#!/usr/bin/perl #package: UAM Text Tools #component: tags for utt #version: 1.0 #author: Tomasz Obrebski use strict; use locale; my $input = <>; chomp $input; our $pos_re = qr/(?:[[:upper:]]+)/; our $attr_re = qr/(?:[[:upper:]]+)/; our $val_re = qr/(?:[[:lower:][:digit:]+?!*-]|<[^>\n]+>)/; our $av_re = qr/(?:$attr_re$val_re+)/; our $avlist_re = qr/(?:$av_re+)/; our $cat_re = qr/(?:$pos_re(?:\/$avlist_re)?)/; print pre($input); sub parse ($) { my ($dstr)=@_; my $avs={}; my ($cat,$attrlist) = split '/', $dstr; ATTR: while( $attrlist =~ /($attr_re)($val_re+)/g ) { my ($attrstr,$valstr)=($1,$2); my %vals; while($valstr =~ /$val_re/g) { my $val = $&; next ATTR if $val eq '*'; $val =~ s/^<([[:lower:]])>$/$1/; $vals{$val}=1; } $avs->{$attrstr} = \%vals; # dlaczego to dziala? %vals jest lokalne } [$cat, $avs]; } sub unparse (\@) { my ($cat,$avs)= @{shift @_}; my $dstr=$cat; my @attrs = keys %$avs; if(@attrs) { $dstr .= '/'; for my $attr ( sort @attrs ) { $dstr .= $attr . (join '', sort keys %{$avs->{$attr}}); } } $dstr; } sub canonize ($) { unparse @{parse shift} ; } sub pre { my $pos_res = '[[:upper:]]+'; my $attr_res = '[[:upper:]]+'; my $val_res = '[[:lower:][:digit:]+?!*-]|<[^>\n[:cntrl:]]+>'; my $av_res = "$attr_res($val_res)+"; my $avlist_res = "($av_res)+"; my $pat = canonize(shift); my $ret; my ($pos,$avlist) = split /\//, $pat; $ret = $pos.'(\/'; while ($avlist =~ /($attr_res)(${val_res}+)/g) { my $attr = $1; my $vals = $2; my $vals = "($val_res)*(".join('|',($vals =~ /$val_res/g)).")($val_res)*"; $ret .= "($av_res)*$attr$vals"; } $ret .= "($av_res)*)?"; return $ret; }