[25ae32e] | 1 | #!/usr/bin/perl |
---|
| 2 | |
---|
[20b4e44] | 3 | #package: UAM Text Tools |
---|
| 4 | #component: tags for utt |
---|
| 5 | #version: 1.0 |
---|
| 6 | #author: Tomasz Obrebski |
---|
| 7 | |
---|
| 8 | use strict; |
---|
[25ae32e] | 9 | use locale; |
---|
| 10 | |
---|
| 11 | my $input = <>; |
---|
| 12 | chomp $input; |
---|
| 13 | |
---|
| 14 | our $pos_re = qr/(?:[[:upper:]]+)/; |
---|
| 15 | our $attr_re = qr/(?:[[:upper:]]+)/; |
---|
| 16 | our $val_re = qr/(?:[[:lower:][:digit:]+?!*-]|<[^>\n]+>)/; |
---|
| 17 | our $av_re = qr/(?:$attr_re$val_re+)/; |
---|
| 18 | our $avlist_re = qr/(?:$av_re+)/; |
---|
| 19 | our $cat_re = qr/(?:$pos_re(?:\/$avlist_re)?)/; |
---|
| 20 | |
---|
| 21 | print pre($input); |
---|
| 22 | |
---|
| 23 | sub parse ($) |
---|
| 24 | { |
---|
| 25 | my ($dstr)=@_; |
---|
| 26 | my $avs={}; |
---|
| 27 | my ($cat,$attrlist) = split '/', $dstr; |
---|
| 28 | ATTR: |
---|
| 29 | while( $attrlist =~ /($attr_re)($val_re+)/g ) |
---|
| 30 | { |
---|
| 31 | my ($attrstr,$valstr)=($1,$2); |
---|
| 32 | my %vals; |
---|
| 33 | while($valstr =~ /$val_re/g) |
---|
| 34 | { |
---|
| 35 | my $val = $&; |
---|
| 36 | next ATTR if $val eq '*'; |
---|
| 37 | $val =~ s/^<([[:lower:]])>$/$1/; |
---|
| 38 | $vals{$val}=1; |
---|
| 39 | } |
---|
| 40 | |
---|
| 41 | $avs->{$attrstr} = \%vals; # dlaczego to dziala? %vals jest lokalne |
---|
| 42 | } |
---|
| 43 | [$cat, $avs]; |
---|
| 44 | } |
---|
| 45 | |
---|
| 46 | sub unparse (\@) |
---|
| 47 | { |
---|
| 48 | my ($cat,$avs)= @{shift @_}; |
---|
| 49 | my $dstr=$cat; |
---|
| 50 | my @attrs = keys %$avs; |
---|
| 51 | if(@attrs) |
---|
| 52 | { |
---|
| 53 | $dstr .= '/'; |
---|
| 54 | for my $attr ( sort @attrs ) |
---|
| 55 | { |
---|
| 56 | $dstr .= $attr . (join '', sort keys %{$avs->{$attr}}); |
---|
| 57 | } |
---|
| 58 | } |
---|
| 59 | $dstr; |
---|
| 60 | } |
---|
| 61 | |
---|
| 62 | sub canonize ($) |
---|
| 63 | { |
---|
| 64 | unparse @{parse shift} ; |
---|
| 65 | } |
---|
| 66 | |
---|
| 67 | sub pre |
---|
| 68 | { |
---|
| 69 | my $pos_res = '[[:upper:]]+'; |
---|
| 70 | my $attr_res = '[[:upper:]]+'; |
---|
| 71 | my $val_res = '[[:lower:][:digit:]+?!*-]|<[^>\n[:cntrl:]]+>'; |
---|
| 72 | my $av_res = "$attr_res($val_res)+"; |
---|
| 73 | my $avlist_res = "($av_res)+"; |
---|
| 74 | |
---|
| 75 | my $pat = canonize(shift); |
---|
| 76 | my $ret; |
---|
| 77 | my ($pos,$avlist) = split /\//, $pat; |
---|
| 78 | $ret = $pos.'(\/'; |
---|
| 79 | while ($avlist =~ /($attr_res)(${val_res}+)/g) |
---|
| 80 | { |
---|
| 81 | my $attr = $1; |
---|
| 82 | my $vals = $2; |
---|
| 83 | my $vals = "($val_res)*(".join('|',($vals =~ /$val_res/g)).")($val_res)*"; |
---|
| 84 | $ret .= "($av_res)*$attr$vals"; |
---|
| 85 | } |
---|
| 86 | $ret .= "($av_res)*)?"; |
---|
| 87 | return $ret; |
---|
| 88 | } |
---|
| 89 | |
---|