1 | #!/usr/bin/perl |
---|
2 | |
---|
3 | #package: UAM Text Tools |
---|
4 | #component: tags for utt |
---|
5 | #version: 1.0 |
---|
6 | #author: Tomasz Obrebski |
---|
7 | |
---|
8 | use strict; |
---|
9 | use locale; |
---|
10 | |
---|
11 | my $input = <>; |
---|
12 | chomp $input; |
---|
13 | |
---|
14 | our $pos_re = qr/(?:[[:upper:]]+)/; |
---|
15 | our $attr_re = qr/(?:[[:upper:]]+)/; |
---|
16 | our $val_re = qr/(?:[[:lower:][:digit:]+?!*-]|<[^>\n]+>)/; |
---|
17 | our $av_re = qr/(?:$attr_re$val_re+)/; |
---|
18 | our $avlist_re = qr/(?:$av_re+)/; |
---|
19 | our $cat_re = qr/(?:$pos_re(?:\/$avlist_re)?)/; |
---|
20 | |
---|
21 | print pre($input); |
---|
22 | |
---|
23 | sub parse ($) |
---|
24 | { |
---|
25 | my ($dstr)=@_; |
---|
26 | my $avs={}; |
---|
27 | my ($cat,$attrlist) = split '/', $dstr; |
---|
28 | ATTR: |
---|
29 | while( $attrlist =~ /($attr_re)($val_re+)/g ) |
---|
30 | { |
---|
31 | my ($attrstr,$valstr)=($1,$2); |
---|
32 | my %vals; |
---|
33 | while($valstr =~ /$val_re/g) |
---|
34 | { |
---|
35 | my $val = $&; |
---|
36 | next ATTR if $val eq '*'; |
---|
37 | $val =~ s/^<([[:lower:]])>$/$1/; |
---|
38 | $vals{$val}=1; |
---|
39 | } |
---|
40 | |
---|
41 | $avs->{$attrstr} = \%vals; # dlaczego to dziala? %vals jest lokalne |
---|
42 | } |
---|
43 | [$cat, $avs]; |
---|
44 | } |
---|
45 | |
---|
46 | sub unparse (\@) |
---|
47 | { |
---|
48 | my ($cat,$avs)= @{shift @_}; |
---|
49 | my $dstr=$cat; |
---|
50 | my @attrs = keys %$avs; |
---|
51 | if(@attrs) |
---|
52 | { |
---|
53 | $dstr .= '/'; |
---|
54 | for my $attr ( sort @attrs ) |
---|
55 | { |
---|
56 | $dstr .= $attr . (join '', sort keys %{$avs->{$attr}}); |
---|
57 | } |
---|
58 | } |
---|
59 | $dstr; |
---|
60 | } |
---|
61 | |
---|
62 | sub canonize ($) |
---|
63 | { |
---|
64 | unparse @{parse shift} ; |
---|
65 | } |
---|
66 | |
---|
67 | sub pre |
---|
68 | { |
---|
69 | my $pos_res = '[[:upper:]]+'; |
---|
70 | my $attr_res = '[[:upper:]]+'; |
---|
71 | my $val_res = '[[:lower:][:digit:]+?!*-]|<[^>\n[:cntrl:]]+>'; |
---|
72 | my $av_res = "$attr_res($val_res)+"; |
---|
73 | my $avlist_res = "($av_res)+"; |
---|
74 | |
---|
75 | my $pat = canonize(shift); |
---|
76 | my $ret; |
---|
77 | my ($pos,$avlist) = split /\//, $pat; |
---|
78 | $ret = $pos.'(\/'; |
---|
79 | while ($avlist =~ /($attr_res)(${val_res}+)/g) |
---|
80 | { |
---|
81 | my $attr = $1; |
---|
82 | my $vals = $2; |
---|
83 | my $vals = "($val_res)*(".join('|',($vals =~ /$val_res/g)).")($val_res)*"; |
---|
84 | $ret .= "($av_res)*$attr$vals"; |
---|
85 | } |
---|
86 | $ret .= "($av_res)*)?"; |
---|
87 | return $ret; |
---|
88 | } |
---|
89 | |
---|