1 | #!/usr/bin/perl |
---|
2 | |
---|
3 | #package: UAM Text Tools |
---|
4 | #component name: mrk |
---|
5 | #author: Marcin Walas |
---|
6 | |
---|
7 | #this program tags the tokenized file with given tags |
---|
8 | #tags can be given in any order and configuration through the expression |
---|
9 | #which is one of the parametres of the script |
---|
10 | #contact: d287572@atos.wmid.amu.edu.pl, walasiek@gmail.com |
---|
11 | |
---|
12 | use strict; |
---|
13 | use Getopt::Long; |
---|
14 | |
---|
15 | use attr; |
---|
16 | |
---|
17 | Getopt::Long::Configure('no_ignore_case_always'); |
---|
18 | |
---|
19 | my $help=0; |
---|
20 | my $pattern=0; |
---|
21 | my $macrofile=0; |
---|
22 | my $define=0; |
---|
23 | my $command=0; |
---|
24 | my $action="pgP"; |
---|
25 | my $eos="seg(EOS)"; |
---|
26 | my $explicit_space=0; |
---|
27 | |
---|
28 | #this is our help function to cut the re to get another tag |
---|
29 | #it takes only one argument which is our patern (after m4 processing) |
---|
30 | #returns: the first root-level brace with content |
---|
31 | sub cutRe |
---|
32 | { |
---|
33 | my $i = 0; |
---|
34 | my $level = 0; |
---|
35 | my $text = $_[0]; |
---|
36 | my $temp; |
---|
37 | for( $i =0; $i < (length $text);$i++) |
---|
38 | { |
---|
39 | $temp = substr($text, $i,1); |
---|
40 | if( $temp eq "(") |
---|
41 | {#we have an opening |
---|
42 | $level++; |
---|
43 | } |
---|
44 | elsif ( $temp eq ")") |
---|
45 | {#we close |
---|
46 | $level--; |
---|
47 | } |
---|
48 | if ( $level == 0) |
---|
49 | { |
---|
50 | $temp = substr($text,0,$i+1); |
---|
51 | last; |
---|
52 | } |
---|
53 | } |
---|
54 | $temp; |
---|
55 | } |
---|
56 | |
---|
57 | #the same function as above althought it returns everything after the |
---|
58 | #first root level brace |
---|
59 | sub restRe |
---|
60 | { |
---|
61 | my $i = 0; |
---|
62 | my $level = 0; |
---|
63 | my $text = $_[0]; |
---|
64 | my $temp; |
---|
65 | for( $i =0; $i < (length $text);$i++) |
---|
66 | { |
---|
67 | $temp = substr($text, $i,1); |
---|
68 | if( $temp eq "(") |
---|
69 | {#we have an opening |
---|
70 | $level++; |
---|
71 | } |
---|
72 | elsif ( $temp eq ")") |
---|
73 | {#we close |
---|
74 | $level--; |
---|
75 | } |
---|
76 | if ( $level == 0) |
---|
77 | { #we cut everything in the begining |
---|
78 | $temp = substr($text,$i+1); |
---|
79 | last; |
---|
80 | } |
---|
81 | } |
---|
82 | $temp; |
---|
83 | } |
---|
84 | |
---|
85 | GetOptions("pattern|e=s" => \$pattern, |
---|
86 | "eos|E=s" => \$eos, |
---|
87 | "macros=s" => \$macrofile, |
---|
88 | "define=s" => \$macrofile, |
---|
89 | "command" => \$command, |
---|
90 | "action=s" => \$action, |
---|
91 | "help|h" => \$help, |
---|
92 | "space|s" => \$explicit_space |
---|
93 | ); |
---|
94 | |
---|
95 | if($help) |
---|
96 | { |
---|
97 | print <<'END' |
---|
98 | Usage: mar [OPTIONS] [file ..] |
---|
99 | |
---|
100 | Options: |
---|
101 | --pattern -e PATTERN Pattern. |
---|
102 | --bos -E PATTERN Segment serving as sentence beginning marker. [TODO] |
---|
103 | --macros=FILE Read macrodefinitions from FILE. [TODO] |
---|
104 | --define=FILE Add macrodefinitions from FILE. [TODO] |
---|
105 | --action -a [p][s][P] Perform only indicated actions. |
---|
106 | p - preprocess |
---|
107 | s - search |
---|
108 | P - postprocess |
---|
109 | (default pgP) |
---|
110 | --command Print generated shell command and exit. |
---|
111 | --help -h Print help. |
---|
112 | |
---|
113 | In patern you can put any tag. Tags should begin with the @ character. |
---|
114 | They don't have to be closed. |
---|
115 | They can't contain white spaces! |
---|
116 | |
---|
117 | Note: If you don't define any custom tags, whole pattern will be taged with |
---|
118 | default tags (begining of match and end of match) |
---|
119 | |
---|
120 | Tags examples: |
---|
121 | |
---|
122 | mar -e '@BEG cat(<ADJ>) @END' |
---|
123 | it will find any adjectives in the text and tag them with surrounding tags |
---|
124 | mar -e 'cat(<ADJ>) @MYTAG cat(<ADJ>)' |
---|
125 | this will find two neighbouring adjectives and parcel them with tag MYTAG |
---|
126 | |
---|
127 | Some example patterns: |
---|
128 | 'word(domu)' - form of the word domu |
---|
129 | 'lexeme(dom)' - any form of lexeme dom |
---|
130 | 'space' - space |
---|
131 | 'cat(<ADJ>)' - adjective |
---|
132 | |
---|
133 | You can use * in patterns to make zero or more counts of word. |
---|
134 | |
---|
135 | END |
---|
136 | ; |
---|
137 | exit 0; |
---|
138 | } |
---|
139 | |
---|
140 | die("$0: no pattern given. Run with -h to get help.\n") unless $pattern || $action !~ /g/; |
---|
141 | |
---|
142 | die("$0: macro file not found") unless -e "terms.m4" and $macrofile="terms.m4"; |
---|
143 | |
---|
144 | my $preproc = ($action =~ /p/) ? ' fla | ' : ''; |
---|
145 | |
---|
146 | my $postproc = ($action =~ /P/) ? ' | unfla ' : ''; |
---|
147 | |
---|
148 | #here we are preparing re for extended matching |
---|
149 | my @tags; |
---|
150 | |
---|
151 | #we must find what our the tags |
---|
152 | #some pattern adjustment |
---|
153 | my $end = 0; |
---|
154 | my $temp = " ".$pattern." "; |
---|
155 | $temp =~ s/(\@[^ ]*) (\@[^ ]* )/\1 \2/g; |
---|
156 | $pattern = $temp; |
---|
157 | |
---|
158 | while ($end != 1) |
---|
159 | { |
---|
160 | #we seek for the first tag in pattern |
---|
161 | if ($temp =~ /^.*?\@(.*?) /) |
---|
162 | { |
---|
163 | #we add this to tags array |
---|
164 | push (@tags, $1); |
---|
165 | #and cut the pattern |
---|
166 | $temp =~ s/^.*?\@(.*?) / /; |
---|
167 | #print $temp."\n"; |
---|
168 | } |
---|
169 | else |
---|
170 | { |
---|
171 | #if we dont find any tags we end |
---|
172 | $end = 1; |
---|
173 | } |
---|
174 | } |
---|
175 | |
---|
176 | #here we have our patern with tags removed (we set sections of ()) between tags |
---|
177 | my $patternmod = "( ".$pattern." )"; |
---|
178 | $patternmod =~ s/\s@.*?\s/\)\(/g; |
---|
179 | |
---|
180 | #discarding spaces |
---|
181 | $patternmod =~ s/\s+/\\`'/g; #` |
---|
182 | # quoting escaped commas |
---|
183 | $patternmod =~ s/\\,/\\`\\`\\,''/g; |
---|
184 | # quoting commas in {m,n} r.e. operator |
---|
185 | $patternmod =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g; |
---|
186 | #print "After m4:".$re."\n"; |
---|
187 | my $re = `echo \"$patternmod\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' $macrofile - 2>/dev/null`; |
---|
188 | |
---|
189 | die("Incorrect pattern (m4).") if $? >> 8; |
---|
190 | |
---|
191 | |
---|
192 | chomp $re; |
---|
193 | |
---|
194 | # <> expansion |
---|
195 | |
---|
196 | $re =~ s/<([^>]+)>/`echo $1 | .\/terms\.tag2re`/ge; |
---|
197 | |
---|
198 | # Perl-like special sequences |
---|
199 | $re =~ s/\./[^ [:cntrl:]]/g; |
---|
200 | $re =~ s/\\s/[ ]/g; |
---|
201 | $re =~ s/\\S/[^ [:cntrl:]]/g; |
---|
202 | $re =~ s/\\d/[0-9]/g; |
---|
203 | $re =~ s/\\D/[^0-9 [:cntrl:]]/g; |
---|
204 | $re =~ s/\\w/[a-z±æê³ñ󶌿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_]/g; |
---|
205 | $re =~ s/\\W/[^a-z±æê³ñ󶌿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_ [:cntrl:]]/g; |
---|
206 | # extensions |
---|
207 | $re =~ s/\\l/[a-z±æê³ñ󶌿]/g; #lowercase letter |
---|
208 | $re =~ s/\\L/[A-Z¡ÆÊ£ÑÓŠ¬¯]/g; #upercase letter |
---|
209 | |
---|
210 | my $sedcommand; |
---|
211 | my $grepcommand; |
---|
212 | |
---|
213 | #now we must built a sed script from our re |
---|
214 | #we do this by cuting our re each tag until we cut them all |
---|
215 | #if an user dint input any tags we do our default |
---|
216 | my $defBOM = "BOM"; |
---|
217 | my $defEOM = "EOM"; |
---|
218 | my $defTempTagBeg = "####TempTAGBEG####"; |
---|
219 | my $defTempTagEnd = "####TempTAGEND####"; |
---|
220 | |
---|
221 | if (@tags == 0) |
---|
222 | { |
---|
223 | $sedcommand = "sed -r 's/($re)/\\500 $defBOM *\\f\\1###EOM###/g; s/###EOM###([0-9]+)/\\1 00 $defEOM *\\f\\1/g'"; |
---|
224 | } |
---|
225 | else #we have custom tags |
---|
226 | { |
---|
227 | #first tag is easy to tag :) |
---|
228 | my $sedscript="sed -r 's/($re)/\\600 $defTempTagBeg *\\f\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $defTempTagEnd *\\f\\1/g;"; |
---|
229 | #after first step we have temp tagged parts of input matching re |
---|
230 | #now we need to insert our custom tags |
---|
231 | #we will find temp tags and process our input |
---|
232 | |
---|
233 | my $i = 0; |
---|
234 | #copy of re which will be cut |
---|
235 | my $rec = $re; |
---|
236 | my $restre = $re; |
---|
237 | |
---|
238 | for ($i = 0 ; $i < @tags ; $i++) |
---|
239 | { |
---|
240 | #re cutting |
---|
241 | $rec = cutRe($restre); |
---|
242 | $restre = restRe($restre); |
---|
243 | if ($rec =~ / *\( *\) */) |
---|
244 | { |
---|
245 | $sedscript = $sedscript."s/([0-9]+) 00 $defTempTagBeg \\*\\f([0-9]+)/\\2 00 $tags[$i] *\\f\\2 00 $defTempTagBeg *\\f\\2/g;"; |
---|
246 | } |
---|
247 | else |
---|
248 | { |
---|
249 | $sedscript = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f($rec)/\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $tags[$i] *\\f\\1 00 $defTempTagBeg *\\f\\1/g;"; |
---|
250 | } |
---|
251 | |
---|
252 | } |
---|
253 | |
---|
254 | $sedcommand = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f//g;s/[0-9]+ 00 $defTempTagEnd \\*\\f//g'"; |
---|
255 | } |
---|
256 | |
---|
257 | if($command) |
---|
258 | { |
---|
259 | print $sedcommand."\n"; |
---|
260 | exit 0; |
---|
261 | } |
---|
262 | exec $preproc.$sedcommand.$postproc; |
---|