1 | #!/usr/bin/perl |
---|
2 | |
---|
3 | #package: UAM Text Tools |
---|
4 | #component: mar |
---|
5 | #version: 1.0 |
---|
6 | #author: Marcin Walas |
---|
7 | |
---|
8 | #this program tags the tokenized file with given tags |
---|
9 | #tags can be given in any order and configuration through the expression |
---|
10 | #which is one of the parametres of the script |
---|
11 | #contact: d287572@atos.wmid.amu.edu.pl, walasiek@gmail.com |
---|
12 | |
---|
13 | my $version = '1.0'; |
---|
14 | |
---|
15 | use lib "/usr/local/lib/utt"; |
---|
16 | use lib "$ENV{'HOME'}/.local/lib/utt"; |
---|
17 | |
---|
18 | use strict; |
---|
19 | use Getopt::Long; |
---|
20 | use File::HomeDir; |
---|
21 | |
---|
22 | use attr; |
---|
23 | |
---|
24 | |
---|
25 | my $LIB_DIR="/usr/local/lib/utt"; |
---|
26 | my $systemconfigfile='/usr/local/etc/utt/mar.conf'; |
---|
27 | my $userconfigfile=home()."/.utt/mar.conf"; |
---|
28 | |
---|
29 | Getopt::Long::Configure('no_ignore_case_always'); |
---|
30 | |
---|
31 | my $help=0; |
---|
32 | my $pattern=0; |
---|
33 | my $macrofile=0; |
---|
34 | my $define=0; |
---|
35 | my $command=0; |
---|
36 | my $action="pgP"; |
---|
37 | my $eos="seg(EOS)"; |
---|
38 | my $explicit_space=0; |
---|
39 | my $morfield='lem'; |
---|
40 | my $tags=0; |
---|
41 | my $show_version = 0; |
---|
42 | |
---|
43 | #read configuration files########################### |
---|
44 | my $file; |
---|
45 | foreach $file ($systemconfigfile, $userconfigfile){ |
---|
46 | if(open(CONFIG, $file)){ |
---|
47 | while (<CONFIG>) { |
---|
48 | chomp; |
---|
49 | s/#.*//; |
---|
50 | s/^\s+//; |
---|
51 | s/\s+$//; |
---|
52 | next unless length; |
---|
53 | my ($name, $value) = split(/\s*=\s*/, $_, 2); |
---|
54 | if(($name eq "pattern")or($name eq "e")){ |
---|
55 | $pattern=$value; |
---|
56 | } |
---|
57 | elsif($name eq "eos"){ |
---|
58 | $eos=$value; |
---|
59 | } |
---|
60 | elsif($name eq "macros"){ |
---|
61 | $macrofile=$value; |
---|
62 | } |
---|
63 | elsif($name eq "tags"){ |
---|
64 | $tags=$value; |
---|
65 | } |
---|
66 | elsif($name eq "morph"){ |
---|
67 | $morfield=$value; |
---|
68 | } |
---|
69 | elsif($name eq "command"){ |
---|
70 | $command=1; |
---|
71 | } |
---|
72 | elsif($name eq "action"){ |
---|
73 | $action=$value; |
---|
74 | } |
---|
75 | elsif($name eq "space"){ |
---|
76 | $explicit_space=1; |
---|
77 | } |
---|
78 | elsif(($name eq "help")or($name eq "h")){ |
---|
79 | $help=1; |
---|
80 | } |
---|
81 | |
---|
82 | } |
---|
83 | close CONFIG; |
---|
84 | } |
---|
85 | } |
---|
86 | ######################################################### |
---|
87 | |
---|
88 | GetOptions("pattern|e=s" => \$pattern, |
---|
89 | "eos|E=s" => \$eos, |
---|
90 | "macros=s" => \$macrofile, |
---|
91 | "define=s" => \$macrofile, |
---|
92 | "command" => \$command, |
---|
93 | "action=s" => \$action, |
---|
94 | "help|h" => \$help, |
---|
95 | "space|s" => \$explicit_space, |
---|
96 | "version|v" => \$show_version, |
---|
97 | ); |
---|
98 | |
---|
99 | |
---|
100 | |
---|
101 | if($show_version){ |
---|
102 | print "Version: $version\n"; |
---|
103 | exit 0; |
---|
104 | } |
---|
105 | |
---|
106 | if($help) |
---|
107 | { |
---|
108 | print <<'END' |
---|
109 | Usage: mar [OPTIONS] [file ..] |
---|
110 | |
---|
111 | Options: |
---|
112 | --pattern -e PATTERN Pattern. |
---|
113 | --eos -E PATTERN Segment serving as sentence beginning marker. [TODO] |
---|
114 | --macros=FILE Read macrodefinitions from FILE. [TODO] |
---|
115 | --define=FILE Add macrodefinitions from FILE. [TODO] |
---|
116 | --action -a [p][s][P] Perform only indicated actions. |
---|
117 | p - preprocess |
---|
118 | s - search |
---|
119 | P - postprocess |
---|
120 | (default psP) |
---|
121 | --command Print generated shell command and exit. |
---|
122 | --help -h Print help. |
---|
123 | --version -v Script version |
---|
124 | |
---|
125 | In patern you can put any tag. Tags should begin with the @ character. |
---|
126 | They don't have to be closed. |
---|
127 | They can't contain white spaces! |
---|
128 | |
---|
129 | Note: If you don't define any custom tags, whole pattern will be taged with |
---|
130 | default tags (begining of match and end of match) |
---|
131 | |
---|
132 | Tags examples: |
---|
133 | |
---|
134 | mar -e '@BEG cat(<ADJ>) @END' |
---|
135 | it will find any adjectives in the text and tag them with surrounding tags |
---|
136 | mar -e 'cat(<ADJ>) @MYTAG cat(<ADJ>)' |
---|
137 | this will find two neighbouring adjectives and parcel them with tag MYTAG |
---|
138 | |
---|
139 | Some example patterns: |
---|
140 | 'word(domu)' - form of the word domu |
---|
141 | 'lexeme(dom)' - any form of lexeme dom |
---|
142 | 'space' - space |
---|
143 | 'cat(<ADJ>)' - adjective |
---|
144 | |
---|
145 | You can use * in patterns to make zero or more counts of word. |
---|
146 | |
---|
147 | END |
---|
148 | ; |
---|
149 | exit 0; |
---|
150 | } |
---|
151 | |
---|
152 | die("$0: no pattern given. Run with -h to get help.\n") unless $pattern || $action !~ /g/; |
---|
153 | |
---|
154 | die("$0: macro file not found") unless |
---|
155 | $macrofile or |
---|
156 | -e "$LIB_DIR/terms.m4" and $macrofile="$LIB_DIR/terms.m4"; |
---|
157 | |
---|
158 | my $preproc = ($action =~ /p/) ? ' fla | ' : ''; |
---|
159 | |
---|
160 | my $postproc = ($action =~ /P/) ? ' | unfla ' : ''; |
---|
161 | |
---|
162 | |
---|
163 | #this is our help function to cut the re to get another tag |
---|
164 | #it takes only one argument which is our patern (after m4 processing) |
---|
165 | #returns: the first root-level brace with content |
---|
166 | sub cutRe |
---|
167 | { |
---|
168 | my $i = 0; |
---|
169 | my $level = 0; |
---|
170 | my $text = $_[0]; |
---|
171 | my $temp; |
---|
172 | for( $i =0; $i < (length $text);$i++) |
---|
173 | { |
---|
174 | $temp = substr($text, $i,1); |
---|
175 | if( $temp eq "(") |
---|
176 | {#we have an opening |
---|
177 | $level++; |
---|
178 | } |
---|
179 | elsif ( $temp eq ")") |
---|
180 | {#we close |
---|
181 | $level--; |
---|
182 | } |
---|
183 | if ( $level == 0) |
---|
184 | { |
---|
185 | $temp = substr($text,0,$i+1); |
---|
186 | last; |
---|
187 | } |
---|
188 | } |
---|
189 | $temp; |
---|
190 | } |
---|
191 | |
---|
192 | #the same function as above althought it returns everything after the |
---|
193 | #first root level brace |
---|
194 | sub restRe |
---|
195 | { |
---|
196 | my $i = 0; |
---|
197 | my $level = 0; |
---|
198 | my $text = $_[0]; |
---|
199 | my $temp; |
---|
200 | for( $i =0; $i < (length $text);$i++) |
---|
201 | { |
---|
202 | $temp = substr($text, $i,1); |
---|
203 | if( $temp eq "(") |
---|
204 | {#we have an opening |
---|
205 | $level++; |
---|
206 | } |
---|
207 | elsif ( $temp eq ")") |
---|
208 | {#we close |
---|
209 | $level--; |
---|
210 | } |
---|
211 | if ( $level == 0) |
---|
212 | { #we cut everything in the begining |
---|
213 | $temp = substr($text,$i+1); |
---|
214 | last; |
---|
215 | } |
---|
216 | } |
---|
217 | $temp; |
---|
218 | } |
---|
219 | |
---|
220 | |
---|
221 | #here we are preparing re for extended matching |
---|
222 | my @tags; |
---|
223 | |
---|
224 | #we must find what our the tags |
---|
225 | #some pattern adjustment |
---|
226 | my $end = 0; |
---|
227 | my $temp = " ".$pattern." "; |
---|
228 | $temp =~ s/(\@[^ ]*) (\@[^ ]* )/\1 \2/g; |
---|
229 | $pattern = $temp; |
---|
230 | |
---|
231 | while ($end != 1) |
---|
232 | { |
---|
233 | #we seek for the first tag in pattern |
---|
234 | if ($temp =~ /^.*?\@(.*?) /) |
---|
235 | { |
---|
236 | #we add this to tags array |
---|
237 | push (@tags, $1); |
---|
238 | #and cut the pattern |
---|
239 | $temp =~ s/^.*?\@(.*?) / /; |
---|
240 | #print $temp."\n"; |
---|
241 | } |
---|
242 | else |
---|
243 | { |
---|
244 | #if we dont find any tags we end |
---|
245 | $end = 1; |
---|
246 | } |
---|
247 | } |
---|
248 | |
---|
249 | #here we have our patern with tags removed (we set sections of ()) between tags |
---|
250 | my $patternmod = "( ".$pattern." )"; |
---|
251 | $patternmod =~ s/\s@.*?\s/\)\(/g; |
---|
252 | |
---|
253 | #discarding spaces |
---|
254 | $patternmod =~ s/\s+/\\`'/g; #` |
---|
255 | # quoting escaped commas |
---|
256 | $patternmod =~ s/\\,/\\`\\`\\,''/g; |
---|
257 | # quoting commas in {m,n} r.e. operator |
---|
258 | $patternmod =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g; |
---|
259 | #print "After m4:".$re."\n"; |
---|
260 | |
---|
261 | my $re = `echo \"$patternmod\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' --define=MORFIELD=$morfield $macrofile - 2>/dev/null`; |
---|
262 | |
---|
263 | die("Incorrect pattern (m4).") if $? >> 8; |
---|
264 | |
---|
265 | |
---|
266 | chomp $re; |
---|
267 | |
---|
268 | # <> expansion |
---|
269 | |
---|
270 | $re =~ s/<([^>]+)>/`echo $1 | $tags.tag2re`/ge; |
---|
271 | |
---|
272 | # Perl-like special sequences |
---|
273 | $re =~ s/\./[^ [:cntrl:]]/g; |
---|
274 | $re =~ s/\\s/[ ]/g; |
---|
275 | $re =~ s/\\S/[^ [:cntrl:]]/g; |
---|
276 | $re =~ s/\\d/[0-9]/g; |
---|
277 | $re =~ s/\\D/[^0-9 [:cntrl:]]/g; |
---|
278 | $re =~ s/\\w/[a-z±æê³ñ󶌿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_]/g; |
---|
279 | $re =~ s/\\W/[^a-z±æê³ñ󶌿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_ [:cntrl:]]/g; |
---|
280 | # extensions |
---|
281 | $re =~ s/\\l/[a-z±æê³ñ󶌿]/g; #lowercase letter |
---|
282 | $re =~ s/\\L/[A-Z¡ÆÊ£ÑÓŠ¬¯]/g; #upercase letter |
---|
283 | |
---|
284 | my $sedcommand; |
---|
285 | my $grepcommand; |
---|
286 | |
---|
287 | #now we must built a sed script from our re |
---|
288 | #we do this by cuting our re each tag until we cut them all |
---|
289 | #if an user dint input any tags we do our default |
---|
290 | my $defBOM = "BOM"; |
---|
291 | my $defEOM = "EOM"; |
---|
292 | my $defTempTagBeg = "####TempTAGBEG####"; |
---|
293 | my $defTempTagEnd = "####TempTAGEND####"; |
---|
294 | |
---|
295 | if (@tags == 0) |
---|
296 | { |
---|
297 | $sedcommand = "sed -r 's/($re)/\\500 $defBOM *\\f\\1###EOM###/g; s/###EOM###([0-9]+)/\\1 00 $defEOM *\\f\\1/g'"; |
---|
298 | } |
---|
299 | else #we have custom tags |
---|
300 | { |
---|
301 | #first tag is easy to tag :) |
---|
302 | my $sedscript="sed -r 's/($re)/\\600 $defTempTagBeg *\\f\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $defTempTagEnd *\\f\\1/g;"; |
---|
303 | #after first step we have temp tagged parts of input matching re |
---|
304 | #now we need to insert our custom tags |
---|
305 | #we will find temp tags and process our input |
---|
306 | |
---|
307 | my $i = 0; |
---|
308 | #copy of re which will be cut |
---|
309 | my $rec = $re; |
---|
310 | my $restre = $re; |
---|
311 | |
---|
312 | for ($i = 0 ; $i < @tags ; $i++) |
---|
313 | { |
---|
314 | #re cutting |
---|
315 | $rec = cutRe($restre); |
---|
316 | $restre = restRe($restre); |
---|
317 | if ($rec =~ / *\( *\) */) |
---|
318 | { |
---|
319 | $sedscript = $sedscript."s/([0-9]+) 00 $defTempTagBeg \\*\\f([0-9]+)/\\2 00 $tags[$i] *\\f\\2 00 $defTempTagBeg *\\f\\2/g;"; |
---|
320 | } |
---|
321 | else |
---|
322 | { |
---|
323 | $sedscript = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f($rec)/\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $tags[$i] *\\f\\1 00 $defTempTagBeg *\\f\\1/g;"; |
---|
324 | } |
---|
325 | |
---|
326 | } |
---|
327 | |
---|
328 | $sedcommand = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f//g;s/[0-9]+ 00 $defTempTagEnd \\*\\f//g'"; |
---|
329 | } |
---|
330 | |
---|
331 | if($command) |
---|
332 | { |
---|
333 | print $sedcommand."\n"; |
---|
334 | exit 0; |
---|
335 | } |
---|
336 | exec $preproc.$sedcommand.$postproc; |
---|