source: app/src/mar/mar @ e1b08a2

help
Last change on this file since e1b08a2 was 25ae32e, checked in by obrebski <obrebski@…>, 16 years ago

git-svn-id: svn://atos.wmid.amu.edu.pl/utt@4 e293616e-ec6a-49c2-aa92-f4a8b91c5d16

  • Property mode set to 100755
File size: 6.2 KB
Line 
1#!/usr/bin/perl
2
3#package: UAM Text Tools
4#component name: mrk
5#author: Marcin Walas
6
7#this program tags the tokenized file with given tags
8#tags can be given in any order and configuration through the expression
9#which is one of the parametres of the script
10#contact: d287572@atos.wmid.amu.edu.pl, walasiek@gmail.com
11
12use strict;
13use Getopt::Long;
14
15use attr;
16
17Getopt::Long::Configure('no_ignore_case_always');
18
19my $help=0;
20my $pattern=0;
21my $macrofile=0;
22my $define=0;
23my $command=0;
24my $action="pgP";
25my $eos="seg(EOS)";
26my $explicit_space=0;
27
28#this is our help function to  cut the re to get another tag
29#it takes only one argument which is our patern (after m4 processing)
30#returns: the first root-level brace with content
31sub cutRe
32{
33 my $i = 0;
34 my $level = 0;
35 my $text = $_[0];
36 my $temp;
37 for( $i =0; $i < (length $text);$i++)
38 {
39         $temp = substr($text, $i,1);
40         if( $temp eq "(")
41         {#we have an opening
42                 $level++;
43         }
44         elsif ( $temp eq ")")
45         {#we close
46                 $level--;
47         }
48         if ( $level == 0)
49         { 
50                $temp  = substr($text,0,$i+1);
51                last;
52         }
53 }
54 $temp;
55}
56
57#the same function as above althought it returns everything after the
58#first root level brace
59sub restRe
60{
61 my $i = 0;
62 my $level = 0;
63 my $text = $_[0];
64 my $temp;
65 for( $i =0; $i < (length $text);$i++)
66 {
67         $temp = substr($text, $i,1);
68         if( $temp eq "(")
69         {#we have an opening
70                 $level++;
71         }
72         elsif ( $temp eq ")")
73         {#we close
74                 $level--;
75         }
76         if ( $level == 0)
77         { #we cut everything in the begining
78                $temp  = substr($text,$i+1);
79                last;
80         }
81 }
82 $temp;
83}
84
85GetOptions("pattern|e=s" => \$pattern,
86           "eos|E=s" => \$eos,
87           "macros=s" => \$macrofile,
88           "define=s" => \$macrofile,
89           "command" => \$command,
90           "action=s" => \$action,
91           "help|h" => \$help,
92           "space|s" => \$explicit_space
93   );
94
95if($help)
96{
97    print <<'END'
98Usage: mar [OPTIONS] [file ..]
99
100Options:
101   --pattern -e PATTERN         Pattern.
102   --bos -E PATTERN             Segment serving as sentence beginning marker. [TODO]
103   --macros=FILE                Read macrodefinitions from FILE. [TODO]
104   --define=FILE                Add macrodefinitions from FILE. [TODO]
105   --action -a [p][s][P]        Perform only indicated actions.
106                                    p - preprocess
107                                    s - search
108                                    P - postprocess
109                                (default pgP)
110   --command                    Print generated shell command and exit.
111   --help -h                    Print help.
112
113In patern you can put any tag. Tags should begin with the @ character.
114They don't have to be closed.
115They can't contain white spaces!
116
117Note: If you don't define any custom tags, whole pattern will be taged with
118      default tags (begining of match and end of match)
119
120Tags examples:
121
122mar -e '@BEG cat(<ADJ>) @END'
123  it will find any adjectives in the text and tag them with surrounding tags
124mar -e 'cat(<ADJ>) @MYTAG cat(<ADJ>)'
125  this will find two neighbouring adjectives and parcel them with tag MYTAG
126
127Some example patterns:
128'word(domu)'  - form of the word domu
129'lexeme(dom)' - any form of lexeme dom
130'space'       - space
131'cat(<ADJ>)'  - adjective
132
133You can use * in patterns to make zero or more counts of word.
134
135END
136;
137    exit 0;
138}
139
140die("$0: no pattern given. Run with -h to get help.\n") unless $pattern || $action !~ /g/;
141
142die("$0: macro file not found") unless -e "terms.m4" and $macrofile="terms.m4";
143
144my $preproc    = ($action =~ /p/) ? ' fla  | '  : '';
145
146my $postproc   = ($action =~ /P/) ? ' | unfla '  : '';
147
148#here we are preparing re for extended matching
149my @tags;
150
151#we must find what our the tags
152#some pattern adjustment
153my $end = 0;
154my $temp = " ".$pattern." ";
155$temp =~ s/(\@[^ ]*) (\@[^ ]* )/\1  \2/g;
156$pattern = $temp;
157
158while ($end != 1)
159{
160        #we seek for the first tag in pattern
161 if ($temp =~ /^.*?\@(.*?) /)
162 {
163         #we add this to tags array
164         push (@tags, $1);
165         #and cut the pattern
166         $temp =~ s/^.*?\@(.*?) / /;
167         #print $temp."\n";
168 }
169 else
170 {
171         #if we dont find any tags we end
172         $end = 1;
173 }
174}
175
176#here we have our patern with tags removed (we set sections of ()) between tags
177my $patternmod = "( ".$pattern." )";
178$patternmod =~ s/\s@.*?\s/\)\(/g;
179
180#discarding spaces
181$patternmod =~ s/\s+/\\`'/g; #`
182# quoting escaped commas
183$patternmod =~ s/\\,/\\`\\`\\,''/g;
184# quoting commas in {m,n} r.e. operator
185$patternmod =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g;
186#print "After m4:".$re."\n";
187my $re = `echo \"$patternmod\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' $macrofile - 2>/dev/null`;
188
189die("Incorrect pattern (m4).") if $? >> 8;
190
191
192chomp $re;
193
194# <> expansion
195
196$re =~ s/<([^>]+)>/`echo $1 | .\/terms\.tag2re`/ge;
197
198# Perl-like special sequences
199$re =~ s/\./[^ [:cntrl:]]/g;
200$re =~ s/\\s/[ ]/g;
201$re =~ s/\\S/[^ [:cntrl:]]/g;
202$re =~ s/\\d/[0-9]/g;
203$re =~ s/\\D/[^0-9 [:cntrl:]]/g;
204$re =~ s/\\w/[a-z±æê³ñ󶌿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_]/g;
205$re =~ s/\\W/[^a-z±æê³ñ󶌿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_ [:cntrl:]]/g;
206# extensions
207$re =~ s/\\l/[a-z±æê³ñ󶌿]/g; #lowercase letter
208$re =~ s/\\L/[A-Z¡ÆÊ£ÑÓŠ¬¯]/g; #upercase letter
209
210my $sedcommand;
211my $grepcommand;
212
213#now we must built a sed script from our re
214#we do this by cuting our re each tag until we cut them all
215#if an user dint input any tags we do our default
216my $defBOM = "BOM";
217my $defEOM = "EOM";
218my $defTempTagBeg = "####TempTAGBEG####";
219my $defTempTagEnd = "####TempTAGEND####";
220
221if (@tags == 0)
222{
223        $sedcommand = "sed -r 's/($re)/\\500 $defBOM *\\f\\1###EOM###/g; s/###EOM###([0-9]+)/\\1 00 $defEOM *\\f\\1/g'";
224}
225else #we have custom tags
226{
227        #first tag is easy to tag :)
228        my $sedscript="sed -r 's/($re)/\\600 $defTempTagBeg *\\f\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $defTempTagEnd *\\f\\1/g;";
229    #after first step we have temp tagged parts of input matching re
230    #now we need to insert our custom tags
231    #we will find temp tags and process our input
232   
233        my $i = 0;
234        #copy of re which will be cut
235        my $rec = $re;
236        my $restre = $re;
237       
238        for ($i = 0 ; $i < @tags ; $i++)
239        {
240         #re cutting
241         $rec = cutRe($restre);
242         $restre = restRe($restre);
243         if ($rec =~ / *\( *\) */)
244         {
245            $sedscript = $sedscript."s/([0-9]+) 00 $defTempTagBeg \\*\\f([0-9]+)/\\2 00 $tags[$i] *\\f\\2 00 $defTempTagBeg *\\f\\2/g;";
246         }
247         else
248         {
249            $sedscript = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f($rec)/\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $tags[$i] *\\f\\1 00 $defTempTagBeg *\\f\\1/g;";
250         }
251
252        }
253       
254        $sedcommand = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f//g;s/[0-9]+ 00 $defTempTagEnd \\*\\f//g'";
255}
256
257if($command)
258{
259    print $sedcommand."\n";
260    exit 0;
261}
262exec $preproc.$sedcommand.$postproc;
Note: See TracBrowser for help on using the repository browser.