Context Navigation

source: app/src/mar/mar @ e1b08a2

help

Last change on this file since e1b08a2 was 25ae32e, checked in by obrebski <obrebski@…>, 16 years ago
git-svn-id: svn://atos.wmid.amu.edu.pl/utt@4 e293616e-ec6a-49c2-aa92-f4a8b91c5d16
Property mode set to `100755`
File size: 6.2 KB

Line
1	#!/usr/bin/perl
2
3	#package: UAM Text Tools
4	#component name: mrk
5	#author: Marcin Walas
6
7	#this program tags the tokenized file with given tags
8	#tags can be given in any order and configuration through the expression
9	#which is one of the parametres of the script
10	#contact: d287572@atos.wmid.amu.edu.pl, walasiek@gmail.com
11
12	use strict;
13	use Getopt::Long;
14
15	use attr;
16
17	Getopt::Long::Configure('no_ignore_case_always');
18
19	my $help=0;
20	my $pattern=0;
21	my $macrofile=0;
22	my $define=0;
23	my $command=0;
24	my $action="pgP";
25	my $eos="seg(EOS)";
26	my $explicit_space=0;
27
28	#this is our help function to cut the re to get another tag
29	#it takes only one argument which is our patern (after m4 processing)
30	#returns: the first root-level brace with content
31	sub cutRe
32	{
33	my $i = 0;
34	my $level = 0;
35	my $text = $_[0];
36	my $temp;
37	for( $i =0; $i < (length $text);$i++)
38	{
39	$temp = substr($text, $i,1);
40	if( $temp eq "(")
41	{#we have an opening
42	$level++;
43	}
44	elsif ( $temp eq ")")
45	{#we close
46	$level--;
47	}
48	if ( $level == 0)
49	{
50	$temp = substr($text,0,$i+1);
51	last;
52	}
53	}
54	$temp;
55	}
56
57	#the same function as above althought it returns everything after the
58	#first root level brace
59	sub restRe
60	{
61	my $i = 0;
62	my $level = 0;
63	my $text = $_[0];
64	my $temp;
65	for( $i =0; $i < (length $text);$i++)
66	{
67	$temp = substr($text, $i,1);
68	if( $temp eq "(")
69	{#we have an opening
70	$level++;
71	}
72	elsif ( $temp eq ")")
73	{#we close
74	$level--;
75	}
76	if ( $level == 0)
77	{ #we cut everything in the begining
78	$temp = substr($text,$i+1);
79	last;
80	}
81	}
82	$temp;
83	}
84
85	GetOptions("pattern\|e=s" => \$pattern,
86	"eos\|E=s" => \$eos,
87	"macros=s" => \$macrofile,
88	"define=s" => \$macrofile,
89	"command" => \$command,
90	"action=s" => \$action,
91	"help\|h" => \$help,
92	"space\|s" => \$explicit_space
93	);
94
95	if($help)
96	{
97	print <<'END'
98	Usage: mar [OPTIONS] [file ..]
99
100	Options:
101	--pattern -e PATTERN Pattern.
102	--bos -E PATTERN Segment serving as sentence beginning marker. [TODO]
103	--macros=FILE Read macrodefinitions from FILE. [TODO]
104	--define=FILE Add macrodefinitions from FILE. [TODO]
105	--action -a [p][s][P] Perform only indicated actions.
106	p - preprocess
107	s - search
108	P - postprocess
109	(default pgP)
110	--command Print generated shell command and exit.
111	--help -h Print help.
112
113	In patern you can put any tag. Tags should begin with the @ character.
114	They don't have to be closed.
115	They can't contain white spaces!
116
117	Note: If you don't define any custom tags, whole pattern will be taged with
118	default tags (begining of match and end of match)
119
120	Tags examples:
121
122	mar -e '@BEG cat(<ADJ>) @END'
123	it will find any adjectives in the text and tag them with surrounding tags
124	mar -e 'cat(<ADJ>) @MYTAG cat(<ADJ>)'
125	this will find two neighbouring adjectives and parcel them with tag MYTAG
126
127	Some example patterns:
128	'word(domu)' - form of the word domu
129	'lexeme(dom)' - any form of lexeme dom
130	'space' - space
131	'cat(<ADJ>)' - adjective
132
133	You can use * in patterns to make zero or more counts of word.
134
135	END
136	;
137	exit 0;
138	}
139
140	die("$0: no pattern given. Run with -h to get help.\n") unless $pattern \|\| $action !~ /g/;
141
142	die("$0: macro file not found") unless -e "terms.m4" and $macrofile="terms.m4";
143
144	my $preproc = ($action =~ /p/) ? ' fla \| ' : '';
145
146	my $postproc = ($action =~ /P/) ? ' \| unfla ' : '';
147
148	#here we are preparing re for extended matching
149	my @tags;
150
151	#we must find what our the tags
152	#some pattern adjustment
153	my $end = 0;
154	my $temp = " ".$pattern." ";
155	$temp =~ s/(\@[^ ]) (\@[^ ] )/\1 \2/g;
156	$pattern = $temp;
157
158	while ($end != 1)
159	{
160	#we seek for the first tag in pattern
161	if ($temp =~ /^.?\@(.?) /)
162	{
163	#we add this to tags array
164	push (@tags, $1);
165	#and cut the pattern
166	$temp =~ s/^.?\@(.?) / /;
167	#print $temp."\n";
168	}
169	else
170	{
171	#if we dont find any tags we end
172	$end = 1;
173	}
174	}
175
176	#here we have our patern with tags removed (we set sections of ()) between tags
177	my $patternmod = "( ".$pattern." )";
178	$patternmod =~ s/\s@.*?\s/\)\(/g;
179
180	#discarding spaces
181	$patternmod =~ s/\s+/\\`'/g; #`
182	# quoting escaped commas
183	$patternmod =~ s/\\,/\\`\\`\\,''/g;
184	# quoting commas in {m,n} r.e. operator
185	$patternmod =~ s/(\{\d),(\d\})/\1\\`\\`,''\2/g;
186	#print "After m4:".$re."\n";
187	my $re = `echo \"$patternmod\" \| m4 --define=ENDOFSEGMENT='[[:cntrl:]]' $macrofile - 2>/dev/null`;
188
189	die("Incorrect pattern (m4).") if $? >> 8;
190
191
192	chomp $re;
193
194	# <> expansion
195
196	$re =~ s/<([^>]+)>/`echo $1 \| .\/terms\.tag2re`/ge;
197
198	# Perl-like special sequences
199	$re =~ s/\./[^ [:cntrl:]]/g;
200	$re =~ s/\\s/[ ]/g;
201	$re =~ s/\\S/[^ [:cntrl:]]/g;
202	$re =~ s/\\d/[0-9]/g;
203	$re =~ s/\\D/[^0-9 [:cntrl:]]/g;
204	$re =~ s/\\w/[a-z±æê³ñó¶Œ¿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_]/g;
205	$re =~ s/\\W/[^a-z±æê³ñó¶Œ¿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_ [:cntrl:]]/g;
206	# extensions
207	$re =~ s/\\l/[a-z±æê³ñó¶Œ¿]/g; #lowercase letter
208	$re =~ s/\\L/[A-Z¡ÆÊ£ÑÓŠ¬¯]/g; #upercase letter
209
210	my $sedcommand;
211	my $grepcommand;
212
213	#now we must built a sed script from our re
214	#we do this by cuting our re each tag until we cut them all
215	#if an user dint input any tags we do our default
216	my $defBOM = "BOM";
217	my $defEOM = "EOM";
218	my $defTempTagBeg = "####TempTAGBEG####";
219	my $defTempTagEnd = "####TempTAGEND####";
220
221	if (@tags == 0)
222	{
223	$sedcommand = "sed -r 's/($re)/\\500 $defBOM \\f\\1###EOM###/g; s/###EOM###([0-9]+)/\\1 00 $defEOM \\f\\1/g'";
224	}
225	else #we have custom tags
226	{
227	#first tag is easy to tag :)
228	my $sedscript="sed -r 's/($re)/\\600 $defTempTagBeg \\f\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $defTempTagEnd \\f\\1/g;";
229	#after first step we have temp tagged parts of input matching re
230	#now we need to insert our custom tags
231	#we will find temp tags and process our input
232
233	my $i = 0;
234	#copy of re which will be cut
235	my $rec = $re;
236	my $restre = $re;
237
238	for ($i = 0 ; $i < @tags ; $i++)
239	{
240	#re cutting
241	$rec = cutRe($restre);
242	$restre = restRe($restre);
243	if ($rec =~ / \( \) */)
244	{
245	$sedscript = $sedscript."s/([0-9]+) 00 $defTempTagBeg \\\\f([0-9]+)/\\2 00 $tags[$i] \\f\\2 00 $defTempTagBeg *\\f\\2/g;";
246	}
247	else
248	{
249	$sedscript = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\\\f($rec)/\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $tags[$i] \\f\\1 00 $defTempTagBeg *\\f\\1/g;";
250	}
251
252	}
253
254	$sedcommand = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\\\f//g;s/[0-9]+ 00 $defTempTagEnd \\\\f//g'";
255	}
256
257	if($command)
258	{
259	print $sedcommand."\n";
260	exit 0;
261	}
262	exec $preproc.$sedcommand.$postproc;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: