1 | #!/usr/bin/perl |
---|
2 | |
---|
3 | #package: UAM Text Tools |
---|
4 | #component: ser (pattern search tool) |
---|
5 | #version: 1.0 |
---|
6 | #author: Tomasz Obrebski |
---|
7 | |
---|
8 | use strict; |
---|
9 | use Getopt::Long; |
---|
10 | use File::Temp; |
---|
11 | use File::HomeDir; |
---|
12 | |
---|
13 | my $LIB_DIR="/usr/local/lib/utt"; |
---|
14 | my $systemconfigfile='/usr/local/etc/utt/ser.conf'; |
---|
15 | my $userconfigfile=home()."/.utt/ser.conf"; |
---|
16 | |
---|
17 | Getopt::Long::Configure('no_ignore_case_always'); |
---|
18 | |
---|
19 | my $help=0; |
---|
20 | my $pattern=0; |
---|
21 | my $only_matching=0; |
---|
22 | my $no_markers=0; |
---|
23 | my $macros=0; |
---|
24 | my $flextemplate=0; |
---|
25 | my $flex=0; |
---|
26 | my $morfield='lem'; |
---|
27 | my $tags=0; |
---|
28 | |
---|
29 | #read configuration files########################### |
---|
30 | my $file; |
---|
31 | foreach $file ($systemconfigfile, $userconfigfile){ |
---|
32 | if(open(CONFIG, $file)){ |
---|
33 | while (<CONFIG>) { |
---|
34 | chomp; |
---|
35 | s/#.*//; |
---|
36 | s/^\s+//; |
---|
37 | s/\s+$//; |
---|
38 | next unless length; |
---|
39 | my ($name, $value) = split(/\s*=\s*/, $_, 2); |
---|
40 | if(($name eq "pattern")or($name eq "e")){ |
---|
41 | $pattern=$value; |
---|
42 | } |
---|
43 | elsif($name eq "morph"){ |
---|
44 | $morfield=$value; |
---|
45 | } |
---|
46 | elsif(($name eq "only-matching")or($name eq "m")){ |
---|
47 | $only_matching=1; |
---|
48 | } |
---|
49 | elsif(($name eq "no-markers")or($name eq "M")){ |
---|
50 | $no_markers=1; |
---|
51 | } |
---|
52 | elsif($name eq "macros"){ |
---|
53 | $macros=$value; |
---|
54 | } |
---|
55 | elsif($name eq "flex-template"){ |
---|
56 | $flextemplate=$value; |
---|
57 | } |
---|
58 | elsif($name eq "tags"){ |
---|
59 | $tags=$value; |
---|
60 | } |
---|
61 | elsif($name eq "flex"){ |
---|
62 | $flex=1; |
---|
63 | } |
---|
64 | elsif(($name eq "help")or($name eq "h")){ |
---|
65 | $help=1; |
---|
66 | } |
---|
67 | |
---|
68 | } |
---|
69 | close CONFIG; |
---|
70 | } |
---|
71 | } |
---|
72 | ######################################################### |
---|
73 | |
---|
74 | GetOptions("pattern|e=s" => \$pattern, |
---|
75 | "morph=s" => \$morfield, |
---|
76 | "only-matching|m" => \$only_matching, |
---|
77 | "no-markers|M" => \$no_markers, |
---|
78 | "macros=s" => \$macros, |
---|
79 | "flex-template=s" => \$flextemplate, |
---|
80 | "tags=s" => \$tags, |
---|
81 | "flex" => \$flex, |
---|
82 | "help|h" => \$help); |
---|
83 | |
---|
84 | if($help) |
---|
85 | { |
---|
86 | print <<'END' |
---|
87 | Usage: ser [OPTIONS] [file ..] |
---|
88 | |
---|
89 | Options: |
---|
90 | --help -h Help. |
---|
91 | --pattern=PATTERN -e PATTERN Search pattern. |
---|
92 | --morph=STRING Field containing morphological information (default 'lem'). |
---|
93 | --macros=FILE Read macrodefinitions from FILE. |
---|
94 | --flex-template=FILE Read flex code template from FILE. |
---|
95 | --tags=STRING Morphosyntactic tag format. |
---|
96 | --only-matching -m Print only fragments matching PATTERN. |
---|
97 | --no-markers -M Do not print BOM and EOM markers [TODO]. |
---|
98 | --flex Print only the generated flex code and exit. |
---|
99 | END |
---|
100 | ; |
---|
101 | exit 0; |
---|
102 | } |
---|
103 | |
---|
104 | |
---|
105 | die("$0: no pattern given.\n") unless $pattern; |
---|
106 | |
---|
107 | die("$0: flex template file not found") unless |
---|
108 | $flextemplate or |
---|
109 | -e "$LIB_DIR/ser.l.template" and $flextemplate="$LIB_DIR/ser.l.template"; |
---|
110 | |
---|
111 | die("$0: macro file not found") unless |
---|
112 | $macros or |
---|
113 | -e "$LIB_DIR/terms.m4" and $macros="$LIB_DIR/terms.m4"; |
---|
114 | |
---|
115 | die("$0: $tags.tag2re program not found") unless |
---|
116 | 1; #JAK NAPISAC WARUNEK??? |
---|
117 | |
---|
118 | die("$0: undefined tagset format (tags option missing)") unless |
---|
119 | $tags; |
---|
120 | |
---|
121 | |
---|
122 | #$pattern =~ s/cat\(([^)]+)\)/'cat('.pre($1).')'/ge; |
---|
123 | # quoting escaped commas /NIE DZIA£A/ |
---|
124 | $pattern =~ s/\\,/\\`\\`\\,''/g; |
---|
125 | |
---|
126 | # protecting backslash |
---|
127 | $pattern =~ s/\\/\\\\\\/g; |
---|
128 | |
---|
129 | # discarding spaces |
---|
130 | $pattern =~ s/\s+/\\`'/g; #` |
---|
131 | |
---|
132 | |
---|
133 | my $flexpattern = `echo \"$pattern\" | m4 --define=ENDOFSEGMENT=\\\\n --define=MORFIELD=$morfield $macros - 2>/dev/null`; |
---|
134 | |
---|
135 | die("Incorrect pattern (m4).") if $? >> 8; |
---|
136 | |
---|
137 | |
---|
138 | chomp $flexpattern; |
---|
139 | |
---|
140 | # <> expansion |
---|
141 | $flexpattern =~ s/<([^>]+)>/`echo $1 | $tags.tag2re`/ge; |
---|
142 | |
---|
143 | # restricting the value of the . special symbol |
---|
144 | $flexpattern =~ s/\./[^ \\t\\n\\r\\f]/g; |
---|
145 | |
---|
146 | # perl-like shortcuts for character classes |
---|
147 | # perl exact |
---|
148 | $flexpattern =~ s/\\s/[ \\t]/g; |
---|
149 | $flexpattern =~ s/\\S/[^ \\t\\n\\r\\f]/g; |
---|
150 | $flexpattern =~ s/\\d/[0-9]/g; |
---|
151 | $flexpattern =~ s/\\D/[^0-9 \\t\\n\\r\\f]/g; |
---|
152 | $flexpattern =~ s/\\w/[a-z±æê³ñ󶌿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_]/g; |
---|
153 | $flexpattern =~ s/\\W/[^a-z±æê³ñ󶌿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_ \\t\\n\\r\\f]/g; |
---|
154 | # extensions |
---|
155 | $flexpattern =~ s/\\l/[a-z±æê³ñ󶌿]/g; #lowercase letter |
---|
156 | $flexpattern =~ s/\\L/[A-Z¡ÆÊ£ÑÓŠ¬¯]/g; #upercase letter |
---|
157 | |
---|
158 | # protecting slash |
---|
159 | $flexpattern =~ s/\//\\\//g; |
---|
160 | |
---|
161 | my $defaultaction = ($only_matching) ? '' : 'ECHO'; |
---|
162 | |
---|
163 | # docelowo posrednie pliki powinny byc w jakims tempie !!! |
---|
164 | |
---|
165 | (undef, my $tmpfile_l) = File::Temp::tempfile(SUFFIX=>'.l'); |
---|
166 | (undef, my $tmpfile_c) = File::Temp::tempfile(SUFFIX=>'.c'); |
---|
167 | (undef, my $tmpfile_x) = File::Temp::tempfile(); |
---|
168 | |
---|
169 | # w tych `` nie dziala |
---|
170 | #`m4 "--define=PATTERN=$flexpattern" "--define=DEFAULTACTION=$defaultaction" $flextemplate > $tmpfile_l`; |
---|
171 | |
---|
172 | system "m4 \"--define=PATTERN=$flexpattern\" \"--define=DEFAULTACTION=$defaultaction\" $flextemplate > $tmpfile_l"; |
---|
173 | |
---|
174 | if($flex) |
---|
175 | { |
---|
176 | # w tych `` nie dziala |
---|
177 | system "cat $tmpfile_l"; |
---|
178 | # if(open(FLEX, $tmpfile_l)) { |
---|
179 | # while(<FLEX>) { |
---|
180 | # print @_; |
---|
181 | # } |
---|
182 | # close FLEX; |
---|
183 | # } |
---|
184 | # else { |
---|
185 | # print "Unable to open file $tmpfile_l\n"; |
---|
186 | # } |
---|
187 | exit(0); |
---|
188 | } |
---|
189 | |
---|
190 | `flex -o$tmpfile_c $tmpfile_l`; |
---|
191 | `cc -O3 -o $tmpfile_x $tmpfile_c -lfl`; |
---|
192 | #`$tmpfile_x`; |
---|
193 | |
---|
194 | system "$tmpfile_x"; |
---|
195 | |
---|
196 | unlink $tmpfile_l; |
---|
197 | unlink $tmpfile_c; |
---|
198 | unlink $tmpfile_x; |
---|