[25ae32e] | 1 | #!/usr/bin/perl |
---|
| 2 | |
---|
| 3 | #package: UAM Text Tools |
---|
| 4 | #component: ser (pattern search tool) |
---|
[20b4e44] | 5 | #version: 1.0 |
---|
| 6 | #author: Tomasz Obrebski |
---|
[25ae32e] | 7 | |
---|
| 8 | use strict; |
---|
| 9 | use Getopt::Long; |
---|
[754d2a3] | 10 | use File::Temp; |
---|
[20b4e44] | 11 | use File::HomeDir; |
---|
[25ae32e] | 12 | |
---|
[754d2a3] | 13 | my $LIB_DIR="/usr/local/lib/utt"; |
---|
| 14 | my $systemconfigfile='/usr/local/etc/utt/ser.conf'; |
---|
[20b4e44] | 15 | my $userconfigfile=home()."/.utt/ser.conf"; |
---|
[25ae32e] | 16 | |
---|
| 17 | Getopt::Long::Configure('no_ignore_case_always'); |
---|
| 18 | |
---|
| 19 | my $help=0; |
---|
| 20 | my $pattern=0; |
---|
| 21 | my $only_matching=0; |
---|
| 22 | my $no_markers=0; |
---|
| 23 | my $macros=0; |
---|
| 24 | my $flextemplate=0; |
---|
| 25 | my $flex=0; |
---|
| 26 | my $morfield='lem'; |
---|
[f5d3b20] | 27 | my $tags=0; |
---|
[25ae32e] | 28 | |
---|
| 29 | #read configuration files########################### |
---|
| 30 | my $file; |
---|
[754d2a3] | 31 | foreach $file ($systemconfigfile, $userconfigfile){ |
---|
[25ae32e] | 32 | if(open(CONFIG, $file)){ |
---|
| 33 | while (<CONFIG>) { |
---|
| 34 | chomp; |
---|
| 35 | s/#.*//; |
---|
| 36 | s/^\s+//; |
---|
| 37 | s/\s+$//; |
---|
| 38 | next unless length; |
---|
| 39 | my ($name, $value) = split(/\s*=\s*/, $_, 2); |
---|
| 40 | if(($name eq "pattern")or($name eq "e")){ |
---|
| 41 | $pattern=$value; |
---|
| 42 | } |
---|
| 43 | elsif($name eq "morph"){ |
---|
| 44 | $morfield=$value; |
---|
| 45 | } |
---|
| 46 | elsif(($name eq "only-matching")or($name eq "m")){ |
---|
| 47 | $only_matching=1; |
---|
| 48 | } |
---|
| 49 | elsif(($name eq "no-markers")or($name eq "M")){ |
---|
| 50 | $no_markers=1; |
---|
| 51 | } |
---|
[754d2a3] | 52 | elsif($name eq "macros"){ |
---|
[25ae32e] | 53 | $macros=$value; |
---|
| 54 | } |
---|
| 55 | elsif($name eq "flex-template"){ |
---|
| 56 | $flextemplate=$value; |
---|
| 57 | } |
---|
[f5d3b20] | 58 | elsif($name eq "tags"){ |
---|
| 59 | $tags=$value; |
---|
| 60 | } |
---|
[25ae32e] | 61 | elsif($name eq "flex"){ |
---|
| 62 | $flex=1; |
---|
| 63 | } |
---|
| 64 | elsif(($name eq "help")or($name eq "h")){ |
---|
| 65 | $help=1; |
---|
| 66 | } |
---|
| 67 | |
---|
| 68 | } |
---|
| 69 | close CONFIG; |
---|
| 70 | } |
---|
| 71 | } |
---|
| 72 | ######################################################### |
---|
| 73 | |
---|
| 74 | GetOptions("pattern|e=s" => \$pattern, |
---|
| 75 | "morph=s" => \$morfield, |
---|
| 76 | "only-matching|m" => \$only_matching, |
---|
| 77 | "no-markers|M" => \$no_markers, |
---|
[754d2a3] | 78 | "macros=s" => \$macros, |
---|
[25ae32e] | 79 | "flex-template=s" => \$flextemplate, |
---|
[f5d3b20] | 80 | "tags=s" => \$tags, |
---|
[25ae32e] | 81 | "flex" => \$flex, |
---|
| 82 | "help|h" => \$help); |
---|
| 83 | |
---|
| 84 | if($help) |
---|
| 85 | { |
---|
| 86 | print <<'END' |
---|
| 87 | Usage: ser [OPTIONS] [file ..] |
---|
| 88 | |
---|
| 89 | Options: |
---|
| 90 | --help -h Help. |
---|
| 91 | --pattern=PATTERN -e PATTERN Search pattern. |
---|
| 92 | --morph=STRING Field containing morphological information (default 'lem'). |
---|
[754d2a3] | 93 | --macros=FILE Read macrodefinitions from FILE. |
---|
[25ae32e] | 94 | --flex-template=FILE Read flex code template from FILE. |
---|
[f5d3b20] | 95 | --tags=STRING Morphosyntactic tag format. |
---|
[25ae32e] | 96 | --only-matching -m Print only fragments matching PATTERN. |
---|
| 97 | --no-markers -M Do not print BOM and EOM markers [TODO]. |
---|
| 98 | --flex Print only the generated flex code and exit. |
---|
| 99 | END |
---|
| 100 | ; |
---|
| 101 | exit 0; |
---|
| 102 | } |
---|
| 103 | |
---|
[e2bde98] | 104 | |
---|
[25ae32e] | 105 | die("$0: no pattern given.\n") unless $pattern; |
---|
| 106 | |
---|
| 107 | die("$0: flex template file not found") unless |
---|
| 108 | $flextemplate or |
---|
[754d2a3] | 109 | -e "$LIB_DIR/ser.l.template" and $flextemplate="$LIB_DIR/ser.l.template"; |
---|
[25ae32e] | 110 | |
---|
| 111 | die("$0: macro file not found") unless |
---|
| 112 | $macros or |
---|
[754d2a3] | 113 | -e "$LIB_DIR/terms.m4" and $macros="$LIB_DIR/terms.m4"; |
---|
[25ae32e] | 114 | |
---|
[f5d3b20] | 115 | die("$0: $tags.tag2re program not found") unless |
---|
| 116 | 1; #JAK NAPISAC WARUNEK??? |
---|
| 117 | |
---|
| 118 | die("$0: undefined tagset format (tags option missing)") unless |
---|
| 119 | $tags; |
---|
| 120 | |
---|
[25ae32e] | 121 | |
---|
| 122 | #$pattern =~ s/cat\(([^)]+)\)/'cat('.pre($1).')'/ge; |
---|
| 123 | # quoting escaped commas /NIE DZIA£A/ |
---|
| 124 | $pattern =~ s/\\,/\\`\\`\\,''/g; |
---|
| 125 | |
---|
| 126 | # protecting backslash |
---|
| 127 | $pattern =~ s/\\/\\\\\\/g; |
---|
| 128 | |
---|
| 129 | # discarding spaces |
---|
| 130 | $pattern =~ s/\s+/\\`'/g; #` |
---|
| 131 | |
---|
[754d2a3] | 132 | |
---|
[25ae32e] | 133 | my $flexpattern = `echo \"$pattern\" | m4 --define=ENDOFSEGMENT=\\\\n --define=MORFIELD=$morfield $macros - 2>/dev/null`; |
---|
| 134 | |
---|
| 135 | die("Incorrect pattern (m4).") if $? >> 8; |
---|
| 136 | |
---|
| 137 | |
---|
| 138 | chomp $flexpattern; |
---|
| 139 | |
---|
| 140 | # <> expansion |
---|
[f5d3b20] | 141 | $flexpattern =~ s/<([^>]+)>/`echo $1 | $tags.tag2re`/ge; |
---|
[25ae32e] | 142 | |
---|
| 143 | # restricting the value of the . special symbol |
---|
| 144 | $flexpattern =~ s/\./[^ \\t\\n\\r\\f]/g; |
---|
| 145 | |
---|
| 146 | # perl-like shortcuts for character classes |
---|
| 147 | # perl exact |
---|
| 148 | $flexpattern =~ s/\\s/[ \\t]/g; |
---|
| 149 | $flexpattern =~ s/\\S/[^ \\t\\n\\r\\f]/g; |
---|
| 150 | $flexpattern =~ s/\\d/[0-9]/g; |
---|
| 151 | $flexpattern =~ s/\\D/[^0-9 \\t\\n\\r\\f]/g; |
---|
| 152 | $flexpattern =~ s/\\w/[a-z±æê³ñ󶌿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_]/g; |
---|
| 153 | $flexpattern =~ s/\\W/[^a-z±æê³ñ󶌿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_ \\t\\n\\r\\f]/g; |
---|
| 154 | # extensions |
---|
| 155 | $flexpattern =~ s/\\l/[a-z±æê³ñ󶌿]/g; #lowercase letter |
---|
| 156 | $flexpattern =~ s/\\L/[A-Z¡ÆÊ£ÑÓŠ¬¯]/g; #upercase letter |
---|
| 157 | |
---|
| 158 | # protecting slash |
---|
| 159 | $flexpattern =~ s/\//\\\//g; |
---|
| 160 | |
---|
| 161 | my $defaultaction = ($only_matching) ? '' : 'ECHO'; |
---|
| 162 | |
---|
| 163 | # docelowo posrednie pliki powinny byc w jakims tempie !!! |
---|
| 164 | |
---|
[754d2a3] | 165 | (undef, my $tmpfile_l) = File::Temp::tempfile(SUFFIX=>'.l'); |
---|
| 166 | (undef, my $tmpfile_c) = File::Temp::tempfile(SUFFIX=>'.c'); |
---|
| 167 | (undef, my $tmpfile_x) = File::Temp::tempfile(); |
---|
| 168 | |
---|
[e2bde98] | 169 | # w tych `` nie dziala |
---|
| 170 | #`m4 "--define=PATTERN=$flexpattern" "--define=DEFAULTACTION=$defaultaction" $flextemplate > $tmpfile_l`; |
---|
[754d2a3] | 171 | |
---|
[e2bde98] | 172 | system "m4 \"--define=PATTERN=$flexpattern\" \"--define=DEFAULTACTION=$defaultaction\" $flextemplate > $tmpfile_l"; |
---|
[25ae32e] | 173 | |
---|
| 174 | if($flex) |
---|
| 175 | { |
---|
[e2bde98] | 176 | # w tych `` nie dziala |
---|
| 177 | system "cat $tmpfile_l"; |
---|
| 178 | # if(open(FLEX, $tmpfile_l)) { |
---|
| 179 | # while(<FLEX>) { |
---|
| 180 | # print @_; |
---|
| 181 | # } |
---|
| 182 | # close FLEX; |
---|
| 183 | # } |
---|
| 184 | # else { |
---|
| 185 | # print "Unable to open file $tmpfile_l\n"; |
---|
| 186 | # } |
---|
| 187 | exit(0); |
---|
[25ae32e] | 188 | } |
---|
| 189 | |
---|
[389de9a] | 190 | `flex -o$tmpfile_c $tmpfile_l`; |
---|
| 191 | `cc -O3 -o $tmpfile_x $tmpfile_c -lfl`; |
---|
[e2bde98] | 192 | #`$tmpfile_x`; |
---|
| 193 | |
---|
| 194 | system "$tmpfile_x"; |
---|
[25ae32e] | 195 | |
---|
[20b4e44] | 196 | unlink $tmpfile_l; |
---|
| 197 | unlink $tmpfile_c; |
---|
| 198 | unlink $tmpfile_x; |
---|