| [25ae32e] | 1 | #!/usr/bin/perl |
|---|
| 2 | |
|---|
| 3 | #package: UAM Text Tools |
|---|
| 4 | #component: ser (pattern search tool) |
|---|
| [20b4e44] | 5 | #version: 1.0 |
|---|
| 6 | #author: Tomasz Obrebski |
|---|
| [25ae32e] | 7 | |
|---|
| 8 | use strict; |
|---|
| 9 | use Getopt::Long; |
|---|
| [754d2a3] | 10 | use File::Temp; |
|---|
| [20b4e44] | 11 | use File::HomeDir; |
|---|
| [25ae32e] | 12 | |
|---|
| [754d2a3] | 13 | my $LIB_DIR="/usr/local/lib/utt"; |
|---|
| 14 | my $systemconfigfile='/usr/local/etc/utt/ser.conf'; |
|---|
| [20b4e44] | 15 | my $userconfigfile=home()."/.utt/ser.conf"; |
|---|
| [25ae32e] | 16 | |
|---|
| 17 | Getopt::Long::Configure('no_ignore_case_always'); |
|---|
| 18 | |
|---|
| 19 | my $help=0; |
|---|
| 20 | my $pattern=0; |
|---|
| 21 | my $only_matching=0; |
|---|
| 22 | my $no_markers=0; |
|---|
| 23 | my $macros=0; |
|---|
| 24 | my $flextemplate=0; |
|---|
| 25 | my $flex=0; |
|---|
| 26 | my $morfield='lem'; |
|---|
| [f5d3b20] | 27 | my $tags=0; |
|---|
| [25ae32e] | 28 | |
|---|
| 29 | #read configuration files########################### |
|---|
| 30 | my $file; |
|---|
| [754d2a3] | 31 | foreach $file ($systemconfigfile, $userconfigfile){ |
|---|
| [25ae32e] | 32 | if(open(CONFIG, $file)){ |
|---|
| 33 | while (<CONFIG>) { |
|---|
| 34 | chomp; |
|---|
| 35 | s/#.*//; |
|---|
| 36 | s/^\s+//; |
|---|
| 37 | s/\s+$//; |
|---|
| 38 | next unless length; |
|---|
| 39 | my ($name, $value) = split(/\s*=\s*/, $_, 2); |
|---|
| 40 | if(($name eq "pattern")or($name eq "e")){ |
|---|
| 41 | $pattern=$value; |
|---|
| 42 | } |
|---|
| 43 | elsif($name eq "morph"){ |
|---|
| 44 | $morfield=$value; |
|---|
| 45 | } |
|---|
| 46 | elsif(($name eq "only-matching")or($name eq "m")){ |
|---|
| 47 | $only_matching=1; |
|---|
| 48 | } |
|---|
| 49 | elsif(($name eq "no-markers")or($name eq "M")){ |
|---|
| 50 | $no_markers=1; |
|---|
| 51 | } |
|---|
| [754d2a3] | 52 | elsif($name eq "macros"){ |
|---|
| [25ae32e] | 53 | $macros=$value; |
|---|
| 54 | } |
|---|
| 55 | elsif($name eq "flex-template"){ |
|---|
| 56 | $flextemplate=$value; |
|---|
| 57 | } |
|---|
| [f5d3b20] | 58 | elsif($name eq "tags"){ |
|---|
| 59 | $tags=$value; |
|---|
| 60 | } |
|---|
| [25ae32e] | 61 | elsif($name eq "flex"){ |
|---|
| 62 | $flex=1; |
|---|
| 63 | } |
|---|
| 64 | elsif(($name eq "help")or($name eq "h")){ |
|---|
| 65 | $help=1; |
|---|
| 66 | } |
|---|
| 67 | |
|---|
| 68 | } |
|---|
| 69 | close CONFIG; |
|---|
| 70 | } |
|---|
| 71 | } |
|---|
| 72 | ######################################################### |
|---|
| 73 | |
|---|
| 74 | GetOptions("pattern|e=s" => \$pattern, |
|---|
| 75 | "morph=s" => \$morfield, |
|---|
| 76 | "only-matching|m" => \$only_matching, |
|---|
| 77 | "no-markers|M" => \$no_markers, |
|---|
| [754d2a3] | 78 | "macros=s" => \$macros, |
|---|
| [25ae32e] | 79 | "flex-template=s" => \$flextemplate, |
|---|
| [f5d3b20] | 80 | "tags=s" => \$tags, |
|---|
| [25ae32e] | 81 | "flex" => \$flex, |
|---|
| 82 | "help|h" => \$help); |
|---|
| 83 | |
|---|
| 84 | if($help) |
|---|
| 85 | { |
|---|
| 86 | print <<'END' |
|---|
| 87 | Usage: ser [OPTIONS] [file ..] |
|---|
| 88 | |
|---|
| 89 | Options: |
|---|
| 90 | --help -h Help. |
|---|
| 91 | --pattern=PATTERN -e PATTERN Search pattern. |
|---|
| 92 | --morph=STRING Field containing morphological information (default 'lem'). |
|---|
| [754d2a3] | 93 | --macros=FILE Read macrodefinitions from FILE. |
|---|
| [25ae32e] | 94 | --flex-template=FILE Read flex code template from FILE. |
|---|
| [f5d3b20] | 95 | --tags=STRING Morphosyntactic tag format. |
|---|
| [25ae32e] | 96 | --only-matching -m Print only fragments matching PATTERN. |
|---|
| 97 | --no-markers -M Do not print BOM and EOM markers [TODO]. |
|---|
| 98 | --flex Print only the generated flex code and exit. |
|---|
| 99 | END |
|---|
| 100 | ; |
|---|
| 101 | exit 0; |
|---|
| 102 | } |
|---|
| 103 | |
|---|
| [e2bde98] | 104 | |
|---|
| [25ae32e] | 105 | die("$0: no pattern given.\n") unless $pattern; |
|---|
| 106 | |
|---|
| 107 | die("$0: flex template file not found") unless |
|---|
| 108 | $flextemplate or |
|---|
| [754d2a3] | 109 | -e "$LIB_DIR/ser.l.template" and $flextemplate="$LIB_DIR/ser.l.template"; |
|---|
| [25ae32e] | 110 | |
|---|
| 111 | die("$0: macro file not found") unless |
|---|
| 112 | $macros or |
|---|
| [754d2a3] | 113 | -e "$LIB_DIR/terms.m4" and $macros="$LIB_DIR/terms.m4"; |
|---|
| [25ae32e] | 114 | |
|---|
| [f5d3b20] | 115 | die("$0: $tags.tag2re program not found") unless |
|---|
| 116 | 1; #JAK NAPISAC WARUNEK??? |
|---|
| 117 | |
|---|
| 118 | die("$0: undefined tagset format (tags option missing)") unless |
|---|
| 119 | $tags; |
|---|
| 120 | |
|---|
| [25ae32e] | 121 | |
|---|
| 122 | #$pattern =~ s/cat\(([^)]+)\)/'cat('.pre($1).')'/ge; |
|---|
| 123 | # quoting escaped commas /NIE DZIA£A/ |
|---|
| 124 | $pattern =~ s/\\,/\\`\\`\\,''/g; |
|---|
| 125 | |
|---|
| 126 | # protecting backslash |
|---|
| 127 | $pattern =~ s/\\/\\\\\\/g; |
|---|
| 128 | |
|---|
| 129 | # discarding spaces |
|---|
| 130 | $pattern =~ s/\s+/\\`'/g; #` |
|---|
| 131 | |
|---|
| [754d2a3] | 132 | |
|---|
| [25ae32e] | 133 | my $flexpattern = `echo \"$pattern\" | m4 --define=ENDOFSEGMENT=\\\\n --define=MORFIELD=$morfield $macros - 2>/dev/null`; |
|---|
| 134 | |
|---|
| 135 | die("Incorrect pattern (m4).") if $? >> 8; |
|---|
| 136 | |
|---|
| 137 | |
|---|
| 138 | chomp $flexpattern; |
|---|
| 139 | |
|---|
| 140 | # <> expansion |
|---|
| [f5d3b20] | 141 | $flexpattern =~ s/<([^>]+)>/`echo $1 | $tags.tag2re`/ge; |
|---|
| [25ae32e] | 142 | |
|---|
| 143 | # restricting the value of the . special symbol |
|---|
| 144 | $flexpattern =~ s/\./[^ \\t\\n\\r\\f]/g; |
|---|
| 145 | |
|---|
| 146 | # perl-like shortcuts for character classes |
|---|
| 147 | # perl exact |
|---|
| 148 | $flexpattern =~ s/\\s/[ \\t]/g; |
|---|
| 149 | $flexpattern =~ s/\\S/[^ \\t\\n\\r\\f]/g; |
|---|
| 150 | $flexpattern =~ s/\\d/[0-9]/g; |
|---|
| 151 | $flexpattern =~ s/\\D/[^0-9 \\t\\n\\r\\f]/g; |
|---|
| 152 | $flexpattern =~ s/\\w/[a-z±æê³ñ󶌿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_]/g; |
|---|
| 153 | $flexpattern =~ s/\\W/[^a-z±æê³ñ󶌿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_ \\t\\n\\r\\f]/g; |
|---|
| 154 | # extensions |
|---|
| 155 | $flexpattern =~ s/\\l/[a-z±æê³ñ󶌿]/g; #lowercase letter |
|---|
| 156 | $flexpattern =~ s/\\L/[A-Z¡ÆÊ£ÑÓŠ¬¯]/g; #upercase letter |
|---|
| 157 | |
|---|
| 158 | # protecting slash |
|---|
| 159 | $flexpattern =~ s/\//\\\//g; |
|---|
| 160 | |
|---|
| 161 | my $defaultaction = ($only_matching) ? '' : 'ECHO'; |
|---|
| 162 | |
|---|
| 163 | # docelowo posrednie pliki powinny byc w jakims tempie !!! |
|---|
| 164 | |
|---|
| [754d2a3] | 165 | (undef, my $tmpfile_l) = File::Temp::tempfile(SUFFIX=>'.l'); |
|---|
| 166 | (undef, my $tmpfile_c) = File::Temp::tempfile(SUFFIX=>'.c'); |
|---|
| 167 | (undef, my $tmpfile_x) = File::Temp::tempfile(); |
|---|
| 168 | |
|---|
| [e2bde98] | 169 | # w tych `` nie dziala |
|---|
| 170 | #`m4 "--define=PATTERN=$flexpattern" "--define=DEFAULTACTION=$defaultaction" $flextemplate > $tmpfile_l`; |
|---|
| [754d2a3] | 171 | |
|---|
| [e2bde98] | 172 | system "m4 \"--define=PATTERN=$flexpattern\" \"--define=DEFAULTACTION=$defaultaction\" $flextemplate > $tmpfile_l"; |
|---|
| [25ae32e] | 173 | |
|---|
| 174 | if($flex) |
|---|
| 175 | { |
|---|
| [e2bde98] | 176 | # w tych `` nie dziala |
|---|
| 177 | system "cat $tmpfile_l"; |
|---|
| 178 | # if(open(FLEX, $tmpfile_l)) { |
|---|
| 179 | # while(<FLEX>) { |
|---|
| 180 | # print @_; |
|---|
| 181 | # } |
|---|
| 182 | # close FLEX; |
|---|
| 183 | # } |
|---|
| 184 | # else { |
|---|
| 185 | # print "Unable to open file $tmpfile_l\n"; |
|---|
| 186 | # } |
|---|
| 187 | exit(0); |
|---|
| [25ae32e] | 188 | } |
|---|
| 189 | |
|---|
| [389de9a] | 190 | `flex -o$tmpfile_c $tmpfile_l`; |
|---|
| 191 | `cc -O3 -o $tmpfile_x $tmpfile_c -lfl`; |
|---|
| [e2bde98] | 192 | #`$tmpfile_x`; |
|---|
| 193 | |
|---|
| 194 | system "$tmpfile_x"; |
|---|
| [25ae32e] | 195 | |
|---|
| [20b4e44] | 196 | unlink $tmpfile_l; |
|---|
| 197 | unlink $tmpfile_c; |
|---|
| 198 | unlink $tmpfile_x; |
|---|