[25ae32e] | 1 | #!/usr/bin/perl |
---|
| 2 | |
---|
[20b4e44] | 3 | #package: UAM Text Tools |
---|
| 4 | #component: mar |
---|
| 5 | #version: 1.0 |
---|
| 6 | #author: Marcin Walas |
---|
[25ae32e] | 7 | |
---|
[6b3be72] | 8 | #this program tags the tokenized file with given tags |
---|
[25ae32e] | 9 | #tags can be given in any order and configuration through the expression |
---|
| 10 | #which is one of the parametres of the script |
---|
| 11 | #contact: d287572@atos.wmid.amu.edu.pl, walasiek@gmail.com |
---|
| 12 | |
---|
[2d89d4b] | 13 | my $version = '1.0'; |
---|
| 14 | |
---|
[6b3be72] | 15 | use lib "/usr/local/lib/utt"; |
---|
[adb4c8d] | 16 | use lib "$ENV{'HOME'}/.local/lib/utt"; |
---|
[6b3be72] | 17 | |
---|
[25ae32e] | 18 | use strict; |
---|
| 19 | use Getopt::Long; |
---|
[7562131] | 20 | use File::HomeDir; |
---|
[25ae32e] | 21 | |
---|
| 22 | use attr; |
---|
| 23 | |
---|
[7562131] | 24 | |
---|
| 25 | my $LIB_DIR="/usr/local/lib/utt"; |
---|
| 26 | my $systemconfigfile='/usr/local/etc/utt/mar.conf'; |
---|
| 27 | my $userconfigfile=home()."/.utt/mar.conf"; |
---|
| 28 | |
---|
[25ae32e] | 29 | Getopt::Long::Configure('no_ignore_case_always'); |
---|
| 30 | |
---|
| 31 | my $help=0; |
---|
| 32 | my $pattern=0; |
---|
| 33 | my $macrofile=0; |
---|
| 34 | my $define=0; |
---|
| 35 | my $command=0; |
---|
| 36 | my $action="pgP"; |
---|
| 37 | my $eos="seg(EOS)"; |
---|
| 38 | my $explicit_space=0; |
---|
[7562131] | 39 | my $morfield='lem'; |
---|
| 40 | my $tags=0; |
---|
[2d89d4b] | 41 | my $show_version = 0; |
---|
[7562131] | 42 | |
---|
| 43 | #read configuration files########################### |
---|
| 44 | my $file; |
---|
| 45 | foreach $file ($systemconfigfile, $userconfigfile){ |
---|
| 46 | if(open(CONFIG, $file)){ |
---|
| 47 | while (<CONFIG>) { |
---|
| 48 | chomp; |
---|
| 49 | s/#.*//; |
---|
| 50 | s/^\s+//; |
---|
| 51 | s/\s+$//; |
---|
| 52 | next unless length; |
---|
| 53 | my ($name, $value) = split(/\s*=\s*/, $_, 2); |
---|
| 54 | if(($name eq "pattern")or($name eq "e")){ |
---|
| 55 | $pattern=$value; |
---|
| 56 | } |
---|
| 57 | elsif($name eq "eos"){ |
---|
| 58 | $eos=$value; |
---|
| 59 | } |
---|
| 60 | elsif($name eq "macros"){ |
---|
| 61 | $macrofile=$value; |
---|
| 62 | } |
---|
| 63 | elsif($name eq "tags"){ |
---|
| 64 | $tags=$value; |
---|
| 65 | } |
---|
| 66 | elsif($name eq "morph"){ |
---|
| 67 | $morfield=$value; |
---|
| 68 | } |
---|
| 69 | elsif($name eq "command"){ |
---|
| 70 | $command=1; |
---|
| 71 | } |
---|
| 72 | elsif($name eq "action"){ |
---|
| 73 | $action=$value; |
---|
| 74 | } |
---|
| 75 | elsif($name eq "space"){ |
---|
| 76 | $explicit_space=1; |
---|
| 77 | } |
---|
| 78 | elsif(($name eq "help")or($name eq "h")){ |
---|
| 79 | $help=1; |
---|
| 80 | } |
---|
| 81 | |
---|
| 82 | } |
---|
| 83 | close CONFIG; |
---|
| 84 | } |
---|
[25ae32e] | 85 | } |
---|
[7562131] | 86 | ######################################################### |
---|
[25ae32e] | 87 | |
---|
| 88 | GetOptions("pattern|e=s" => \$pattern, |
---|
| 89 | "eos|E=s" => \$eos, |
---|
| 90 | "macros=s" => \$macrofile, |
---|
| 91 | "define=s" => \$macrofile, |
---|
| 92 | "command" => \$command, |
---|
| 93 | "action=s" => \$action, |
---|
| 94 | "help|h" => \$help, |
---|
[2d89d4b] | 95 | "space|s" => \$explicit_space, |
---|
| 96 | "version|v" => \$show_version, |
---|
[25ae32e] | 97 | ); |
---|
| 98 | |
---|
[7562131] | 99 | |
---|
| 100 | |
---|
[2d89d4b] | 101 | if($show_version){ |
---|
| 102 | print "Version: $version\n"; |
---|
| 103 | exit 0; |
---|
| 104 | } |
---|
[7562131] | 105 | |
---|
[25ae32e] | 106 | if($help) |
---|
| 107 | { |
---|
| 108 | print <<'END' |
---|
| 109 | Usage: mar [OPTIONS] [file ..] |
---|
| 110 | |
---|
| 111 | Options: |
---|
| 112 | --pattern -e PATTERN Pattern. |
---|
[2d89d4b] | 113 | --eos -E PATTERN Segment serving as sentence beginning marker. [TODO] |
---|
[25ae32e] | 114 | --macros=FILE Read macrodefinitions from FILE. [TODO] |
---|
| 115 | --define=FILE Add macrodefinitions from FILE. [TODO] |
---|
| 116 | --action -a [p][s][P] Perform only indicated actions. |
---|
| 117 | p - preprocess |
---|
| 118 | s - search |
---|
| 119 | P - postprocess |
---|
[2d89d4b] | 120 | (default psP) |
---|
[25ae32e] | 121 | --command Print generated shell command and exit. |
---|
| 122 | --help -h Print help. |
---|
[2d89d4b] | 123 | --version -v Script version |
---|
[25ae32e] | 124 | |
---|
| 125 | In patern you can put any tag. Tags should begin with the @ character. |
---|
| 126 | They don't have to be closed. |
---|
| 127 | They can't contain white spaces! |
---|
| 128 | |
---|
| 129 | Note: If you don't define any custom tags, whole pattern will be taged with |
---|
| 130 | default tags (begining of match and end of match) |
---|
| 131 | |
---|
| 132 | Tags examples: |
---|
| 133 | |
---|
| 134 | mar -e '@BEG cat(<ADJ>) @END' |
---|
| 135 | it will find any adjectives in the text and tag them with surrounding tags |
---|
| 136 | mar -e 'cat(<ADJ>) @MYTAG cat(<ADJ>)' |
---|
| 137 | this will find two neighbouring adjectives and parcel them with tag MYTAG |
---|
| 138 | |
---|
| 139 | Some example patterns: |
---|
| 140 | 'word(domu)' - form of the word domu |
---|
| 141 | 'lexeme(dom)' - any form of lexeme dom |
---|
| 142 | 'space' - space |
---|
| 143 | 'cat(<ADJ>)' - adjective |
---|
| 144 | |
---|
| 145 | You can use * in patterns to make zero or more counts of word. |
---|
| 146 | |
---|
| 147 | END |
---|
| 148 | ; |
---|
| 149 | exit 0; |
---|
| 150 | } |
---|
| 151 | |
---|
| 152 | die("$0: no pattern given. Run with -h to get help.\n") unless $pattern || $action !~ /g/; |
---|
| 153 | |
---|
[7562131] | 154 | die("$0: macro file not found") unless |
---|
| 155 | $macrofile or |
---|
| 156 | -e "$LIB_DIR/terms.m4" and $macrofile="$LIB_DIR/terms.m4"; |
---|
[25ae32e] | 157 | |
---|
| 158 | my $preproc = ($action =~ /p/) ? ' fla | ' : ''; |
---|
| 159 | |
---|
| 160 | my $postproc = ($action =~ /P/) ? ' | unfla ' : ''; |
---|
| 161 | |
---|
[7562131] | 162 | |
---|
| 163 | #this is our help function to cut the re to get another tag |
---|
| 164 | #it takes only one argument which is our patern (after m4 processing) |
---|
| 165 | #returns: the first root-level brace with content |
---|
| 166 | sub cutRe |
---|
| 167 | { |
---|
| 168 | my $i = 0; |
---|
| 169 | my $level = 0; |
---|
| 170 | my $text = $_[0]; |
---|
| 171 | my $temp; |
---|
| 172 | for( $i =0; $i < (length $text);$i++) |
---|
| 173 | { |
---|
| 174 | $temp = substr($text, $i,1); |
---|
| 175 | if( $temp eq "(") |
---|
| 176 | {#we have an opening |
---|
| 177 | $level++; |
---|
| 178 | } |
---|
| 179 | elsif ( $temp eq ")") |
---|
| 180 | {#we close |
---|
| 181 | $level--; |
---|
| 182 | } |
---|
| 183 | if ( $level == 0) |
---|
| 184 | { |
---|
| 185 | $temp = substr($text,0,$i+1); |
---|
| 186 | last; |
---|
| 187 | } |
---|
| 188 | } |
---|
| 189 | $temp; |
---|
| 190 | } |
---|
| 191 | |
---|
| 192 | #the same function as above althought it returns everything after the |
---|
| 193 | #first root level brace |
---|
| 194 | sub restRe |
---|
| 195 | { |
---|
| 196 | my $i = 0; |
---|
| 197 | my $level = 0; |
---|
| 198 | my $text = $_[0]; |
---|
| 199 | my $temp; |
---|
| 200 | for( $i =0; $i < (length $text);$i++) |
---|
| 201 | { |
---|
| 202 | $temp = substr($text, $i,1); |
---|
| 203 | if( $temp eq "(") |
---|
| 204 | {#we have an opening |
---|
| 205 | $level++; |
---|
| 206 | } |
---|
| 207 | elsif ( $temp eq ")") |
---|
| 208 | {#we close |
---|
| 209 | $level--; |
---|
| 210 | } |
---|
| 211 | if ( $level == 0) |
---|
| 212 | { #we cut everything in the begining |
---|
| 213 | $temp = substr($text,$i+1); |
---|
| 214 | last; |
---|
| 215 | } |
---|
| 216 | } |
---|
| 217 | $temp; |
---|
| 218 | } |
---|
| 219 | |
---|
| 220 | |
---|
[25ae32e] | 221 | #here we are preparing re for extended matching |
---|
| 222 | my @tags; |
---|
| 223 | |
---|
| 224 | #we must find what our the tags |
---|
| 225 | #some pattern adjustment |
---|
| 226 | my $end = 0; |
---|
| 227 | my $temp = " ".$pattern." "; |
---|
| 228 | $temp =~ s/(\@[^ ]*) (\@[^ ]* )/\1 \2/g; |
---|
| 229 | $pattern = $temp; |
---|
| 230 | |
---|
| 231 | while ($end != 1) |
---|
| 232 | { |
---|
| 233 | #we seek for the first tag in pattern |
---|
| 234 | if ($temp =~ /^.*?\@(.*?) /) |
---|
| 235 | { |
---|
| 236 | #we add this to tags array |
---|
| 237 | push (@tags, $1); |
---|
| 238 | #and cut the pattern |
---|
| 239 | $temp =~ s/^.*?\@(.*?) / /; |
---|
| 240 | #print $temp."\n"; |
---|
| 241 | } |
---|
| 242 | else |
---|
| 243 | { |
---|
| 244 | #if we dont find any tags we end |
---|
| 245 | $end = 1; |
---|
| 246 | } |
---|
| 247 | } |
---|
| 248 | |
---|
| 249 | #here we have our patern with tags removed (we set sections of ()) between tags |
---|
| 250 | my $patternmod = "( ".$pattern." )"; |
---|
| 251 | $patternmod =~ s/\s@.*?\s/\)\(/g; |
---|
| 252 | |
---|
| 253 | #discarding spaces |
---|
| 254 | $patternmod =~ s/\s+/\\`'/g; #` |
---|
| 255 | # quoting escaped commas |
---|
| 256 | $patternmod =~ s/\\,/\\`\\`\\,''/g; |
---|
| 257 | # quoting commas in {m,n} r.e. operator |
---|
| 258 | $patternmod =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g; |
---|
| 259 | #print "After m4:".$re."\n"; |
---|
[7562131] | 260 | |
---|
| 261 | my $re = `echo \"$patternmod\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' --define=MORFIELD=$morfield $macrofile - 2>/dev/null`; |
---|
[25ae32e] | 262 | |
---|
| 263 | die("Incorrect pattern (m4).") if $? >> 8; |
---|
| 264 | |
---|
| 265 | |
---|
| 266 | chomp $re; |
---|
| 267 | |
---|
| 268 | # <> expansion |
---|
| 269 | |
---|
[7562131] | 270 | $re =~ s/<([^>]+)>/`echo $1 | $tags.tag2re`/ge; |
---|
[25ae32e] | 271 | |
---|
| 272 | # Perl-like special sequences |
---|
| 273 | $re =~ s/\./[^ [:cntrl:]]/g; |
---|
| 274 | $re =~ s/\\s/[ ]/g; |
---|
| 275 | $re =~ s/\\S/[^ [:cntrl:]]/g; |
---|
| 276 | $re =~ s/\\d/[0-9]/g; |
---|
| 277 | $re =~ s/\\D/[^0-9 [:cntrl:]]/g; |
---|
| 278 | $re =~ s/\\w/[a-z±æê³ñ󶌿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_]/g; |
---|
| 279 | $re =~ s/\\W/[^a-z±æê³ñ󶌿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_ [:cntrl:]]/g; |
---|
| 280 | # extensions |
---|
| 281 | $re =~ s/\\l/[a-z±æê³ñ󶌿]/g; #lowercase letter |
---|
| 282 | $re =~ s/\\L/[A-Z¡ÆÊ£ÑÓŠ¬¯]/g; #upercase letter |
---|
| 283 | |
---|
| 284 | my $sedcommand; |
---|
| 285 | my $grepcommand; |
---|
| 286 | |
---|
| 287 | #now we must built a sed script from our re |
---|
| 288 | #we do this by cuting our re each tag until we cut them all |
---|
| 289 | #if an user dint input any tags we do our default |
---|
| 290 | my $defBOM = "BOM"; |
---|
| 291 | my $defEOM = "EOM"; |
---|
| 292 | my $defTempTagBeg = "####TempTAGBEG####"; |
---|
| 293 | my $defTempTagEnd = "####TempTAGEND####"; |
---|
| 294 | |
---|
| 295 | if (@tags == 0) |
---|
| 296 | { |
---|
| 297 | $sedcommand = "sed -r 's/($re)/\\500 $defBOM *\\f\\1###EOM###/g; s/###EOM###([0-9]+)/\\1 00 $defEOM *\\f\\1/g'"; |
---|
| 298 | } |
---|
| 299 | else #we have custom tags |
---|
| 300 | { |
---|
| 301 | #first tag is easy to tag :) |
---|
| 302 | my $sedscript="sed -r 's/($re)/\\600 $defTempTagBeg *\\f\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $defTempTagEnd *\\f\\1/g;"; |
---|
| 303 | #after first step we have temp tagged parts of input matching re |
---|
| 304 | #now we need to insert our custom tags |
---|
| 305 | #we will find temp tags and process our input |
---|
| 306 | |
---|
| 307 | my $i = 0; |
---|
| 308 | #copy of re which will be cut |
---|
| 309 | my $rec = $re; |
---|
| 310 | my $restre = $re; |
---|
| 311 | |
---|
| 312 | for ($i = 0 ; $i < @tags ; $i++) |
---|
| 313 | { |
---|
| 314 | #re cutting |
---|
| 315 | $rec = cutRe($restre); |
---|
| 316 | $restre = restRe($restre); |
---|
| 317 | if ($rec =~ / *\( *\) */) |
---|
| 318 | { |
---|
| 319 | $sedscript = $sedscript."s/([0-9]+) 00 $defTempTagBeg \\*\\f([0-9]+)/\\2 00 $tags[$i] *\\f\\2 00 $defTempTagBeg *\\f\\2/g;"; |
---|
| 320 | } |
---|
| 321 | else |
---|
| 322 | { |
---|
| 323 | $sedscript = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f($rec)/\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $tags[$i] *\\f\\1 00 $defTempTagBeg *\\f\\1/g;"; |
---|
| 324 | } |
---|
| 325 | |
---|
| 326 | } |
---|
| 327 | |
---|
| 328 | $sedcommand = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f//g;s/[0-9]+ 00 $defTempTagEnd \\*\\f//g'"; |
---|
| 329 | } |
---|
| 330 | |
---|
| 331 | if($command) |
---|
| 332 | { |
---|
| 333 | print $sedcommand."\n"; |
---|
| 334 | exit 0; |
---|
| 335 | } |
---|
| 336 | exec $preproc.$sedcommand.$postproc; |
---|