| 1 | #!/usr/bin/perl |
|---|
| 2 | |
|---|
| 3 | #package: UAM Text Tools |
|---|
| 4 | #component: mar |
|---|
| 5 | #version: 1.0 |
|---|
| 6 | #author: Marcin Walas |
|---|
| 7 | |
|---|
| 8 | #this program tags the tokenized file with given tags |
|---|
| 9 | #tags can be given in any order and configuration through the expression |
|---|
| 10 | #which is one of the parametres of the script |
|---|
| 11 | #contact: d287572@atos.wmid.amu.edu.pl, walasiek@gmail.com |
|---|
| 12 | |
|---|
| 13 | my $version = '1.0'; |
|---|
| 14 | |
|---|
| 15 | use lib "/usr/local/lib/utt"; |
|---|
| 16 | use lib "$ENV{'HOME'}/.local/lib/utt"; |
|---|
| 17 | |
|---|
| 18 | use strict; |
|---|
| 19 | use Getopt::Long; |
|---|
| 20 | use File::HomeDir; |
|---|
| 21 | |
|---|
| 22 | use attr; |
|---|
| 23 | |
|---|
| 24 | |
|---|
| 25 | my $LIB_DIR="/usr/local/lib/utt"; |
|---|
| 26 | my $systemconfigfile='/usr/local/etc/utt/mar.conf'; |
|---|
| 27 | my $userconfigfile=home()."/.utt/mar.conf"; |
|---|
| 28 | |
|---|
| 29 | Getopt::Long::Configure('no_ignore_case_always'); |
|---|
| 30 | |
|---|
| 31 | my $help=0; |
|---|
| 32 | my $pattern=0; |
|---|
| 33 | my $macrofile=0; |
|---|
| 34 | my $define=0; |
|---|
| 35 | my $command=0; |
|---|
| 36 | my $action="pgP"; |
|---|
| 37 | my $eos="seg(EOS)"; |
|---|
| 38 | my $explicit_space=0; |
|---|
| 39 | my $morfield='lem'; |
|---|
| 40 | my $tags=0; |
|---|
| 41 | my $show_version = 0; |
|---|
| 42 | |
|---|
| 43 | #read configuration files########################### |
|---|
| 44 | my $file; |
|---|
| 45 | foreach $file ($systemconfigfile, $userconfigfile){ |
|---|
| 46 | if(open(CONFIG, $file)){ |
|---|
| 47 | while (<CONFIG>) { |
|---|
| 48 | chomp; |
|---|
| 49 | s/#.*//; |
|---|
| 50 | s/^\s+//; |
|---|
| 51 | s/\s+$//; |
|---|
| 52 | next unless length; |
|---|
| 53 | my ($name, $value) = split(/\s*=\s*/, $_, 2); |
|---|
| 54 | if(($name eq "pattern")or($name eq "e")){ |
|---|
| 55 | $pattern=$value; |
|---|
| 56 | } |
|---|
| 57 | elsif($name eq "eos"){ |
|---|
| 58 | $eos=$value; |
|---|
| 59 | } |
|---|
| 60 | elsif($name eq "macros"){ |
|---|
| 61 | $macrofile=$value; |
|---|
| 62 | } |
|---|
| 63 | elsif($name eq "tags"){ |
|---|
| 64 | $tags=$value; |
|---|
| 65 | } |
|---|
| 66 | elsif($name eq "morph"){ |
|---|
| 67 | $morfield=$value; |
|---|
| 68 | } |
|---|
| 69 | elsif($name eq "command"){ |
|---|
| 70 | $command=1; |
|---|
| 71 | } |
|---|
| 72 | elsif($name eq "action"){ |
|---|
| 73 | $action=$value; |
|---|
| 74 | } |
|---|
| 75 | elsif($name eq "space"){ |
|---|
| 76 | $explicit_space=1; |
|---|
| 77 | } |
|---|
| 78 | elsif(($name eq "help")or($name eq "h")){ |
|---|
| 79 | $help=1; |
|---|
| 80 | } |
|---|
| 81 | |
|---|
| 82 | } |
|---|
| 83 | close CONFIG; |
|---|
| 84 | } |
|---|
| 85 | } |
|---|
| 86 | ######################################################### |
|---|
| 87 | |
|---|
| 88 | GetOptions("pattern|e=s" => \$pattern, |
|---|
| 89 | "eos|E=s" => \$eos, |
|---|
| 90 | "macros=s" => \$macrofile, |
|---|
| 91 | "define=s" => \$macrofile, |
|---|
| 92 | "command" => \$command, |
|---|
| 93 | "action=s" => \$action, |
|---|
| 94 | "help|h" => \$help, |
|---|
| 95 | "space|s" => \$explicit_space, |
|---|
| 96 | "version|v" => \$show_version, |
|---|
| 97 | ); |
|---|
| 98 | |
|---|
| 99 | |
|---|
| 100 | |
|---|
| 101 | if($show_version){ |
|---|
| 102 | print "Version: $version\n"; |
|---|
| 103 | exit 0; |
|---|
| 104 | } |
|---|
| 105 | |
|---|
| 106 | if($help) |
|---|
| 107 | { |
|---|
| 108 | print <<'END' |
|---|
| 109 | Usage: mar [OPTIONS] [file ..] |
|---|
| 110 | |
|---|
| 111 | Options: |
|---|
| 112 | --pattern -e PATTERN Pattern. |
|---|
| 113 | --eos -E PATTERN Segment serving as sentence beginning marker. [TODO] |
|---|
| 114 | --macros=FILE Read macrodefinitions from FILE. [TODO] |
|---|
| 115 | --define=FILE Add macrodefinitions from FILE. [TODO] |
|---|
| 116 | --action -a [p][s][P] Perform only indicated actions. |
|---|
| 117 | p - preprocess |
|---|
| 118 | s - search |
|---|
| 119 | P - postprocess |
|---|
| 120 | (default psP) |
|---|
| 121 | --command Print generated shell command and exit. |
|---|
| 122 | --help -h Print help. |
|---|
| 123 | --version -v Script version |
|---|
| 124 | |
|---|
| 125 | In patern you can put any tag. Tags should begin with the @ character. |
|---|
| 126 | They don't have to be closed. |
|---|
| 127 | They can't contain white spaces! |
|---|
| 128 | |
|---|
| 129 | Note: If you don't define any custom tags, whole pattern will be taged with |
|---|
| 130 | default tags (begining of match and end of match) |
|---|
| 131 | |
|---|
| 132 | Tags examples: |
|---|
| 133 | |
|---|
| 134 | mar -e '@BEG cat(<ADJ>) @END' |
|---|
| 135 | it will find any adjectives in the text and tag them with surrounding tags |
|---|
| 136 | mar -e 'cat(<ADJ>) @MYTAG cat(<ADJ>)' |
|---|
| 137 | this will find two neighbouring adjectives and parcel them with tag MYTAG |
|---|
| 138 | |
|---|
| 139 | Some example patterns: |
|---|
| 140 | 'word(domu)' - form of the word domu |
|---|
| 141 | 'lexeme(dom)' - any form of lexeme dom |
|---|
| 142 | 'space' - space |
|---|
| 143 | 'cat(<ADJ>)' - adjective |
|---|
| 144 | |
|---|
| 145 | You can use * in patterns to make zero or more counts of word. |
|---|
| 146 | |
|---|
| 147 | END |
|---|
| 148 | ; |
|---|
| 149 | exit 0; |
|---|
| 150 | } |
|---|
| 151 | |
|---|
| 152 | die("$0: no pattern given. Run with -h to get help.\n") unless $pattern || $action !~ /g/; |
|---|
| 153 | |
|---|
| 154 | die("$0: macro file not found") unless |
|---|
| 155 | $macrofile or |
|---|
| 156 | -e "$LIB_DIR/terms.m4" and $macrofile="$LIB_DIR/terms.m4"; |
|---|
| 157 | |
|---|
| 158 | my $preproc = ($action =~ /p/) ? ' fla | ' : ''; |
|---|
| 159 | |
|---|
| 160 | my $postproc = ($action =~ /P/) ? ' | unfla ' : ''; |
|---|
| 161 | |
|---|
| 162 | |
|---|
| 163 | #this is our help function to cut the re to get another tag |
|---|
| 164 | #it takes only one argument which is our patern (after m4 processing) |
|---|
| 165 | #returns: the first root-level brace with content |
|---|
| 166 | sub cutRe |
|---|
| 167 | { |
|---|
| 168 | my $i = 0; |
|---|
| 169 | my $level = 0; |
|---|
| 170 | my $text = $_[0]; |
|---|
| 171 | my $temp; |
|---|
| 172 | for( $i =0; $i < (length $text);$i++) |
|---|
| 173 | { |
|---|
| 174 | $temp = substr($text, $i,1); |
|---|
| 175 | if( $temp eq "(") |
|---|
| 176 | {#we have an opening |
|---|
| 177 | $level++; |
|---|
| 178 | } |
|---|
| 179 | elsif ( $temp eq ")") |
|---|
| 180 | {#we close |
|---|
| 181 | $level--; |
|---|
| 182 | } |
|---|
| 183 | if ( $level == 0) |
|---|
| 184 | { |
|---|
| 185 | $temp = substr($text,0,$i+1); |
|---|
| 186 | last; |
|---|
| 187 | } |
|---|
| 188 | } |
|---|
| 189 | $temp; |
|---|
| 190 | } |
|---|
| 191 | |
|---|
| 192 | #the same function as above althought it returns everything after the |
|---|
| 193 | #first root level brace |
|---|
| 194 | sub restRe |
|---|
| 195 | { |
|---|
| 196 | my $i = 0; |
|---|
| 197 | my $level = 0; |
|---|
| 198 | my $text = $_[0]; |
|---|
| 199 | my $temp; |
|---|
| 200 | for( $i =0; $i < (length $text);$i++) |
|---|
| 201 | { |
|---|
| 202 | $temp = substr($text, $i,1); |
|---|
| 203 | if( $temp eq "(") |
|---|
| 204 | {#we have an opening |
|---|
| 205 | $level++; |
|---|
| 206 | } |
|---|
| 207 | elsif ( $temp eq ")") |
|---|
| 208 | {#we close |
|---|
| 209 | $level--; |
|---|
| 210 | } |
|---|
| 211 | if ( $level == 0) |
|---|
| 212 | { #we cut everything in the begining |
|---|
| 213 | $temp = substr($text,$i+1); |
|---|
| 214 | last; |
|---|
| 215 | } |
|---|
| 216 | } |
|---|
| 217 | $temp; |
|---|
| 218 | } |
|---|
| 219 | |
|---|
| 220 | |
|---|
| 221 | #here we are preparing re for extended matching |
|---|
| 222 | my @tags; |
|---|
| 223 | |
|---|
| 224 | #we must find what our the tags |
|---|
| 225 | #some pattern adjustment |
|---|
| 226 | my $end = 0; |
|---|
| 227 | my $temp = " ".$pattern." "; |
|---|
| 228 | $temp =~ s/(\@[^ ]*) (\@[^ ]* )/\1 \2/g; |
|---|
| 229 | $pattern = $temp; |
|---|
| 230 | |
|---|
| 231 | while ($end != 1) |
|---|
| 232 | { |
|---|
| 233 | #we seek for the first tag in pattern |
|---|
| 234 | if ($temp =~ /^.*?\@(.*?) /) |
|---|
| 235 | { |
|---|
| 236 | #we add this to tags array |
|---|
| 237 | push (@tags, $1); |
|---|
| 238 | #and cut the pattern |
|---|
| 239 | $temp =~ s/^.*?\@(.*?) / /; |
|---|
| 240 | #print $temp."\n"; |
|---|
| 241 | } |
|---|
| 242 | else |
|---|
| 243 | { |
|---|
| 244 | #if we dont find any tags we end |
|---|
| 245 | $end = 1; |
|---|
| 246 | } |
|---|
| 247 | } |
|---|
| 248 | |
|---|
| 249 | #here we have our patern with tags removed (we set sections of ()) between tags |
|---|
| 250 | my $patternmod = "( ".$pattern." )"; |
|---|
| 251 | $patternmod =~ s/\s@.*?\s/\)\(/g; |
|---|
| 252 | |
|---|
| 253 | #discarding spaces |
|---|
| 254 | $patternmod =~ s/\s+/\\`'/g; #` |
|---|
| 255 | # quoting escaped commas |
|---|
| 256 | $patternmod =~ s/\\,/\\`\\`\\,''/g; |
|---|
| 257 | # quoting commas in {m,n} r.e. operator |
|---|
| 258 | $patternmod =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g; |
|---|
| 259 | #print "After m4:".$re."\n"; |
|---|
| 260 | |
|---|
| 261 | my $re = `echo \"$patternmod\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' --define=MORFIELD=$morfield $macrofile - 2>/dev/null`; |
|---|
| 262 | |
|---|
| 263 | die("Incorrect pattern (m4).") if $? >> 8; |
|---|
| 264 | |
|---|
| 265 | |
|---|
| 266 | chomp $re; |
|---|
| 267 | |
|---|
| 268 | # <> expansion |
|---|
| 269 | |
|---|
| 270 | $re =~ s/<([^>]+)>/`echo $1 | $tags.tag2re`/ge; |
|---|
| 271 | |
|---|
| 272 | # Perl-like special sequences |
|---|
| 273 | $re =~ s/\./[^ [:cntrl:]]/g; |
|---|
| 274 | $re =~ s/\\s/[ ]/g; |
|---|
| 275 | $re =~ s/\\S/[^ [:cntrl:]]/g; |
|---|
| 276 | $re =~ s/\\d/[0-9]/g; |
|---|
| 277 | $re =~ s/\\D/[^0-9 [:cntrl:]]/g; |
|---|
| 278 | $re =~ s/\\w/[a-z±æê³ñ󶌿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_]/g; |
|---|
| 279 | $re =~ s/\\W/[^a-z±æê³ñ󶌿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_ [:cntrl:]]/g; |
|---|
| 280 | # extensions |
|---|
| 281 | $re =~ s/\\l/[a-z±æê³ñ󶌿]/g; #lowercase letter |
|---|
| 282 | $re =~ s/\\L/[A-Z¡ÆÊ£ÑÓŠ¬¯]/g; #upercase letter |
|---|
| 283 | |
|---|
| 284 | my $sedcommand; |
|---|
| 285 | my $grepcommand; |
|---|
| 286 | |
|---|
| 287 | #now we must built a sed script from our re |
|---|
| 288 | #we do this by cuting our re each tag until we cut them all |
|---|
| 289 | #if an user dint input any tags we do our default |
|---|
| 290 | my $defBOM = "BOM"; |
|---|
| 291 | my $defEOM = "EOM"; |
|---|
| 292 | my $defTempTagBeg = "####TempTAGBEG####"; |
|---|
| 293 | my $defTempTagEnd = "####TempTAGEND####"; |
|---|
| 294 | |
|---|
| 295 | if (@tags == 0) |
|---|
| 296 | { |
|---|
| 297 | $sedcommand = "sed -r 's/($re)/\\500 $defBOM *\\f\\1###EOM###/g; s/###EOM###([0-9]+)/\\1 00 $defEOM *\\f\\1/g'"; |
|---|
| 298 | } |
|---|
| 299 | else #we have custom tags |
|---|
| 300 | { |
|---|
| 301 | #first tag is easy to tag :) |
|---|
| 302 | my $sedscript="sed -r 's/($re)/\\600 $defTempTagBeg *\\f\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $defTempTagEnd *\\f\\1/g;"; |
|---|
| 303 | #after first step we have temp tagged parts of input matching re |
|---|
| 304 | #now we need to insert our custom tags |
|---|
| 305 | #we will find temp tags and process our input |
|---|
| 306 | |
|---|
| 307 | my $i = 0; |
|---|
| 308 | #copy of re which will be cut |
|---|
| 309 | my $rec = $re; |
|---|
| 310 | my $restre = $re; |
|---|
| 311 | |
|---|
| 312 | for ($i = 0 ; $i < @tags ; $i++) |
|---|
| 313 | { |
|---|
| 314 | #re cutting |
|---|
| 315 | $rec = cutRe($restre); |
|---|
| 316 | $restre = restRe($restre); |
|---|
| 317 | if ($rec =~ / *\( *\) */) |
|---|
| 318 | { |
|---|
| 319 | $sedscript = $sedscript."s/([0-9]+) 00 $defTempTagBeg \\*\\f([0-9]+)/\\2 00 $tags[$i] *\\f\\2 00 $defTempTagBeg *\\f\\2/g;"; |
|---|
| 320 | } |
|---|
| 321 | else |
|---|
| 322 | { |
|---|
| 323 | $sedscript = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f($rec)/\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $tags[$i] *\\f\\1 00 $defTempTagBeg *\\f\\1/g;"; |
|---|
| 324 | } |
|---|
| 325 | |
|---|
| 326 | } |
|---|
| 327 | |
|---|
| 328 | $sedcommand = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f//g;s/[0-9]+ 00 $defTempTagEnd \\*\\f//g'"; |
|---|
| 329 | } |
|---|
| 330 | |
|---|
| 331 | if($command) |
|---|
| 332 | { |
|---|
| 333 | print $sedcommand."\n"; |
|---|
| 334 | exit 0; |
|---|
| 335 | } |
|---|
| 336 | exec $preproc.$sedcommand.$postproc; |
|---|