#!/usr/bin/perl #package: UAM Text Tools #component: mar #version: 1.0 #author: Marcin Walas #this program tags the tokenized file with given tags #tags can be given in any order and configuration through the expression #which is one of the parametres of the script #contact: d287572@atos.wmid.amu.edu.pl, walasiek@gmail.com my $version = '1.0'; use lib "/usr/local/lib/utt"; use lib "$ENV{'HOME'}/.local/lib/utt"; use strict; use Getopt::Long; use File::HomeDir; use attr; my $LIB_DIR="/usr/local/lib/utt"; my $systemconfigfile='/usr/local/etc/utt/mar.conf'; my $userconfigfile=home()."/.utt/mar.conf"; Getopt::Long::Configure('no_ignore_case_always'); my $help=0; my $pattern=0; my $macrofile=0; my $define=0; my $command=0; my $action="pgP"; my $eos="seg(EOS)"; my $explicit_space=0; my $morfield='lem'; my $tags=0; my $show_version = 0; #read configuration files########################### my $file; foreach $file ($systemconfigfile, $userconfigfile){ if(open(CONFIG, $file)){ while () { chomp; s/#.*//; s/^\s+//; s/\s+$//; next unless length; my ($name, $value) = split(/\s*=\s*/, $_, 2); if(($name eq "pattern")or($name eq "e")){ $pattern=$value; } elsif($name eq "eos"){ $eos=$value; } elsif($name eq "macros"){ $macrofile=$value; } elsif($name eq "tags"){ $tags=$value; } elsif($name eq "morph"){ $morfield=$value; } elsif($name eq "command"){ $command=1; } elsif($name eq "action"){ $action=$value; } elsif($name eq "space"){ $explicit_space=1; } elsif(($name eq "help")or($name eq "h")){ $help=1; } } close CONFIG; } } ######################################################### GetOptions("pattern|e=s" => \$pattern, "eos|E=s" => \$eos, "macros=s" => \$macrofile, "define=s" => \$macrofile, "command" => \$command, "action=s" => \$action, "help|h" => \$help, "space|s" => \$explicit_space, "version|v" => \$show_version, ); if($show_version){ print "Version: $version\n"; exit 0; } if($help) { print <<'END' Usage: mar [OPTIONS] [file ..] Options: --pattern -e PATTERN Pattern. --eos -E PATTERN Segment serving as sentence beginning marker. [TODO] --macros=FILE Read macrodefinitions from FILE. [TODO] --define=FILE Add macrodefinitions from FILE. [TODO] --action -a [p][s][P] Perform only indicated actions. p - preprocess s - search P - postprocess (default psP) --command Print generated shell command and exit. --help -h Print help. --version -v Script version In patern you can put any tag. Tags should begin with the @ character. They don't have to be closed. They can't contain white spaces! Note: If you don't define any custom tags, whole pattern will be taged with default tags (begining of match and end of match) Tags examples: mar -e '@BEG cat() @END' it will find any adjectives in the text and tag them with surrounding tags mar -e 'cat() @MYTAG cat()' this will find two neighbouring adjectives and parcel them with tag MYTAG Some example patterns: 'word(domu)' - form of the word domu 'lexeme(dom)' - any form of lexeme dom 'space' - space 'cat()' - adjective You can use * in patterns to make zero or more counts of word. END ; exit 0; } die("$0: no pattern given. Run with -h to get help.\n") unless $pattern || $action !~ /g/; die("$0: macro file not found") unless $macrofile or -e "$LIB_DIR/terms.m4" and $macrofile="$LIB_DIR/terms.m4"; my $preproc = ($action =~ /p/) ? ' fla | ' : ''; my $postproc = ($action =~ /P/) ? ' | unfla ' : ''; #this is our help function to cut the re to get another tag #it takes only one argument which is our patern (after m4 processing) #returns: the first root-level brace with content sub cutRe { my $i = 0; my $level = 0; my $text = $_[0]; my $temp; for( $i =0; $i < (length $text);$i++) { $temp = substr($text, $i,1); if( $temp eq "(") {#we have an opening $level++; } elsif ( $temp eq ")") {#we close $level--; } if ( $level == 0) { $temp = substr($text,0,$i+1); last; } } $temp; } #the same function as above althought it returns everything after the #first root level brace sub restRe { my $i = 0; my $level = 0; my $text = $_[0]; my $temp; for( $i =0; $i < (length $text);$i++) { $temp = substr($text, $i,1); if( $temp eq "(") {#we have an opening $level++; } elsif ( $temp eq ")") {#we close $level--; } if ( $level == 0) { #we cut everything in the begining $temp = substr($text,$i+1); last; } } $temp; } #here we are preparing re for extended matching my @tags; #we must find what our the tags #some pattern adjustment my $end = 0; my $temp = " ".$pattern." "; $temp =~ s/(\@[^ ]*) (\@[^ ]* )/\1 \2/g; $pattern = $temp; while ($end != 1) { #we seek for the first tag in pattern if ($temp =~ /^.*?\@(.*?) /) { #we add this to tags array push (@tags, $1); #and cut the pattern $temp =~ s/^.*?\@(.*?) / /; #print $temp."\n"; } else { #if we dont find any tags we end $end = 1; } } #here we have our patern with tags removed (we set sections of ()) between tags my $patternmod = "( ".$pattern." )"; $patternmod =~ s/\s@.*?\s/\)\(/g; #discarding spaces $patternmod =~ s/\s+/\\`'/g; #` # quoting escaped commas $patternmod =~ s/\\,/\\`\\`\\,''/g; # quoting commas in {m,n} r.e. operator $patternmod =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g; #print "After m4:".$re."\n"; my $re = `echo \"$patternmod\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' --define=MORFIELD=$morfield $macrofile - 2>/dev/null`; die("Incorrect pattern (m4).") if $? >> 8; chomp $re; # <> expansion $re =~ s/<([^>]+)>/`echo $1 | $tags.tag2re`/ge; # Perl-like special sequences $re =~ s/\./[^ [:cntrl:]]/g; $re =~ s/\\s/[ ]/g; $re =~ s/\\S/[^ [:cntrl:]]/g; $re =~ s/\\d/[0-9]/g; $re =~ s/\\D/[^0-9 [:cntrl:]]/g; $re =~ s/\\w/[a-z±æê³ñó¶¼¿A-Z¡ÆÊ£ÑÓ¦¬¯0-9_]/g; $re =~ s/\\W/[^a-z±æê³ñó¶¼¿A-Z¡ÆÊ£ÑÓ¦¬¯0-9_ [:cntrl:]]/g; # extensions $re =~ s/\\l/[a-z±æê³ñó¶¼¿]/g; #lowercase letter $re =~ s/\\L/[A-Z¡ÆÊ£ÑÓ¦¬¯]/g; #upercase letter my $sedcommand; my $grepcommand; #now we must built a sed script from our re #we do this by cuting our re each tag until we cut them all #if an user dint input any tags we do our default my $defBOM = "BOM"; my $defEOM = "EOM"; my $defTempTagBeg = "####TempTAGBEG####"; my $defTempTagEnd = "####TempTAGEND####"; if (@tags == 0) { $sedcommand = "sed -r 's/($re)/\\500 $defBOM *\\f\\1###EOM###/g; s/###EOM###([0-9]+)/\\1 00 $defEOM *\\f\\1/g'"; } else #we have custom tags { #first tag is easy to tag :) my $sedscript="sed -r 's/($re)/\\600 $defTempTagBeg *\\f\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $defTempTagEnd *\\f\\1/g;"; #after first step we have temp tagged parts of input matching re #now we need to insert our custom tags #we will find temp tags and process our input my $i = 0; #copy of re which will be cut my $rec = $re; my $restre = $re; for ($i = 0 ; $i < @tags ; $i++) { #re cutting $rec = cutRe($restre); $restre = restRe($restre); if ($rec =~ / *$ *$ */) { $sedscript = $sedscript."s/([0-9]+) 00 $defTempTagBeg \\*\\f([0-9]+)/\\2 00 $tags[$i] *\\f\\2 00 $defTempTagBeg *\\f\\2/g;"; } else { $sedscript = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f($rec)/\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $tags[$i] *\\f\\1 00 $defTempTagBeg *\\f\\1/g;"; } } $sedcommand = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f//g;s/[0-9]+ 00 $defTempTagEnd \\*\\f//g'"; } if($command) { print $sedcommand."\n"; exit 0; } exec $preproc.$sedcommand.$postproc;