#!/usr/bin/perl #package: UAM Text Tools #component name: gre #author: Tomasz Obrêbski use strict; use Getopt::Long; my $LIB_DIR="/usr/local/lib/utt"; # katalog zawierajacy terms.m4 my $systemconfigfile="/usr/local/etc/utt/grp.conf"; my $userconfigfile="$ENV{'HOME'}/.utt/grp.conf"; Getopt::Long::Configure('no_ignore_case_always'); my $help=0; my $pattern=0; my $matches_only=0; my $macrofile=0; my $define=0; my $show_command=0; my $action="pgP"; my $eos="seg(EOS)"; my $morfield='lem'; #read configuration files########################### my $file; foreach $file ($systemconfigfile, $userconfigfile){ if(open(CONFIG, $file)){ while () { chomp; s/#.*//; s/^\s+//; s/\s+$//; next unless length; my ($name, $value) = split(/\s*=\s*/, $_, 2); if(($name eq "pattern")or($name eq "e")){ $pattern=$value; } elsif(($name eq "eos")or($name eq "E")){ $eos=$value; } elsif($name eq "morph"){ $morfield=$value; } elsif($name eq "macros"){ $macrofile=$value; } elsif($name eq "define"){ $define=$value; } elsif($name eq "command"){ $show_command=1; } elsif($name eq "action"){ $action; } elsif(($name eq "help")or($name eq "h")){ $help=1; } } close CONFIG; } } ######################################################### GetOptions("pattern|e=s" => \$pattern, "eos|E=s" => \$eos, "morph=s" => \$morfield, "macros=s" => \$macrofile, "define=s" => \$macrofile, "command" => \$show_command, "action=s" => \$action, "help|h" => \$help); if($help) { print <<'END' Usage: gre [OPTIONS] [file ..] Options: --pattern -e PATTERN Pattern. --eos -E PATTERN Segment serving as sentence delimiter. --morph=STRING Field containing morphological information (default 'lem'). --macros=FILE Read macrodefinitions from FILE. --define=FILE Add macrodefinitions from FILE. --action -a [u][p][g][P] Perform only indicated actions. u - uncompress with 'lzop -cd' p - preprocess g - grep P - postprocess (default pgP) --command Print the shell command to be executed and exit. --help -h Help. END ; exit 0; } die("$0: no pattern given.\n") unless $pattern || $action !~ /g/; die("$0: macro file not found") unless $macrofile or -e "$LIB_DIR/terms.m4" and $macrofile="$LIB_DIR/terms.m4"; my $uncompress = ($action =~ /u/) ? ' lzop -cd | ' : ''; my $preproc = ($action =~ /p/) ? ' fla | ' : ''; my $postproc = ($action =~ /P/) ? ' | unfla ' : ''; # discarding spaces $pattern =~ s/\s+/\\`'/g; #` # quoting escaped commas $pattern =~ s/\\,/\\`\\`\\,''/g; # quoting commas in {m,n} r.e. operator $pattern =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g; my $grepre = `echo \"$pattern\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' --define=MORFIELD=$morfield $macrofile - 2>/dev/null`; die("Incorrect pattern (m4).") if $? >> 8; chomp $grepre; # <> expansion $grepre =~ s/<([^>]+)>/`echo $1 | tag2re`/ge; $grepre =~ s/\./[^ [:cntrl:]]/g; $grepre =~ s/\\s/[ ]/g; $grepre =~ s/\\S/[^ [:cntrl:]]/g; $grepre =~ s/\\d/[0-9]/g; $grepre =~ s/\\D/[^0-9 [:cntrl:]]/g; $grepre =~ s/\\w/[a-z±æê³ñ󶼿A-Z¡ÆÊ£ÑÓ¦¬¯0-9_]/g; $grepre =~ s/\\W/[^a-z±æê³ñ󶼿A-Z¡ÆÊ£ÑÓ¦¬¯0-9_ [:cntrl:]]/g; # extensions $grepre =~ s/\\l/[a-z±æê³ñ󶼿]/g; #lowercase letter $grepre =~ s/\\L/[A-Z¡ÆÊ£ÑÓ¦¬¯]/g; #upercase letter my $grep_command = ($action =~ /g/) ? "egrep '$grepre'" : " cat "; if($show_command) { print $grep_command."\n"; exit 0; } #print $preproc.$grep_command.$postproc."\n"; exec $preproc.$grep_command.$postproc;