[25ae32e] | 1 | #!/usr/bin/perl |
---|
| 2 | |
---|
| 3 | #package: UAM Text Tools |
---|
[20b4e44] | 4 | #component name: grp |
---|
| 5 | #version: 1.0 |
---|
| 6 | #author: Tomasz Obrebski |
---|
[25ae32e] | 7 | |
---|
| 8 | use strict; |
---|
| 9 | use Getopt::Long; |
---|
[20b4e44] | 10 | use File::HomeDir; |
---|
[25ae32e] | 11 | |
---|
[20b4e44] | 12 | # katalog zawierajacy terms.m4 |
---|
| 13 | my $LIB_DIR="/usr/local/lib/utt"; |
---|
[25ae32e] | 14 | |
---|
| 15 | my $systemconfigfile="/usr/local/etc/utt/grp.conf"; |
---|
[20b4e44] | 16 | my $userconfigfile=home()."/.utt/grp.conf"; |
---|
[25ae32e] | 17 | |
---|
| 18 | Getopt::Long::Configure('no_ignore_case_always'); |
---|
| 19 | |
---|
| 20 | my $help=0; |
---|
| 21 | my $pattern=0; |
---|
| 22 | my $matches_only=0; |
---|
| 23 | my $macrofile=0; |
---|
| 24 | my $define=0; |
---|
| 25 | my $show_command=0; |
---|
| 26 | my $action="pgP"; |
---|
| 27 | my $eos="seg(EOS)"; |
---|
| 28 | my $morfield='lem'; |
---|
[f5d3b20] | 29 | my $tags=0; |
---|
[25ae32e] | 30 | |
---|
| 31 | #read configuration files########################### |
---|
| 32 | my $file; |
---|
| 33 | foreach $file ($systemconfigfile, $userconfigfile){ |
---|
| 34 | if(open(CONFIG, $file)){ |
---|
| 35 | while (<CONFIG>) { |
---|
| 36 | chomp; |
---|
| 37 | s/#.*//; |
---|
| 38 | s/^\s+//; |
---|
| 39 | s/\s+$//; |
---|
| 40 | next unless length; |
---|
| 41 | my ($name, $value) = split(/\s*=\s*/, $_, 2); |
---|
| 42 | if(($name eq "pattern")or($name eq "e")){ |
---|
| 43 | $pattern=$value; |
---|
| 44 | } |
---|
| 45 | elsif(($name eq "eos")or($name eq "E")){ |
---|
| 46 | $eos=$value; |
---|
| 47 | } |
---|
| 48 | elsif($name eq "morph"){ |
---|
| 49 | $morfield=$value; |
---|
| 50 | } |
---|
| 51 | elsif($name eq "macros"){ |
---|
| 52 | $macrofile=$value; |
---|
| 53 | } |
---|
| 54 | elsif($name eq "define"){ |
---|
| 55 | $define=$value; |
---|
| 56 | } |
---|
| 57 | elsif($name eq "command"){ |
---|
| 58 | $show_command=1; |
---|
| 59 | } |
---|
| 60 | elsif($name eq "action"){ |
---|
| 61 | $action; |
---|
| 62 | } |
---|
[f5d3b20] | 63 | elsif($name eq "tags"){ |
---|
| 64 | $tags=$value; |
---|
| 65 | } |
---|
[25ae32e] | 66 | elsif(($name eq "help")or($name eq "h")){ |
---|
| 67 | $help=1; |
---|
| 68 | } |
---|
| 69 | |
---|
| 70 | } |
---|
| 71 | close CONFIG; |
---|
| 72 | } |
---|
| 73 | } |
---|
| 74 | ######################################################### |
---|
| 75 | |
---|
| 76 | GetOptions("pattern|e=s" => \$pattern, |
---|
| 77 | "eos|E=s" => \$eos, |
---|
| 78 | "morph=s" => \$morfield, |
---|
| 79 | "macros=s" => \$macrofile, |
---|
| 80 | "define=s" => \$macrofile, |
---|
| 81 | "command" => \$show_command, |
---|
| 82 | "action=s" => \$action, |
---|
[f5d3b20] | 83 | "tags=s" => \$tags, |
---|
[25ae32e] | 84 | "help|h" => \$help); |
---|
| 85 | |
---|
| 86 | if($help) |
---|
| 87 | { |
---|
| 88 | print <<'END' |
---|
| 89 | Usage: gre [OPTIONS] [file ..] |
---|
| 90 | |
---|
| 91 | Options: |
---|
| 92 | --pattern -e PATTERN Pattern. |
---|
| 93 | --eos -E PATTERN Segment serving as sentence delimiter. |
---|
| 94 | --morph=STRING Field containing morphological information (default 'lem'). |
---|
| 95 | --macros=FILE Read macrodefinitions from FILE. |
---|
| 96 | --define=FILE Add macrodefinitions from FILE. |
---|
| 97 | --action -a [u][p][g][P] Perform only indicated actions. |
---|
| 98 | u - uncompress with 'lzop -cd' |
---|
| 99 | p - preprocess |
---|
| 100 | g - grep |
---|
| 101 | P - postprocess |
---|
| 102 | (default pgP) |
---|
[f5d3b20] | 103 | --tags=STRING Morphosyntactic tag format. |
---|
[25ae32e] | 104 | --command Print the shell command to be executed and exit. |
---|
| 105 | --help -h Help. |
---|
| 106 | END |
---|
| 107 | ; |
---|
| 108 | exit 0; |
---|
| 109 | } |
---|
| 110 | |
---|
| 111 | die("$0: no pattern given.\n") unless $pattern || $action !~ /g/; |
---|
| 112 | |
---|
| 113 | die("$0: macro file not found") unless |
---|
| 114 | $macrofile or |
---|
| 115 | -e "$LIB_DIR/terms.m4" and $macrofile="$LIB_DIR/terms.m4"; |
---|
| 116 | |
---|
[f5d3b20] | 117 | die("$0: undefined tagset format (tags option missing)") unless |
---|
| 118 | $tags; |
---|
| 119 | |
---|
| 120 | die("$0: $tags.tag2re program not found") unless |
---|
| 121 | 1; #JAK NAPISAC WARUNEK??? |
---|
| 122 | |
---|
| 123 | |
---|
[25ae32e] | 124 | my $uncompress = ($action =~ /u/) ? ' lzop -cd | ' : ''; |
---|
| 125 | my $preproc = ($action =~ /p/) ? ' fla | ' : ''; |
---|
| 126 | |
---|
| 127 | my $postproc = ($action =~ /P/) ? ' | unfla ' : ''; |
---|
| 128 | |
---|
| 129 | |
---|
| 130 | # discarding spaces |
---|
| 131 | $pattern =~ s/\s+/\\`'/g; #` |
---|
| 132 | # quoting escaped commas |
---|
| 133 | $pattern =~ s/\\,/\\`\\`\\,''/g; |
---|
| 134 | # quoting commas in {m,n} r.e. operator |
---|
| 135 | $pattern =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g; |
---|
| 136 | |
---|
| 137 | my $grepre = `echo \"$pattern\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' --define=MORFIELD=$morfield $macrofile - 2>/dev/null`; |
---|
| 138 | |
---|
| 139 | die("Incorrect pattern (m4).") if $? >> 8; |
---|
| 140 | |
---|
| 141 | |
---|
| 142 | chomp $grepre; |
---|
| 143 | |
---|
| 144 | # <> expansion |
---|
| 145 | |
---|
[f5d3b20] | 146 | $grepre =~ s/<([^>]+)>/`echo $1 | $tags.tag2re`/ge; |
---|
[25ae32e] | 147 | |
---|
| 148 | $grepre =~ s/\./[^ [:cntrl:]]/g; |
---|
| 149 | |
---|
| 150 | $grepre =~ s/\\s/[ ]/g; |
---|
| 151 | $grepre =~ s/\\S/[^ [:cntrl:]]/g; |
---|
| 152 | $grepre =~ s/\\d/[0-9]/g; |
---|
| 153 | $grepre =~ s/\\D/[^0-9 [:cntrl:]]/g; |
---|
| 154 | $grepre =~ s/\\w/[a-z±æê³ñ󶌿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_]/g; |
---|
| 155 | $grepre =~ s/\\W/[^a-z±æê³ñ󶌿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_ [:cntrl:]]/g; |
---|
| 156 | # extensions |
---|
| 157 | $grepre =~ s/\\l/[a-z±æê³ñ󶌿]/g; #lowercase letter |
---|
| 158 | $grepre =~ s/\\L/[A-Z¡ÆÊ£ÑÓŠ¬¯]/g; #upercase letter |
---|
| 159 | |
---|
| 160 | my $grep_command = ($action =~ /g/) ? "egrep '$grepre'" : " cat "; |
---|
| 161 | |
---|
| 162 | if($show_command) |
---|
| 163 | { |
---|
| 164 | print $grep_command."\n"; |
---|
| 165 | exit 0; |
---|
| 166 | } |
---|
| 167 | |
---|
| 168 | #print $preproc.$grep_command.$postproc."\n"; |
---|
| 169 | |
---|
| 170 | exec $preproc.$grep_command.$postproc; |
---|