1 | #!/usr/bin/perl |
---|
2 | |
---|
3 | #package: UAM Text Tools |
---|
4 | #component name: gre |
---|
5 | #author: Tomasz Obrêbski |
---|
6 | |
---|
7 | use strict; |
---|
8 | use Getopt::Long; |
---|
9 | |
---|
10 | my $LIB_DIR="/usr/local/lib/utt"; # katalog zawierajacy terms.m4 |
---|
11 | |
---|
12 | my $systemconfigfile="/usr/local/etc/utt/grp.conf"; |
---|
13 | my $userconfigfile="$ENV{'HOME'}/.utt/grp.conf"; |
---|
14 | |
---|
15 | Getopt::Long::Configure('no_ignore_case_always'); |
---|
16 | |
---|
17 | my $help=0; |
---|
18 | my $pattern=0; |
---|
19 | my $matches_only=0; |
---|
20 | my $macrofile=0; |
---|
21 | my $define=0; |
---|
22 | my $show_command=0; |
---|
23 | my $action="pgP"; |
---|
24 | my $eos="seg(EOS)"; |
---|
25 | my $morfield='lem'; |
---|
26 | |
---|
27 | #read configuration files########################### |
---|
28 | my $file; |
---|
29 | foreach $file ($systemconfigfile, $userconfigfile){ |
---|
30 | if(open(CONFIG, $file)){ |
---|
31 | while (<CONFIG>) { |
---|
32 | chomp; |
---|
33 | s/#.*//; |
---|
34 | s/^\s+//; |
---|
35 | s/\s+$//; |
---|
36 | next unless length; |
---|
37 | my ($name, $value) = split(/\s*=\s*/, $_, 2); |
---|
38 | if(($name eq "pattern")or($name eq "e")){ |
---|
39 | $pattern=$value; |
---|
40 | } |
---|
41 | elsif(($name eq "eos")or($name eq "E")){ |
---|
42 | $eos=$value; |
---|
43 | } |
---|
44 | elsif($name eq "morph"){ |
---|
45 | $morfield=$value; |
---|
46 | } |
---|
47 | elsif($name eq "macros"){ |
---|
48 | $macrofile=$value; |
---|
49 | } |
---|
50 | elsif($name eq "define"){ |
---|
51 | $define=$value; |
---|
52 | } |
---|
53 | elsif($name eq "command"){ |
---|
54 | $show_command=1; |
---|
55 | } |
---|
56 | elsif($name eq "action"){ |
---|
57 | $action; |
---|
58 | } |
---|
59 | elsif(($name eq "help")or($name eq "h")){ |
---|
60 | $help=1; |
---|
61 | } |
---|
62 | |
---|
63 | } |
---|
64 | close CONFIG; |
---|
65 | } |
---|
66 | } |
---|
67 | ######################################################### |
---|
68 | |
---|
69 | GetOptions("pattern|e=s" => \$pattern, |
---|
70 | "eos|E=s" => \$eos, |
---|
71 | "morph=s" => \$morfield, |
---|
72 | "macros=s" => \$macrofile, |
---|
73 | "define=s" => \$macrofile, |
---|
74 | "command" => \$show_command, |
---|
75 | "action=s" => \$action, |
---|
76 | "help|h" => \$help); |
---|
77 | |
---|
78 | if($help) |
---|
79 | { |
---|
80 | print <<'END' |
---|
81 | Usage: gre [OPTIONS] [file ..] |
---|
82 | |
---|
83 | Options: |
---|
84 | --pattern -e PATTERN Pattern. |
---|
85 | --eos -E PATTERN Segment serving as sentence delimiter. |
---|
86 | --morph=STRING Field containing morphological information (default 'lem'). |
---|
87 | --macros=FILE Read macrodefinitions from FILE. |
---|
88 | --define=FILE Add macrodefinitions from FILE. |
---|
89 | --action -a [u][p][g][P] Perform only indicated actions. |
---|
90 | u - uncompress with 'lzop -cd' |
---|
91 | p - preprocess |
---|
92 | g - grep |
---|
93 | P - postprocess |
---|
94 | (default pgP) |
---|
95 | --command Print the shell command to be executed and exit. |
---|
96 | --help -h Help. |
---|
97 | END |
---|
98 | ; |
---|
99 | exit 0; |
---|
100 | } |
---|
101 | |
---|
102 | die("$0: no pattern given.\n") unless $pattern || $action !~ /g/; |
---|
103 | |
---|
104 | die("$0: macro file not found") unless |
---|
105 | $macrofile or |
---|
106 | -e "$LIB_DIR/terms.m4" and $macrofile="$LIB_DIR/terms.m4"; |
---|
107 | |
---|
108 | my $uncompress = ($action =~ /u/) ? ' lzop -cd | ' : ''; |
---|
109 | my $preproc = ($action =~ /p/) ? ' fla | ' : ''; |
---|
110 | |
---|
111 | my $postproc = ($action =~ /P/) ? ' | unfla ' : ''; |
---|
112 | |
---|
113 | |
---|
114 | # discarding spaces |
---|
115 | $pattern =~ s/\s+/\\`'/g; #` |
---|
116 | # quoting escaped commas |
---|
117 | $pattern =~ s/\\,/\\`\\`\\,''/g; |
---|
118 | # quoting commas in {m,n} r.e. operator |
---|
119 | $pattern =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g; |
---|
120 | |
---|
121 | my $grepre = `echo \"$pattern\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' --define=MORFIELD=$morfield $macrofile - 2>/dev/null`; |
---|
122 | |
---|
123 | die("Incorrect pattern (m4).") if $? >> 8; |
---|
124 | |
---|
125 | |
---|
126 | chomp $grepre; |
---|
127 | |
---|
128 | # <> expansion |
---|
129 | |
---|
130 | $grepre =~ s/<([^>]+)>/`echo $1 | tag2re`/ge; |
---|
131 | |
---|
132 | $grepre =~ s/\./[^ [:cntrl:]]/g; |
---|
133 | |
---|
134 | $grepre =~ s/\\s/[ ]/g; |
---|
135 | $grepre =~ s/\\S/[^ [:cntrl:]]/g; |
---|
136 | $grepre =~ s/\\d/[0-9]/g; |
---|
137 | $grepre =~ s/\\D/[^0-9 [:cntrl:]]/g; |
---|
138 | $grepre =~ s/\\w/[a-z±æê³ñ󶌿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_]/g; |
---|
139 | $grepre =~ s/\\W/[^a-z±æê³ñ󶌿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_ [:cntrl:]]/g; |
---|
140 | # extensions |
---|
141 | $grepre =~ s/\\l/[a-z±æê³ñ󶌿]/g; #lowercase letter |
---|
142 | $grepre =~ s/\\L/[A-Z¡ÆÊ£ÑÓŠ¬¯]/g; #upercase letter |
---|
143 | |
---|
144 | my $grep_command = ($action =~ /g/) ? "egrep '$grepre'" : " cat "; |
---|
145 | |
---|
146 | if($show_command) |
---|
147 | { |
---|
148 | print $grep_command."\n"; |
---|
149 | exit 0; |
---|
150 | } |
---|
151 | |
---|
152 | #print $preproc.$grep_command.$postproc."\n"; |
---|
153 | |
---|
154 | exec $preproc.$grep_command.$postproc; |
---|