1 | #!/usr/bin/perl |
---|
2 | |
---|
3 | #package: UAM Text Tools |
---|
4 | #component name: grp |
---|
5 | #version: 1.0 |
---|
6 | #author: Tomasz Obrebski |
---|
7 | |
---|
8 | use strict; |
---|
9 | use Getopt::Long; |
---|
10 | use File::HomeDir; |
---|
11 | |
---|
12 | # katalog zawierajacy terms.m4 |
---|
13 | my $LIB_DIR="/usr/local/lib/utt"; |
---|
14 | |
---|
15 | my $systemconfigfile="/usr/local/etc/utt/grp.conf"; |
---|
16 | my $userconfigfile=home()."/.utt/grp.conf"; |
---|
17 | |
---|
18 | Getopt::Long::Configure('no_ignore_case_always'); |
---|
19 | |
---|
20 | my $help=0; |
---|
21 | my $pattern=0; |
---|
22 | my $matches_only=0; |
---|
23 | my $macrofile=0; |
---|
24 | my $define=0; |
---|
25 | my $show_command=0; |
---|
26 | my $action="pgP"; |
---|
27 | my $eos="seg(EOS)"; |
---|
28 | my $morfield='lem'; |
---|
29 | my $tags=0; |
---|
30 | |
---|
31 | #read configuration files########################### |
---|
32 | my $file; |
---|
33 | foreach $file ($systemconfigfile, $userconfigfile){ |
---|
34 | if(open(CONFIG, $file)){ |
---|
35 | while (<CONFIG>) { |
---|
36 | chomp; |
---|
37 | s/#.*//; |
---|
38 | s/^\s+//; |
---|
39 | s/\s+$//; |
---|
40 | next unless length; |
---|
41 | my ($name, $value) = split(/\s*=\s*/, $_, 2); |
---|
42 | if(($name eq "pattern")or($name eq "e")){ |
---|
43 | $pattern=$value; |
---|
44 | } |
---|
45 | elsif(($name eq "eos")or($name eq "E")){ |
---|
46 | $eos=$value; |
---|
47 | } |
---|
48 | elsif($name eq "morph"){ |
---|
49 | $morfield=$value; |
---|
50 | } |
---|
51 | elsif($name eq "macros"){ |
---|
52 | $macrofile=$value; |
---|
53 | } |
---|
54 | elsif($name eq "define"){ |
---|
55 | $define=$value; |
---|
56 | } |
---|
57 | elsif($name eq "command"){ |
---|
58 | $show_command=1; |
---|
59 | } |
---|
60 | elsif($name eq "action"){ |
---|
61 | $action; |
---|
62 | } |
---|
63 | elsif($name eq "tags"){ |
---|
64 | $tags=$value; |
---|
65 | } |
---|
66 | elsif(($name eq "help")or($name eq "h")){ |
---|
67 | $help=1; |
---|
68 | } |
---|
69 | |
---|
70 | } |
---|
71 | close CONFIG; |
---|
72 | } |
---|
73 | } |
---|
74 | ######################################################### |
---|
75 | |
---|
76 | GetOptions("pattern|e=s" => \$pattern, |
---|
77 | "eos|E=s" => \$eos, |
---|
78 | "morph=s" => \$morfield, |
---|
79 | "macros=s" => \$macrofile, |
---|
80 | "define=s" => \$macrofile, |
---|
81 | "command" => \$show_command, |
---|
82 | "action=s" => \$action, |
---|
83 | "tags=s" => \$tags, |
---|
84 | "help|h" => \$help); |
---|
85 | |
---|
86 | if($help) |
---|
87 | { |
---|
88 | print <<'END' |
---|
89 | Usage: gre [OPTIONS] [file ..] |
---|
90 | |
---|
91 | Options: |
---|
92 | --pattern -e PATTERN Pattern. |
---|
93 | --eos -E PATTERN Segment serving as sentence delimiter. |
---|
94 | --morph=STRING Field containing morphological information (default 'lem'). |
---|
95 | --macros=FILE Read macrodefinitions from FILE. |
---|
96 | --define=FILE Add macrodefinitions from FILE. |
---|
97 | --action -a [u][p][g][P] Perform only indicated actions. |
---|
98 | u - uncompress with 'lzop -cd' |
---|
99 | p - preprocess |
---|
100 | g - grep |
---|
101 | P - postprocess |
---|
102 | (default pgP) |
---|
103 | --tags=STRING Morphosyntactic tag format. |
---|
104 | --command Print the shell command to be executed and exit. |
---|
105 | --help -h Help. |
---|
106 | END |
---|
107 | ; |
---|
108 | exit 0; |
---|
109 | } |
---|
110 | |
---|
111 | die("$0: no pattern given.\n") unless $pattern || $action !~ /g/; |
---|
112 | |
---|
113 | die("$0: macro file not found") unless |
---|
114 | $macrofile or |
---|
115 | -e "$LIB_DIR/terms.m4" and $macrofile="$LIB_DIR/terms.m4"; |
---|
116 | |
---|
117 | die("$0: undefined tagset format (tags option missing)") unless |
---|
118 | $tags; |
---|
119 | |
---|
120 | die("$0: $tags.tag2re program not found") unless |
---|
121 | 1; #JAK NAPISAC WARUNEK??? |
---|
122 | |
---|
123 | |
---|
124 | my $uncompress = ($action =~ /u/) ? ' lzop -cd | ' : ''; |
---|
125 | my $preproc = ($action =~ /p/) ? ' fla | ' : ''; |
---|
126 | |
---|
127 | my $postproc = ($action =~ /P/) ? ' | unfla ' : ''; |
---|
128 | |
---|
129 | |
---|
130 | # discarding spaces |
---|
131 | $pattern =~ s/\s+/\\`'/g; #` |
---|
132 | # quoting escaped commas |
---|
133 | $pattern =~ s/\\,/\\`\\`\\,''/g; |
---|
134 | # quoting commas in {m,n} r.e. operator |
---|
135 | $pattern =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g; |
---|
136 | |
---|
137 | my $grepre = `echo \"$pattern\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' --define=MORFIELD=$morfield $macrofile - 2>/dev/null`; |
---|
138 | |
---|
139 | die("Incorrect pattern (m4).") if $? >> 8; |
---|
140 | |
---|
141 | |
---|
142 | chomp $grepre; |
---|
143 | |
---|
144 | # <> expansion |
---|
145 | |
---|
146 | $grepre =~ s/<([^>]+)>/`echo $1 | $tags.tag2re`/ge; |
---|
147 | |
---|
148 | $grepre =~ s/\./[^ [:cntrl:]]/g; |
---|
149 | |
---|
150 | $grepre =~ s/\\s/[ ]/g; |
---|
151 | $grepre =~ s/\\S/[^ [:cntrl:]]/g; |
---|
152 | $grepre =~ s/\\d/[0-9]/g; |
---|
153 | $grepre =~ s/\\D/[^0-9 [:cntrl:]]/g; |
---|
154 | $grepre =~ s/\\w/[a-z±æê³ñ󶌿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_]/g; |
---|
155 | $grepre =~ s/\\W/[^a-z±æê³ñ󶌿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_ [:cntrl:]]/g; |
---|
156 | # extensions |
---|
157 | $grepre =~ s/\\l/[a-z±æê³ñ󶌿]/g; #lowercase letter |
---|
158 | $grepre =~ s/\\L/[A-Z¡ÆÊ£ÑÓŠ¬¯]/g; #upercase letter |
---|
159 | |
---|
160 | my $grep_command = ($action =~ /g/) ? "egrep '$grepre'" : " cat "; |
---|
161 | |
---|
162 | if($show_command) |
---|
163 | { |
---|
164 | print $grep_command."\n"; |
---|
165 | exit 0; |
---|
166 | } |
---|
167 | |
---|
168 | #print $preproc.$grep_command.$postproc."\n"; |
---|
169 | |
---|
170 | exec $preproc.$grep_command.$postproc; |
---|