1 | #!/usr/bin/perl |
---|
2 | |
---|
3 | #package: UAM Text Tools |
---|
4 | #component: tok (tokenizer) |
---|
5 | #version: 1.0 |
---|
6 | #author: Tomasz Obrebski |
---|
7 | |
---|
8 | use strict; |
---|
9 | use locale; |
---|
10 | use Getopt::Long; |
---|
11 | use File::HomeDir; |
---|
12 | |
---|
13 | my $max_form_length = 50; |
---|
14 | |
---|
15 | my $interactive=0; |
---|
16 | my $help; |
---|
17 | |
---|
18 | my $systemconfigfile='/usr/local/etc/utt/tok.conf'; |
---|
19 | #my $userconfigfile="$ENV{'HOME'}/.utt/tok.conf"; |
---|
20 | my $userconfigfile=home()."/.utt/tok.conf"; |
---|
21 | |
---|
22 | #read configuration files########################### |
---|
23 | my $file; |
---|
24 | foreach $file ($systemconfigfile, $userconfigfile){ |
---|
25 | if(open(CONFIG, $file)){ |
---|
26 | while (<CONFIG>) { |
---|
27 | chomp; |
---|
28 | s/#.*//; |
---|
29 | s/^\s+//; |
---|
30 | s/\s+$//; |
---|
31 | next unless length; |
---|
32 | my ($name, $value) = split(/\s*=\s*/, $_, 2); |
---|
33 | if(($name eq "interactive")or($name eq "i")){ |
---|
34 | $interactive=1; |
---|
35 | } |
---|
36 | elsif(($name eq "help")or($name eq "h")){ |
---|
37 | $help=1; |
---|
38 | } |
---|
39 | } |
---|
40 | close CONFIG; |
---|
41 | } |
---|
42 | } |
---|
43 | #########################################################s |
---|
44 | |
---|
45 | GetOptions("interactive|i" => \$interactive, |
---|
46 | "help|h" => \$help); |
---|
47 | |
---|
48 | if($help) |
---|
49 | { |
---|
50 | print <<'END' |
---|
51 | Usage: tok [OPTIONS] |
---|
52 | |
---|
53 | Options: |
---|
54 | --interactive Interactive (no output buffering). |
---|
55 | --help -h Help. |
---|
56 | END |
---|
57 | ; |
---|
58 | exit 0; |
---|
59 | } |
---|
60 | |
---|
61 | |
---|
62 | $| = $interactive; |
---|
63 | |
---|
64 | my $offset = 0; |
---|
65 | |
---|
66 | while(<>) |
---|
67 | { |
---|
68 | 1 while |
---|
69 | / [[:alpha:]]+ (?{seg('W',$&)}) |
---|
70 | | \d+ (?{seg('N',$&)}) |
---|
71 | | \s+ (?{seg('S',$&)}) |
---|
72 | | [[:punct:]] (?{seg('P',$&)}) |
---|
73 | | . (?{seg('B',$&)}) |
---|
74 | /gxo; |
---|
75 | } |
---|
76 | |
---|
77 | # | [^[:print:]] (?{seg("B",$&)}) |
---|
78 | |
---|
79 | sub min { |
---|
80 | my ($val1, $val2) = @_; |
---|
81 | if($val1 < $val2) { |
---|
82 | return $val1; |
---|
83 | } |
---|
84 | else { |
---|
85 | return $val2; |
---|
86 | } |
---|
87 | } |
---|
88 | |
---|
89 | |
---|
90 | sub seg |
---|
91 | { |
---|
92 | my ($tag,$match) = @_; |
---|
93 | my $length = length $match; |
---|
94 | my $idx = 0; |
---|
95 | while($idx < $length) { |
---|
96 | my $l = min $max_form_length, $length - $idx; |
---|
97 | my $m = substr $match, $idx, $l; |
---|
98 | |
---|
99 | printf "%04d %02d %s ", $offset + $idx, $l, $tag; |
---|
100 | if($tag eq 'S') |
---|
101 | { |
---|
102 | for(my $i=0; $i<$l; ++$i) |
---|
103 | { |
---|
104 | my $c = substr $m, $i, 1; |
---|
105 | print '_' if $c eq ' '; |
---|
106 | print '\n' if $c eq "\n"; |
---|
107 | print '\t' if $c eq "\t"; |
---|
108 | print '\r' if $c eq "\r"; |
---|
109 | print '\f' if $c eq "\f"; |
---|
110 | } |
---|
111 | } |
---|
112 | elsif($tag eq 'B') |
---|
113 | { |
---|
114 | printf "\\x%02X", ord($m); |
---|
115 | } |
---|
116 | else |
---|
117 | { |
---|
118 | print $m; |
---|
119 | } |
---|
120 | print "\n"; |
---|
121 | $idx += $l; |
---|
122 | } # while($idx < $length) |
---|
123 | $offset += $length; |
---|
124 | } #sub seg |
---|
125 | |
---|