1 | #!/usr/bin/perl |
---|
2 | |
---|
3 | #package: UAM Text Tools |
---|
4 | #component: tok (tokenizer) |
---|
5 | #version: 1.0 |
---|
6 | #author: Tomasz Obrebski |
---|
7 | |
---|
8 | use strict; |
---|
9 | use locale; |
---|
10 | use Getopt::Long; |
---|
11 | use File::HomeDir; |
---|
12 | |
---|
13 | my $interactive=0; |
---|
14 | my $help; |
---|
15 | |
---|
16 | my $systemconfigfile='/usr/local/etc/utt/tok.conf'; |
---|
17 | #my $userconfigfile="$ENV{'HOME'}/.utt/tok.conf"; |
---|
18 | my $userconfigfile=home()."/.utt/tok.conf"; |
---|
19 | |
---|
20 | #read configuration files########################### |
---|
21 | my $file; |
---|
22 | foreach $file ($systemconfigfile, $userconfigfile){ |
---|
23 | if(open(CONFIG, $configfile1)){ |
---|
24 | while (<CONFIG>) { |
---|
25 | chomp; |
---|
26 | s/#.*//; |
---|
27 | s/^\s+//; |
---|
28 | s/\s+$//; |
---|
29 | next unless length; |
---|
30 | my ($name, $value) = split(/\s*=\s*/, $_, 2); |
---|
31 | if(($name eq "interactive")or($name eq "i")){ |
---|
32 | $interactive=1; |
---|
33 | } |
---|
34 | elsif(($name eq "help")or($name eq "h")){ |
---|
35 | $help=1; |
---|
36 | } |
---|
37 | } |
---|
38 | close CONFIG; |
---|
39 | } |
---|
40 | } |
---|
41 | #########################################################s |
---|
42 | |
---|
43 | GetOptions("interactive|i" => \$interactive, |
---|
44 | "help|h" => \$help); |
---|
45 | |
---|
46 | if($help) |
---|
47 | { |
---|
48 | print <<'END' |
---|
49 | Usage: tok [OPTIONS] |
---|
50 | |
---|
51 | Options: |
---|
52 | --interactive Interactive (no output buffering). |
---|
53 | --help -h Help. |
---|
54 | END |
---|
55 | ; |
---|
56 | exit 0; |
---|
57 | } |
---|
58 | |
---|
59 | |
---|
60 | $| = $interactive; |
---|
61 | |
---|
62 | my $offset = 0; |
---|
63 | |
---|
64 | while(<>) |
---|
65 | { |
---|
66 | 1 while |
---|
67 | / [[:alpha:]]+ (?{seg('W',$&)}) |
---|
68 | | \d+ (?{seg('N',$&)}) |
---|
69 | | \s+ (?{seg('S',$&)}) |
---|
70 | | [[:punct:]] (?{seg('P',$&)}) |
---|
71 | | . (?{seg('B',$&)}) |
---|
72 | /gxo; |
---|
73 | } |
---|
74 | |
---|
75 | # | [^[:print:]] (?{seg("B",$&)}) |
---|
76 | |
---|
77 | |
---|
78 | sub seg |
---|
79 | { |
---|
80 | my ($tag,$match) = @_; |
---|
81 | my $len=length $match; |
---|
82 | printf "%04d %02d %s ", $offset, $len, $tag; |
---|
83 | if($tag eq 'S') |
---|
84 | { |
---|
85 | for(my $i=0; $i<$len; ++$i) |
---|
86 | { |
---|
87 | my $c = substr $match, $i, 1; |
---|
88 | print '_' if $c eq ' '; |
---|
89 | print '\n' if $c eq "\n"; |
---|
90 | print '\t' if $c eq "\t"; |
---|
91 | print '\r' if $c eq "\r"; |
---|
92 | print '\f' if $c eq "\f"; |
---|
93 | } |
---|
94 | } |
---|
95 | elsif($tag eq 'B') |
---|
96 | { |
---|
97 | printf "\\x%02X", ord($match); |
---|
98 | } |
---|
99 | else |
---|
100 | { |
---|
101 | print $match; |
---|
102 | } |
---|
103 | print "\n"; |
---|
104 | $offset += $len; |
---|
105 | } |
---|