#!/usr/bin/perl

#package:	UAM Text Tools
#component:	tok (tokenizer)
#version:	1.0
#author:	Tomasz Obrebski

use strict;
use locale;
use Getopt::Long;
use File::HomeDir;

my $max_form_length = 50;

my $interactive=0;
my $help;

my $systemconfigfile='/usr/local/etc/utt/tok.conf';
#my $userconfigfile="$ENV{'HOME'}/.utt/tok.conf";
my $userconfigfile=home()."/.utt/tok.conf";

#read configuration files###########################
my $file;
foreach $file ($systemconfigfile, $userconfigfile){
	if(open(CONFIG, $file)){
		while (<CONFIG>) {
    			chomp;
    			s/#.*//;
			s/^\s+//;
    			s/\s+$//;
    			next unless length;
    			my ($name, $value) = split(/\s*=\s*/, $_, 2);
    			if(($name eq "interactive")or($name eq "i")){
				$interactive=1;
    			}
    			elsif(($name eq "help")or($name eq "h")){
				$help=1;
    			}
		}
		close CONFIG;
	}
}
#########################################################s

GetOptions("interactive|i" => \$interactive,
	   "help|h" => \$help);

if($help)
{
    print <<'END'
Usage: tok [OPTIONS]

Options:
   --interactive		Interactive (no output buffering).
   --help -h			Help.
END
;
    exit 0;
}


$| = $interactive;

my $offset = 0;

while(<>)
{
    1 while
	/ [[:alpha:]]+   (?{seg('W',$&)})
	| \d+            (?{seg('N',$&)})
	| \s+            (?{seg('S',$&)})
	| [[:punct:]]    (?{seg('P',$&)})
	| .		 (?{seg('B',$&)})
	/gxo;
}

#	| [^[:print:]]	 (?{seg("B",$&)})

sub min {
  my ($val1, $val2) = @_;
  if($val1 < $val2) {
  	return $val1;
  }
  else {
  	return $val2;
  }
}


sub seg
{
    my ($tag,$match) = @_;
    my $length = length $match;
    my $idx = 0;
    while($idx < $length) {
	my $l = min $max_form_length, $length - $idx;
	my $m = substr $match, $idx, $l;
    
	printf "%04d %02d %s ", $offset + $idx, $l, $tag;
    if($tag eq 'S')
    {
	for(my $i=0; $i<$l; ++$i)
	{
	    my $c = substr $m, $i, 1;
	    print '_' if $c eq ' ';
	    print '\n' if $c eq "\n";
	    print '\t' if $c eq "\t";
	    print '\r' if $c eq "\r";
	    print '\f' if $c eq "\f";
	}
    }
    elsif($tag eq 'B')
    {
	printf "\\x%02X", ord($m);
    }
    else
    {
	print $m;
    }
    print "\n";
    $idx += $l;
    } # while($idx < $length)
    $offset += $length;
} #sub seg

