#! /usr/bin/env perl

#package:	UAM Text Tools
#component:	compiledic
#version:	1.0
#author:	Tomasz Obrebski

use strict;
use locale;
use File::HomeDir;
use File::Basename;
use File::Temp;
use File::Copy;
use Getopt::Long;


my $linesPerFile = 20000;


my $systemconfigfile='/usr/local/etc/utt/compiledic.conf';
my $userconfigfile=home()."/.utt/compiledic.conf";

Getopt::Long::Configure('no_ignore_case_always');

#my $symfile='~/.utt/pl/pl_PL.iso-8859-2.sym';
my $symbols=0;
my $help=0;



#read configuration files###########################
my $file;
foreach $file ($systemconfigfile, $userconfigfile){
  if(open(CONFIG, $file)){
  	while (<CONFIG>) {
  		chomp;                  
      		s/#.*//;                
	      	s/^\s+//;               
      		s/\s+$//;               
    		next unless length;     
    		my ($name, $value) = split(/\s*=\s*/, $_, 2);
    		if($name eq "symbols"){
			$symbols=$value;
    		}
   		elsif(($name eq "help")or($name eq "h")){
			$help=1;
    		}
    	
	} 
  	close CONFIG;
  }
}
#########################################################

GetOptions("symbols=s" => \$symbols,
	   "help|h" => \$help);

if($help)
{
    print <<'END'
Usage: compiledic [OPTIONS] dictionaryfile

Options:
   --help -h                      Help.
   --symbols=FILE                 Symbol file.
END
;
    exit 0;
}

##################################################

-f $symbols or die("Symbol file not found.");

@ARGV > 0   or die("Source dictionary not given.");

my $file = shift;

-f $file or die("Source dictionary not found.");

$file =~ /(.*)\.dic/ or die("The input file must have .dic extension.");

my $filenameprefix = $1;

##################################################

# Tworzymy katalog tymczasowy, gdzie wszystko bedzie umieszczane.
my $tmp_root = File::Temp::tempdir( CLEANUP => 1 );

(undef, my $symfile) = File::Temp::tempfile( DIR => $tmp_root, SUFFIX => ".sym" );
my $symfilenoext=$symfile;
$symfilenoext =~ s/\.sym$//;
my $labfile = $symfilenoext . '.lab';
my $sclfile = $symfilenoext . '.scl';

copy($symbols, $symfile);

# Przygotowanie etykiet

`lexmakelab $symfilenoext`;


# Analiza pliku slownika

print "preparing file...........................................";
#print "... sed section .....\n";
(undef, my $file_sed) = File::Temp::tempfile( DIR => $tmp_root, SUFFIX => ".sed" );
`sed -r "s/([[:punct:]])/\[\\1\]/g" < $file > $file_sed`;

print "OK\n";

#dzielimy plik na wiele czesci, uruchamiamy lexcomplex dla kazdej
#czesci osobno, nastepnie laczymy to za pomoca programu fsmunion

#print "Dziele slownik na mniejsze czesci...";

open(IN, $file_sed);

my $lineCount = 0;
my $fileCount = 0;

open(FILE, ">$tmp_root/slo_$fileCount");

while (<IN>) {

    if (++$lineCount >= $linesPerFile) {
	$fileCount++;
	$lineCount = 0;

	close(FILE);
#	print "Tworze nowy plik tymczasowy: slo_".$fileCount."\n";
	open(FILE, ">$tmp_root/slo_".$fileCount);
    }

    print(FILE $_);
}

#print "OK\n";

print "building partial automata";

#32 kropki, fileCount plikow
my $filesPerDot = $fileCount/32;
my $files=$filesPerDot;
my $dots=0;

for (my $i=0; $i<=$fileCount; $i++) {

    if ($files >= $filesPerDot) {
	$files = 0;
	print ".";
	$dots++;
    }
    $files++;

    `lexcomplex -l $labfile -S $sclfile < $tmp_root/slo_$i > $tmp_root/slownik_$i.fsm`;

}
if ($dots < 32) {
    for (my $i=0; $i<32 - $dots; $i++) {
	print ".";
    }
}

print "OK\n";

unlink <$tmp_root/slo_*>;

print "building final automaton";

#35 kropek...
my $ndots=33;
$filesPerDot = $fileCount/$ndots;
$files=$filesPerDot;
$dots=0;

copy("$tmp_root/slownik_0.fsm", "$tmp_root/slownik1.fsm");

for (my $i=1; $i<=$fileCount; $i++) {

    if ($files >= $filesPerDot) {
	$files = 0;
	print ".";
	$dots++;
    }
    $files++;

    `fsmunion $tmp_root/slownik_$i slownik1.fsm > $tmp_root/slownik2.fsm`;

    move("$tmp_root/slownik2.fsm", "$tmp_root/slownik1.fsm") || die "Unable to move $tmp_root/slownik2.fsm!\n";
}

if ($dots < $ndots) {
    for (my $i=0; $i<$ndots - $dots; $i++) {
	print ".";
    }
}

#`fsmunion $tmp_root/* > $tmp_root/slownik1.fsm`;

print "OK\n";

print "removing epsilon-transitions.............................";

`fsmrmepsilon $tmp_root/slownik1.fsm > $tmp_root/slownik2.fsm`;

unlink ("$tmp_root/slownik1.fsm");

print "OK\n";

print "determinizing automaton..................................";

`fsmdeterminize $tmp_root/slownik2.fsm > $tmp_root/slownik1.fsm`;

unlink ("$tmp_root/slownik2.fsm");

print "OK\n";

print "minimizing automaton.....................................";

`fsmminimize $tmp_root/slownik1.fsm > $tmp_root/slownik.fsm`;

#`rm slownik1.fsm`;

print "OK\n";

print "converting fsm format to bin.............................";

`fsmprint -i $labfile $tmp_root/slownik.fsm > $tmp_root/slownik.txt`;

`fsm2aut $tmp_root/slownik.txt > $tmp_root/slownik.aut`;

`aut2fsa < $tmp_root/slownik.aut > $filenameprefix.bin`;

print "OK\n";

print "removing temporary files.................................";

unlink <$tmp_root/*>;
unlink ($tmp_root);

print "OK\n";
