#! /bin/bash

no_of_parts=0
part_size=100000

while [ $# -gt 2 ]
do
  case $1
  in
    -p)
      no_of_parts=$2
      shift 2
    ;;

    *)
      echo "The arguments to use are"
      echo "-p: number of parts"
      shift 1
    ;;
  esac
done

if [ $# -lt 2 ]
then
    echo "Usage:"
    echo "        compdic [-p <parts>] <sourcefile> <fst>"
    echo "where"
    echo "    <sourcefile> - file containig a list of words, one per line, iso-8859-2 encoded"
    echo "    <dict>       - file to which the compiled automaton in openfst format will be written"
    exit 0
fi	


source=$1
fst=$2


if [ $no_of_parts -eq 0 ]
then
    no_of_parts=$(( `cat $1 | wc -l` / $part_size + 1 ))
fi


echo number of parts: $no_of_parts


tempdir=`mktemp -d /tmp/compdic.XXXXXX`

alphabet=`tempfile -d $tempdir`

cat <<EOF > $alphabet
<eps> 0
a 1
A 2
 3
 4
 5
b 6
B 7
c 8
C 9
 10
 11
d 12
D 13
e 14
E 15
 16
 17
 18
f 19
F 20
g 21
G 22
h 23
H 24
i 25
I 26
j 27
J 28
k 29
K 30
l 31
L 32
 33
 34
m 35
M 36
n 37
N 38
 39
 40
o 41
O 42
 43
 44
 45
p 46
P 47
q 48
Q 49
r 50
R 51
s 52
S 53
 54
 55
t 56
T 57
u 58
U 59
 60
v 61
V 62
w 63
W 64
x 65
X 66
y 67
Y 68
z 69
Z 70
 71
 72
 73
 74
0 75
1 76
2 77
3 78
4 79
5 80
6 81
7 82
8 83
9 84
_ 85
- 86
? 87
! 88
~ 89
; 90
, 91
/ 92
* 93
+ 94
 95 
EOF


no_of_lines=$(( (`cat $source | wc -l` / $no_of_parts) + 1 ))

split -l $no_of_lines $source $tempdir/part.

automaton=$tempdir/output.fst

cat <<EOF | fstcompile --acceptor --isymbols=$alphabet > $automaton
EOF

n=0

for f in $tempdir/part.*
do
    temp1=`tempfile -d $tempdir`
    temp2=`tempfile -d $tempdir`
    temp3=`tempfile -d $tempdir`

    n=$(( $n + 1 ))
    echo processing part $n

    cat $f |\
    lst2fstext |\
    fstcompile --acceptor --isymbols=$alphabet |\
    fstrmepsilon |\
    fstdeterminize > $temp1
    fstminimize $temp1 $temp2

    fstunion $automaton $temp2 | fstrmepsilon | fstdeterminize > $temp3
    fstminimize $temp3 $automaton
done

echo generating binary automaton file ...

cat $automaton | fsttopsort > $fst
rm -r $tempdir

#echo generating cats file ...

#cat $1 | cut -d ',' -f 2 | sort -u > $1.cats
