Index: src/compdic/compdic-dic-to-fst
===================================================================
--- src/compdic/compdic-dic-to-fst	(revision 555c7f814bf87fa10240e98295372095bc422115)
+++ src/compdic/compdic-dic-to-fst	(revision 555c7f814bf87fa10240e98295372095bc422115)
@@ -0,0 +1,187 @@
+#! /bin/bash
+
+no_of_parts=0
+
+while [ $# -gt 2 ]
+do
+  case $1
+  in
+    -p)
+      no_of_parts=$2
+      shift 2
+    ;;
+
+    *)
+      echo "The arguments to use are"
+      echo "-p: number of parts"
+      shift 1
+    ;;
+  esac
+done
+
+if [ $# -lt 2 ]
+then
+    echo "Usage:"
+    echo "        compdic [-p <parts>] <sourcefile> <fst>"
+    echo "where"
+    echo "    <sourcefile> - file containig a list of words, one per line, iso-8859-2 encoded"
+    echo "    <dict>       - file to which the compiled automaton in openfst format will be written"
+    exit 0
+fi	
+
+
+source=$1
+fst=$2
+
+
+if [ $no_of_parts -eq 0 ]
+then
+    no_of_parts=$(( `cat $1 | wc -l` / 75000 + 1 ))
+fi
+
+
+echo number of parts: $no_of_parts
+
+
+tempdir=`mktemp -d /tmp/compdic.XXXXXX`
+
+alphabet=`tempfile -d $tempdir`
+
+cat <<EOF > $alphabet
+<eps> 0
+a 1
+A 2
+ä 3
+± 4
+¡ 5
+b 6
+B 7
+c 8
+C 9
+æ 10
+Æ 11
+d 12
+D 13
+e 14
+E 15
+é 16
+ê 17
+Ê 18
+f 19
+F 20
+g 21
+G 22
+h 23
+H 24
+i 25
+I 26
+j 27
+J 28
+k 29
+K 30
+l 31
+L 32
+³ 33
+£ 34
+m 35
+M 36
+n 37
+N 38
+ñ 39
+Ñ 40
+o 41
+O 42
+ö 43
+ó 44
+Ó 45
+p 46
+P 47
+q 48
+Q 49
+r 50
+R 51
+s 52
+S 53
+¶ 54
+Š 55
+t 56
+T 57
+u 58
+U 59
+ü 60
+v 61
+V 62
+w 63
+W 64
+x 65
+X 66
+y 67
+Y 68
+z 69
+Z 70
+Œ 71
+¬ 72
+¿ 73
+¯ 74
+0 75
+1 76
+2 77
+3 78
+4 79
+5 80
+6 81
+7 82
+8 83
+9 84
+_ 85
+- 86
+? 87
+! 88
+~ 89
+; 90
+, 91
+/ 92
+* 93
++ 94
+EOF
+
+
+no_of_lines=$(( (`cat $source | wc -l` / $no_of_parts) + 1 ))
+
+split -l $no_of_lines $source $tempdir/part.
+
+automaton=$tempdir/output.fst
+
+cat <<EOF | fstcompile --acceptor --isymbols=$alphabet > $automaton
+EOF
+
+n=0
+
+for f in $tempdir/part.*
+do
+    temp1=`tempfile -d $tempdir`
+    temp2=`tempfile -d $tempdir`
+    temp3=`tempfile -d $tempdir`
+
+    n=$(( $n + 1 ))
+    echo processing part $n
+
+    cat $f |\
+    lst2fstext |\
+    fstcompile --acceptor --isymbols=$alphabet |\
+    fstrmepsilon |\
+    fstdeterminize > $temp1
+    fstminimize $temp1 $temp2
+
+    fstunion $automaton $temp2 | fstrmepsilon | fstdeterminize > $temp3
+    fstminimize $temp3 $automaton
+done
+
+echo generating binary automaton file ...
+
+cat $automaton | fsttopsort > $fst
+rm -r $tempdir
+
+#echo generating cats file ...
+
+#cat $1 | cut -d ',' -f 2 | sort -u > $1.cats
