source: src/compdic/compdic-dic-to-fst @ 0a58b3f

Last change on this file since 0a58b3f was f600a02, checked in by Tomasz Obrebski <obrebski@…>, 11 years ago

Bugs in build and installation process fixed, lem.bin and gram.dgp built while compilation

  • Property mode set to 100755
File size: 2.1 KB
Line 
1#! /bin/bash
2
3no_of_parts=0
4
5while [ $# -gt 2 ]
6do
7  case $1
8  in
9    -p)
10      no_of_parts=$2
11      shift 2
12    ;;
13
14    *)
15      echo "The arguments to use are"
16      echo "-p: number of parts"
17      shift 1
18    ;;
19  esac
20done
21
22if [ $# -lt 2 ]
23then
24    echo "Usage:"
25    echo "        compdic [-p <parts>] <sourcefile> <fst>"
26    echo "where"
27    echo "    <sourcefile> - file containig a list of words, one per line, iso-8859-2 encoded"
28    echo "    <dict>       - file to which the compiled automaton in openfst format will be written"
29    exit 0
30fi     
31
32
33source=$1
34fst=$2
35
36
37if [ $no_of_parts -eq 0 ]
38then
39    no_of_parts=$(( `cat $1 | wc -l` / 75000 + 1 ))
40fi
41
42
43echo number of parts: $no_of_parts
44
45
46tempdir=`mktemp -d /tmp/compdic.XXXXXX`
47
48alphabet=`tempfile -d $tempdir`
49
50cat <<EOF > $alphabet
51<eps> 0
52a 1
53A 2
54ä 3
55± 4
56¡ 5
57b 6
58B 7
59c 8
60C 9
61æ 10
62Æ 11
63d 12
64D 13
65e 14
66E 15
67é 16
68ê 17
69Ê 18
70f 19
71F 20
72g 21
73G 22
74h 23
75H 24
76i 25
77I 26
78j 27
79J 28
80k 29
81K 30
82l 31
83L 32
84³ 33
85£ 34
86m 35
87M 36
88n 37
89N 38
90ñ 39
91Ñ 40
92o 41
93O 42
94ö 43
95ó 44
96Ó 45
97p 46
98P 47
99q 48
100Q 49
101r 50
102R 51
103s 52
104S 53
105¶ 54
106Š 55
107t 56
108T 57
109u 58
110U 59
111ü 60
112v 61
113V 62
114w 63
115W 64
116x 65
117X 66
118y 67
119Y 68
120z 69
121Z 70
122Œ 71
123¬ 72
124¿ 73
125¯ 74
1260 75
1271 76
1282 77
1293 78
1304 79
1315 80
1326 81
1337 82
1348 83
1359 84
136_ 85
137- 86
138? 87
139! 88
140~ 89
141; 90
142, 91
143/ 92
144* 93
145+ 94
146Ö 95
147EOF
148
149
150no_of_lines=$(( (`cat $source | wc -l` / $no_of_parts) + 1 ))
151
152split -l $no_of_lines $source $tempdir/part.
153
154automaton=$tempdir/output.fst
155
156cat <<EOF | fstcompile --acceptor --isymbols=$alphabet > $automaton
157EOF
158
159n=0
160
161for f in $tempdir/part.*
162do
163    temp1=`tempfile -d $tempdir`
164    temp2=`tempfile -d $tempdir`
165    temp3=`tempfile -d $tempdir`
166
167    n=$(( $n + 1 ))
168    echo processing part $n
169
170    cat $f |\
171    lst2fstext |\
172    fstcompile --acceptor --isymbols=$alphabet |\
173    fstrmepsilon |\
174    fstdeterminize > $temp1
175    fstminimize $temp1 $temp2
176
177    fstunion $automaton $temp2 | fstrmepsilon | fstdeterminize > $temp3
178    fstminimize $temp3 $automaton
179done
180
181echo generating binary automaton file ...
182
183cat $automaton | fsttopsort > $fst
184rm -r $tempdir
185
186#echo generating cats file ...
187
188#cat $1 | cut -d ',' -f 2 | sort -u > $1.cats
Note: See TracBrowser for help on using the repository browser.