source: src/compdic/compdic-dic-to-fst @ c03f8a5

Last change on this file since c03f8a5 was acbabee, checked in by Tomasz Obrebski <obrebski@…>, 10 years ago

many changes, mainly dgp1 algorithm

  • Property mode set to 100755
File size: 2.1 KB
RevLine 
[555c7f8]1#! /bin/bash
2
3no_of_parts=0
[acbabee]4part_size=100000
[555c7f8]5
6while [ $# -gt 2 ]
7do
8  case $1
9  in
10    -p)
11      no_of_parts=$2
12      shift 2
13    ;;
14
15    *)
16      echo "The arguments to use are"
17      echo "-p: number of parts"
18      shift 1
19    ;;
20  esac
21done
22
23if [ $# -lt 2 ]
24then
25    echo "Usage:"
26    echo "        compdic [-p <parts>] <sourcefile> <fst>"
27    echo "where"
28    echo "    <sourcefile> - file containig a list of words, one per line, iso-8859-2 encoded"
29    echo "    <dict>       - file to which the compiled automaton in openfst format will be written"
30    exit 0
31fi     
32
33
34source=$1
35fst=$2
36
37
38if [ $no_of_parts -eq 0 ]
39then
[acbabee]40    no_of_parts=$(( `cat $1 | wc -l` / $part_size + 1 ))
[555c7f8]41fi
42
43
44echo number of parts: $no_of_parts
45
46
47tempdir=`mktemp -d /tmp/compdic.XXXXXX`
48
49alphabet=`tempfile -d $tempdir`
50
51cat <<EOF > $alphabet
52<eps> 0
53a 1
54A 2
55ä 3
56± 4
57¡ 5
58b 6
59B 7
60c 8
61C 9
62æ 10
63Æ 11
64d 12
65D 13
66e 14
67E 15
68é 16
69ê 17
70Ê 18
71f 19
72F 20
73g 21
74G 22
75h 23
76H 24
77i 25
78I 26
79j 27
80J 28
81k 29
82K 30
83l 31
84L 32
85³ 33
86£ 34
87m 35
88M 36
89n 37
90N 38
91ñ 39
92Ñ 40
93o 41
94O 42
95ö 43
96ó 44
97Ó 45
98p 46
99P 47
100q 48
101Q 49
102r 50
103R 51
104s 52
105S 53
106¶ 54
107Š 55
108t 56
109T 57
110u 58
111U 59
112ü 60
113v 61
114V 62
115w 63
116W 64
117x 65
118X 66
119y 67
120Y 68
121z 69
122Z 70
123Œ 71
124¬ 72
125¿ 73
126¯ 74
1270 75
1281 76
1292 77
1303 78
1314 79
1325 80
1336 81
1347 82
1358 83
1369 84
137_ 85
138- 86
139? 87
140! 88
141~ 89
142; 90
143, 91
144/ 92
145* 93
146+ 94
[f600a02]147Ö 95
[555c7f8]148EOF
149
150
151no_of_lines=$(( (`cat $source | wc -l` / $no_of_parts) + 1 ))
152
153split -l $no_of_lines $source $tempdir/part.
154
155automaton=$tempdir/output.fst
156
157cat <<EOF | fstcompile --acceptor --isymbols=$alphabet > $automaton
158EOF
159
160n=0
161
162for f in $tempdir/part.*
163do
164    temp1=`tempfile -d $tempdir`
165    temp2=`tempfile -d $tempdir`
166    temp3=`tempfile -d $tempdir`
167
168    n=$(( $n + 1 ))
169    echo processing part $n
170
171    cat $f |\
172    lst2fstext |\
173    fstcompile --acceptor --isymbols=$alphabet |\
174    fstrmepsilon |\
175    fstdeterminize > $temp1
176    fstminimize $temp1 $temp2
177
178    fstunion $automaton $temp2 | fstrmepsilon | fstdeterminize > $temp3
179    fstminimize $temp3 $automaton
180done
181
182echo generating binary automaton file ...
183
184cat $automaton | fsttopsort > $fst
185rm -r $tempdir
186
187#echo generating cats file ...
188
189#cat $1 | cut -d ',' -f 2 | sort -u > $1.cats
Note: See TracBrowser for help on using the repository browser.