source: src/compdic/compdic @ f4bf33e

Last change on this file since f4bf33e was abd28d1, checked in by Tomasz Obrebski <to@…>, 13 years ago

fixed further bugs around compdic
cats file renamed and moved
config files updated

  • Property mode set to 100755
File size: 2.1 KB
RevLine 
[e7de6cc]1
2no_of_parts=0
3
4while [ $# -gt 2 ]
5do
6  case $1
7  in
8    -p)
9      no_of_parts=$2
10      shift 2
11    ;;
12
13    *)
14      echo "The arguments to use are"
15      echo "-p: number of parts"
16      shift 1
17    ;;
18  esac
19done
20
21if [ $# -lt 2 ]
22then
23    echo "Usage:"
24    echo "        compdic [-p <parts>] <wordlist> <automaton>"
25    echo "where"
26    echo "    <wordlist> - file containig a list of words, one per line, iso-8859-2 encoded"
27    echo "    <automaton> - a file to which the compiled automaton (cor/kor format) shoul be written"
28    exit 0
29fi     
30
31if [ $no_of_parts -eq 0 ]
32then
33    no_of_parts=$(( `cat $1 | wc -l` / 75000 + 1 ))
34fi
35
36
37echo number of parts: $no_of_parts
38
39
40tempdir=`mktemp -d /tmp/compdic.XXXXXX`
41
42alphabet=`tempfile -d $tempdir`
43
44cat <<EOF > $alphabet
45<eps> 0
46a 1
47A 2
48ä 3
49± 4
50¡ 5
51b 6
52B 7
53c 8
54C 9
55æ 10
56Æ 11
57d 12
58D 13
59e 14
60E 15
61é 16
62ê 17
63Ê 18
64f 19
65F 20
66g 21
67G 22
68h 23
69H 24
70i 25
71I 26
72j 27
73J 28
74k 29
75K 30
76l 31
77L 32
78³ 33
79£ 34
80m 35
81M 36
82n 37
83N 38
84ñ 39
85Ñ 40
86o 41
87O 42
88ö 43
89ó 44
90Ó 45
91p 46
92P 47
93q 48
94Q 49
95r 50
96R 51
97s 52
98S 53
99¶ 54
100Š 55
101t 56
102T 57
103u 58
104U 59
105ü 60
106v 61
107V 62
108w 63
109W 64
110x 65
111X 66
112y 67
113Y 68
114z 69
115Z 70
116Œ 71
117¬ 72
118¿ 73
119¯ 74
1200 75
1211 76
1222 77
1233 78
1244 79
1255 80
1266 81
1277 82
1288 83
1299 84
130_ 85
131- 86
132? 87
133! 88
134~ 89
135; 90
136, 91
137/ 92
138* 93
139+ 94
140EOF
141
142
143no_of_lines=$(( (`cat $1 | wc -l` / $no_of_parts) + 1 ))
144
145split -l $no_of_lines $1 $tempdir/part.
146
147automaton=$tempdir/output.fst
148
149cat <<EOF | fstcompile --acceptor --isymbols=$alphabet > $automaton
150EOF
151
152n=0
153
154for f in $tempdir/part.*
155do
156    temp1=`tempfile -d $tempdir`
157    temp2=`tempfile -d $tempdir`
158    temp3=`tempfile -d $tempdir`
159
160    n=$(( $n + 1 ))
161    echo processing part $n
162
163    cat $f |\
164    lst2fstext |\
165    fstcompile --acceptor --isymbols=$alphabet |\
166    fstrmepsilon |\
167    fstdeterminize > $temp1
168    fstminimize $temp1 $temp2
169
170    fstunion $automaton $temp2 | fstrmepsilon | fstdeterminize > $temp3
171    fstminimize $temp3 $automaton
172done
173
[abd28d1]174echo generating binary automaton file ...
175
176cat $automaton | fsttopsort | fstprint --acceptor --isymbols=$alphabet | fsm2aut | aut2fsa > $2
[e7de6cc]177rm -r $tempdir
[abd28d1]178
179echo generating cats file ...
180
181cat $1 | cut -d ',' -f 2 | sort -u $2.cats
Note: See TracBrowser for help on using the repository browser.