source: src/compdic/compdic @ c03f8a5

Last change on this file since c03f8a5 was c03f8a5, checked in by Tomasz Obrebski <obrebski@…>, 10 years ago

minor fixes

  • Property mode set to 100755
File size: 2.1 KB
Line 
1#!/bin/bash
2
3echo `basename $0` $@
4
5no_of_parts=0
6
7while [ $# -gt 2 ]
8do
9  case $1
10  in
11    -p)
12      no_of_parts=$2
13      shift 2
14    ;;
15
16    *)
17      echo "The arguments to use are"
18      echo "-p: number of parts"
19      shift 1
20    ;;
21  esac
22done
23
24if [ $# -lt 2 ]
25then
26    echo "Usage:"
27    echo "        compdic [-p <parts>] <wordlist> <automaton>"
28    echo "where"
29    echo "    <wordlist> - file containig a list of words, one per line, iso-8859-2 encoded"
30    echo "    <automaton> - a file to which the compiled automaton (cor/kor format) shoul be written"
31    exit 0
32fi     
33
34if [ $no_of_parts -eq 0 ]
35then
36    no_of_parts=$(( `cat $1 | wc -l` / 75000 + 1 ))
37fi
38
39
40echo number of parts: $no_of_parts
41
42tempdir=`mktemp -d /tmp/compdic.XXXXXX`
43
44alphabet=`tempfile -d $tempdir`
45
46cat <<EOF > $alphabet
47<eps> 0
48a 1
49A 2
50ä 3
51± 4
52¡ 5
53b 6
54B 7
55c 8
56C 9
57æ 10
58Æ 11
59d 12
60D 13
61e 14
62E 15
63é 16
64ê 17
65Ê 18
66f 19
67F 20
68g 21
69G 22
70h 23
71H 24
72i 25
73I 26
74j 27
75J 28
76k 29
77K 30
78l 31
79L 32
80³ 33
81£ 34
82m 35
83M 36
84n 37
85N 38
86ñ 39
87Ñ 40
88o 41
89O 42
90ö 43
91ó 44
92Ó 45
93p 46
94P 47
95q 48
96Q 49
97r 50
98R 51
99s 52
100S 53
101¶ 54
102Š 55
103t 56
104T 57
105u 58
106U 59
107ü 60
108v 61
109V 62
110w 63
111W 64
112x 65
113X 66
114y 67
115Y 68
116z 69
117Z 70
118Œ 71
119¬ 72
120¿ 73
121¯ 74
1220 75
1231 76
1242 77
1253 78
1264 79
1275 80
1286 81
1297 82
1308 83
1319 84
132_ 85
133- 86
134? 87
135! 88
136~ 89
137; 90
138, 91
139/ 92
140* 93
141+ 94
142Ö 95
143EOF
144
145
146no_of_lines=$(( (`cat $1 | wc -l` / $no_of_parts) + 1 ))
147
148split -l $no_of_lines $1 $tempdir/part.
149
150automaton=$tempdir/output.fst
151
152cat <<EOF | fstcompile --acceptor --isymbols=$alphabet > $automaton
153EOF
154
155n=0
156
157for f in $tempdir/part.*
158do
159    temp1=`tempfile -d $tempdir`
160    temp2=`tempfile -d $tempdir`
161    temp3=`tempfile -d $tempdir`
162
163    n=$(( $n + 1 ))
164    echo processing part $n
165
166    cat $f |\
167    lst2fstext |\
168    fstcompile --acceptor --isymbols=$alphabet |\
169    fstrmepsilon |\
170    fstdeterminize > $temp1
171    fstminimize $temp1 $temp2
172
173    fstunion $automaton $temp2 | fstrmepsilon | fstdeterminize > $temp3
174    fstminimize $temp3 $automaton
175done
176
177echo generating binary automaton file ...
178
179cat $automaton | fsttopsort | fstprint --acceptor --isymbols=$alphabet | fsm2aut | aut2fsa > $2
180rm -r $tempdir
181
182echo generating cats file ...
183
184cat $1 | cut -d ',' -f 2 | sort -u > $2.cats
Note: See TracBrowser for help on using the repository browser.