source: src/compdic/compdic-dic-to-fst @ e0cd003

Last change on this file since e0cd003 was 555c7f8, checked in by Tomasz Obrebski <to@…>, 12 years ago

dodanie mo�liwo�ci szybkiego uaktualniania s�ownika dla lema

  • Property mode set to 100755
File size: 2.1 KB
RevLine 
[555c7f8]1#! /bin/bash
2
3no_of_parts=0
4
5while [ $# -gt 2 ]
6do
7  case $1
8  in
9    -p)
10      no_of_parts=$2
11      shift 2
12    ;;
13
14    *)
15      echo "The arguments to use are"
16      echo "-p: number of parts"
17      shift 1
18    ;;
19  esac
20done
21
22if [ $# -lt 2 ]
23then
24    echo "Usage:"
25    echo "        compdic [-p <parts>] <sourcefile> <fst>"
26    echo "where"
27    echo "    <sourcefile> - file containig a list of words, one per line, iso-8859-2 encoded"
28    echo "    <dict>       - file to which the compiled automaton in openfst format will be written"
29    exit 0
30fi     
31
32
33source=$1
34fst=$2
35
36
37if [ $no_of_parts -eq 0 ]
38then
39    no_of_parts=$(( `cat $1 | wc -l` / 75000 + 1 ))
40fi
41
42
43echo number of parts: $no_of_parts
44
45
46tempdir=`mktemp -d /tmp/compdic.XXXXXX`
47
48alphabet=`tempfile -d $tempdir`
49
50cat <<EOF > $alphabet
51<eps> 0
52a 1
53A 2
54ä 3
55± 4
56¡ 5
57b 6
58B 7
59c 8
60C 9
61æ 10
62Æ 11
63d 12
64D 13
65e 14
66E 15
67é 16
68ê 17
69Ê 18
70f 19
71F 20
72g 21
73G 22
74h 23
75H 24
76i 25
77I 26
78j 27
79J 28
80k 29
81K 30
82l 31
83L 32
84³ 33
85£ 34
86m 35
87M 36
88n 37
89N 38
90ñ 39
91Ñ 40
92o 41
93O 42
94ö 43
95ó 44
96Ó 45
97p 46
98P 47
99q 48
100Q 49
101r 50
102R 51
103s 52
104S 53
105¶ 54
106Š 55
107t 56
108T 57
109u 58
110U 59
111ü 60
112v 61
113V 62
114w 63
115W 64
116x 65
117X 66
118y 67
119Y 68
120z 69
121Z 70
122Œ 71
123¬ 72
124¿ 73
125¯ 74
1260 75
1271 76
1282 77
1293 78
1304 79
1315 80
1326 81
1337 82
1348 83
1359 84
136_ 85
137- 86
138? 87
139! 88
140~ 89
141; 90
142, 91
143/ 92
144* 93
145+ 94
146EOF
147
148
149no_of_lines=$(( (`cat $source | wc -l` / $no_of_parts) + 1 ))
150
151split -l $no_of_lines $source $tempdir/part.
152
153automaton=$tempdir/output.fst
154
155cat <<EOF | fstcompile --acceptor --isymbols=$alphabet > $automaton
156EOF
157
158n=0
159
160for f in $tempdir/part.*
161do
162    temp1=`tempfile -d $tempdir`
163    temp2=`tempfile -d $tempdir`
164    temp3=`tempfile -d $tempdir`
165
166    n=$(( $n + 1 ))
167    echo processing part $n
168
169    cat $f |\
170    lst2fstext |\
171    fstcompile --acceptor --isymbols=$alphabet |\
172    fstrmepsilon |\
173    fstdeterminize > $temp1
174    fstminimize $temp1 $temp2
175
176    fstunion $automaton $temp2 | fstrmepsilon | fstdeterminize > $temp3
177    fstminimize $temp3 $automaton
178done
179
180echo generating binary automaton file ...
181
182cat $automaton | fsttopsort > $fst
183rm -r $tempdir
184
185#echo generating cats file ...
186
187#cat $1 | cut -d ',' -f 2 | sort -u > $1.cats
Note: See TracBrowser for help on using the repository browser.