source: src/compdic/compdic @ 243d027

Last change on this file since 243d027 was 555c7f8, checked in by Tomasz Obrebski <to@…>, 12 years ago

dodanie mo�liwo�ci szybkiego uaktualniania s�ownika dla lema

  • Property mode set to 100755
File size: 2.1 KB
Line 
1#!/bin/bash
2
3no_of_parts=0
4
5while [ $# -gt 2 ]
6do
7  case $1
8  in
9    -p)
10      no_of_parts=$2
11      shift 2
12    ;;
13
14    *)
15      echo "The arguments to use are"
16      echo "-p: number of parts"
17      shift 1
18    ;;
19  esac
20done
21
22if [ $# -lt 2 ]
23then
24    echo "Usage:"
25    echo "        compdic [-p <parts>] <wordlist> <automaton>"
26    echo "where"
27    echo "    <wordlist> - file containig a list of words, one per line, iso-8859-2 encoded"
28    echo "    <automaton> - a file to which the compiled automaton (cor/kor format) shoul be written"
29    exit 0
30fi     
31
32if [ $no_of_parts -eq 0 ]
33then
34    no_of_parts=$(( `cat $1 | wc -l` / 75000 + 1 ))
35fi
36
37
38echo number of parts: $no_of_parts
39
40
41tempdir=`mktemp -d /tmp/compdic.XXXXXX`
42
43alphabet=`tempfile -d $tempdir`
44
45cat <<EOF > $alphabet
46<eps> 0
47a 1
48A 2
49ä 3
50± 4
51¡ 5
52b 6
53B 7
54c 8
55C 9
56æ 10
57Æ 11
58d 12
59D 13
60e 14
61E 15
62é 16
63ê 17
64Ê 18
65f 19
66F 20
67g 21
68G 22
69h 23
70H 24
71i 25
72I 26
73j 27
74J 28
75k 29
76K 30
77l 31
78L 32
79³ 33
80£ 34
81m 35
82M 36
83n 37
84N 38
85ñ 39
86Ñ 40
87o 41
88O 42
89ö 43
90ó 44
91Ó 45
92p 46
93P 47
94q 48
95Q 49
96r 50
97R 51
98s 52
99S 53
100¶ 54
101Š 55
102t 56
103T 57
104u 58
105U 59
106ü 60
107v 61
108V 62
109w 63
110W 64
111x 65
112X 66
113y 67
114Y 68
115z 69
116Z 70
117Œ 71
118¬ 72
119¿ 73
120¯ 74
1210 75
1221 76
1232 77
1243 78
1254 79
1265 80
1276 81
1287 82
1298 83
1309 84
131_ 85
132- 86
133? 87
134! 88
135~ 89
136; 90
137, 91
138/ 92
139* 93
140+ 94
141EOF
142
143
144no_of_lines=$(( (`cat $1 | wc -l` / $no_of_parts) + 1 ))
145
146split -l $no_of_lines $1 $tempdir/part.
147
148automaton=$tempdir/output.fst
149
150cat <<EOF | fstcompile --acceptor --isymbols=$alphabet > $automaton
151EOF
152
153n=0
154
155for f in $tempdir/part.*
156do
157    temp1=`tempfile -d $tempdir`
158    temp2=`tempfile -d $tempdir`
159    temp3=`tempfile -d $tempdir`
160
161    n=$(( $n + 1 ))
162    echo processing part $n
163
164    cat $f |\
165    lst2fstext |\
166    fstcompile --acceptor --isymbols=$alphabet |\
167    fstrmepsilon |\
168    fstdeterminize > $temp1
169    fstminimize $temp1 $temp2
170
171    fstunion $automaton $temp2 | fstrmepsilon | fstdeterminize > $temp3
172    fstminimize $temp3 $automaton
173done
174
175echo generating binary automaton file ...
176
177cat $automaton | fsttopsort | fstprint --acceptor --isymbols=$alphabet | fsm2aut | aut2fsa > $2
178rm -r $tempdir
179
180echo generating cats file ...
181
182cat $1 | cut -d ',' -f 2 | sort -u $2.cats
Note: See TracBrowser for help on using the repository browser.