source: src/compdic/compdic @ 854bece

Last change on this file since 854bece was 519eaf5, checked in by Tomasz Obrebski <obrebski@…>, 11 years ago

Bug fixes: bubbles,props

  • Property mode set to 100755
File size: 2.1 KB
RevLine 
[555c7f8]1#!/bin/bash
[e7de6cc]2
3no_of_parts=0
4
5while [ $# -gt 2 ]
6do
7  case $1
8  in
9    -p)
10      no_of_parts=$2
11      shift 2
12    ;;
13
14    *)
15      echo "The arguments to use are"
16      echo "-p: number of parts"
17      shift 1
18    ;;
19  esac
20done
21
22if [ $# -lt 2 ]
23then
24    echo "Usage:"
25    echo "        compdic [-p <parts>] <wordlist> <automaton>"
26    echo "where"
27    echo "    <wordlist> - file containig a list of words, one per line, iso-8859-2 encoded"
28    echo "    <automaton> - a file to which the compiled automaton (cor/kor format) shoul be written"
29    exit 0
30fi     
31
32if [ $no_of_parts -eq 0 ]
33then
34    no_of_parts=$(( `cat $1 | wc -l` / 75000 + 1 ))
35fi
36
37
38echo number of parts: $no_of_parts
39
40
41tempdir=`mktemp -d /tmp/compdic.XXXXXX`
42
43alphabet=`tempfile -d $tempdir`
44
45cat <<EOF > $alphabet
46<eps> 0
47a 1
48A 2
49ä 3
50± 4
51¡ 5
52b 6
53B 7
54c 8
55C 9
56æ 10
57Æ 11
58d 12
59D 13
60e 14
61E 15
62é 16
63ê 17
64Ê 18
65f 19
66F 20
67g 21
68G 22
69h 23
70H 24
71i 25
72I 26
73j 27
74J 28
75k 29
76K 30
77l 31
78L 32
79³ 33
80£ 34
81m 35
82M 36
83n 37
84N 38
85ñ 39
86Ñ 40
87o 41
88O 42
89ö 43
90ó 44
91Ó 45
92p 46
93P 47
94q 48
95Q 49
96r 50
97R 51
98s 52
99S 53
100¶ 54
101Š 55
102t 56
103T 57
104u 58
105U 59
106ü 60
107v 61
108V 62
109w 63
110W 64
111x 65
112X 66
113y 67
114Y 68
115z 69
116Z 70
117Œ 71
118¬ 72
119¿ 73
120¯ 74
1210 75
1221 76
1232 77
1243 78
1254 79
1265 80
1276 81
1287 82
1298 83
1309 84
131_ 85
132- 86
133? 87
134! 88
135~ 89
136; 90
137, 91
138/ 92
139* 93
140+ 94
[f600a02]141Ö 95
[e7de6cc]142EOF
143
144
145no_of_lines=$(( (`cat $1 | wc -l` / $no_of_parts) + 1 ))
146
147split -l $no_of_lines $1 $tempdir/part.
148
149automaton=$tempdir/output.fst
150
151cat <<EOF | fstcompile --acceptor --isymbols=$alphabet > $automaton
152EOF
153
154n=0
155
156for f in $tempdir/part.*
157do
158    temp1=`tempfile -d $tempdir`
159    temp2=`tempfile -d $tempdir`
160    temp3=`tempfile -d $tempdir`
161
162    n=$(( $n + 1 ))
163    echo processing part $n
164
165    cat $f |\
166    lst2fstext |\
167    fstcompile --acceptor --isymbols=$alphabet |\
168    fstrmepsilon |\
169    fstdeterminize > $temp1
170    fstminimize $temp1 $temp2
171
172    fstunion $automaton $temp2 | fstrmepsilon | fstdeterminize > $temp3
173    fstminimize $temp3 $automaton
174done
175
[abd28d1]176echo generating binary automaton file ...
177
178cat $automaton | fsttopsort | fstprint --acceptor --isymbols=$alphabet | fsm2aut | aut2fsa > $2
[e7de6cc]179rm -r $tempdir
[abd28d1]180
181echo generating cats file ...
182
[519eaf5]183cat $1 | cut -d ',' -f 2 | sort -u > $2.cats
Note: See TracBrowser for help on using the repository browser.