source: _old/app/src/compdic/compdic @ 8f395c8

Last change on this file since 8f395c8 was 8f395c8, checked in by tom <tom@…>, 13 years ago

poprawki w komponencie compdic

  • Property mode set to 100755
File size: 2.2 KB
RevLine 
[93afab8]1
2no_of_parts=0
3
4while [ $# -gt 2 ]
5do
6  case $1
7  in
8    -p)
9      no_of_parts=$2
10      shift 2
11    ;;
12
13    *)
14      echo "The arguments to use are"
15      echo "-p: number of parts"
16      shift 1
17    ;;
18  esac
19done
20
21if [ $# -lt 2 ]
22then
[8f395c8]23    echo
24    echo "compdic is a tool to compile lists of automaton paths (words) into the automaton format"
25    echo "suitable for use with lem, gue, cor, and kor components"
26    echo
[93afab8]27    echo "Usage:"
28    echo "        compdic [-p <parts>] <wordlist> <automaton>"
29    echo "where"
30    echo "    <wordlist> - file containig a list of words, one per line, iso-8859-2 encoded"
[8f395c8]31    echo "    <automaton> - a file to which the compiled automaton (lem/gue/cor/kor format) should be written"
[93afab8]32    exit 0
33fi     
34
35if [ $no_of_parts -eq 0 ]
36then
37    no_of_parts=$(( `cat $1 | wc -l` / 75000 + 1 ))
38fi
39
40
41echo number of parts: $no_of_parts
42
43
44tempdir=`mktemp -d /tmp/compdic.XXXXXX`
45
46alphabet=`tempfile -d $tempdir`
47
48cat <<EOF > $alphabet
49<eps> 0
50a 1
51A 2
52ä 3
53± 4
54¡ 5
55b 6
56B 7
57c 8
58C 9
59æ 10
60Æ 11
61d 12
62D 13
63e 14
64E 15
65é 16
66ê 17
67Ê 18
68f 19
69F 20
70g 21
71G 22
72h 23
73H 24
74i 25
75I 26
76j 27
77J 28
78k 29
79K 30
80l 31
81L 32
82³ 33
83£ 34
84m 35
85M 36
86n 37
87N 38
88ñ 39
89Ñ 40
90o 41
91O 42
92ö 43
93ó 44
94Ó 45
95p 46
96P 47
97q 48
98Q 49
99r 50
100R 51
101s 52
102S 53
103¶ 54
104Š 55
105t 56
106T 57
107u 58
108U 59
109ü 60
110v 61
111V 62
112w 63
113W 64
114x 65
115X 66
116y 67
117Y 68
118z 69
119Z 70
120Œ 71
121¬ 72
122¿ 73
123¯ 74
1240 75
1251 76
1262 77
1273 78
1284 79
1295 80
1306 81
1317 82
1328 83
1339 84
134_ 85
135- 86
136? 87
137! 88
138~ 89
139; 90
140, 91
141/ 92
142* 93
143+ 94
144EOF
145
146
147no_of_lines=$(( (`cat $1 | wc -l` / $no_of_parts) + 1 ))
148
149split -l $no_of_lines $1 $tempdir/part.
150
151automaton=$tempdir/output.fst
152
153cat <<EOF | fstcompile --acceptor --isymbols=$alphabet --keep_isymbols > $automaton
154EOF
155
156n=0
157
158for f in $tempdir/part.*
159do
160    temp1=`tempfile -d $tempdir`
161    temp2=`tempfile -d $tempdir`
162    temp3=`tempfile -d $tempdir`
163
164    n=$(( $n + 1 ))
165    echo processing part $n
166
167    cat $f |\
168    lst2fstext |\
169    fstcompile --acceptor --isymbols=$alphabet --keep_isymbols |\
170    fstrmepsilon |\
171    fstdeterminize > $temp1
172    fstminimize $temp1 $temp2
173
174    fstunion $automaton $temp2 | fstrmepsilon | fstdeterminize > $temp3
175    fstminimize $temp3 $automaton
176done
177
178cat $automaton | fsttopsort | fstprint --acceptor | fsm2aut | aut2fsa > $2
179rm -r $tempdir
Note: See TracBrowser for help on using the repository browser.