Last change
on this file was
e7de6cc,
checked in by Tomasz Obrebski <to@…>, 13 years ago
|
new version of dgp
added dgc, tre and compdic components
compiledic renamed to compdic_utf8
./configure updated
|
-
Property mode set to
100755
|
File size:
1.1 KB
|
Line | |
---|
1 | #!/usr/bin/env python |
---|
2 | # -*- coding: utf-8 -*- |
---|
3 | """----------------------------------------------------------------------------- |
---|
4 | # Name: text2FST |
---|
5 | # Purpose: A tool for translating a dictionary file into a OpenFST format. |
---|
6 | # |
---|
7 | # Author: Krzysztof Szarzynski <szarznyski.wmi.amu.edu.pl> |
---|
8 | # |
---|
9 | # Created: 19/11/2012 |
---|
10 | # Copyright: (c) UAM Text Tools 2012 |
---|
11 | # Licence: Simplified BSD License |
---|
12 | # Usage: |
---|
13 | # cat dictionary.dic | ./text2fst > output.fst |
---|
14 | # Warning: the dictionary.dic file must be UTF8 _without_ BOM |
---|
15 | # |
---|
16 | # TODO: Checking the BOM and removing it from the dictionary.file |
---|
17 | -----------------------------------------------------------------------------""" |
---|
18 | |
---|
19 | import sys |
---|
20 | import locale |
---|
21 | |
---|
22 | encoding = locale.getdefaultlocale()[1] |
---|
23 | |
---|
24 | def prn(str): |
---|
25 | sys.stdout.write(str.encode(encoding)) |
---|
26 | |
---|
27 | begState = 0 |
---|
28 | endState = 1 |
---|
29 | eps = u"<eps>" |
---|
30 | currentState = begState |
---|
31 | for line in sys.stdin: |
---|
32 | prn(u"%s %s %s\n"%(begState, currentState, eps)) |
---|
33 | line = line.decode('UTF-8') |
---|
34 | for letter in line: |
---|
35 | prn(u"%s %s %s\n"%(currentState, currentState+1, letter)) |
---|
36 | currentState+=1 |
---|
37 | prn(u"%s %s %s\n"%(currentState, endState, eps)) |
---|
38 | print endState |
---|
39 | |
---|
Note: See
TracBrowser
for help on using the repository browser.