Last change
on this file since f4bf33e was
e7de6cc,
checked in by Tomasz Obrebski <to@…>, 13 years ago
|
new version of dgp
added dgc, tre and compdic components
compiledic renamed to compdic_utf8
./configure updated
|
-
Property mode set to
100755
|
File size:
1.1 KB
|
Rev | Line | |
---|
[5f4d9c3] | 1 | #!/usr/bin/env python |
---|
| 2 | # -*- coding: utf-8 -*- |
---|
| 3 | """----------------------------------------------------------------------------- |
---|
| 4 | # Name: text2FST |
---|
| 5 | # Purpose: A tool for translating a dictionary file into a OpenFST format. |
---|
| 6 | # |
---|
| 7 | # Author: Krzysztof Szarzynski <szarznyski.wmi.amu.edu.pl> |
---|
| 8 | # |
---|
| 9 | # Created: 19/11/2012 |
---|
| 10 | # Copyright: (c) UAM Text Tools 2012 |
---|
| 11 | # Licence: Simplified BSD License |
---|
| 12 | # Usage: |
---|
| 13 | # cat dictionary.dic | ./text2fst > output.fst |
---|
| 14 | # Warning: the dictionary.dic file must be UTF8 _without_ BOM |
---|
| 15 | # |
---|
| 16 | # TODO: Checking the BOM and removing it from the dictionary.file |
---|
| 17 | -----------------------------------------------------------------------------""" |
---|
| 18 | |
---|
| 19 | import sys |
---|
| 20 | import locale |
---|
| 21 | |
---|
| 22 | encoding = locale.getdefaultlocale()[1] |
---|
| 23 | |
---|
| 24 | def prn(str): |
---|
| 25 | sys.stdout.write(str.encode(encoding)) |
---|
| 26 | |
---|
| 27 | begState = 0 |
---|
| 28 | endState = 1 |
---|
| 29 | eps = u"<eps>" |
---|
| 30 | currentState = begState |
---|
| 31 | for line in sys.stdin: |
---|
| 32 | prn(u"%s %s %s\n"%(begState, currentState, eps)) |
---|
| 33 | line = line.decode('UTF-8') |
---|
| 34 | for letter in line: |
---|
| 35 | prn(u"%s %s %s\n"%(currentState, currentState+1, letter)) |
---|
| 36 | currentState+=1 |
---|
| 37 | prn(u"%s %s %s\n"%(currentState, endState, eps)) |
---|
| 38 | print endState |
---|
| 39 | |
---|
Note: See
TracBrowser
for help on using the repository browser.