|
Last change
on this file since ac25afd was
5f4d9c3,
checked in by Maciej Prill <mprill@…>, 14 years ago
|
|
Rewritten the build system, added lem UTF-8 version.
|
-
Property mode set to
100755
|
|
File size:
1.1 KB
|
| Line | |
|---|
| 1 | #!/usr/bin/env python |
|---|
| 2 | # -*- coding: utf-8 -*- |
|---|
| 3 | """----------------------------------------------------------------------------- |
|---|
| 4 | # Name: text2FST |
|---|
| 5 | # Purpose: A tool for translating a dictionary file into a OpenFST format. |
|---|
| 6 | # |
|---|
| 7 | # Author: Krzysztof Szarzynski <szarznyski.wmi.amu.edu.pl> |
|---|
| 8 | # |
|---|
| 9 | # Created: 19/11/2012 |
|---|
| 10 | # Copyright: (c) UAM Text Tools 2012 |
|---|
| 11 | # Licence: Simplified BSD License |
|---|
| 12 | # Usage: |
|---|
| 13 | # cat dictionary.dic | ./text2fst > output.fst |
|---|
| 14 | # Warning: the dictionary.dic file must be UTF8 _without_ BOM |
|---|
| 15 | # |
|---|
| 16 | # TODO: Checking the BOM and removing it from the dictionary.file |
|---|
| 17 | -----------------------------------------------------------------------------""" |
|---|
| 18 | |
|---|
| 19 | import sys |
|---|
| 20 | import locale |
|---|
| 21 | |
|---|
| 22 | encoding = locale.getdefaultlocale()[1] |
|---|
| 23 | |
|---|
| 24 | def prn(str): |
|---|
| 25 | sys.stdout.write(str.encode(encoding)) |
|---|
| 26 | |
|---|
| 27 | begState = 0 |
|---|
| 28 | endState = 1 |
|---|
| 29 | eps = u"<eps>" |
|---|
| 30 | currentState = begState |
|---|
| 31 | for line in sys.stdin: |
|---|
| 32 | prn(u"%s %s %s\n"%(begState, currentState, eps)) |
|---|
| 33 | line = line.decode('UTF-8') |
|---|
| 34 | for letter in line: |
|---|
| 35 | prn(u"%s %s %s\n"%(currentState, currentState+1, letter)) |
|---|
| 36 | currentState+=1 |
|---|
| 37 | prn(u"%s %s %s\n"%(currentState, endState, eps)) |
|---|
| 38 | print endState |
|---|
| 39 | |
|---|
Note: See
TracBrowser
for help on using the repository browser.