source: src/compdic_utf8/text2fst.py

Last change on this file was e7de6cc, checked in by Tomasz Obrebski <to@…>, 12 years ago

new version of dgp
added dgc, tre and compdic components
compiledic renamed to compdic_utf8
./configure updated

  • Property mode set to 100755
File size: 1.1 KB
Line 
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3"""-----------------------------------------------------------------------------
4# Name:         text2FST     
5# Purpose:      A tool for translating a dictionary file into a OpenFST format.
6#
7# Author:      Krzysztof Szarzynski <szarznyski.wmi.amu.edu.pl>
8#
9# Created:     19/11/2012
10# Copyright:   (c) UAM Text Tools 2012
11# Licence:     Simplified BSD License
12# Usage:
13#       cat dictionary.dic | ./text2fst > output.fst
14# Warning: the dictionary.dic file must be UTF8 _without_ BOM
15#
16# TODO: Checking the BOM and removing it from the dictionary.file
17-----------------------------------------------------------------------------"""
18
19import sys
20import locale
21
22encoding = locale.getdefaultlocale()[1]
23
24def prn(str):
25    sys.stdout.write(str.encode(encoding))
26
27begState = 0
28endState = 1
29eps = u"<eps>"
30currentState = begState
31for line in sys.stdin:
32    prn(u"%s %s %s\n"%(begState, currentState, eps))
33    line = line.decode('UTF-8')
34    for letter in line:
35        prn(u"%s %s %s\n"%(currentState, currentState+1, letter))
36        currentState+=1
37    prn(u"%s %s %s\n"%(currentState, endState, eps))
38print endState
39   
Note: See TracBrowser for help on using the repository browser.