#!/usr/bin/env python # -*- coding: utf-8 -*- """----------------------------------------------------------------------------- # Name: text2FST # Purpose: A tool for translating a dictionary file into a OpenFST format. # # Author: Krzysztof Szarzynski # # Created: 19/11/2012 # Copyright: (c) UAM Text Tools 2012 # Licence: Simplified BSD License # Usage: # cat dictionary.dic | ./text2fst > output.fst # Warning: the dictionary.dic file must be UTF8 _without_ BOM # # TODO: Checking the BOM and removing it from the dictionary.file -----------------------------------------------------------------------------""" import sys import locale encoding = locale.getdefaultlocale()[1] def prn(str): sys.stdout.write(str.encode(encoding)) begState = 0 endState = 1 eps = u"" currentState = begState for line in sys.stdin: prn(u"%s %s %s\n"%(begState, currentState, eps)) line = line.decode('UTF-8') for letter in line: prn(u"%s %s %s\n"%(currentState, currentState+1, letter)) currentState+=1 prn(u"%s %s %s\n"%(currentState, endState, eps)) print endState