- Timestamp:
- 12/11/08 22:20:14 (16 years ago)
- Branches:
- master, help
- Children:
- 2d89d4b
- Parents:
- 91ed676
- git-author:
- obrebski <obrebski@…> (12/11/08 22:20:14)
- git-committer:
- obrebski <obrebski@…> (12/11/08 22:20:14)
- Location:
- app
- Files:
-
- 8 edited
Legend:
- Unmodified
- Added
- Removed
-
app/doc/utt.texinfo
re28a625 r9ace5d2 1 1 2 \input texinfo @c -*-texinfo-*- 2 @documentencoding ISO-8859-2 3 @c @documentencoding ISO-8859-2 4 @documentencoding UTF-8 3 5 @c @documentlanguage pl 4 6 … … 11 13 This manual is for UAM Text Tools (version 0.90, October, 2008) 12 14 13 Copyright @copyright{} 2005, 2007 Tomasz Obr êbski, Micha³ Stolarski, Justyna Walkowska, Pawe³Konieczka.15 Copyright @copyright{} 2005, 2007 Tomasz ObrÄbski, MichaÅ Stolarski, Justyna Walkowska, PaweÅ Konieczka. 14 16 15 17 Permission is granted to copy, distribute and/or modify this document … … 31 33 @subtitle edition 0.01, @today 32 34 @subtitle status: prescript 33 @author by Justyna Walkowska, Tomasz Obr @,{}ebski and Micha@l{}Stolarski35 @author by Justyna Walkowska, Tomasz ObrÄbski and MichaÅ Stolarski 34 36 @page 35 37 @vskip 0pt plus 1filll … … 42 44 43 45 @iftex 46 @tex 47 % \usepackage[T1]{fontenc} 48 % \usepackage[utf8]{inputenc} 49 % \usepackage{times} 50 @end tex 51 44 52 @parskip = 0.5@normalbaselineskip plus 3pt minus 1pt 45 53 @end iftex 46 47 54 @c @headings off 48 55 @c @everyheading LEM(1) @| @| LEM(1) … … 84 91 85 92 @item 86 tokenization 93 tokenization óÅÄ 94 ÅŒ 87 95 @item 88 96 dictionary-based morphological analysis … … 90 98 heuristic morphological analysis of unknown words 91 99 @item 92 spelling correction 100 spelling correction óÅÄ 101 ÅÄÅŒ 93 102 @item 94 103 pattern search … … 125 134 @itemize 126 135 @item Pawel Konieczka 127 @item Tomasz Obr ebski128 @item Micha lStolarski136 @item Tomasz ObrÄbski 137 @item MichaÅ Stolarski 129 138 @item Marcin Walas 130 139 @item Justyna Walkowska 131 @item Pawe l Werenski140 @item PaweÅ WereÅski 132 141 @end itemize 133 142 … … 251 260 @example 252 261 0000 00 BOS * 253 0000 07 W Piszemy lem:pisa Ê,V262 0000 07 W Piszemy lem:pisaÄ,V 254 263 0007 01 S _ 255 264 0008 05 W dobre lem:dobry,ADJ … … 262 271 0024 11 W Warszawiacy lem:Warszawiak,N 263 272 0035 01 S _ 264 0036 03 W te ¿273 0036 03 W teÅŒ 265 274 0039 01 P . 266 275 0040 00 EOS * … … 270 279 @example 271 280 0000 BOS * 272 0000 W Piszemy lem:pisa Ê,V281 0000 W Piszemy lem:pisaÄ,V 273 282 0007 S _ 274 283 0008 W dobre lem:dobry,ADJ … … 283 292 @example 284 293 0000 BOS * 285 W Piszemy lem:pisa Ê,V294 W Piszemy lem:pisaÄÂ,V 286 295 S _ 287 296 W dobre lem:dobry,ADJ … … 294 303 W Warszawiacy lem:Warszawiak,N 295 304 S _ 296 W te ¿305 W teÅŒ 297 306 P . 298 307 EOS * … … 429 438 430 439 431 @c [JAK UZYSKA à POLSKIE CZCIONKI W DVI???]440 @c [JAK UZYSKAÄ POLSKIE CZCIONKI W DVI???] 432 441 433 442 @macro parhelp … … 651 660 652 661 @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} 653 @item @strong{Authors:} @tab Tomasz Obr êbski662 @item @strong{Authors:} @tab Tomasz ObrÄbski 654 663 @item @strong{Component category:} @tab source 655 664 @item @strong{Input format:} @tab raw text file … … 756 765 @c @chapter sen - sentencizer 757 766 758 @c Authors: Tomasz Obr êbski767 @c Authors: Tomasz ObrÄbski 759 768 760 769 @c --------------------------------------------------------------------- … … 767 776 768 777 @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} 769 @item @strong{Authors:} @tab Tomasz Obr êbski, Micha³Stolarski778 @item @strong{Authors:} @tab Tomasz ObrÄbski, MichaÅ Stolarski 770 779 @item @strong{Component category:} @tab filter 771 780 @item @strong{Input format:} @tab UTT regular … … 871 880 872 881 @example 873 0000 07 W Piszemy lem:pisa Ê,V/AiVpMdTrfNpP1882 0000 07 W Piszemy lem:pisaÄ,V/AiVpMdTrfNpP1 874 883 0007 01 B _ 875 884 0008 05 W dobre lem:dobry,ADJ/DpNpCnavGaifn … … 886 895 887 896 @example 888 0000 07 W Piszemy lem:pisa Ê,V/AiVpMdTrfNpP1897 0000 07 W Piszemy lem:pisaÄ,V/AiVpMdTrfNpP1 889 898 0007 01 S _ 890 899 0008 05 W dobre lem:dobry,ADJ/DpNpCnavGaifn lem:dobry,ADJ/DpNsCnavGn … … 898 907 899 908 @example 900 0000 07 W Piszemy lem:pisa Ê,V/AiVpMdTrfNpP1909 0000 07 W Piszemy lem:pisaÄ,V/AiVpMdTrfNpP1 901 910 0007 01 S _ 902 911 0008 05 W dobre lem:dobry,ADJ/DpNpCnavGaifn,ADJ/DpNsCnavGn … … 932 941 string @code{<add1>}, replace suffix of length @code{<cut2>} with string 933 942 @code{<add2>}. For example @code{3t} transforms @samp{kocie} into 934 @samp{kot}, @code{3-4a ³y} transforms @samp{najbielsi} into @samp{bia³y}943 @samp{kot}, @code{3-4aÃÅy} transforms @samp{najbielsi} into @samp{biaÃÅy} 935 944 936 945 Each dictionary entry must be written in one line and must not contain blank characters. … … 943 952 kotem;2,N/GaNsCi 944 953 kocie;3t,N/GaNsCl;3t,N/GaNsCv 945 najbielsi;3-4a ³y,ADJ/DsNpCnGp946 najbielsze;3-5a ³y,ADJ/DsNpCnGaifn954 najbielsi;3-4aÅy,ADJ/DsNpCnGp 955 najbielsze;3-5aÅy,ADJ/DsNpCnGaifn 947 956 najlepsi;dobry,ADJ/DsNpCnGp 948 957 najlepsze;dobry,ADJ/DsNpCnGaifn … … 1009 1018 @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} 1010 1019 1011 @item @strong{Authors:} @tab Micha ³ Stolarski, Tomasz Obrêbski1020 @item @strong{Authors:} @tab MichaÅ Stolarski, Tomasz ObrÄbski 1012 1021 @item @strong{Component category:} @tab filter 1013 1022 … … 1106 1115 1107 1116 1108 Example: @code{3-4a ³y} transforms @i{najbielsi} into @i{bia³y}1117 Example: @code{3-4aÅy} transforms @i{najbielsi} into @i{biaÅy} 1109 1118 1110 1119 … … 1114 1123 likelihood of the guess. 1115 1124 1116 @ example1117 *³kê;1a,N/GfNsCa1118 naj*elszy;3-4a³y,ADJ/...:...1119 @ end example1125 @c @example 1126 @c *ÅkÄ;1a,N/GfNsCa 1127 @c naj*elszy;3-4aÅy,ADJ/...:... 1128 @c @end example 1120 1129 1121 1130 … … 1129 1138 1130 1139 @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} 1131 @item @strong{Authors:} @tab Tomasz Obr êbski, Micha³Stolarski1140 @item @strong{Authors:} @tab Tomasz ObrÄbski, MichaÅ Stolarski 1132 1141 @item @strong{Component category:} @tab filter 1133 1142 @item @strong{Input format:} @tab UTT regular … … 1216 1225 @section kor - configurable spelling corrector 1217 1226 1218 [TODO] 1227 @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} 1228 @item @strong{Authors:} @tab PaweÅ Werenski, Tomasz ObrÄbski, MichaÅ Stolarski 1229 @item @strong{Component category:} @tab filter 1230 @item @strong{Input format:} @tab UTT regular 1231 @item @strong{Output format:} @tab UTT regular 1232 @item @strong{Required annotation:} @tab tok 1233 @end multitable 1234 1235 @menu 1236 * kor description:: 1237 * kor command line options:: 1238 * kor weights definition file:: 1239 * kor dictionaries:: 1240 @end menu 1241 1242 1243 @node kor description 1244 @subsection Description 1245 1246 The spelling corrector applies a Pawel Werenski's dynamic programming 1247 algorithm to the FSA representation of the set of word forms of the 1248 Polex/PMDBF dictionary. The algorithm is an extension of K. Oflazer 1249 algorithm used by @command{cor}. In the extended version it is 1250 possible to assign weights to individual edit operations. 1251 1252 Given an incorrect word form it returns all word forms 1253 present in the dictionary whose edit distance is smaller than the 1254 threshold given as the parameter. 1255 1256 1257 @node kor command line options 1258 @subsection Command line options 1259 1260 @table @code 1261 1262 @parhelp 1263 @parversion 1264 @parinteractive 1265 @c @parfile 1266 @c @paroutput 1267 @c @parfail 1268 @c @parcopy 1269 @parinputfield 1270 @paroutputfield 1271 @pardictionary 1272 @parprocess 1273 @parselect 1274 @parunselect 1275 @paroneline 1276 @paronefield 1277 1278 @item @b{@minus{}@minus{}distance=@var{int}, @minus{}n @var{int}} 1279 Maximum edit distance (default='1'). 1280 1281 @item @b{@minus{}@minus{}weights=@var{filename}, @minus{}w @var{filename}} 1282 Edit operations' weights file. 1283 1284 @c @item @b{@minus{}@minus{}replace, @minus{}r} 1285 @c Replace original form with corrected form, place original form in the 1286 @c cor field. This option has no effect in @option{--one-*} modes (default=off) 1287 1288 1289 @end table 1290 1291 1292 @node kor weights definition file 1293 @subsection Weights definition file 1294 1295 Example: 1296 1297 @example 1298 1299 %stdcor 1 1300 %xchg 1 1301 ÅŒ rz 0.5 1302 ch h 0.5 1303 u ó 0.5 1304 1305 @end example 1306 1307 1308 Default weight is set to 1 (@code{%stdcor 1}), the weight of exchange 1309 operation is set to 1 (@code{%xchg 1}), the three principal orthographic 1310 errors are assigned the weight 0.5. 1311 1312 The edit operation weight declaration, such as 1313 1314 @example 1315 ÅŒ rz 0.5 1316 @end example 1317 1318 works in both ways, i.e. ÅŒ->rz, rz->ÅŒ. 1319 1320 The default weights definition file for @code{kor} is: 1321 1322 @example 1323 $HOME/.local/share/utt/weights.kor 1324 @end example 1325 1326 or, if the above mentioned file is absent: 1327 1328 @example 1329 /usr/local/share/utt/weights.kor 1330 @end example 1331 1332 1333 @node kor dictionaries 1334 @subsection Dictionaries 1335 1336 see @command{cor} 1219 1337 1220 1338 @c --------------------------------------------------------------------- … … 1228 1346 @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} 1229 1347 1230 @item @strong{Authors:} @tab Tomasz Obr êbski1348 @item @strong{Authors:} @tab Tomasz ObrÄbski 1231 1349 @item @strong{Component category:} @tab filter 1232 1350 @item @strong{Input format:} @tab UTT regular … … 1256 1374 1257 1375 input: 1258 0000 05 W Cze ¶Ê1376 0000 05 W CzeÅÄ 1259 1377 0005 01 P ! 1260 1378 0006 01 S _ … … 1267 1385 output: 1268 1386 0000 00 BOS * 1269 0000 05 W Cze ¶Ê1387 0000 05 W CzeÅÄ 1270 1388 0005 01 P ! 1271 1389 0006 00 EOS * … … 1288 1406 @c @chapter gph - graphizer 1289 1407 1290 @c Authors: Tomasz Obr êbski1408 @c Authors: Tomasz ObrÄbski 1291 1409 1292 1410 … … 1301 1419 1302 1420 @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} 1303 @item @strong{Authors:} @tab Tomasz Obr êbski1421 @item @strong{Authors:} @tab Tomasz ObrÄbski 1304 1422 @item @strong{Component category:} @tab filter 1305 1423 @item @strong{Input format:} @tab UTT regular … … 1537 1655 1538 1656 @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} 1539 @item @strong{Authors:} @tab Tomasz Obr êbski1657 @item @strong{Authors:} @tab Tomasz ObrÄbski 1540 1658 @item @strong{Component category:} @tab filter 1541 1659 @item @strong{Input format:} @tab UTT flattened … … 1626 1744 1627 1745 @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} 1628 @item @strong{Authors:} @tab Marcin Walas, Tomasz Obr êbski1746 @item @strong{Authors:} @tab Marcin Walas, Tomasz ObrÄbski 1629 1747 @item @strong{Input format:} @tab UTT flattened 1630 1748 @item @strong{Output format:} @tab UTT flattened … … 1646 1764 1647 1765 @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} 1648 @item @strong{Authors:} @tab Tomasz Obr êbski1766 @item @strong{Authors:} @tab Tomasz ObrÄbski 1649 1767 @item @strong{Component category:} @tab filter 1650 1768 @item @strong{Input format:} @tab UTT regular … … 1839 1957 1840 1958 @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} 1841 @item @strong{Authors:} @tab Micha l Stolarski, Tomasz Obrebski1959 @item @strong{Authors:} @tab MichaÅ Stolarski, Tomasz ObrÄbski 1842 1960 @item @strong{Component category:} @tab additional tool 1843 1961 @end multitable … … 1884 2002 1885 2003 @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} 1886 @item @strong{Authors:} @tab Tomasz Obr êbski2004 @item @strong{Authors:} @tab Tomasz ObrÄbski 1887 2005 @item @strong{Input format:} @tab UTT regular 1888 2006 @item @strong{Output format:} @tab UTT flattened … … 1932 2050 1933 2051 @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} 1934 @item @strong{Authors:} @tab Tomasz Obr êbski2052 @item @strong{Authors:} @tab Tomasz ObrÄbski 1935 2053 @item @strong{Input format:} @tab UTT flattened 1936 2054 @item @strong{Output format:} @tab UTT regular … … 2236 2354 @tab @code{v} @tab vocative. 2237 2355 @item 2238 @item2239 2356 @code{G} @tab @tab Gender 2240 2357 @item … … 2729 2846 @c @chapter Copyright 2730 2847 @c 2731 @c Copyright 2004 by Tomasz Obr ebski2848 @c Copyright 2004 by Tomasz ObrÄbski 2732 2849 @c This software is free for research and educational use. 2733 2850 -
app/src/dgp/const.hh
r0214596 r9ace5d2 3 3 4 4 #define MAXTYPES 32 5 #define MAXFLAGS 64 5 6 #define MAXNODES 1024 6 7 #define MAXCONSTRS 32 -
app/src/dgp/dgc
r3748bd1 r9ace5d2 98 98 my $nreq=0; 99 99 my $nlink=0; 100 my $nflag=0; 100 101 101 102 my %cats; … … 113 114 open(OUTPUT, ">$outputfile") or die("Can't open output file: $outputfile!"); 114 115 } 115 116 116 117 117 … … 192 192 } 193 193 } 194 } 195 elsif(/^FLAG\s+\S+$/) 196 { 197 ++$nflag; 198 print OUTPUT "$_\n" 194 199 } 195 200 elsif(/^$/) { … … 249 254 printf STDERR "%6d RIGHT statements\n", $nright; 250 255 printf STDERR "%6d LINK statements\n", $nlink; 256 printf STDERR "%6d FLAG statements\n", $nflag; 251 257 252 258 -
app/src/dgp/grammar.cc
r0214596 r9ace5d2 44 44 lt.resize(types_sz); 45 45 gt.resize(types_sz); 46 } 47 } 48 49 void Grammar::add_flag(const char* s) 50 { 51 Flag::add(s); 52 if(Flag::count()>flags_sz) 53 { 54 flags_sz += 16; 55 pass.resize(flags_sz); 46 56 } 47 57 } … … 124 134 set_connect(arg1,arg2,arg3); 125 135 } 136 // FLAG DECLARATION 137 else if(strcmp(key,"FLAG")==0 && fields>=2) 138 { 139 add_flag(arg1); 140 } 126 141 127 142 else fprintf(stderr,"Invalid line %d. Ignored.\n", lineno); … … 160 175 if(connect[c][d].count(t)) 161 176 fprintf(f,"LINK\t%s\t%s\t%s\n",c.str(),d.str(),t.str()); 177 178 for(Flag i=1; i<Flag::count(); ++i) 179 fprintf(f,"FLAG\t%s\n",i.str()); 162 180 } 163 181 -
app/src/dgp/grammar.hh
r3748bd1 r9ace5d2 11 11 #include "sgraph.hh" 12 12 13 14 class Link 15 { 16 Role role; 17 FlagSet hflags; 18 FlagSet dflags; 19 } 20 21 13 22 class Grammar 14 23 { … … 18 27 // enum CONSTR { SGL, OBL, LEFT, RIGHT, INIT, NONINIT, FIN, NONFIN }; 19 28 20 Grammar() : types_sz(0), cats_sz(0) {} ;29 Grammar() : types_sz(0), cats_sz(0), flags_sz(0) {} ; 21 30 22 31 int types_sz; 23 32 int cats_sz; 33 int flags_sz; 24 34 25 35 vector< vector< Roles > > connect; … … 31 41 vector< RoleSet > gt; 32 42 43 44 // vector< vector< vector< 45 vector< FlagSet > set; 46 vector< FlagSet > pass; 47 33 48 bool read(FILE* f); 34 49 void write(FILE* f); … … 36 51 void add_category(const char* s); 37 52 void add_type(const char* s); 53 void add_flag(const char* s); 38 54 39 55 void set_sgl(Role r) { sgl.set(r); } -
app/src/dgp/sgraph.cc
r0214596 r9ace5d2 128 128 { 129 129 buf+=sprintf(buf,";"); 130 int cont=0; 130 131 for(Role i=1; i<=Role::count(); ++i) 131 if(node.prop.forbidden[i]) buf+=sprintf(buf," !%s",i.str());132 if(node.prop.forbidden[i]) buf+=sprintf(buf,"%s!%s",(cont++)?",":"",i.str()); 132 133 for(Role i=1; i<=Role::count(); ++i) 133 if(node.prop.required[i]) buf+=sprintf(buf," &%s",i.str());134 if(node.prop.required[i]) buf+=sprintf(buf,"%s&%s",(cont++)?",":"",i.str()); 134 135 } 135 136 … … 140 141 141 142 142 int SGraph::sprint_node_debug(char* buf, c har* pref, int n)143 int SGraph::sprint_node_debug(char* buf, const char* pref, int n) 143 144 { 144 145 char *buf0 = buf; … … 149 150 } 150 151 151 int SGraph::print_node_debug(FILE* f, c har* pref, int n)152 int SGraph::print_node_debug(FILE* f, const char* pref, int n) 152 153 { 153 154 char buf[1000]; -
app/src/dgp/sgraph.hh
r0214596 r9ace5d2 88 88 int sprint_node(char* buf, int n, unsigned int info); 89 89 int print_node(FILE* f, int n, unsigned int info); 90 int sprint_node_debug(char* buf, c har* pref, int n);91 int print_node_debug(FILE* f, c har* pref, int n);90 int sprint_node_debug(char* buf, const char* pref, int n); 91 int print_node_debug(FILE* f, const char* pref, int n); 92 92 93 93 void print_arc(FILE* f, int left, int right, Role role, int dir); // 0 - left, 1 - right -
app/src/dgp/thesymbols.hh
r0214596 r9ace5d2 23 23 24 24 typedef Symbol<4> Rel; 25 25 26 typedef Symbol<5> Flag; 27 typedef bitset<MAXFLAGS> FlagSet; 26 28 27 29 #endif
Note: See TracChangeset
for help on using the changeset viewer.