source: src/tre/tre @ e7de6cc

Last change on this file since e7de6cc was e7de6cc, checked in by Tomasz Obrebski <to@…>, 12 years ago

new version of dgp
added dgc, tre and compdic components
compiledic renamed to compdic_utf8
./configure updated

  • Property mode set to 100755
File size: 10.4 KB
Line 
1#!/usr/bin/ruby1.9.1 -I /usr/local/lib/utt -I $HOME/.local/lib/utt
2# -*- coding: iso-8859-2 -*-
3
4$: << "#{ENV['HOME']}/.local/lib/utt"
5$: << "/usr/local/lib/utt"
6
7require 'getoptlong'
8require 'seg.rb'
9
10opts = GetoptLong.new(
11[ '--help',     '-h',   GetoptLong::NO_ARGUMENT ],
12[ '--debug',    '-d',   GetoptLong::NO_ARGUMENT ],
13[ '--format',   '-F',   GetoptLong::REQUIRED_ARGUMENT ],
14[ '--info',     '-I',   GetoptLong::REQUIRED_ARGUMENT ],
15[ '--span',     '-s',   GetoptLong::REQUIRED_ARGUMENT ],
16[ '--maxsize',          GetoptLong::REQUIRED_ARGUMENT ],
17[ '--forest',           GetoptLong::NO_ARGUMENT ],
18[ '--ground',           GetoptLong::NO_ARGUMENT ],
19[ '--only-trees','-t',  GetoptLong::NO_ARGUMENT ])
20
21$helptext=
22"The program generates trees from the graph output by dgp. dgp must\n"+
23"must be run with '-i ds' option.\n\n"+
24"Command:       tre [options]\n\n"+
25"Options:\n"+
26"--help         -h      Print help (this text) and exit.\n"+
27"--debug        -d      Verbose output. For developers only.\n"+
28"--format=s     -F s    Output format. Recognized values:\n"+
29"                               a       root + list of arcs\n"+
30"                               p       parenthesized notation\n"+
31"                               h       human readable indented tree format\n"+
32"                       Multiple values are allowed. (default p)\n"+
33"--info=s       -I s    Information printed. Recognized values:\n"+
34"                               n       node identifier\n"+
35"                               f       surface form\n"+
36"                               m       morphological information\n"+
37"                               l       arc labels\n"+
38"--only-trees   -t      Do not copy input. Print trees only.\n"
39
40$DEBUG=false
41$FORMAT='p'
42$INFO='DEFAULT'
43$ONLYTREES=false
44$START=nil
45$END=nil
46$FOREST=false
47$MAXSIZE=nil
48
49opts.each do |opt, arg|
50  case opt
51  when '--help'
52    print $helptext
53    exit 0
54  when '--debug'
55    $DEBUG=true
56  when '--format'
57    $FORMAT=arg
58  when '--info'
59    $INFO=arg
60  when '--only-trees'
61    $ONLYTREES=true
62  when '--forest'
63    $FOREST=true
64  when '--ground'
65    $GROUND=true
66  when '--maxsize'
67    $MAXSIZE=arg.to_i
68  when '--span'
69    $START,$END = arg.split ','
70  else
71    print "Unknown option #{opt}. Ignored.\n"
72  end
73end
74
75if $INFO=='DEFAULT'
76  case $FORMAT
77    when 'p','a'
78    $INFO='nl'
79    when 'h'
80    $INFO='fmnl'
81  end
82end
83
84$dgpsep=';'
85
86def tre(input)
87  $gphid=[]
88  $form=[]
89  $lem=[]
90  $ord1=[]
91  $count=0
92  nodes=[]
93  prevpos=-1
94  tokennumber=0
95  for line in input
96    seg=Seg.new(line)
97    print line unless $ONLYTREES || seg.field(3) == 'EOS'
98   
99    if dgp=seg['dgp']
100      if nodes==[] && seg[3]!='BOS'
101        print "A sentence must start with BOS segment. Aborting.\n"
102        return
103      end
104
105      id=dgp[/^\d+/].to_i
106
107      if gph=seg['gph']
108        $gphid[id]=gph[/^\d+/].to_i
109      else
110        print "No gph field. Aborting.\n"
111        return
112      end
113
114      $form[$gphid[id]] = seg[4]
115      $lem[$gphid[id]] = seg['lem']
116      $ord1[$gphid[id]] = if prevpos==seg[1].to_i then tokennumber
117                         else prevpos=seg[1].to_i; tokennumber+=1 end
118             
119      nodes[id] = [seg[1].to_i,seg[2].to_i,dgp]
120
121      if seg[3]=='EOS'
122
123        $pref = "#{seg[1]} #{seg[2]} SYN *"
124
125        parsegraph(nodes)
126
127        set_ord #(0...(nodes.length)).each{|i| set_distance_from_i i }
128
129        printgraph if $DEBUG
130
131        if $GROUND
132          printground
133        else
134          thetrees = $FOREST ? genforest : gentrees
135         
136          output_trees thetrees
137         
138          print line unless $ONLYTREES
139         
140          $gphid=[]   # POWTÓRZENIE
141          $form=[]
142          $lem=[]
143          $ord1=[]
144          $count=0     
145          nodes=[]
146          prevpos=-1
147          tokennumber=0
148        end
149      end
150    end
151  end
152end
153
154
155def output_trees trees
156  for t in trees
157    $count += 1
158    t1=ground(t)
159
160    span = $FOREST ? " span:" + (ground_tree_min(t1).to_s + ","+ground_tree_max(t1).to_s)+";" : ""
161    case $FORMAT
162    when /a/
163      print "#{$pref} tre:#{$count}#{span} #{arcsinfo(t1[0],t1[1])}"
164#       print arcsinfo(t1[0],t1[1])
165      print "\n"
166    when /p/
167      print "#{$pref}#{span} tre:#{$count} par:"
168      printpar(t1[0],t1[1])
169      print "\n"
170    when /h/
171      print "#\n# tree #{$count}\n# ------\n"
172      printtree(t1[0],t1[1],0)
173    end
174  end
175end
176
177
178def nodeinfo(id)
179  info=""
180  if $INFO =~ /o/
181    info += $ord1[id].to_s                           
182    info += '.' if $INFO =~ /[nfm]/
183  end
184  if $INFO =~ /n/
185    info += id.to_s                           
186    info += '.' if $INFO =~ /[fm]/
187  end
188  if $INFO =~ /f/
189    info += $form[id]
190    info += ';' if $INFO =~ /m/
191  end
192  if $INFO =~ /m/
193    info += $lem[id] 
194  end
195  info
196end
197
198
199def arcsinfo(root,arcs)
200  "head:#{nodeinfo(root)} links:" + arcs.map{|a| "(#{($INFO =~ /l/) ? a[2]+":" : ""}#{nodeinfo(a[0])}-#{nodeinfo(a[1])})"}.join("")
201#   for a in arcs
202#     print ';'
203#     print "#{a[2]}:" if $INFO =~ /l/
204#       print nodeinfo(a[0])+'-'+nodeinfo(a[1])
205#   end
206end
207
208def printtree(root,arcs,o)
209  if o==0
210        print "# %-16s" % "root: "
211  end
212  print nodeinfo(root),"\n"
213  for arc in arcs.select{ |a| a[0]==root }.sort{|a,b| a[1]<=>b[1] }
214    print '# ',"   "*(o+1)
215    print "%-16s" % (arc[2]+": ")
216    printtree(arc[1],arcs,o+1)
217  end
218end
219
220def printpar(root,arcs)
221  print nodeinfo(root)
222  deps = arcs.select{ |a| a[0]==root }.sort{|a,b| a[1]<=>b[1] }
223  unless deps == []
224    print '('
225    cont=false
226    for arc in deps
227      if cont then print ',' else cont=true end
228      print arc[2],':' if $INFO =~ /l/
229      printpar(arc[1],arcs)
230    end
231    print ')'
232  end
233end
234
235
236def ground_tree_min t
237  ([t[0]]+t[1].map{|e| [e[0],e[1]]}).flatten.min
238end
239
240def ground_tree_max t
241  ([t[0]]+t[1].map{|e| [e[0],e[1]]}).flatten.max
242end
243
244
245
246def parsegraph(nodes)
247
248  $n   =nodes.length
249  $sat =[]; $vis =[]; $succ=[]; $lhs =[]; $arcs=[]; $pos=[]; $len=[]; $ord=[]; $distance={}
250
251  for dgp in nodes
252
253    parts  = dgp[2].split($dgpsep,7)
254
255    if parts[3]==nil || parts[4]==nil || parts[5]==nil
256      $stderr.print "ERR: tre requires dgp be called with '--info s' option. Aborting.\n"
257      exit
258    end
259
260    i      = parts[0].to_i
261    $pos[i] = dgp[0].to_i
262    $len[i] = dgp[1].to_i
263    $sat << i if parts[1]=="s"
264
265    $arcs |= parts[2].split(',').map{ |a| case a
266                                          when /\-\-(\w+)-(\d+)\((\d+)~(\d+)\)/
267                                            [i, $2.to_i, $1, $3.to_i, $4.to_i]
268                                          when /\+\+(\w+)-(\d+)\((\d+)~(\d+)\)/
269                                            [$2.to_i, i, $1, $3.to_i, $4.to_i]
270                                          end }
271    $succ |= parts[3][1..-2].split(',').map{|x| [x.to_i,i]}
272    $vis  |= parts[4][1..-2].split(',').map{|x| [x.to_i,i]}
273    $lhs  |= parts[5][1..-2].split(',').map{|x| [x.to_i,i]} + [[i,i]]
274
275  end
276
277end
278
279
280def ground(t)
281  [ $gphid[t[0]] , t[1].map{|a| [$gphid[a[0]],$gphid[a[1]],a[2]]} ]
282end 
283
284
285#NOWE-START
286
287def successors i
288  $succ.select{|e| e[0]==i}.map{|e| e[1]}
289end
290
291def predecessors i
292  $succ.select{|e| e[1]==i}.map{|e| e[0]}
293end
294
295def start_nodes
296  $succ.map{|e| e[1]}.map{|e| predecessors(e)}.uniq.map{|e| e[0]}
297end
298
299def end_nodes
300  $succ.map{|e| e[0]}.map{|e| successors(e)}.uniq.map{|e| e[0]}
301end
302
303def set_ord
304  positions = $pos.uniq.sort
305  (0...$n).each{|i| $ord[i] = positions.index($pos[i]) }
306end
307
308
309def set_distance_from_i i
310  set_distance_from_i_to_jth_successors_to_v i, i, 1
311end
312
313def set_distance_from_i_to_jth_successors_to_v i, j , v
314  succ = successors(j)
315  for j1 in succ
316    $distance[[i,j1]] = v
317    set_distance_from_i_to_jth_successors_to_v i, j1, v+1
318  end
319end
320
321#NOWE-END
322
323
324def gentrees
325  bos=0; eos=$n-1;
326  gentrees2 bos, eos
327end
328
329
330def genforest
331  forest=[]
332  for bos in start_nodes
333    for eos in end_nodes # tu s± te¿ wierzcho³ki poprzedzaj±ce!!!
334      next if $ord[bos] > $ord[eos] or ($MAXSIZE != nil and $ord[eos] - $ord[bos] > $MAXSIZE+1)
335      forest += gentrees2(bos,eos)
336    end
337  end
338  forest
339end
340
341def gentrees2 bos, eos
342  $thetrees=[];
343  roots = (1...eos).select{|i| $vis.include? [i,eos]}.select{|i| $vis.include? [bos,i]}
344
345  if $DEBUG then print "ROOTS: #{roots.inspect}\n" end
346  for root in roots
347    gentrees3 bos, eos, root
348  end
349  $thetrees
350end
351
352def gentrees3 bos, eos, root
353  $theroot=root
354  $thebos=bos
355  $theeos=eos
356  for r in buildR(root , eos, [])
357    (rmin,rmax,rtree) = r
358    buildR(bos, rmin, rtree)
359  end
360end
361
362def buildR(min, max, tree)
363  if $DEBUG then print "buildR--#{min}--#{max}--#{tree.inspect}\n" end
364  trees=[]
365  for a in $arcs.select{|a| a[0]==max && $vis.include?([min,a[1]]) }
366    if $DEBUG then print "ARC: #{a.inspect}\n" end
367    for r in buildR(a[4],a[3],tree+[a])                 #!!! buildR(a[1],a[3],tree+[a])
368      (rmin,rmax,rarcs) = r
369      for l in buildR(min,rmin,rarcs)
370        (lmin,lmax,larcs) = l
371        trees << [lmin,rmax,larcs]
372      end
373    end
374  end
375  for i in (0...$n).select{|i| $succ.include?([i,max])}.select{|i| $lhs.include?([min,i])}
376    for l in buildL(min,i,tree)
377      (lmin,lmax,larcs) = l
378      trees << [lmin,lmax,larcs]
379    end
380  end
381  trees 
382end
383   
384
385def buildL(min,max,tree)
386  if $DEBUG then print "buildL--#{min}--#{max}--#{tree.inspect}\n" end
387  if $pos[min]==$pos[max]
388    if min==$thebos && max==$thebos
389      $thetrees.push [$theroot,tree]
390      if $DEBUG then print "adding tree: #{tree.inspect}\n" end
391    end
392    return [[max,max,tree]]
393  end
394  trees=[]
395  for arc in $arcs.select{|a| a[1]==max && $lhs.include?([min,a[0]]) }
396    if $DEBUG then print "ARC: #{arc.inspect}\n" end
397    for r in buildR(arc[3],arc[4],tree+[arc]) ### buildR(arc[3],max,tree+[arc])
398      (rmin,rmax,rarcs) = r
399      for l in buildL(min,rmin,rarcs)
400        (lmin,lmax,larcs) = l
401        trees << [lmin,lmax,larcs]
402      end
403    end
404  end
405  trees
406end
407
408
409def printgraph()
410  print "N:    #{$n}\n"
411  print "SAT:  #{set_to_s($sat)}\n"
412  print "SUCC: #{rel_to_s($succ)}\n"
413  print "VIS:  #{rel_to_s($vis)}\n"
414  print "LHS:  #{rel_to_s($lhs)}\n"
415  print "ARCS: #{arcs_to_s($arcs)}\n"
416end
417
418
419def printground
420  for i in 1...($form.length-1)
421    print "#{$ord1[i]} #{$form[i]} #{$lem[i]} "
422    print $arcs.select{|a| $ord1[$gphid[a[1]]] == $ord1[i]}.map{|a| "#{a[2]}:#{$ord1[$gphid[a[0]]]}"}.sort.uniq.join(' ')
423    print "\n"
424  end
425end
426
427
428def set_to_s(s) "{#{s.join(',')}}" end
429def rel_to_s(r) "{#{r.map{|p| "(#{p[0]},#{p[1]})"}.join(',')}}" end
430def arc_to_s(q) "-#{q[0]}-#{q[2]}-#{q[1]}/#{q[3]}" end
431def arcs_to_s(a) "{#{a.map{|q| arc_to_s(q)}.join(',')}}" end
432
433######################################################################
434
435tre($stdin)
Note: See TracBrowser for help on using the repository browser.