#!/usr/bin/env ruby
# -*- coding: iso-8859-2 -*-

# $: << "#{ENV['HOME']}/.local/lib/utt"
$: << "/usr/local/lib/utt"

require 'getoptlong'
require 'seg.rb'

opts = GetoptLong.new(
[ '--help',     '-h',   GetoptLong::NO_ARGUMENT ],
[ '--debug',    '-d',   GetoptLong::NO_ARGUMENT ],
[ '--format',   '-F',   GetoptLong::REQUIRED_ARGUMENT ],
[ '--info',     '-I',   GetoptLong::REQUIRED_ARGUMENT ],
[ '--dgpids',           GetoptLong::NO_ARGUMENT ],
[ '--graph',            GetoptLong::NO_ARGUMENT ],
[ '--uniq',     '-u',   GetoptLong::NO_ARGUMENT ],
[ '--utt',              GetoptLong::NO_ARGUMENT ],
[ '--span',     '-s',   GetoptLong::REQUIRED_ARGUMENT ],
[ '--maxsize',          GetoptLong::REQUIRED_ARGUMENT ],
[ '--forest',           GetoptLong::NO_ARGUMENT ],
[ '--only-trees','-t',  GetoptLong::NO_ARGUMENT ])

$helptext = <<END
The program generates trees from the graph output by dgp. dgp must be run 
with '--info=ds' option.

Command:       tre [options]

Options:
--help         -h      Print help (this text) and exit.
--debug        -d      Verbose output. For developers only.
--format=s     -F s    Output format. Recognized values:
                               a       root + list of arcs
                               p       parenthesized notation
                               h       human readable indented format
                               c       CONLL format
                       Multiple values are allowed. (default p)
--info=s       -I s    Information printed. Recognized values:
                               n       node identifier
                               f       surface form
                               m       morphological information
                               l       arc labels\
--gphids               Used gph node identifiers (default: linear)
--dgpids               Used dgp node identifiers (default: linear)
--graph                Do not generate trees, just print the graph.
--uniq         -u      Remove duplicate trees.
--utt                  UTT formatted output.

END

$DEBUG=false
$FORMAT='p'
$INFO='DEFAULT'
$UTTOUTPUT=false
$START=nil
$END=nil
$FOREST=false
$MAXSIZE=nil
$GPHIDS=false
$DGPIDS=false
$GRAPH==false

opts.each do |opt, arg|
  case opt
  when '--help'
    print $helptext
    exit 0
  when '--debug'
    $DEBUG=true
  when '--format'
    $FORMAT=arg
  when '--info'
    $INFO=arg
  when '--gphids'
    $GPHIDS=true
  when '--dgpids'
    $DGPIDS=true
  when '--graph'
    $GRAPH=true
  when '--uniq'
    $UNIQ=true
  when '--utt'
    $UTTOUTPUT=true
  when '--forest'
    $FOREST=true
  when '--maxsize'
    $MAXSIZE=arg.to_i
  when '--span'
    $START,$END = arg.split ','
  else
    print "Unknown option #{opt}. Ignored.\n"
  end
end

if $INFO=='DEFAULT'
  case $FORMAT
    when 'p','a'
    $INFO='fl'
    when 'h'
    $INFO='fmnl'
  end
end

$dgpsep=';'

def tre(input)
  $gphid=[]
  $form=[]
  $lem=[]
  $ord1=[]
  $count=0
  nodes=[]
  prevpos=-1
  tokennumber=0
  for line in input
    seg=Seg.new(line)
    print line if $UTTOUTPUT && seg.field(3) == 'EOS'
    
    if dgp=seg['dgp']
      if nodes==[] && seg[3]!='BOS'
        print "A sentence must start with BOS segment. Aborting.\n"
        return
      end

      id=dgp[/^\d+/].to_i

      if gph=seg['gph']
        $gphid[id]=gph[/^\d+/].to_i
      else
        print "No gph field. Aborting.\n"
        return
      end

      $form[$gphid[id]] = seg[4]
      $lem[$gphid[id]] = seg['lem']
      $ord1[$gphid[id]] = if prevpos==seg[1].to_i then tokennumber
                         else prevpos=seg[1].to_i; tokennumber+=1 end
              
      nodes[id] = [seg[1].to_i,seg[2].to_i,dgp]

      if seg[3]=='EOS'

        $pref = "#{seg[1]} #{seg[2]} SYN *"
        parsegraph(nodes)
        set_ord #(0...(nodes.length)).each{|i| set_distance_from_i i }
        printgraph if $DEBUG
        if $GRAPH
          if $FORMAT =~ /c/
            printconll
          else
            printground
          end
        else
          thetrees = $FOREST ? genforest : gentrees
          outputs = output_trees thetrees
          outputs = outputs.sort.uniq if $UNIQ
          print outputs.join
          print line if $UTTOUTPUT
          $gphid=[]
          $form=[]
          $lem=[] 
          $ord1=[]
          $count=0	
          nodes=[]
          prevpos=-1
          tokennumber=0
        end
      end
    end
  end
end


def output_trees trees
  
  outputs = []
  
  for t in trees
    $count += 1
    # t1=ground(t)

    t1=t

    # span = $FOREST ? " span:" + (ground_tree_min(t1).to_s + ","+ground_tree_max(t1).to_s)+";" : ""
    # case $FORMAT
    # when /a/
    #   outputs << "#{$pref} tre:#{$count}#{span} #{arc_output(t1)}\n"
    # when /p/
    #   outputs << "#{$pref}#{span} tre:#{$count} par:#{par_output(t1)}\n"
    # when /h/
    #   outputs << "#\n# tree #{$count}\n# ------\n#{dgp_output(t1,0)}"
    # when /c/
    #   outputs << conll_output(t1,0)
    # end

    case $FORMAT
    when /a/
      outputs << "#{arc_output(t1)}\n"
    when /p/
      outputs << "#{par_output(t1)}\n"
    when /h/
      outputs << human_output(t1,0)
    when /c/
      outputs << conll_output(t1,0)
    end

  end

  outputs

end

def id_output id
  if $DGPIDS then id elsif $GPHIDS then $gphid[id] else $ord1[$gphid[id]] end
end

def nodeinfo(id)
  info=""
  gphid = $gphid[id]
  if $INFO =~ /o/
    info += $ord1[gphid].to_s + '/' + gphid.to_s + '/' + id.to_s
    info += '.' if $INFO =~ /[nfm]/
  end
  if $INFO =~ /n/
    info += id_output(id).to_s                           
    info += '.' if $INFO =~ /[fm]/
  end
  if $INFO =~ /f/
    info += $form[gphid] 
    info += ';' if $INFO =~ /m/
  end
  if $INFO =~ /m/
    info += $lem[gphid]  
  end
  info
end


def arc_output(tree)
  root, arcs = tree
  "head:#{nodeinfo(root)} links:" + arcs.map{|a| "(#{($INFO =~ /l/) ? a[2]+":" : ""}#{nodeinfo(a[0])}-#{nodeinfo(a[1])})"}.join("")
end

def printtree(root,arcs,o)
  if o==0
        print "# %-16s" % "root: "
  end
  print nodeinfo(root),"\n"
  for arc in arcs.select{ |a| a[0]==root }.sort{|a,b| a[1]<=>b[1] }
    print '# ',"   "*(o+1)
    print "%-16s" % (arc[2]+": ")
    printtree(arc[1],arcs,o+1)
  end
end

def human_output(tree,o)
  root, arcs = tree
  output = ''
  if o==0
        output += "%-16s" % "root: "
  end
  output += nodeinfo(root) + "\n"
  for arc in arcs.select{ |a| a[0]==root }.sort{|a,b| a[1]<=>b[1] }
    output += "   "*(o+1)
    output += "%-16s" % (arc[2]+": ")
    output += human_output([arc[1],arcs],o+1)
  end
  output
end

def conll_output(tree,o)
  root,arcs = tree
  nodes = ([root] + arcs.map{|a| a[1]}).sort{|a,b| $gphid[a] <=> $gphid[b]}
  conll_lines = []
  for i in nodes
    gphid = $gphid[i]
    id = $ord1[gphid]
    form = $form[gphid]
    /^(?<lemma>.*),(?<cpostag>[^\/]*)(\/(?<feats>.+))?/ =~ $lem[gphid]
    thearcs = arcs.select{|a| a[1]==i }.map{|a| [$ord1[$gphid[a[0]]],a[2]] } 
    thearcs = [[0,'root']] if thearcs.empty?
    for a in thearcs
      head,deprel = a
      conll_lines << [id,form,lemma,cpostag,cpostag,feats,head,deprel,nil,nil].map{|s| s ? s.to_s : "_"}.join("\t")
    end
  end
  conll_lines.join("\n") + "\n\n"
end

def par_output(tree)
  root, arcs = tree
  ldeps = arcs.select{|a| a[0]==root and $gphid[a[1]] < $gphid[root]}.sort{|a,b| $gphid[a[1]]<=>$gphid[b[1]] }
  rdeps = arcs.select{|a| a[0]==root and $gphid[a[1]] > $gphid[root]}.sort{|a,b| $gphid[a[1]]<=>$gphid[b[1]] }

  output = ''

  output_left  = ldeps.map{|arc| ' (' + (($INFO =~ /l/) ? arc[2].upcase : '') + par_output([arc[1],arcs]) + ')'}.join
  output_right = rdeps.map{|arc| ' (' + (($INFO =~ /l/) ? arc[2].upcase : '') + par_output([arc[1],arcs]) + ')'}.join

  # for arc in ldeps
  #   output += ' ('
  #   output += arc[2].upcase if $INFO =~ /l/
  #   output += par_output(arc[1],arcs)
  #   output += ')'
  # end

  # print ' ',nodeinfo(root)

  # for arc in rdeps
  #   print ' ('
  #   print arc[2].upcase if $INFO =~ /l/
  #   printpar(arc[1],arcs)
  #   print ')'
  # end

  output_left + ' ' + nodeinfo(root) + output_right

end


def ground_tree_min t
  ([t[0]]+t[1].map{|e| [e[0],e[1]]}).flatten.min
end

def ground_tree_max t
  ([t[0]]+t[1].map{|e| [e[0],e[1]]}).flatten.max
end



def parsegraph(nodes)

  $n   =nodes.length
  $sat =[]; $vis =[]; $succ=[]; $lhs =[]; $arcs=[]; $pos=[]; $len=[]; $ord=[]; $distance={}

  for dgp in nodes

    parts  = dgp[2].split($dgpsep,7)

    if parts[3]==nil || parts[4]==nil || parts[5]==nil
      $stderr.print "ERR: tre requires dgp be called with '--info=ds' option. Aborting.\n"
      exit
    end

    i      = parts[0].to_i
    $pos[i] = dgp[0].to_i
    $len[i] = dgp[1].to_i
    $sat << i if parts[1]=="s"

    $arcs |= parts[2].scan(/\([^()]+\)/).map{ |a| case a 
                                          when /\-\-(\w+):(\d+)/
                                            # [i, $2.to_i, $1, $3.to_i, $4.to_i]
                                            [i, $2.to_i, $1, 0, 0]
                                          when /\+\+(\w+):(\d+)/
                                            # [$2.to_i, i, $1, $3.to_i, $4.to_i]
                                            [$2.to_i, i, $1, 0, 0]
                                          end }
    $succ |= parts[3][1..-2].split(',').map{|x| [x.to_i,i]}
    $vis  |= parts[4][1..-2].split(',').map{|x| [x.to_i,i]} 
    $lhs  |= parts[5][1..-2].split(',').map{|x| [x.to_i,i]} + [[i,i]]

  end

end


def ground(t)
  [ $gphid[t[0]] , t[1].map{|a| [$gphid[a[0]],$gphid[a[1]],a[2]]} ]
end  


#NOWE-START

def successors i
  $succ.select{|e| e[0]==i}.map{|e| e[1]}
end

def predecessors i
  $succ.select{|e| e[1]==i}.map{|e| e[0]}
end

def start_nodes
  $succ.map{|e| e[1]}.map{|e| predecessors(e)}.uniq.map{|e| e[0]}
end

def end_nodes
  $succ.map{|e| e[0]}.map{|e| successors(e)}.uniq.map{|e| e[0]}
end

def set_ord
  positions = $pos.uniq.sort
  (0...$n).each{|i| $ord[i] = positions.index($pos[i]) }
end


def set_distance_from_i i
  set_distance_from_i_to_jth_successors_to_v i, i, 1
end

def set_distance_from_i_to_jth_successors_to_v i, j , v
  succ = successors(j)
  for j1 in succ
    $distance[[i,j1]] = v
    set_distance_from_i_to_jth_successors_to_v i, j1, v+1
  end
end

#NOWE-END


def gentrees
  bos=0; eos=$n-1;
  gentrees2 bos, eos
end


def genforest
  forest=[]
  for bos in start_nodes
    for eos in end_nodes # tu są też wierzchołki poprzedzające!!!
      next if $ord[bos] > $ord[eos] or ($MAXSIZE != nil and $ord[eos] - $ord[bos] > $MAXSIZE+1)
      forest += gentrees2(bos,eos)
    end
  end
  forest
end

def gentrees2 bos, eos
  $thetrees=[];
  roots = (bos..eos).select{|i| $vis.include? [i,eos]}.select{|i| $vis.include? [bos,i]}

  if $DEBUG then print "ROOTS: #{roots.inspect}\n" end
  for root in roots
    gentrees3 bos, eos, root
  end
  $thetrees
end

def gentrees3 bos, eos, root
  $theroot=root
  $thebos=bos
  $theeos=eos
  for r in buildR(root , eos, [])
    (rmin,rmax,rtree) = r
    buildR(bos, rmin, rtree)
  end
end

def buildR(min, max, tree)
  if $DEBUG then print "buildR--#{min}--#{max}--#{tree.inspect}\n" end
  trees=[]
  for arc in $arcs.select{|a| a[0]==max && $vis.include?([min,a[1]]) }
    if $DEBUG then print "ARC: #{arc.inspect}\n" end
    for r in buildR(arc[1],arc[0],tree+[arc])                 #!!! buildR(a[1],a[3],tree+[a])
      # for r in buildR(arc[4],arc[3],tree+[arc])                 #!!! buildR(a[1],a[3],tree+[a])
      (rmin,rmax,rarcs) = r
      for l in buildR(min,rmin,rarcs)
        (lmin,lmax,larcs) = l
        trees << [lmin,rmax,larcs]
      end
    end
  end
  for i in (0...$n).select{|i| $succ.include?([i,max])}.select{|i| $lhs.include?([min,i])}
    for l in buildL(min,i,tree)
      (lmin,lmax,larcs) = l
      trees << [lmin,lmax,larcs]
    end
  end
  trees  
end
    

def buildL(min,max,tree)
  if $DEBUG then print "buildL--#{min}--#{max}--#{tree.inspect}\n" end
  if $pos[min]==$pos[max]
    if min==$thebos && max==$thebos
      $thetrees.push [$theroot,tree]
      if $DEBUG then print "adding tree: #{tree.inspect}\n" end
    end
    return [[max,max,tree]]
  end
  trees=[]
  for arc in $arcs.select{|a| a[1]==max && $lhs.include?([min,a[0]]) }
    if $DEBUG then print "ARC: #{arc.inspect}\n" end
     for r in buildR(arc[0],arc[1],tree+[arc]) ### buildR(arc[3],max,tree+[arc])
     #for r in buildR(arc[3],arc[4],tree+[arc]) ### buildR(arc[3],max,tree+[arc])
      (rmin,rmax,rarcs) = r
      for l in buildL(min,rmin,rarcs)
        (lmin,lmax,larcs) = l
        trees << [lmin,lmax,larcs]
      end
    end
  end
  trees
end


def printgraph()
  print "N:    #{$n}\n"
  print "SAT:  #{set_to_s($sat)}\n"
  print "SUCC: #{rel_to_s($succ)}\n"
  print "VIS:  #{rel_to_s($vis)}\n"
  print "LHS:  #{rel_to_s($lhs)}\n"
  print "ARCS: #{arcs_to_s($arcs)}\n"
end


def printground
  for i in 1...($form.length-1)
    print "#{$ord1[i]} #{$form[i]} #{$lem[i]} "
    print $arcs.select{|a| $ord1[$gphid[a[1]]] == $ord1[i]}.map{|a| "#{a[2]}:#{$ord1[$gphid[a[0]]]}"}.sort.uniq.join(' ')
    print "\n"
  end
end

def printconll
  for i in 1...($form.length-1)
    id = $ord1[i]
    form = $form[i]
    /^(?<lemma>.*),(?<cpostag>[^\/]*)(\/(?<feats>.+))?/ =~ $lem[i]
    arcs = $arcs.select{|a| $ord1[$gphid[a[1]]] == $ord1[i]}.map{|a| [$ord1[$gphid[a[0]]],a[2]]}.sort.uniq
    arcs = [[0,'root']] if arcs.empty?
    for a in arcs
      head,deprel = a
      puts [id,form,lemma,cpostag,cpostag,feats,head,deprel,nil,nil].map{|s| s ? s.to_s : "_"}.join("\t")
    end
  end
  puts
end


def set_to_s(s) "{#{s.join(',')}}" end
def rel_to_s(r) "{#{r.map{|p| "(#{p[0]},#{p[1]})"}.join(',')}}" end
def arc_to_s(q) "-#{q[0]}-#{q[2]}-#{q[1]}/#{q[3]}" end
def arcs_to_s(a) "{#{a.map{|q| arc_to_s(q)}.join(',')}}" end

######################################################################

tre($stdin)
