#!/usr/bin/perl -w

#package:	UAM Text Tools
#component:	kon (search context)
#version:	1.0
#author:	Justyna Walkowska

use strict;
use Getopt::Long;
use locale;
use File::HomeDir;

Getopt::Long::Configure('no_ignore_case_always');

my $l='30c';
my $r='30c';
my $trim=0;
my $white=0;
my $bon='[0-9]+ [0-9]+ BOM .*';
my $eon='[0-9]+ [0-9]+ EOM .*';
my $bod='[';
my $eod=']';
my $column=0;
my $ignore=0;
my $help=0;

my $systemconfigfile='/usr/local/etc/utt/kon.conf';
#my $userconfigfile="$ENV{'HOME'}/.utt/kon.conf";
my $userconfigfile=home()."/.utt/kon.conf";

#read configuration files###########################
my $file;
foreach $file ($systemconfigfile, $userconfigfile){
  if(open(CONFIG, $file)){
  	while (<CONFIG>) {
  		chomp;
      		s/#.*//;
	      	s/^\s+//;
      		s/\s+$//;
    		next unless length;
    		my ($name, $value) = split(/\s*=\s*/, $_, 2);
    		if(($name eq "left")or($name eq "l")){
			$l=$value;
    		}
    		elsif(($name eq "right")or($name eq "r")){
			$r=$value;
    		}
    		elsif(($name eq "trim")or($name eq "t")){
			$trim=1;
    		}
    		elsif(($name eq "white")or($name eq "w")){
			$white=1;
    		}
    		elsif($name eq "bom"){
			$bon=$value;
    		}
    		elsif($name eq "eom"){
			$eon=$value;
    		}
    		elsif($name eq "bod"){
			$bod=$value;
    		}
    		elsif($name eq "eod"){
			$eod=$value;
    		}
    		elsif(($name eq "column")or($name eq "c")){
			$column=$value;
    		}
    		elsif(($name eq "ignore")or($name eq "i")){
			$ignore=1;
    		}
    		elsif(($name eq "help")or($name eq "h")){
			$help=1;
    		}

	}
  	close CONFIG;
  }
}
#########################################################

GetOptions("left|l=s" => \$l,
	   "right|r=s" => \$r,
	   "trim|t" => \$trim,
	   "white|w" => \$white,
           "bom=s" => \$bon,
	   "eom=s" => \$eon,
	   "bod=s" => \$bod,
	   "eod=s" => \$eod,
	   "column|c=s" => \$column,
	   "ignore|i" => \$ignore,
	   "help|h" => \$help);

if(!($column=~/^[0-9]+$/)){$column=0;}

if($help)
{
    print <<'END'
Options:
   --help -h            Help.
   --left -l            Left context info (default='30c')
                        Examples:
                                 -l=5c: left context is 5 characters
                                 -l=5w: left context is 5 words
                                 -l=5s: left context is 5 non-empty input lines
                                 -l='\s*\S+\sr\S+BOS': left context starts with the given regex
   --right -r           Right context info (default='30c')
   --trim -t            Clear incomplete words from output
   --white -w           DO NOT change all white characters into spaces
   --column -c          Left column minimal width in characters (default = 0)
   --ignore -i		Ignore input inconsistency
   --bon                Beginning of selected segment
                        (regex, default='[0-9]+ [0-9]+ BOM .*')
   --eon                End of selected segment
                        (regex, default='[0-9]+ [0-9]+ EOM .*')
   --bod                Selected segment beginning display (default='[')
   --eod                Selected segment end display (default=']')

END
;
    exit 0;
}


my $seg_no=0;
my $seg_size=0;

my $left_type;
my $left_size;
my $right_type;
my $right_size;

set_lr_types($l, $r, \$left_type,\$left_size,\$right_type,\$right_size, $trim);


my $inn=0;
my $after_bos=0;
my $before_eos=0;

my @LEFT; #tablica skalarów
my @CENTER; #tablica skalarów
my @RIGHT;

my @current_center;
my @current_left; #skalar dla c, w pp. tablica
my @current_left_words;
my @current_right_words_number;


while(<>){
                       my $line = $_;
	               chomp $line;
                       my @line = split / /, $line;
	               my $line_s=@line;

	               if(!line_format_ok(@line)){next;}

                       if(!$white){white_into_spaces(\@line);}
                       else{if($line[2] eq "S"){symbols_into_white(\$line[3]);}}

                       if(!input_consistent(\$seg_no,\$seg_size,$line[0],$line[1],$ignore)){
                         eof_or_inconsistency(\@LEFT,\@CENTER,\@RIGHT,$bod,$eod,$white,$column,$trim,$left_type,$right_type);
                         @current_center=();
                         @current_left=();
                         @current_left_words=();
                         @current_right_words_number=();
                         $after_bos=0;
                         $before_eos=0;
                       }

                       remember_current_left($left_type,$left_size,\@current_left,\@line, \@current_left_words, $line, \$after_bos, \$before_eos);
                       remember_center($line,\@line,\$inn,\@current_center,$white,\@CENTER,\@current_left,\@LEFT, \$after_bos, \$before_eos, \@RIGHT, \@current_right_words_number);
                       remember_right($right_type,$left_type,$right_size,\@line,\@LEFT,\@CENTER,\@RIGHT,$bod,$eod,$white,$column,$trim,\@current_right_words_number, $line, \$before_eos);
}

eof_or_inconsistency(\@LEFT,\@CENTER,\@RIGHT,$bod,$eod,$white,$column,$trim,$left_type,$right_type);
exit(0);

#################procedury###############################

sub line_format_ok{
    my @line = @_;
    my $size = @line;
    if($size<4){return 0;}
    if($line[0]!~/[0-9]+/){return 0;}
    if($line[1]!~/[0-9]+/){return 0;}
    return 1;
  }

sub white_into_spaces{
    my $line_ref=shift;
    if(@{$line_ref}[2] eq "S"){
       @{$line_ref}[3]=" ";
       }
  }

sub symbols_into_white{
    my $string_ref=shift;
    ${$string_ref} =~ s/\\n/\n/g;
    ${$string_ref} =~ s/\\t/\t/g;
    ${$string_ref} =~ s/_/ /g;
  }

sub white_into_symbols{
    my $string_ref=shift;
    ${$string_ref} =~ s/\n/\\n/g;
    ${$string_ref} =~ s/\t/\\t/g;
    ${$string_ref} =~ s/ /_/g;
  }

sub input_consistent{
    my $seg_no_ref = shift;
    my $seg_size_ref = shift;
    my $line0 = shift;
    my $line1 = shift;
    my $ig = shift;
    my $ok=1;

    if(${$seg_no_ref}!=0&&(!$ig)){
       my $distance = $line0-${$seg_size_ref};
       if($distance!=${$seg_no_ref}){$ok=0;}
    }
    ${$seg_no_ref}=$line0;
    ${$seg_size_ref}=$line1;
    return $ok;
  }

sub set_lr_types{
  my $left = shift;
  my $right = shift;
  my $left_type_ref =shift;
  my $left_size_ref =shift;
  my $right_type_ref =shift;
  my $right_size_ref =shift;
  my $do_trim=shift;

  if($left=~/[0-9]+c/){
    ${$left_type_ref}='c';
    ${$left_size_ref}=get_number($left);
    if($do_trim){${$left_size_ref}++;}
  }
  else{
    if($left=~/[0-9]+w/){
      ${$left_type_ref}='w';
      ${$left_size_ref}=get_number($left);
    }
    else{
      if($left=~/[0-9]+s/){
        ${$left_type_ref}='s';
        ${$left_size_ref}=get_number($left);
      }
      else{
           ${$left_type_ref}=$left;
      }
    }
  }

if($right=~/[0-9]+c/){
    ${$right_type_ref}='c';
    ${$right_size_ref}=get_number($right);
    if($do_trim){${$right_size_ref}++;}
  }
  else{
    if($right=~/[0-9]+w/){
      ${$right_type_ref}='w';
      ${$right_size_ref}=get_number($right);
    }
    else{
      if($right=~/[0-9]+s/){
        ${$right_type_ref}='s';
        ${$right_size_ref}=get_number($right);
      }
      else{
           ${$right_type_ref}=$right;
      }
    }
  }
  }

sub get_number{
  my $string = shift;
  my @letters = split(//,$string);
  my $i=0;
  while($letters[$i]=~/[0-9]/){$i++;}
  my $j;
  my $number=0;
  my $ten=1;
  for($j=$i-1;$j>=0;$j--){
    $number+=$letters[$j]*$ten;
    $ten*=10;
    }
  return $number;
  }

sub remember_center{
 my $lin = shift;
 my $lin_ref = shift;
 my $inn_ref = shift;
 my $current_center_ref = shift;
 my $white_info = shift;
 my $CENTER_REF = shift;
 my $current_left_ref = shift;
 my $LEFT_REF = shift;
 my $after_bos_ref = shift;
 my $before_eos_ref = shift;
 my $RIGHT_REF = shift;
 my $current_words_right_number_ref = shift;

 if((!${$inn_ref}) && $lin=~/$bon/){
   ${$inn_ref}=1;
   @{$current_center_ref}=();
   ${$after_bos_ref}=0;

   push(@{$LEFT_REF},join('',@{$current_left_ref}));

   }
 if(${$inn_ref} && $lin=~/$eon/){
   ${$inn_ref}=0;
   push(@{$CENTER_REF},join('',@{$current_center_ref}));
   ${$before_eos_ref}=1;
   my @new_table;
   push(@{$RIGHT_REF},\@new_table);
   push(@{$current_words_right_number_ref},0);
   }
 if($inn && index($lin,'*')==-1){
   white_into_symbols(\${$lin_ref}[3]);
   if($white_info){push(@{$current_center_ref},${$lin_ref}[3]);}
   else{push(@{$current_center_ref},${$lin_ref}[3]);}
   }
  }

sub remember_current_left{
my $type=shift;
my $size=shift;
my $ref=shift;
my $line_ref=shift;
   if($type eq 'c'){
     if(!(${$line_ref}[3] eq '*')){
       push(@{$ref},split('',${$line_ref}[3]));
       my $lsize = @{$ref};
       if($lsize>$size){splice(@{$ref},0,$lsize-$size);}
     }
   }
   else{
     if($type eq 'w'){
              my $words_ref = shift;
              if(!(${$line_ref}[3] eq '*')){
                push(@{$ref},${$line_ref}[3]);
                if(${$line_ref}[2] eq 'W'){
                 push(@{$words_ref},${$line_ref}[3]);
                }
                my $lsize = @{$words_ref};
                if($lsize>$size){
                    my $word = ${$words_ref}[1];
                    splice(@{$words_ref},0,1);
                    while(!(${$ref}[0] eq $word)){splice(@{$ref},0,1); }
                  }
                  }

       }
       else{
         if($type eq 's'){
           if(!(${$line_ref}[3] eq '*')){
           push(@{$ref},${$line_ref}[3]);
           my $lsize = @{$ref};
           if($lsize>$size){splice(@{$ref},0,$lsize-$size);}
     }
         }
         else{#bos/eos
           shift;
           my $line = shift;
           my $after_bos_ref = shift;
           my $before_eos_ref = shift;
           if($line=~/$type/){
             ${$after_bos_ref}=1;
             @{$ref}=();
             }
           if(${$after_bos_ref} && !(${$line_ref}[3] eq '*')){
             push(@{$ref},${$line_ref}[3]);
           }
         }
       }
   }
  }

sub remember_right{
my $type=shift;
my $type_left=shift;
my $size=shift;
my $line_ref=shift;
my $LEFT_REF=shift;
my $CENTER_REF=shift;
my $RIGHT_REF=shift;
my $bod=shift;
my $eod=shift;
my $w=shift;
my $c=shift;
my $t=shift;

   if($type eq 'c'){
     if(!(${$line_ref}[3] eq '*')){
       my $right_size = @{$RIGHT_REF};
       for(my $i=0; $i<$right_size; $i++){
         push(@{${$RIGHT_REF}[$i]}, split('',${$line_ref}[3]));
         my $lsize = @{${$RIGHT_REF}[$i]};
         if($lsize>=$size){
           splice(@{${$RIGHT_REF}[$i]},$size-1); #wypisz i usun
           print_and_remove($i,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type);
           $right_size = @{$RIGHT_REF};
           $i--;
         }
       }
     }
   }
   else{
     if($type eq 'w'){
              my $words_number_ref = shift;
              if(!(${$line_ref}[3] eq '*')){
                my $right_size = @{$RIGHT_REF};
                for(my $i=0; $i<$right_size; $i++){
                  push(@{${$RIGHT_REF}[$i]},${$line_ref}[3]);
                  if(${$line_ref}[2] eq 'W'){
                    ${$words_number_ref}[$i]=${$words_number_ref}[$i]+1;
                    if(${$words_number_ref}[$i]==$size){
                      print_and_remove($i,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type);
                      $right_size = @{$RIGHT_REF};
                      $i--;
                      splice(@{$words_number_ref},$i,1);
                    }
                  }
                }
              }
        }
       else{
         if($type eq 's'){
           if(!(${$line_ref}[3] eq '*')){
             my $right_s = @{$RIGHT_REF};
             for(my $i=0; $i<$right_s; $i++){
                  push(@{${$RIGHT_REF}[$i]},${$line_ref}[3]);
                  my $rsize=@{${$RIGHT_REF}[$i]};
                    if($rsize==$size){
                      print_and_remove($i,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type);
                      $right_s = @{$RIGHT_REF};
                      $i--;
                    }
              }
           }
       }
       else{#bos/eos
           shift;
           my $line = shift;
           my $before_eos_ref = shift;
           if(${$before_eos_ref}){
             if(!(${$line_ref}[3] eq '*')){
               #tylko 1 pozycja
               push(@{${$RIGHT_REF}[0]},${$line_ref}[3]);
             }
             if($line=~/$type/){
               ${$before_eos_ref}=0;
               print_and_remove(0,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type);
             }
           }
         }
   }
  }
}

sub print_and_remove{
    my $index = shift;
    my $LEFT_REF = shift;
    my $CENTER_REF = shift;
    my $RIGHT_REF = shift;
    my $bdis = shift;
    my $edis = shift;
    my $white = shift;
    my $column = shift;
    my $trim = shift;
    my $left_type = shift;
    my $right_type = shift;

    my $left_string = "${$LEFT_REF}[$index]";
    my $right_string = join('',@{${$RIGHT_REF}[$index]});

    if($trim){
      if($left_type eq "c"){$left_string=trim_left($left_string);}
      if($right_type eq "c"){$right_string=trim_right($right_string);}
      }

    if(length($left_string)<$column){$left_string=" "x($column-length($left_string)).$left_string;}

    if($white){
      white_into_symbols(\$left_string);
      white_into_symbols(\$right_string);
#ponizsza linijka dodana 18 listopada
      white_into_symbols(\${$CENTER_REF}[$index]);
    }

    print $left_string;
    print $bdis;

#ponizsza 3 linijki (tj. 1 blok) dodana 18 listopada
    if(!$white){
	symbols_into_white(\${$CENTER_REF}[$index]);
    }

    print "${$CENTER_REF}[$index]";
    print $edis;
    print $right_string;
    print "\n";

    splice(@{$LEFT_REF},$index,1);
    splice(@{$CENTER_REF},$index,1);
    splice(@{$RIGHT_REF},$index,1);
  }

sub trim_left{
    my $string = shift;
    if(substr($string,0,1) eq " "){return substr($string,1);}
    my $position = index($string," ");
    my $temp_position = index($string,"\n");
    if(!$temp_position==-1&&($position==-1||$temp_position<$position)){$position=$temp_position;}
    $temp_position = index($string,"\t");
    if(!$temp_position==-1&&($position==-1||$temp_position<$position)){$position=$temp_position;}
    return substr($string,$position+1);
  }

sub trim_right{
    my $string = shift;
    my $length = length($string);
    if(substr($string,$length-1,1) eq " "){return substr($string,0,$length-1);}
    my $position = rindex($string," ");
    my $temp_position = rindex($string,"\n");
    if($temp_position>$position){$position=$temp_position;}
    $temp_position = rindex($string,"\t");
    if($temp_position>$position){$position=$temp_position;}
    return substr($string,0,$position);
  }

sub eof_or_inconsistency{
    my $LEFT_REF = shift;
    my $CENTER_REF = shift;
    my $RIGHT_REF = shift;
    my $bdis = shift;
    my $edis = shift;
    my $white = shift;
    my $column = shift;
    my $trim = shift;
    my $left_type = shift;
    my $right_type = shift;

    my $length = @{$CENTER_REF};
    for(my $i=0;$i<$length;$i++){
       print_and_remove(0,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bdis,$edis,$white,$column,$trim,$left_type,$right_type);
       $length = @{$CENTER_REF};
       $i--;
      }
  }
