#!/usr/bin/perl -w #package: UAM Text Tools #component: kon (search context) #version: 1.0 #author: Justyna Walkowska use strict; use Getopt::Long; use locale; use File::HomeDir; Getopt::Long::Configure('no_ignore_case_always'); my $l='30c'; my $r='30c'; my $trim=0; my $white=0; my $bon='[0-9]+ [0-9]+ BOM .*'; my $eon='[0-9]+ [0-9]+ EOM .*'; my $bod='['; my $eod=']'; my $column=0; my $ignore=0; my $help=0; my $systemconfigfile='/usr/local/etc/utt/kon.conf'; #my $userconfigfile="$ENV{'HOME'}/.utt/kon.conf"; my $userconfigfile=home()."/.utt/kon.conf"; #read configuration files########################### my $file; foreach $file ($systemconfigfile, $userconfigfile){ if(open(CONFIG, $file)){ while () { chomp; s/#.*//; s/^\s+//; s/\s+$//; next unless length; my ($name, $value) = split(/\s*=\s*/, $_, 2); if(($name eq "left")or($name eq "l")){ $l=$value; } elsif(($name eq "right")or($name eq "r")){ $r=$value; } elsif(($name eq "trim")or($name eq "t")){ $trim=1; } elsif(($name eq "white")or($name eq "w")){ $white=1; } elsif($name eq "bom"){ $bon=$value; } elsif($name eq "eom"){ $eon=$value; } elsif($name eq "bod"){ $bod=$value; } elsif($name eq "eod"){ $eod=$value; } elsif(($name eq "column")or($name eq "c")){ $column=$value; } elsif(($name eq "ignore")or($name eq "i")){ $ignore=1; } elsif(($name eq "help")or($name eq "h")){ $help=1; } } close CONFIG; } } ######################################################### GetOptions("left|l=s" => \$l, "right|r=s" => \$r, "trim|t" => \$trim, "white|w" => \$white, "bom=s" => \$bon, "eom=s" => \$eon, "bod=s" => \$bod, "eod=s" => \$eod, "column|c=s" => \$column, "ignore|i" => \$ignore, "help|h" => \$help); if(!($column=~/^[0-9]+$/)){$column=0;} if($help) { print <<'END' Options: --help -h Help. --left -l Left context info (default='30c') Examples: -l=5c: left context is 5 characters -l=5w: left context is 5 words -l=5s: left context is 5 non-empty input lines -l='\s*\S+\sr\S+BOS': left context starts with the given regex --right -r Right context info (default='30c') --trim -t Clear incomplete words from output --white -w DO NOT change all white characters into spaces --column -c Left column minimal width in characters (default = 0) --ignore -i Ignore input inconsistency --bon Beginning of selected segment (regex, default='[0-9]+ [0-9]+ BOM .*') --eon End of selected segment (regex, default='[0-9]+ [0-9]+ EOM .*') --bod Selected segment beginning display (default='[') --eod Selected segment end display (default=']') END ; exit 0; } my $seg_no=0; my $seg_size=0; my $left_type; my $left_size; my $right_type; my $right_size; set_lr_types($l, $r, \$left_type,\$left_size,\$right_type,\$right_size, $trim); my $inn=0; my $after_bos=0; my $before_eos=0; my @LEFT; #tablica skalarów my @CENTER; #tablica skalarów my @RIGHT; my @current_center; my @current_left; #skalar dla c, w pp. tablica my @current_left_words; my @current_right_words_number; while(<>){ my $line = $_; chomp $line; my @line = split / /, $line; my $line_s=@line; if(!line_format_ok(@line)){next;} if(!$white){white_into_spaces(\@line);} else{if($line[2] eq "S"){symbols_into_white(\$line[3]);}} if(!input_consistent(\$seg_no,\$seg_size,$line[0],$line[1],$ignore)){ eof_or_inconsistency(\@LEFT,\@CENTER,\@RIGHT,$bod,$eod,$white,$column,$trim,$left_type,$right_type); @current_center=(); @current_left=(); @current_left_words=(); @current_right_words_number=(); $after_bos=0; $before_eos=0; } remember_current_left($left_type,$left_size,\@current_left,\@line, \@current_left_words, $line, \$after_bos, \$before_eos); remember_center($line,\@line,\$inn,\@current_center,$white,\@CENTER,\@current_left,\@LEFT, \$after_bos, \$before_eos, \@RIGHT, \@current_right_words_number); remember_right($right_type,$left_type,$right_size,\@line,\@LEFT,\@CENTER,\@RIGHT,$bod,$eod,$white,$column,$trim,\@current_right_words_number, $line, \$before_eos); } eof_or_inconsistency(\@LEFT,\@CENTER,\@RIGHT,$bod,$eod,$white,$column,$trim,$left_type,$right_type); exit(0); #################procedury############################### sub line_format_ok{ my @line = @_; my $size = @line; if($size<4){return 0;} if($line[0]!~/[0-9]+/){return 0;} if($line[1]!~/[0-9]+/){return 0;} return 1; } sub white_into_spaces{ my $line_ref=shift; if(@{$line_ref}[2] eq "S"){ @{$line_ref}[3]=" "; } } sub symbols_into_white{ my $string_ref=shift; ${$string_ref} =~ s/\\n/\n/g; ${$string_ref} =~ s/\\t/\t/g; ${$string_ref} =~ s/_/ /g; } sub white_into_symbols{ my $string_ref=shift; ${$string_ref} =~ s/\n/\\n/g; ${$string_ref} =~ s/\t/\\t/g; ${$string_ref} =~ s/ /_/g; } sub input_consistent{ my $seg_no_ref = shift; my $seg_size_ref = shift; my $line0 = shift; my $line1 = shift; my $ig = shift; my $ok=1; if(${$seg_no_ref}!=0&&(!$ig)){ my $distance = $line0-${$seg_size_ref}; if($distance!=${$seg_no_ref}){$ok=0;} } ${$seg_no_ref}=$line0; ${$seg_size_ref}=$line1; return $ok; } sub set_lr_types{ my $left = shift; my $right = shift; my $left_type_ref =shift; my $left_size_ref =shift; my $right_type_ref =shift; my $right_size_ref =shift; my $do_trim=shift; if($left=~/[0-9]+c/){ ${$left_type_ref}='c'; ${$left_size_ref}=get_number($left); if($do_trim){${$left_size_ref}++;} } else{ if($left=~/[0-9]+w/){ ${$left_type_ref}='w'; ${$left_size_ref}=get_number($left); } else{ if($left=~/[0-9]+s/){ ${$left_type_ref}='s'; ${$left_size_ref}=get_number($left); } else{ ${$left_type_ref}=$left; } } } if($right=~/[0-9]+c/){ ${$right_type_ref}='c'; ${$right_size_ref}=get_number($right); if($do_trim){${$right_size_ref}++;} } else{ if($right=~/[0-9]+w/){ ${$right_type_ref}='w'; ${$right_size_ref}=get_number($right); } else{ if($right=~/[0-9]+s/){ ${$right_type_ref}='s'; ${$right_size_ref}=get_number($right); } else{ ${$right_type_ref}=$right; } } } } sub get_number{ my $string = shift; my @letters = split(//,$string); my $i=0; while($letters[$i]=~/[0-9]/){$i++;} my $j; my $number=0; my $ten=1; for($j=$i-1;$j>=0;$j--){ $number+=$letters[$j]*$ten; $ten*=10; } return $number; } sub remember_center{ my $lin = shift; my $lin_ref = shift; my $inn_ref = shift; my $current_center_ref = shift; my $white_info = shift; my $CENTER_REF = shift; my $current_left_ref = shift; my $LEFT_REF = shift; my $after_bos_ref = shift; my $before_eos_ref = shift; my $RIGHT_REF = shift; my $current_words_right_number_ref = shift; if((!${$inn_ref}) && $lin=~/$bon/){ ${$inn_ref}=1; @{$current_center_ref}=(); ${$after_bos_ref}=0; push(@{$LEFT_REF},join('',@{$current_left_ref})); } if(${$inn_ref} && $lin=~/$eon/){ ${$inn_ref}=0; push(@{$CENTER_REF},join('',@{$current_center_ref})); ${$before_eos_ref}=1; my @new_table; push(@{$RIGHT_REF},\@new_table); push(@{$current_words_right_number_ref},0); } if($inn && index($lin,'*')==-1){ white_into_symbols(\${$lin_ref}[3]); if($white_info){push(@{$current_center_ref},${$lin_ref}[3]);} else{push(@{$current_center_ref},${$lin_ref}[3]);} } } sub remember_current_left{ my $type=shift; my $size=shift; my $ref=shift; my $line_ref=shift; if($type eq 'c'){ if(!(${$line_ref}[3] eq '*')){ push(@{$ref},split('',${$line_ref}[3])); my $lsize = @{$ref}; if($lsize>$size){splice(@{$ref},0,$lsize-$size);} } } else{ if($type eq 'w'){ my $words_ref = shift; if(!(${$line_ref}[3] eq '*')){ push(@{$ref},${$line_ref}[3]); if(${$line_ref}[2] eq 'W'){ push(@{$words_ref},${$line_ref}[3]); } my $lsize = @{$words_ref}; if($lsize>$size){ my $word = ${$words_ref}[1]; splice(@{$words_ref},0,1); while(!(${$ref}[0] eq $word)){splice(@{$ref},0,1); } } } } else{ if($type eq 's'){ if(!(${$line_ref}[3] eq '*')){ push(@{$ref},${$line_ref}[3]); my $lsize = @{$ref}; if($lsize>$size){splice(@{$ref},0,$lsize-$size);} } } else{#bos/eos shift; my $line = shift; my $after_bos_ref = shift; my $before_eos_ref = shift; if($line=~/$type/){ ${$after_bos_ref}=1; @{$ref}=(); } if(${$after_bos_ref} && !(${$line_ref}[3] eq '*')){ push(@{$ref},${$line_ref}[3]); } } } } } sub remember_right{ my $type=shift; my $type_left=shift; my $size=shift; my $line_ref=shift; my $LEFT_REF=shift; my $CENTER_REF=shift; my $RIGHT_REF=shift; my $bod=shift; my $eod=shift; my $w=shift; my $c=shift; my $t=shift; if($type eq 'c'){ if(!(${$line_ref}[3] eq '*')){ my $right_size = @{$RIGHT_REF}; for(my $i=0; $i<$right_size; $i++){ push(@{${$RIGHT_REF}[$i]}, split('',${$line_ref}[3])); my $lsize = @{${$RIGHT_REF}[$i]}; if($lsize>=$size){ splice(@{${$RIGHT_REF}[$i]},$size-1); #wypisz i usun print_and_remove($i,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type); $right_size = @{$RIGHT_REF}; $i--; } } } } else{ if($type eq 'w'){ my $words_number_ref = shift; if(!(${$line_ref}[3] eq '*')){ my $right_size = @{$RIGHT_REF}; for(my $i=0; $i<$right_size; $i++){ push(@{${$RIGHT_REF}[$i]},${$line_ref}[3]); if(${$line_ref}[2] eq 'W'){ ${$words_number_ref}[$i]=${$words_number_ref}[$i]+1; if(${$words_number_ref}[$i]==$size){ print_and_remove($i,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type); $right_size = @{$RIGHT_REF}; $i--; splice(@{$words_number_ref},$i,1); } } } } } else{ if($type eq 's'){ if(!(${$line_ref}[3] eq '*')){ my $right_s = @{$RIGHT_REF}; for(my $i=0; $i<$right_s; $i++){ push(@{${$RIGHT_REF}[$i]},${$line_ref}[3]); my $rsize=@{${$RIGHT_REF}[$i]}; if($rsize==$size){ print_and_remove($i,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type); $right_s = @{$RIGHT_REF}; $i--; } } } } else{#bos/eos shift; my $line = shift; my $before_eos_ref = shift; if(${$before_eos_ref}){ if(!(${$line_ref}[3] eq '*')){ #tylko 1 pozycja push(@{${$RIGHT_REF}[0]},${$line_ref}[3]); } if($line=~/$type/){ ${$before_eos_ref}=0; print_and_remove(0,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type); } } } } } } sub print_and_remove{ my $index = shift; my $LEFT_REF = shift; my $CENTER_REF = shift; my $RIGHT_REF = shift; my $bdis = shift; my $edis = shift; my $white = shift; my $column = shift; my $trim = shift; my $left_type = shift; my $right_type = shift; my $left_string = "${$LEFT_REF}[$index]"; my $right_string = join('',@{${$RIGHT_REF}[$index]}); if($trim){ if($left_type eq "c"){$left_string=trim_left($left_string);} if($right_type eq "c"){$right_string=trim_right($right_string);} } if(length($left_string)<$column){$left_string=" "x($column-length($left_string)).$left_string;} if($white){ white_into_symbols(\$left_string); white_into_symbols(\$right_string); #ponizsza linijka dodana 18 listopada white_into_symbols(\${$CENTER_REF}[$index]); } print $left_string; print $bdis; #ponizsza 3 linijki (tj. 1 blok) dodana 18 listopada if(!$white){ symbols_into_white(\${$CENTER_REF}[$index]); } print "${$CENTER_REF}[$index]"; print $edis; print $right_string; print "\n"; splice(@{$LEFT_REF},$index,1); splice(@{$CENTER_REF},$index,1); splice(@{$RIGHT_REF},$index,1); } sub trim_left{ my $string = shift; if(substr($string,0,1) eq " "){return substr($string,1);} my $position = index($string," "); my $temp_position = index($string,"\n"); if(!$temp_position==-1&&($position==-1||$temp_position<$position)){$position=$temp_position;} $temp_position = index($string,"\t"); if(!$temp_position==-1&&($position==-1||$temp_position<$position)){$position=$temp_position;} return substr($string,$position+1); } sub trim_right{ my $string = shift; my $length = length($string); if(substr($string,$length-1,1) eq " "){return substr($string,0,$length-1);} my $position = rindex($string," "); my $temp_position = rindex($string,"\n"); if($temp_position>$position){$position=$temp_position;} $temp_position = rindex($string,"\t"); if($temp_position>$position){$position=$temp_position;} return substr($string,0,$position); } sub eof_or_inconsistency{ my $LEFT_REF = shift; my $CENTER_REF = shift; my $RIGHT_REF = shift; my $bdis = shift; my $edis = shift; my $white = shift; my $column = shift; my $trim = shift; my $left_type = shift; my $right_type = shift; my $length = @{$CENTER_REF}; for(my $i=0;$i<$length;$i++){ print_and_remove(0,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bdis,$edis,$white,$column,$trim,$left_type,$right_type); $length = @{$CENTER_REF}; $i--; } }