#!/usr/bin/perl

#package:        UAM Text Tools
#component name: gre
#author:         Tomasz Obrêbski

use strict;
use Getopt::Long;

my $LIB_DIR="/usr/local/lib/utt";  # katalog zawierajacy terms.m4

my $systemconfigfile="/usr/local/etc/utt/grp.conf";
my $userconfigfile="$ENV{'HOME'}/.utt/grp.conf";

Getopt::Long::Configure('no_ignore_case_always');

my $help=0;
my $pattern=0;
my $matches_only=0;
my $macrofile=0;
my $define=0;
my $show_command=0;
my $action="pgP";
my $eos="seg(EOS)";
my $morfield='lem';

#read configuration files###########################
my $file;
foreach $file ($systemconfigfile, $userconfigfile){
  if(open(CONFIG, $file)){
  	while (<CONFIG>) {
  		chomp;                  
      		s/#.*//;                
	      	s/^\s+//;               
      		s/\s+$//;               
    		next unless length;     
    		my ($name, $value) = split(/\s*=\s*/, $_, 2);
    		if(($name eq "pattern")or($name eq "e")){
			$pattern=$value;
    		}
    		elsif(($name eq "eos")or($name eq "E")){
			$eos=$value;
    		}
    		elsif($name eq "morph"){
			$morfield=$value;
    		}
    		elsif($name eq "macros"){
			$macrofile=$value;
    		}
    		elsif($name eq "define"){
			$define=$value;
    		}
    		elsif($name eq "command"){
			$show_command=1;
    		}
    		elsif($name eq "action"){
			$action;
    		}
    		elsif(($name eq "help")or($name eq "h")){
			$help=1;
    		}
    	
	} 
  	close CONFIG;
  }
}
#########################################################

GetOptions("pattern|e=s" => \$pattern,
	   "eos|E=s" => \$eos,
           "morph=s" => \$morfield,
	   "macros=s" => \$macrofile,
	   "define=s" => \$macrofile,
	   "command" => \$show_command,
	   "action=s" => \$action,
	   "help|h" => \$help);

if($help)
{
    print <<'END'
Usage: gre [OPTIONS] [file ..]

Options:
   --pattern -e	PATTERN		Pattern.
   --eos -E PATTERN             Segment serving as sentence delimiter.
   --morph=STRING               Field containing morphological information (default 'lem').
   --macros=FILE		Read macrodefinitions from FILE.
   --define=FILE		Add macrodefinitions from FILE.
   --action -a [u][p][g][P]	Perform only indicated actions.
				    u - uncompress with 'lzop -cd'
				    p - preprocess
				    g - grep
				    P - postprocess
				(default pgP)
   --command			Print the shell command to be executed and exit.
   --help -h			Help.
END
;
    exit 0;
}

die("$0: no pattern given.\n") unless $pattern || $action !~ /g/;

die("$0: macro file not found") unless
    $macrofile or
    -e "$LIB_DIR/terms.m4" and $macrofile="$LIB_DIR/terms.m4";

my $uncompress = ($action =~ /u/) ? ' lzop -cd | '  : '';
my $preproc    = ($action =~ /p/) ? ' fla  | '  : '';

my $postproc   = ($action =~ /P/) ? ' | unfla '  : '';


# discarding spaces
$pattern =~ s/\s+/\\`'/g; #` 
# quoting escaped commas
$pattern =~ s/\\,/\\`\\`\\,''/g;
# quoting commas in {m,n} r.e. operator
$pattern =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g;

my $grepre = `echo \"$pattern\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' --define=MORFIELD=$morfield $macrofile - 2>/dev/null`;

die("Incorrect pattern (m4).") if $? >> 8;


chomp $grepre;

# <> expansion

$grepre =~ s/<([^>]+)>/`echo $1 | tag2re`/ge;

$grepre =~ s/\./[^ [:cntrl:]]/g;

$grepre =~ s/\\s/[ ]/g;
$grepre =~ s/\\S/[^ [:cntrl:]]/g;
$grepre =~ s/\\d/[0-9]/g;
$grepre =~ s/\\D/[^0-9 [:cntrl:]]/g;
$grepre =~ s/\\w/[a-z±æê³ñó¶¼¿A-Z¡ÆÊ£ÑÓ¦¬¯0-9_]/g;
$grepre =~ s/\\W/[^a-z±æê³ñó¶¼¿A-Z¡ÆÊ£ÑÓ¦¬¯0-9_ [:cntrl:]]/g;
# extensions
$grepre =~ s/\\l/[a-z±æê³ñó¶¼¿]/g; #lowercase letter
$grepre =~ s/\\L/[A-Z¡ÆÊ£ÑÓ¦¬¯]/g; #upercase letter

my $grep_command = ($action =~ /g/) ? "egrep '$grepre'" : " cat ";

if($show_command)
{
    print $grep_command."\n";
    exit 0;
}

#print $preproc.$grep_command.$postproc."\n";

exec $preproc.$grep_command.$postproc;
