Index: src/mar.pl
===================================================================
--- src/mar.pl	(revision dc2176950fd43bdb6684b7730f37aaa88239b818)
+++ src/mar.pl	(revision dc2176950fd43bdb6684b7730f37aaa88239b818)
@@ -0,0 +1,336 @@
+#!/usr/bin/perl
+
+#package:	UAM Text Tools
+#component:	mar
+#version:	1.0
+#author:	Marcin Walas
+
+#this program tags the tokenized file with given tags
+#tags can be given in any order and configuration through the expression
+#which is one of the parametres of the script
+#contact: d287572@atos.wmid.amu.edu.pl, walasiek@gmail.com
+
+my $version = '1.0';
+
+use lib "/usr/local/lib/utt";
+use lib "$ENV{'HOME'}/.local/lib/utt";
+
+use strict;
+use Getopt::Long;
+use File::HomeDir;
+
+use attr;
+
+
+my $LIB_DIR="/usr/local/lib/utt";
+my $systemconfigfile='/usr/local/etc/utt/mar.conf';
+my $userconfigfile=home()."/.utt/mar.conf";
+
+Getopt::Long::Configure('no_ignore_case_always');
+
+my $help=0;
+my $pattern=0;
+my $macrofile=0;
+my $define=0;
+my $command=0;
+my $action="pgP";
+my $eos="seg(EOS)";
+my $explicit_space=0;
+my $morfield='lem';
+my $tags=0;
+my $show_version = 0;
+
+#read configuration files###########################
+my $file;
+foreach $file ($systemconfigfile, $userconfigfile){
+  if(open(CONFIG, $file)){
+  	while (<CONFIG>) {
+  		chomp;                  
+      		s/#.*//;                
+	      	s/^\s+//;               
+      		s/\s+$//;               
+    		next unless length;     
+    		my ($name, $value) = split(/\s*=\s*/, $_, 2);
+    		if(($name eq "pattern")or($name eq "e")){
+			$pattern=$value;
+    		}
+    		elsif($name eq "eos"){
+			$eos=$value;
+    		}
+    		elsif($name eq "macros"){
+			$macrofile=$value;
+    		}
+    		elsif($name eq "tags"){
+    			$tags=$value;
+    		}
+    		elsif($name eq "morph"){
+			$morfield=$value;
+    		}
+    		elsif($name eq "command"){
+			$command=1;
+    		}
+    		elsif($name eq "action"){
+			$action=$value;
+    		}
+    		elsif($name eq "space"){
+			$explicit_space=1;
+    		}
+    		elsif(($name eq "help")or($name eq "h")){
+			$help=1;
+    		}
+    	
+	} 
+  	close CONFIG;
+  }
+}
+#########################################################
+
+GetOptions("pattern|e=s" => \$pattern,
+	   "eos|E=s" => \$eos,
+	   "macros=s" => \$macrofile,
+	   "define=s" => \$macrofile,
+	   "command" => \$command,
+	   "action=s" => \$action,
+	   "help|h" => \$help,
+	   "space|s" => \$explicit_space,
+       "version|v" => \$show_version,
+   );
+
+
+
+if($show_version){
+    print "Version: $version\n";
+    exit 0;
+}
+
+if($help)
+{
+    print <<'END'
+Usage: mar [OPTIONS] [file ..]
+
+Options:
+   --pattern -e	PATTERN		Pattern.
+   --eos -E PATTERN             Segment serving as sentence beginning marker. [TODO]
+   --macros=FILE		Read macrodefinitions from FILE. [TODO]
+   --define=FILE		Add macrodefinitions from FILE. [TODO]
+   --action -a [p][s][P]	Perform only indicated actions.
+				    p - preprocess
+				    s - search
+				    P - postprocess
+				(default psP)
+   --command			Print generated shell command and exit.
+   --help -h			Print help.
+   --version -v         Script version
+
+In patern you can put any tag. Tags should begin with the @ character.
+They don't have to be closed. 
+They can't contain white spaces!
+
+Note: If you don't define any custom tags, whole pattern will be taged with
+      default tags (begining of match and end of match)
+
+Tags examples:
+
+mar -e '@BEG cat(<ADJ>) @END'
+  it will find any adjectives in the text and tag them with surrounding tags
+mar -e 'cat(<ADJ>) @MYTAG cat(<ADJ>)' 
+  this will find two neighbouring adjectives and parcel them with tag MYTAG
+
+Some example patterns:
+'word(domu)'  - form of the word domu
+'lexeme(dom)' - any form of lexeme dom
+'space'       - space
+'cat(<ADJ>)'  - adjective
+
+You can use * in patterns to make zero or more counts of word.
+
+END
+;
+    exit 0;
+}
+
+die("$0: no pattern given. Run with -h to get help.\n") unless $pattern || $action !~ /g/;
+
+die("$0: macro file not found") unless
+    $macrofile or
+    -e "$LIB_DIR/terms.m4" and $macrofile="$LIB_DIR/terms.m4";
+
+my $preproc    = ($action =~ /p/) ? ' fla  | '  : '';
+
+my $postproc   = ($action =~ /P/) ? ' | unfla '  : '';
+
+
+#this is our help function to  cut the re to get another tag
+#it takes only one argument which is our patern (after m4 processing)
+#returns: the first root-level brace with content
+sub cutRe
+{
+ my $i = 0;
+ my $level = 0;
+ my $text = $_[0];
+ my $temp;
+ for( $i =0; $i < (length $text);$i++)
+ {
+	 $temp = substr($text, $i,1);
+	 if( $temp eq "(")
+	 {#we have an opening
+		 $level++;
+	 }
+	 elsif ( $temp eq ")")
+	 {#we close
+		 $level--;
+	 }
+	 if ( $level == 0)
+	 { 
+		$temp  = substr($text,0,$i+1);
+		last;
+	 }
+ }
+ $temp;
+}
+
+#the same function as above althought it returns everything after the
+#first root level brace
+sub restRe
+{
+ my $i = 0;
+ my $level = 0;
+ my $text = $_[0];
+ my $temp;
+ for( $i =0; $i < (length $text);$i++)
+ {
+	 $temp = substr($text, $i,1);
+	 if( $temp eq "(")
+	 {#we have an opening
+		 $level++;
+	 }
+	 elsif ( $temp eq ")")
+	 {#we close
+		 $level--;
+	 }
+	 if ( $level == 0)
+	 { #we cut everything in the begining
+		$temp  = substr($text,$i+1);
+		last;
+	 }
+ }
+ $temp;
+}
+
+
+#here we are preparing re for extended matching
+my @tags;
+
+#we must find what our the tags
+#some pattern adjustment
+my $end = 0;
+my $temp = " ".$pattern." ";
+$temp =~ s/(\@[^ ]*) (\@[^ ]* )/\1  \2/g;
+$pattern = $temp;
+
+while ($end != 1)
+{
+	#we seek for the first tag in pattern
+ if ($temp =~ /^.*?\@(.*?) /)
+ {
+	 #we add this to tags array
+	 push (@tags, $1);
+	 #and cut the pattern
+	 $temp =~ s/^.*?\@(.*?) / /;
+	 #print $temp."\n";
+ }
+ else
+ {
+	 #if we dont find any tags we end
+	 $end = 1;
+ }
+}
+
+#here we have our patern with tags removed (we set sections of ()) between tags
+my $patternmod = "( ".$pattern." )";
+$patternmod =~ s/\s@.*?\s/\)\(/g;
+
+#discarding spaces
+$patternmod =~ s/\s+/\\`'/g; #` 
+# quoting escaped commas
+$patternmod =~ s/\\,/\\`\\`\\,''/g;
+# quoting commas in {m,n} r.e. operator
+$patternmod =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g;
+#print "After m4:".$re."\n";
+
+my $re = `echo \"$patternmod\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' --define=MORFIELD=$morfield $macrofile - 2>/dev/null`;
+
+die("Incorrect pattern (m4).") if $? >> 8;
+
+
+chomp $re;
+
+# <> expansion
+
+$re =~ s/<([^>]+)>/`echo $1 | $tags.tag2re`/ge;
+
+# Perl-like special sequences
+$re =~ s/\./[^ [:cntrl:]]/g;
+$re =~ s/\\s/[ ]/g;
+$re =~ s/\\S/[^ [:cntrl:]]/g;
+$re =~ s/\\d/[0-9]/g;
+$re =~ s/\\D/[^0-9 [:cntrl:]]/g;
+$re =~ s/\\w/[a-z±æê³ñó¶Œ¿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_]/g;
+$re =~ s/\\W/[^a-z±æê³ñó¶Œ¿A-Z¡ÆÊ£ÑÓŠ¬¯0-9_ [:cntrl:]]/g;
+# extensions
+$re =~ s/\\l/[a-z±æê³ñó¶Œ¿]/g; #lowercase letter
+$re =~ s/\\L/[A-Z¡ÆÊ£ÑÓŠ¬¯]/g; #upercase letter
+
+my $sedcommand;
+my $grepcommand;
+
+#now we must built a sed script from our re
+#we do this by cuting our re each tag until we cut them all
+#if an user dint input any tags we do our default 
+my $defBOM = "BOM";
+my $defEOM = "EOM";
+my $defTempTagBeg = "####TempTAGBEG####";
+my $defTempTagEnd = "####TempTAGEND####";
+
+if (@tags == 0)
+{
+	$sedcommand = "sed -r 's/($re)/\\500 $defBOM *\\f\\1###EOM###/g; s/###EOM###([0-9]+)/\\1 00 $defEOM *\\f\\1/g'";
+}
+else #we have custom tags
+{
+	#first tag is easy to tag :)
+	my $sedscript="sed -r 's/($re)/\\600 $defTempTagBeg *\\f\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $defTempTagEnd *\\f\\1/g;";
+    #after first step we have temp tagged parts of input matching re
+    #now we need to insert our custom tags
+    #we will find temp tags and process our input
+    
+	my $i = 0;
+	#copy of re which will be cut
+	my $rec = $re;
+	my $restre = $re;
+	
+	for ($i = 0 ; $i < @tags ; $i++)
+	{
+	 #re cutting
+	 $rec = cutRe($restre);
+	 $restre = restRe($restre);
+	 if ($rec =~ / *\( *\) */)
+	 {
+	    $sedscript = $sedscript."s/([0-9]+) 00 $defTempTagBeg \\*\\f([0-9]+)/\\2 00 $tags[$i] *\\f\\2 00 $defTempTagBeg *\\f\\2/g;";
+	 }
+	 else
+	 {
+	    $sedscript = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f($rec)/\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $tags[$i] *\\f\\1 00 $defTempTagBeg *\\f\\1/g;";
+	 }
+
+	}
+	
+	$sedcommand = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f//g;s/[0-9]+ 00 $defTempTagEnd \\*\\f//g'";
+}
+
+if($command)
+{
+    print $sedcommand."\n";
+    exit 0;
+}
+exec $preproc.$sedcommand.$postproc;
