#!/usr/bin/perl #package: UAM Text Tools #component: dgc (dg compiler) #version: 1.0 #author: Tomasz Obrebski use lib "/usr/local/lib/utt"; use lib "$ENV{'HOME'}/.local/lib/utt"; #use strict; use Getopt::Long; use Data::Dumper; use attr; use File::HomeDir; use Parse::RecDescent; $::RD_HINT=1; # use List::MoreUtils; my $systemconfigfile='/etc/utt/dgc.conf'; my $userconfigfile=0; #home()."/.utt/dgc.conf"; Getopt::Long::Configure('no_ignore_case_always'); my $help=0; my $catfile=0; my $dicfile=0; my $gramfile=0; my $outputfile=0; #read configuration files########################### my $file; foreach $file ($systemconfigfile, $userconfigfile){ if(open(CONFIG, $file)){ while () { chomp; s/#.*//; s/^\s+//; s/\s+$//; next unless length; my ($name, $value) = split(/\s*=\s*/, $_, 2); if(($name eq "categories")or($name eq "c")) { $catfile=$value; } elsif(($name eq "dictionary")or($name eq "d")) { $dicfile=$value; } elsif(($name eq "grammar")or($name eq "g")) { $gramfile=$value; } elsif(($name eq "outputfile")or($name eq "o")) { $outputfile=$value; } elsif(($name eq "help")or($name eq "h")) { $help=1; } } close CONFIG; } } ######################################################### GetOptions("help|h" => \$help, "categories|c=s" => \$catfile, "dictionary|d=s" => \$dicfile, "grammar|g=s" => \$gramfile, "outputfile|o=s" => \$outputfile); my $homedir = $ENV{'HOME'}; $catfile =~ s/~/$homedir/; $dicfile =~ s/~/$homedir/; $gramfile =~ s/~/$homedir/; $outputfile =~ s/~/$homedir/; if($help) { print <<'END' Usage: dgc [OPTIONS] Options: --categories -c filename List of syntactic categories. --dictionary -d filename Dictionary. --grammar -g filename List of grammar rules. --outputfile -o filename Output file name. --help -h Help. END ; exit 0; } die("At least one of --cats and --dic must be given.\n") if !$catfile && !$dicfile; our %in; #gramatyka wejściowa our %idx; #indeks gramatyki wejściowej (niektóre stwierdzenia) our %out; #gramatyka wyjściowa our %class; #tablica klas our $attr_re = $attr::attr_re; our $cat_re = $attr::cat_re; our $cats_re = qr/(?:$attr::cat_re\s*(?:,\s*$attr::cat_re)*)/; our $class_re = qr/(?:\@\w+)/; our $av_re = $attr::av_re; our $avlist_re = $attr::avlist_re; our $role_re = qr/(?:[[:lower:][:digit:]_]+)/; our $prop_re = qr/(?:[[:upper:]]+)/; our $proplist_re = qr/(?:(?:\&$prop_re)+)/; my $inputlineno=0; our $statementgrammar = q( statement : statement1 ";" { $item[1] } statement1: /cat/i acat { ['cat', { cat=>$item{acat}, catexp=>attr::parse($item{acat}) }, $item{acat}] } | /flag/i flag { ['flag', { flag=>$item{flag} }, $item{flag}] } | /role/i role { ['role', { role=>$item{role} }, $item{role}] } | /left/i role { ['left', { role=>$item{role} }, 0] } | /right/i role { ['right', { role=>$item{role} }, 0] } | /sgl/i role { ['sgl', { role=>$item{role} }, 0] } | /req/i xcat role { ['req', { cats=>$item{xcat}, role=>$item{role} }, 0] } | /agr/i role attr { ['agr', { role=>$item{role}, attr=>$item{attr} }, $item{role}] } | /gov/i role xcat { ['gov', { role=>$item{role}, cats=>$item{xcat} }, $item{role}] } | /init/i flagconstr { ['initf', { flag=>$item{flagconstr} }, 0] } | /fin/i flagconstr { ['finf', { flag=>$item{flagconstr} }, 0] } | /init/i role { ['initr', { role=>$item{role} }, 0] } | /fin/i role { ['finr', { role=>$item{role} }, 0] } | /set/i xcat flag { ['set', { cats=>$item{xcat}, flag=>$item{flag} }, 0] } | /pass/i role flag { ['pass', { role=>$item{role}, flag=>$item{flag} }, 0] } | /constre/i role role { ['constre', { role1=>$item[2], role2=>$item[3] }, 0] } | /constri/i role role { ['constri', { role1=>$item[2], role2=>$item[3] }, 0] } | /link/i xcat optflags(?) xcat optflags(?) role prop(s?) { ['link', { hcats=>$item[2], hflagconstr=>$item[3], dcats=>$item[4], dflagconstr=>$item[5], role=>$item[6], props=>$item[7] }, 0] } | /long/i role role(s? /,/) '^' role(s? /,/) { ['long', { rel=>$item[2], up=>$item[3], down=>$item[5] }, 0] } | /class/i classname '=' xcat { ['class', { name=>$item{classname}, cats=>$item{xcat} }, $item{classname}] } acat: /$attr::cat_re/ attr: /$attr::attr_re/ xcat: classexpr role: /\w+/ flag: /\w+/ optflags: "//" flagconstr { $item[2] } flagconstr: /\w+[+-]/ prop: '&' /\w+/ { $item[2] } classname: /\$\w+[+-]/ classexpr : classexpr1 '|' classexpr { main::union($item[1],$item[3]) } | classexpr1 '~' classexpr { main::intersection( $item[1], main::complement($item[3]) ) } | classexpr1 classexpr1 : classexpr2 '&' classexpr1 { main::intersection($item[1],$item[3]) } | classexpr2 classexpr2 : '~' classexpr2 { main::complement($item[2]) } | classexpr3 classexpr3 : classexpr4 '/' /$attr::avlist_re/ { main::intersection($item[1], [main::extension('*/' . $item[3])] ) } | classexpr4 classexpr4 : class | cat | '(' classexpr ')' { $item[2] } class : classname { $main::class{$item[1]} or @{[]} } cat : /$main::cat_re/ { [main::extension($item[1])] } ); our $statementparser = Parse::RecDescent->new($statementgrammar); sub register { my ($src, $statement, $data, $index) = @_ ; $data->{line} = $inputlineno; $data->{src} = $src; push @{$in{$statement}}, $data; push @{$idx{$statement}{$index}}, $data if($index); if ($statement eq 'class') { $class{ $data->{name} } = $data->{cats} } } if(!$outputfile) { *OUTPUT = *STDOUT; } elsif($outputfile eq "-") { *OUTPUT = *STDOUT; } else { open(OUTPUT, ">$outputfile") or die("Can't open output file: $outputfile!"); } if(!$gramfile) { *INPUT = *STDIN; } elsif($gramfile eq "-") { *INPUT = *STDIN; } else { open(INPUT, "cat $gramfile | m4 |") or die("Unable to open: $gramfile!"); } # *INPUT = *STDIN; ############### TYMCZASOWO loadcats($catfile) if $catfile; extractcats($dicfile) if $dicfile; # CZYTANIE GRAMATYKI DGC while() { $inputlineno++; s/#.*//; s/^\s+//; s/\s+$//; s/\s+/ /g; next unless $_; my $result = $statementparser->statement("$_;"); # print "#input line $inputlineno\n"; # print Dumper($result); if($result) { register($_, @{$result}) } else { print STDERR "ERROR at line $inputlineno\n" } } # GENEROWANIE GRAMATYKI DGP my $inline = 0; my $outline = 0; # print Dumper($idx{gov}->{subj}); for my $x (@{$in{cat}}) { print_outin("CAT $x->{cat}", $x); } for my $x (@{$in{flag}}) { print_outin("FLAG $x->{flag}", $x); } for my $x (@{$in{role}}) { print_outin("ROLE $x->{role}", $x); } for my $x (@{$in{long}}) { print_outin("LONG $x->{rel} " . join(",",@{$x->{up}}) . "^" . join(",",@{$x->{down}}), $x) } for my $x (@{$in{left}}) { print_outin("LEFT $x->{role}", $x) if chk_role($x->{role}, $x) } for my $x (@{$in{right}}) { print_outin("RIGHT $x->{role}", $x) if chk_role($x->{role}, $x) } for my $x (@{$in{sgl}}) { print_outin("SGL $x->{role}", $x) if chk_role($x->{role}, $x) } for my $x (@{$in{req}}) { if( chk_role($x->{role}, $x) ) { for my $atomcat (map{$_->{cat}} @{$x->{cats}}) { print_outin("REQ $atomcat $x->{role}", $x); } } } for my $x (@{$in{initr}}) { print_outin("INITR $x->{role}", $x) if chk_role($x->{role}, $x) } for my $x (@{$in{finr}}) { print_outin("FINR $x->{role}", $x) if chk_role($x->{role}, $x) } for my $x (@{$in{initf}}) { print_outin("INITF $x->{flag}", $x) } # SPRAWDZIĆ CZY FLAGA JEST ZADEKLAROWANA for my $x (@{$in{finf}}) { print_outin("FINF $x->{flag}", $x); } # SPRAWDZIĆ CZY FLAGA JEST ZADEKLAROWANA for my $x (@{$in{set}}) { for my $atomcat (map{$_->{cat}} @{$x->{cats}}) { print_outin("SET $atomcat $x->{flag}", $x); } } for my $x (@{$in{pass}}) { print_outin("PASS $x->{role} $x->{flag}", $x); } for my $x (@{$in{constre}}) { print_outin("CONSTRE $x->{role1} $x->{role2}", $x) if chk_role($x->{role1}, $x) & chk_role($x->{role2}, $x) } for my $x (@{$in{constri}}) { print_outin("CONSTRI $x->{role1} $x->{role2}", $x) if chk_role($x->{role1}, $x) & chk_role($x->{role2}, $x) } for my $x (@{$in{link}}) { my @agrs = @{ $idx{agr}->{$x->{role} } or [] }; my @govs = @{ $idx{gov}->{$x->{role} } or [] }; my @deps = (@govs > 0) ? @{ intersection( $x->{dcats}, map { $_->{cats} } @govs ) } : @{ $x->{dcats} } ; for my $head ( @{ $x->{hcats} } ) { DEP: for my $dep (@deps) { for my $agr (@agrs) { next DEP unless attr::agree(@{$head->{catexp}},@{$dep->{catexp}},$agr->{attr}); } my $hflagconstr = @{$x->{hflagconstr}} ? "//@{$x->{hflagconstr}}" : ""; my $dflagconstr = @{$x->{dflagconstr}} ? "//@{$x->{dflagconstr}}" : ""; my $props = join(map { "\&$_" } $x->{props}); print_outin("LINK $head->{cat}$hflagconstr $dep->{cat}$dflagconstr $x->{role}$props",$x, @agrs, @govs); } } } sub chk_role { ($role, $statement_details) = @_; if($idx{role}{$role}) { 1; } else { print_error("undefined role", $statement_details); 0; } } sub print_outin { my ($out,@in) = (shift, @_); print OUTPUT "$out\t\t#"; printf OUTPUT " %04d@\"%s\"", $_->{line}, $_->{src} foreach @in; print OUTPUT "\n"; } sub print_error { my ($message,@in) = (shift,@_); print STDERR "ERROR: $message in statement "; printf STDERR " %04d@\"%s\"", $_->{line}, $_->{src} foreach @in; print STDERR "\n"; } sub extractcats { my $file = shift; open DICFILE, "$file"; while() { while(/,([^[:space:];]+)/g) { my $cat=$1; next if !$cat; # || exists $cats{$cat}; # print OUTPUT "CAT $1\n"; register('cat', {src=>"CAT $cat", cat=>"$cat", catexp=>attr::parse($cat)}, $cat); } } close DICFILE; } sub loadcats { my $file = shift; open CATFILE, "$file"; while() { tr/ \t\n//d; next if !$_; # || exists $cats{$_}; # print OUTPUT "CAT $_\n"; register("CAT $_", 'cat', {cat=>"$_", catexp=>attr::parse($_)}, $_); } close CATFILE; } sub extension { my $cat = shift; my $catexp = attr::parse($cat); grep { attr::match(@{$_->{catexp}},@{$catexp}) } @{$in{cat}}; } sub uniq { my %seen; grep { ! $seen{$_}++ } @_ } sub union { [ uniq( map { @{$_} } @_ ) ] } sub intersection { my $n=@_; my %seen; [ grep { ++$seen{$_} == $n } map { @{$_} } @_ ] } sub complement { my %exclude; for $c (@{shift()}) { $exclude{$c}++ }; [ grep { ! $exclude{$_} } @{$in{cat}} ] } # printf STDERR "%6d CAT statements\n", 0+keys(%cats); # printf STDERR "%6d ROLE statements\n", 0+keys(%role); # printf STDERR "%6d SGL statements\n", @sgl+0; # printf STDERR "%6d REQ statements\n", @req+0; # printf STDERR "%6d LEFT statements\n", $nleft; # printf STDERR "%6d RIGHT statements\n", $nright; # printf STDERR "%6d INITR statements\n", $ninitr; # printf STDERR "%6d FINR statements\n", $nfinr; # printf STDERR "%6d INITF statements\n", $ninitf; # printf STDERR "%6d FINF statements\n", $nfinf; # printf STDERR "%6d INITC statements\n", $ninitc; # printf STDERR "%6d FINC statements\n", $nfinc; # printf STDERR "%6d LINK statements\n", $nlink; # printf STDERR "%6d CLASS statements\n", $nclass; # printf STDERR "%6d FLAG statements\n", $nflag; # printf STDERR "%6d SET statements\n", $nset; # printf STDERR "%6d PASS statements\n", $npass; ################################################################################## # while() # { # $inputlineno++; # s/#.*//; # s/^\s+//; # s/\s+$//; # s/\s+/ /g; # if (/^CAT ($cat_re)$/) { register('cat', {src=>$&, cat=>attr::parse($1)}, $1); } # elsif(/^FLAG (\S+)$/) { register('flag', {src=>$&, flag=>$1}, $1); } # elsif(/^ROLE (\S+)$/) { register('role', {src=>$&, role=>$1}, $1); } # elsif(/^LEFT (\S+)$/) { register('left', {src=>$&, role=>$1}, 0); } # elsif(/^RIGHT (\S+)$/) { register('right', {src=>$&, role=>$1}, 0); } # elsif(/^SGL (\S+)$/) { register('sgl', {src=>$&, role=>$1}, 0); } # elsif(/^REQ (\S+) (\S+)$/) { register('req', {src=>$&, cat=>$1, role=>$2}, 0); } # elsif(/^AGR (\S+) (\S+)$/) { register('agr', {src=>$&, role=>$1, attr=>$2}, $1); } # elsif(/^GOV (\S+) (\S+)$/) { register('gov', {src=>$&, role=>$1, cat=>$2, catexp=>attr::parse($2)}, $1); } # elsif(/^INIT ($role_re)$/) { register('initr', {src=>$&, role=>$1}, 0); } # elsif(/^FIN ($role_re)$/) { register('finr', {src=>$&, role=>$1}, 0); } # elsif(/^INIT ($av_re)$/) { register('initf', {src=>$&, flag=>$1}, 0); } # elsif(/^FIN ($av_re)$/) { register('finf', {src=>$&, flag=>$1}, 0); } # elsif(/^SET ($cat_re)\s+(\S+)$/) { register('set', {src=>$&, cat=>$1, flag=>$2}, 0); } # elsif(/^PASS (\S+)\s+(\S+)$/) { register('pass', {src=>$&, role=>$1, flag=>$2}, 0); } # elsif(/^CONSTRE (\S+)\s+(\S+)$/) { register('constre', {src=>$&, role1=>$1, role2=>$2}, 0); } # elsif(/^CONSTRI (\S+)\s+(\S+)$/) { register('constri', {src=>$&, role1=>$1, role2=>$2}, 0); } # elsif(my ($hs,$hfs,$ds,$dfs,$r,$rprops) = /^LINK\s+($cats_re)((?:;$avlist_re)?)\s+($cats_re)((?:;$avlist_re)?)\s+($role_re)((?:$proplist_re)?)$/) # { register('link', {src=>$&, hs=>$hs, hfs=>$hfs, ds=>$ds, dfs=>$dfs, r=>$r, props=>$rprops},0) } # elsif(/^LONG\s+(\S+)((\s+<\S+)*)((\s+\S+>)*)$/) # { # my $rel = $1; # my $ups = $2; # my $downs = $4; # $ups =~ s///g; # $downs =~ s/^\s+//; # my @down = split(/\s+/,$downs) or (); # register('long', {src=>$&, rel=>$rel, up=>\@up, down=>\@down},0); # print OUTPUT "LONG $rel " . join(",",@up) . "^" . join(",",@down) . "\n"; # } # elsif(my ($cl,$cs) = /^CLASS\s+(\S+)\s*\=(.*)$/) # { # $class{$1} = $classparser->classexpr($2); # } # elsif(/^$/) # { # # pomijamy puste linie oraz komentarze # } # else # { # print STDERR "Illegal format: $_\n"; # } # } # sub is_cat { shift =~ /$attr::cat_re/; } # sub is_role { $role{shift}; } # sub is_flag { $flag{shift}; } # sub print_in # { # my $data = shift(); # printf "in@%04d ", $data->{line}; # print $data->{src}; # } # sub print_out # { # printf "out@%08d ", $outline++; # print @_; # } # sub addlinks # { # my ($l, $h,$hfs,$d,$dfs,$r,$rprops) = @_; # my @heads = extension($h); # my @deps = extension($d); # my @deps_gov; # DEP_GOV: # for my $dep (@deps) # { # for my $gov (@govs) # { # next DEP_GOV unless attr::match(@{$dep->{catexp}},@{$gov->{catexp}}); # } # push @deps_gov, $dep; # } # for my $head (@heads) # { # DEP: # for my $dep (@deps_gov) # { # for my $agr (@agrs) # { # next DEP unless attr::agree(@{$head->{catexp}},@{$dep->{catexp}},$agr->{attr}); # } # print_outin("LINK $head->{cat}$hfs $dep->{cat}$dfs $r$rprops",$l, @agrs,@govs); # } # } # } # elsif(/^INIT\s+([[:upper:]]\S*)$/) # { # print OUTPUT "#$_\n"; # my $cat = attr::parse $1; # for my $atomcat (keys %cats) # { # if(attr::match @$cat, @{$cats{$atomcat}}) # { # print OUTPUT "INITC ".$atomcat."\n"; # ++$ninitc; # } # } # } # elsif(/^FIN\s+([[:upper:]]\S*)$/) # { # print OUTPUT "#$_\n"; # my $cat = attr::parse $1; # for my $atomcat (keys %cats) # { # if(attr::match @$cat, @{$cats{$atomcat}}) # { # print OUTPUT "FINC ".$atomcat."\n"; # ++$nfinc; # } # } # }