Skip to content

Commit

Permalink
quz parsing stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
Annette Rios committed Jul 29, 2015
1 parent 41415f5 commit 1f989d4
Showing 1 changed file with 147 additions and 0 deletions.
147 changes: 147 additions & 0 deletions parsing/xfst2conll.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@

#!/usr/bin/perl

use strict;
use open ':utf8';
binmode ':utf8';


my @words;
my $newWord=1;
my $index=1;
my $linecount=0;
my $s_count=1;

while(<>)
{
$linecount++;
if (/^$/)
{
$newWord=1;
}
elsif($_ =~ /#EOS/){
print "$index\tVROOT\t_\t_\t_\t_\n\n";
$index=1;
$s_count++;
}
elsif($newWord )
{
my ($form, $analysis) = split(/\t/);
#print $analysis;

chomp($analysis);

my (@DBs) = split(/\[\^DB\]/, $analysis);

foreach my $db (@DBs)
{
#print STDERR "db:$db\n\n";
my ($roottag) = $db =~ m/\[(ALFS|CARD|FLM|NP|NRoot|NRootNUM|NRootES|NRootCMP|VRoot|VRootES|PrnDem|PrnInterr|PrnPers(\+Lim)?\+[123]\.(Sg|Pl)(\.Incl|\.Excl)?|SP|\$.|AdvES|PrepES|ConjES|Part_Affir|Part_Cond|Part_Conec|Part_Contr|Part_Disc|Part_Neg|Part_Neg_Imp|Part_Sim).?\]/ ;
my ($rootstring) = $db =~ m/^([^\[]+)/g;
#print STDERR "root string: $rootstring in $db\n";
my ($trans) = $db =~ m/\[=([^\]]+)\]/;


my @morphtags = $db =~ m/\[(\+.+?)\]/g ;
my @morphtypes = $db =~ m/\[(Amb|NDeriv|VDeriv|NS|VS|Cas|Tns|Num|Asp|Mod|NPers|VPers|Tns\_VPers)\]/g ;
my @morphstrings = $db =~ m/\[--\]([^\[]+)/g;


my $morphtype_string;
#print STDERR "root tag: $roottag\n";

if($roottag ne '' ){
$morphtype_string = "Root";
for(my $i=0;$i<scalar(@morphtypes);$i++){
$morphtype_string .= "_".@morphtypes[$i];
}
}
else{
$morphtype_string = @morphtypes[0];
for(my $i=1;$i<scalar(@morphtypes);$i++){
$morphtype_string .= "_".@morphtypes[$i];
}
}
#print "morph_type string: $morphtype_string\n\n";

my $db_string = $rootstring;
foreach my $morphstring (@morphstrings){
$db_string .= $morphstring;
}

if ($db =~ /^\[--\]/){
$db_string = "-".$db_string;
}
#print db



my $db_morphs;
if($roottag ne '' ){
$db_morphs="Root=$roottag";
}

if(scalar(@morphtags) == scalar(@morphtypes)){
for(my $i=0;$i<scalar(@morphtags);$i++ ){
$db_morphs.= "|".@morphtypes[$i]."=".@morphtags[$i];
}
}
else{
print STDERR "different number of morph tags and types in line $linecount in sentence $s_count, cannot convert!\n";
exit(0);
}

# special cases:

# for roots: add translation
if($roottag ne ''&& $trans ne ''){
$db_morphs .= "|trans=".$trans;
}

# delete leading '|' if no root in db
$db_morphs =~ s/^\|//;
#punctuation
if( $roottag =~ /^\$/){
$db_morphs= "_";
$morphtype_string = $roottag;
}

# SP -> db_morphs = SP, no root
if($roottag =~ /^SP|CARD|ALFS|FLM$/){
$db_morphs = "_";
$morphtype_string = $roottag;
}


#print "$index\t$db_string\t_\t$morphtype_string\t$morphtype_string\t$db_morphs\t_\t_\t_\t_\n";
print "$index\t$db_string\t_\t$morphtype_string\t$morphtype_string\t$db_morphs\n";

$index++;
}
}
#no new word: another analysis still left for this word -> dump
else{
next;
}

}



#foreach my $word (@words){
# my $analyses = @$word[1];
# my $form = @$word[0];
#
# #in case there is still more than one analysis: take first one
# my $infohash = @$analyses[0];
#
# print $form.":\t".$infohash->{'string'}."\n";
# print
#
#
#
#}




0 comments on commit 1f989d4

Please sign in to comment.