-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrte3_baseline_runner.pl
113 lines (93 loc) · 3.47 KB
/
rte3_baseline_runner.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# Simple runner that calculates two baseline methods
# for RTE3 data XML file and report relevant values
# This baseline code reports the following values
# - mean word PMI
# - product best-per-word conditional prob.
use warnings;
use strict;
use Benchmark qw(:all);
use condprob qw(:DEFAULT set_num_thread $DEBUG export_hash_to_file $SOLR_URL mean_allword_pmi product_best_word_condprob mean_best_wordPMI);
# PARAMETERS to set (for condprob.pm)
our $DEBUG=0; # well, turn it on for quality check.
set_num_thread(4);
our $SOLR_URL = "http://localhost:9911/solr";
our $APPROXIMATE_WITH_TOP_N_HITS=4000;
#my $lambda = 0.2;
my $TEMP_DIR = "./temp";
$| =1; # flush always
die "Usage: needs three arguments.\n\">perl runner.pl rte_filename start_num end_num\"\n perl runner.pl ./testdata/English_dev.xml 1 800 \n" unless ($ARGV[2]);
my $RTEFILE = $ARGV[0];
die "unable to open file: $RTEFILE" unless (-r $RTEFILE);
my $START_ID = $ARGV[1] + 0;
my $END_ID = $ARGV[2] + 0;
my $RUN_ID = "rte_baseline";
die "start id out of bounds" if ($START_ID < 1 or $START_ID > 800);
die "end id out of bounds" if ($END_ID < 1 or $END_ID > 800);
die "end id must be bigger than start" if ($END_ID < $START_ID);
# time in
my $t0 = Benchmark->new;
# read data
my ($t_aref, $h_aref, $d_aref) = read_rte_data($RTEFILE);
## print header (CVS format, first line as column names)
#print "id, gold, P_coll(h), P_model(h), P_coll(t), P_model(t), P_model(h|t), PMI(h;t),";
#print "\n";
print "id, gold, meanPMI, norm-prod bestCondProd, raw-prod bestCondProd, mean bestPMI, wgt-mean bestPMI";
print "\n";
# now select one
for (my $pair_id = $START_ID; $pair_id <= $END_ID; $pair_id++)
{
my $id = $pair_id - 1; # id actually is starting from 0.
my $text = call_splitta($t_aref->[$id], $RUN_ID);
my $hypo = call_splitta($h_aref->[$id], $RUN_ID);
print STDERR "Processing id $pair_id;\n";
# if splitta fails: (RTE3 dev 141, hypo)
warn "SPLITTA failed! ==> fallback to lc(string).\n" unless($text and $hypo);
$text = lc($t_aref->[$id]) unless($text);
$hypo = lc($h_aref->[$id]) unless($hypo);
print STDERR "text: $text\n";
print STDERR "hypo: $hypo\n";
my $meanPMI = mean_allword_pmi($text, $hypo);
my ($word_logprob_norm, $word_logprob_raw) = product_best_word_condprob($text, $hypo);
my ($mean_best_PMI, $weighted_mean_best_PMI) = mean_best_wordPMI($text, $hypo);
# all prepared. print
print "$pair_id, $d_aref->[$id], $meanPMI, $word_logprob_norm, $word_logprob_raw, $mean_best_PMI, $weighted_mean_best_PMI";
print "\n";
}
# time stamp
my $t1 = Benchmark->new;
my $td = timediff($t1, $t0);
print STDERR "the code took:", timestr($td), "\n";
# reading EOP RTE file.
sub read_rte_data
{
# not generic but, good for current data
my $filename = shift;
open FILE, "<", $filename or die "unable to read $filename";
my @t;
my @h;
my @gold;
while (<FILE>)
{
next unless ($_ =~ /<pair id=.+ entailment="(.+?)"/);
# now a pair: get next two lines
{
my $gold_decision = $1;
my $tline = <FILE>;
my $hline = <FILE>;
# remove head / tail tags
$tline =~ s/^\s+<t>//;
$tline =~ s/<\/t>$//;
$hline =~ s/^\s+<h>//;
$hline =~ s/<\/h>$//;
# dcode
#print $tline, "\n";
#print $hline, "\n";
#die;
push @t, $tline;
push @h, $hline;
push @gold, $gold_decision;
}
}
die "Eh, need to be called within an array context" unless defined wantarray;
return (\@t, \@h, \@gold);
}