-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsrilm_call.pm
120 lines (99 loc) · 3.33 KB
/
srilm_call.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# a perl module to call and read SRILM
# outputs (especially those debug outputs, too)
package srilm_call;
use strict;
use warnings;
use Exporter;
our @ISA = qw(Exporter);
our @EXPORT = qw(set_ngram_input_file make_ngram_input_file read_debug3_p call_ngram);
our $NGRAM_EXECUTABLE = "ngram";
our $NGRAM_DEBUGOPTION = "-debug 3"; # a must for us
our $NGRAM_INPUT_FILE = "./models/ngram_input.txt"; ## this is default value, and will be overridden by run-id, in most cases. (e.g. multiple instance running)
sub set_ngram_input_file($)
{
# if multiple sentences are called simiultanously, this must
# be set accordingly. (multiple instances ...)
# note that this file is all shared by collection model call and
# document model calls (threds, etc)
$NGRAM_INPUT_FILE = $_[0];
}
sub call_ngram($;$$) {
# call ngram with some options.
# arguments are call_ngram(model_path, optional_arguments*, sentence**);
# only first argument is mandatory.
# * if missing, no optional arguments will be given.
# ** if missing, will run on previously called text.
# the STDOUT of ngram, will be returned as @result
# call would be made like this;
# ngram -ppl in.txt -lm modelfile (-order or any similar options) -debug 3
my $model_path = $_[0];
my $additional_options = $_[1];
my $sentence_string = $_[2];
# sanity check
die unless (-e $model_path);
$additional_options = "" unless (defined $additional_options);
# generate input file, if $sentence_string is given
if (defined $sentence_string)
{
make_ngram_input_file($sentence_string);
# open FILE, ">", $NGRAM_INPUT_FILE;
# print FILE $sentence_string;
# close FILE;
}
die "Something wrong. This is a new call without sentence, or file write failed\n" unless (-r $NGRAM_INPUT_FILE);
# make command
my $command = $NGRAM_EXECUTABLE . " " . "-ppl " . $NGRAM_INPUT_FILE . " " . "-lm " . $model_path . " " . $NGRAM_DEBUGOPTION . " " . $additional_options;
# call
#print STDERR $command;
my @result = `$command 2> /dev/null`;
#my @result = `$command`;
# sanity check
die "\n ngram call fails, or problematic - no stdout from SRILM ngram. Check that commandline program ngram is in path \n" unless (@result);
return @result;
}
sub make_ngram_input_file
{
my $sentence_string = $_[0];
open FILE, ">", $NGRAM_INPUT_FILE;
print FILE $sentence_string;
close FILE;
}
sub read_debug3_p {
# return probability value itself (non-log)
my @result;
my @pline = read_debug3(@_);
foreach (@pline)
{
/.\] (.+?) \[ /;
push @result, $1;
}
return @result;
}
sub read_debug3_log {
# return log probability part of each word
my @result;
my @pline = read_debug3(@_);
foreach (@pline)
{
/ \[ (.+?) \] \/ 1/;
push @result, $1
}
return @result;
}
sub read_debug3 {
# print STDERR @lines;
# all ngram -ppl [input] -debug 3 outputs will be passed
# to this method.
# first line, original sentence
# then each line, "\t" p (something)
# finally, some closeing things
# let's pick all lines with \t, since we are assuming single sentence
my @lines = @_;
my @result;
foreach (@lines)
{
push @result, $_ if (/^\t/);
}
return @result;
}
1;