40 string delim,
string s1_char_delim,
41 string s2_char_delim,
bool init=
false) {
42 ifstream infile (input_file.c_str ());
45 cerr <<
"Loading input file: " << input_file << endl;
47 if (infile.is_open ()) {
48 while (infile.good ()) {
49 getline (infile, line);
56 if (tokens.size() > 1) {
68 cerr <<
"Failed to open input file: " << input_file << endl;
76 StdArc::Weight threshold,
int nbest,
77 bool fb,
bool penalize) {
94 ofstream ofile (ofile_name.c_str ());
97 for (
unsigned int i = 0; i < aligner->
fsas.size (); i++) {
99 VectorFst<StdArc>* tfst =
new VectorFst<StdArc> ();
100 Map (aligner->
fsas.at (i), tfst, LogToStdMapper ());
113 if (tfst->NumStates () > 0) {
114 StdArc::Weight weight_threshold = 99;
115 StdArc::StateId state_threshold = kNoStateId;
116 AnyArcFilter<StdArc> arc_filter;
117 vector<StdArc::Weight> distance;
118 VectorFst<StdArc> ofst;
120 AutoQueue<StdArc::StateId> state_queue (*tfst, &distance, arc_filter);
123 ShortestPathOptions<StdArc, AutoQueue<StdArc::StateId>,
124 AnyArcFilter<StdArc> >
125 opts (&state_queue, arc_filter, nbest,
false,
false,
126 kDelta,
false, weight_threshold,
129 &path_filter, 10000, opts);
130 for (
size_t i = 0; i < path_filter.
ordered_paths.size (); i++) {
132 for (
size_t j = 0; j < path.size (); j++) {
133 ofile << aligner->
isyms->Find (path [j]);
134 if (j < path.size () - 1)
147 vector<VectorFst<LogArc> > *fsts,
148 string far_name, StdArc::Weight threshold,
149 int nbest,
bool fb,
bool penalize) {
159 string key_prefix =
"";
160 string key_suffix =
"";
163 int32 generate_keys = 7;
164 bool set_syms =
false;
166 FarWriter<StdArc> *far_writer = \
167 FarWriter<StdArc>::Create (far_name, FAR_DEFAULT);
171 for (
unsigned int i = 0; i < fsts->size (); i++) {
175 if (fsts->at (i).NumStates () == 0)
continue;
177 VectorFst<StdArc>* tfst =
new VectorFst<StdArc> ();
178 VectorFst<LogArc>* lfst =
new VectorFst<LogArc> ();
179 VectorFst<LogArc>* pfst =
new VectorFst<LogArc> ();
180 VectorFst<StdArc>* ffst =
new VectorFst<StdArc> ();
183 Map (fsts->at(i), tfst, LogToStdMapper ());
187 Map (*tfst, lfst, StdToLogMapper ());
193 Push<LogArc, REWEIGHT_TO_FINAL> (*lfst, pfst, kPushWeights);
194 for (StateIterator<VectorFst<LogArc> > siter (*pfst);
195 !siter.Done (); siter.Next ()) {
196 size_t v = siter.Value();
197 if (pfst->Final(v) != LogArc::Weight::Zero ()) {
198 pfst->SetFinal (v,LogArc::Weight::One ());
204 if (pfst->NumStates () == 0)
continue;
208 Map (*pfst, ffst, LogToStdMapper ());
210 if (set_syms ==
false) {
211 ffst->SetInputSymbols (aligner->
isyms);
212 ffst->SetOutputSymbols (aligner->
isyms);
216 sprintf (keybuf,
"%0*d", generate_keys, i+1);
220 far_writer->Add (key_prefix + key + key_suffix, *ffst);
235 DEFINE_bool (seq1_del,
true,
"Allow deletions in sequence one." );
236 DEFINE_bool (seq2_del,
true,
"Allow deletions in sequence two." );
238 DEFINE_bool (penalize_em,
false,
"Penalize links during EM training." );
239 DEFINE_bool (load_model,
false,
"Load a pre-trained model for use." );
240 DEFINE_bool (lattice,
false,
"Write out the alignment lattices as an fst archive (.far)." );
241 DEFINE_bool (restrict,
true,
"Restrict links to M-1, 1-N during initialization." );
242 DEFINE_bool (mbr,
false,
"Use the LMBR decoder (not yet implemented)." );
243 DEFINE_bool (fb,
false,
"Use forward-backward pruning for the alignment lattices." );
244 DEFINE_int32 (seq1_max, 2,
"Maximum subsequence length for sequence one." );
245 DEFINE_int32 (seq2_max, 2,
"Maximum subsequence length for sequence two." );
246 DEFINE_int32 (iter, 11,
"Maximum number of EM iterations to perform." );
247 DEFINE_int32 (nbest, 1,
"Output the N-best alignments given the model." );
248 DEFINE_string (input,
"",
"Two-column input file to align." );
249 DEFINE_string (seq1_sep,
"|",
"Multi-token separator for input tokens." );
250 DEFINE_string (seq2_sep,
"|",
"Multi-token separator for output tokens." );
251 DEFINE_string (s1s2_sep,
"}",
"Token used to separate input-output subsequences in the g2p model." );
252 DEFINE_string (delim,
"\t",
"Delimiter separating entry one and entry two in the input file." );
254 DEFINE_string (skip,
"_",
"Skip token used to represent null transitions. Distinct from epsilon." );
255 DEFINE_string (ofile,
"",
"Output file to write the aligned dictionary to." );
256 DEFINE_string (s1_char_delim,
"",
"Sequence one input delimeter." );
257 DEFINE_string (s2_char_delim,
" ",
"Sequence two input delimeter." );
258 DEFINE_string (model_file,
"",
"FST-format alignment model to load." );
259 DEFINE_string (write_model,
"",
"Write out the alignment model in OpenFst format to filename." );
260 DEFINE_double (thresh, 1e-10,
"Delta threshold for EM training termination." );
261 DEFINE_double (pthresh, -99,
"Pruning threshold. Use to prune unlikely N-best candidates when using multiple alignments.");
263 int main(
int argc,
char* argv[] ){
264 string usage =
"phonetisaurus-align --input=dictionary --ofile=corpus.\n\n Usage: ";
265 set_new_handler (FailedNewHandler);
269 if( FLAGS_load_model==
true ){
270 aligner = *(
new M2MFstAligner (FLAGS_model_file, FLAGS_penalize,
271 FLAGS_penalize_em, FLAGS_restrict));
273 FLAGS_s1_char_delim, FLAGS_s2_char_delim,
276 cerr <<
"Please provide a valid input file." << endl;
281 aligner = *(
new M2MFstAligner (FLAGS_seq1_del, FLAGS_seq2_del,
282 FLAGS_seq1_max, FLAGS_seq2_max,
283 FLAGS_seq1_sep, FLAGS_seq2_sep,
284 FLAGS_s1s2_sep, FLAGS_eps, FLAGS_skip,
285 FLAGS_penalize, FLAGS_penalize_em,
289 FLAGS_s1_char_delim, FLAGS_s2_char_delim,
292 cerr <<
"Please provide a valid input file." << endl;
297 cerr <<
"Starting EM..." << endl;
299 cerr <<
"Finished first iter..." << endl;
300 for (
int i = 1; i <= FLAGS_iter; i++) {
301 cerr <<
"Iteration: " << i <<
" Change: ";
302 aligner.expectation ();
303 cerr << aligner.maximization (
false) << endl;
306 cerr <<
"Last iteration: " << endl;
307 aligner.expectation ();
308 aligner.maximization (
true);
311 StdArc::Weight pthresh = FLAGS_pthresh == -99.0
312 ? LogWeight::Zero().Value()
314 if (FLAGS_write_model.compare (
"") != 0) {
315 cerr <<
"Writing alignment model in OpenFst format to file: " 316 << FLAGS_write_model << endl;
317 aligner.write_model (FLAGS_write_model);
320 if (FLAGS_lattice ==
true)
322 FLAGS_nbest, FLAGS_fb, FLAGS_penalize);
325 FLAGS_fb, FLAGS_penalize);
void write_alignments(M2MFstAligner *aligner, string ofile_name, StdArc::Weight threshold, int nbest, bool fb, bool penalize)
DEFINE_string(input,"","Two-column input file to align.")
vector< VectorFst< LogArc > > fsas
unordered_set< int > VetoSet
DEFINE_bool(seq1_del, true,"Allow deletions in sequence one.")
void PhonetisaurusSetFlags(const char *usage, int *argc, char ***argv, bool remove_flags)
DEFINE_double(thresh, 1e-10,"Delta threshold for EM training termination.")
float maximization(bool lastiter)
int main(int argc, char *argv[])
vector< vector< int > > ordered_paths
DEFINE_int32(seq1_max, 2,"Maximum subsequence length for sequence one.")
void ShortestPathSpecialized(const Fst< Arc > &ifst, MutableFst< Arc > *ofst, vector< typename Arc::Weight > *distance, PathFilter *path_filter, size_t beam, ShortestPathOptions< Arc, Queue, ArcFilter > &opts)
void entry2alignfstnoinit(vector< string > seq1, vector< string > seq2, int nbest, string lattice="")
void prune_fst(VectorFst< StdArc > *fst)
vector< string > tokenize_utf8_string(string *utf8_string, string *delimiter)
void entry2alignfst(vector< string > seq1, vector< string > seq2)
void compileNBestFarArchive(M2MFstAligner *aligner, vector< VectorFst< LogArc > > *fsts, string far_name, StdArc::Weight threshold, int nbest, bool fb, bool penalize)
int load_input_file(M2MFstAligner *aligner, string input_file, string delim, string s1_char_delim, string s2_char_delim, bool init=false)