Phonetisaurus  1.0
FST-based Grapheme-to-Phoneme conversion
phonetisaurus-align.cc
Go to the documentation of this file.
1 /*
2  phonetisaurus-align.cc
3 
4  Copyright (c) [2012-], Josef Robert Novak
5  All rights reserved.
6 
7  Redistribution and use in source and binary forms, with or without
8  modification, are permitted #provided that the following conditions
9  are met:
10 
11  * Redistributions of source code must retain the above copyright
12  notice, this list of conditions and the following disclaimer.
13  * Redistributions in binary form must reproduce the above
14  copyright notice, this list of #conditions and the following
15  disclaimer in the documentation and/or other materials provided
16  with the distribution.
17 
18  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22  COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
23  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
27  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
29  OF THE POSSIBILITY OF SUCH DAMAGE.
30 *
31 */
32 #include <include/M2MFstAligner.h>
33 #include <include/LatticePruner.h>
34 #include <include/util.h>
36 using namespace fst;
37 
38 
39 int load_input_file (M2MFstAligner* aligner, string input_file,
40  string delim, string s1_char_delim,
41  string s2_char_delim, bool init=false) {
42  ifstream infile (input_file.c_str ());
43  string line;
44  int lines = 0;
45  cerr << "Loading input file: " << input_file << endl;
46 
47  if (infile.is_open ()) {
48  while (infile.good ()) {
49  getline (infile, line);
50  if (line.empty ())
51  continue;
52  vector<string> tokens = tokenize_utf8_string (&line, &delim);
53  vector<string> seq1 = tokenize_utf8_string (&tokens.at (0),
54  &s1_char_delim);
55  vector<string> seq2;
56  if (tokens.size() > 1) {
57  seq2 = tokenize_utf8_string (&tokens.at (1), &s2_char_delim);
58  }
59  if (init == false)
60  aligner->entry2alignfst (seq1, seq2);
61  else
62  aligner->entry2alignfstnoinit (seq1, seq2, 1, line);
63  lines++;
64  }
65  infile.close ();
66  }
67  else {
68  cerr << "Failed to open input file: " << input_file << endl;
69  return -1;
70  }
71 
72  return lines;
73 }
74 
75 void write_alignments (M2MFstAligner* aligner, string ofile_name,
76  StdArc::Weight threshold, int nbest,
77  bool fb, bool penalize) {
78  /*
79  Write the raw alignments to a file in text-based corpus format.
80 
81  NOTE: Although N-best and other pruning strategies are supported,
82  the final format is that of a standard text corpus. All relative
83  token and pronunciation scores will be stripped. In general
84  this means that, unless you are very lucky with your combined
85  pruning strategy the un-ranked N-best hypotheses will result in a
86  lower-quality joint N-gram model.
87 
88  This approach is best used with simple 1-best.
89  */
90 
91  //Build us a lattice pruner
92  LatticePruner pruner (aligner->penalties, threshold, nbest, fb, penalize);
93 
94  ofstream ofile (ofile_name.c_str ());
95  VetoSet veto_set_;
96  veto_set_.insert (0);
97  for (unsigned int i = 0; i < aligner->fsas.size (); i++) {
98  //Map to Tropical semiring
99  VectorFst<StdArc>* tfst = new VectorFst<StdArc> ();
100  Map (aligner->fsas.at (i), tfst, LogToStdMapper ());
101  pruner.prune_fst (tfst);
102  RmEpsilon (tfst);
103  //Skip empty results. This should only happen
104  // in the following situations:
105  // 1. seq1_del=false && len(seq1)<len(seq2)
106  // 2. seq2_del=false && len(seq1)>len(seq2)
107  //In both 1.and 2. the issue is that we need to
108  // insert a 'skip' in order to guarantee at least
109  // one valid alignment path through seq1*seq2, but
110  // user params didn't allow us to.
111  //Probably better to insert these where necessary
112  // during initialization, regardless of user prefs.
113  if (tfst->NumStates () > 0) {
114  StdArc::Weight weight_threshold = 99;
115  StdArc::StateId state_threshold = kNoStateId;
116  AnyArcFilter<StdArc> arc_filter;
117  vector<StdArc::Weight> distance;
118  VectorFst<StdArc> ofst;
119 
120  AutoQueue<StdArc::StateId> state_queue (*tfst, &distance, arc_filter);
121  IdentityPathFilter<StdArc> path_filter;
122 
123  ShortestPathOptions<StdArc, AutoQueue<StdArc::StateId>,
124  AnyArcFilter<StdArc> >
125  opts (&state_queue, arc_filter, nbest, false, false,
126  kDelta, false, weight_threshold,
127  state_threshold);
128  ShortestPathSpecialized (*tfst, &ofst, &distance,
129  &path_filter, 10000, opts);
130  for (size_t i = 0; i < path_filter.ordered_paths.size (); i++) {
131  const vector<int>& path = path_filter.ordered_paths[i];
132  for (size_t j = 0; j < path.size (); j++) {
133  ofile << aligner->isyms->Find (path [j]);
134  if (j < path.size () - 1)
135  ofile << " ";
136  }
137  ofile << "\n";
138  }
139  }
140  delete tfst;
141  }
142 
143  return;
144 }
145 
147  vector<VectorFst<LogArc> > *fsts,
148  string far_name, StdArc::Weight threshold,
149  int nbest, bool fb, bool penalize) {
150  /*
151  Generic method for compiling an FstARchive from a vector of FST lattices.
152  The 'nbest' and 'threshold' parameters may be used to heuristically prune
153  the individual input lattices.
154 
155  TODO: Forward-Backward pruning for the lattices for one last variation.
156  */
157 
158  //Book-keeping stuff
159  string key_prefix = "";
160  string key_suffix = "";
161  string key = "";
162  char keybuf[16];
163  int32 generate_keys = 7; //Suitable for up to several million lattices
164  bool set_syms = false; //Have we set the isyms successfully yet??
165  //Build us a FarWriter to compile the archive
166  FarWriter<StdArc> *far_writer = \
167  FarWriter<StdArc>::Create (far_name, FAR_DEFAULT);
168  //Build us a lattice pruner
169  LatticePruner pruner (aligner->penalties, threshold, nbest, fb, penalize);
170 
171  for (unsigned int i = 0; i < fsts->size (); i++) {
172  //Maybe the alignment params did not permit any
173  // valid alignment. If so, do not bother with additional
174  // post-processing, don't add to the archive, do not pass go!
175  if (fsts->at (i).NumStates () == 0) continue;
176  //There has got to be a more efficient way to do this!
177  VectorFst<StdArc>* tfst = new VectorFst<StdArc> ();
178  VectorFst<LogArc>* lfst = new VectorFst<LogArc> ();
179  VectorFst<LogArc>* pfst = new VectorFst<LogArc> ();
180  VectorFst<StdArc>* ffst = new VectorFst<StdArc> ();
181 
182  //Map to the Tropical semiring
183  Map (fsts->at(i), tfst, LogToStdMapper ());
184  pruner.prune_fst (tfst);
185 
186  //Map back to the Log semiring
187  Map (*tfst, lfst, StdToLogMapper ());
188 
189  //Perform posterior normalization of the N-best lattice by pushing weights
190  // in the log semiring and then removing the final weight.
191  //When N=1, this will also have the effect of removing all weights.
192  // The .far result here will be identical to the non-lattice 'write_alignments()'.
193  Push<LogArc, REWEIGHT_TO_FINAL> (*lfst, pfst, kPushWeights);
194  for (StateIterator<VectorFst<LogArc> > siter (*pfst);
195  !siter.Done (); siter.Next ()) {
196  size_t v = siter.Value();
197  if (pfst->Final(v) != LogArc::Weight::Zero ()) {
198  pfst->SetFinal (v,LogArc::Weight::One ());
199  }
200  }
201 
202  //Maybe we pruned everything. If so, don't add this to the archive
203  // as the empty fst will cause ngramcount to fail.
204  if (pfst->NumStates () == 0) continue;
205 
206 
207  //Finally map back to the Tropical semiring for the last time
208  Map (*pfst, ffst, LogToStdMapper ());
209 
210  if (set_syms == false) {
211  ffst->SetInputSymbols (aligner->isyms);
212  ffst->SetOutputSymbols (aligner->isyms);
213  set_syms = true;
214  }
215 
216  sprintf (keybuf, "%0*d", generate_keys, i+1);
217  key = keybuf;
218 
219  //Write the final result to the FARchive
220  far_writer->Add (key_prefix + key + key_suffix, *ffst);
221 
222  //Cleanup the temporary FSTs
223  delete lfst;
224  delete tfst;
225  delete pfst;
226  delete ffst;
227  }
228  //Cleanup the archive writer
229  delete far_writer;
230 
231  return;
232 }
233 
234 
235 DEFINE_bool (seq1_del, true, "Allow deletions in sequence one." );
236 DEFINE_bool (seq2_del, true, "Allow deletions in sequence two." );
237 DEFINE_bool (penalize, true, "Penalize scores." );
238 DEFINE_bool (penalize_em, false, "Penalize links during EM training." );
239 DEFINE_bool (load_model, false, "Load a pre-trained model for use." );
240 DEFINE_bool (lattice, false, "Write out the alignment lattices as an fst archive (.far)." );
241 DEFINE_bool (restrict, true, "Restrict links to M-1, 1-N during initialization." );
242 DEFINE_bool (mbr, false, "Use the LMBR decoder (not yet implemented)." );
243 DEFINE_bool (fb, false, "Use forward-backward pruning for the alignment lattices." );
244 DEFINE_int32 (seq1_max, 2, "Maximum subsequence length for sequence one." );
245 DEFINE_int32 (seq2_max, 2, "Maximum subsequence length for sequence two." );
246 DEFINE_int32 (iter, 11, "Maximum number of EM iterations to perform." );
247 DEFINE_int32 (nbest, 1, "Output the N-best alignments given the model." );
248 DEFINE_string (input, "", "Two-column input file to align." );
249 DEFINE_string (seq1_sep, "|", "Multi-token separator for input tokens." );
250 DEFINE_string (seq2_sep, "|", "Multi-token separator for output tokens." );
251 DEFINE_string (s1s2_sep, "}", "Token used to separate input-output subsequences in the g2p model." );
252 DEFINE_string (delim, "\t", "Delimiter separating entry one and entry two in the input file." );
253 DEFINE_string (eps, "<eps>", "Epsilon symbol." );
254 DEFINE_string (skip, "_", "Skip token used to represent null transitions. Distinct from epsilon." );
255 DEFINE_string (ofile, "", "Output file to write the aligned dictionary to." );
256 DEFINE_string (s1_char_delim, "", "Sequence one input delimeter." );
257 DEFINE_string (s2_char_delim, " ", "Sequence two input delimeter." );
258 DEFINE_string (model_file, "", "FST-format alignment model to load." );
259 DEFINE_string (write_model, "", "Write out the alignment model in OpenFst format to filename." );
260 DEFINE_double (thresh, 1e-10, "Delta threshold for EM training termination." );
261 DEFINE_double (pthresh, -99, "Pruning threshold. Use to prune unlikely N-best candidates when using multiple alignments.");
262 
263 int main( int argc, char* argv[] ){
264  string usage = "phonetisaurus-align --input=dictionary --ofile=corpus.\n\n Usage: ";
265  set_new_handler (FailedNewHandler);
266  PhonetisaurusSetFlags (usage.c_str(), &argc, &argv, false);
267  M2MFstAligner aligner;
268 
269  if( FLAGS_load_model==true ){
270  aligner = *(new M2MFstAligner (FLAGS_model_file, FLAGS_penalize,
271  FLAGS_penalize_em, FLAGS_restrict));
272  switch (load_input_file (&aligner, FLAGS_input, FLAGS_delim,
273  FLAGS_s1_char_delim, FLAGS_s2_char_delim,
274  FLAGS_load_model)) {
275  case 0:
276  cerr << "Please provide a valid input file." << endl;
277  case -1:
278  return -1;
279  }
280  }else{
281  aligner = *(new M2MFstAligner (FLAGS_seq1_del, FLAGS_seq2_del,
282  FLAGS_seq1_max, FLAGS_seq2_max,
283  FLAGS_seq1_sep, FLAGS_seq2_sep,
284  FLAGS_s1s2_sep, FLAGS_eps, FLAGS_skip,
285  FLAGS_penalize, FLAGS_penalize_em,
286  FLAGS_restrict
287  ));
288  switch (load_input_file (&aligner, FLAGS_input, FLAGS_delim,
289  FLAGS_s1_char_delim, FLAGS_s2_char_delim,
290  FLAGS_load_model)) {
291  case 0:
292  cerr << "Please provide a valid input file." << endl;
293  case -1:
294  return -1;
295  }
296 
297  cerr << "Starting EM..." << endl;
298  aligner.maximization (false);
299  cerr << "Finished first iter..." << endl;
300  for (int i = 1; i <= FLAGS_iter; i++) {
301  cerr << "Iteration: " << i << " Change: ";
302  aligner.expectation ();
303  cerr << aligner.maximization (false) << endl;
304  }
305 
306  cerr << "Last iteration: " << endl;
307  aligner.expectation ();
308  aligner.maximization (true);
309  }
310 
311  StdArc::Weight pthresh = FLAGS_pthresh == -99.0
312  ? LogWeight::Zero().Value()
313  : FLAGS_pthresh;
314  if (FLAGS_write_model.compare ("") != 0) {
315  cerr << "Writing alignment model in OpenFst format to file: "
316  << FLAGS_write_model << endl;
317  aligner.write_model (FLAGS_write_model);
318  }
319 
320  if (FLAGS_lattice == true)
321  compileNBestFarArchive (&aligner, &aligner.fsas, FLAGS_ofile, pthresh,
322  FLAGS_nbest, FLAGS_fb, FLAGS_penalize);
323  else
324  write_alignments (&aligner, FLAGS_ofile, pthresh, FLAGS_nbest,
325  FLAGS_fb, FLAGS_penalize);
326 
327  return 1;
328 }
void write_alignments(M2MFstAligner *aligner, string ofile_name, StdArc::Weight threshold, int nbest, bool fb, bool penalize)
DEFINE_string(input,"","Two-column input file to align.")
vector< VectorFst< LogArc > > fsas
Definition: M2MFstAligner.h:79
unordered_set< int > VetoSet
DEFINE_bool(seq1_del, true,"Allow deletions in sequence one.")
void PhonetisaurusSetFlags(const char *usage, int *argc, char ***argv, bool remove_flags)
Definition: util.cc:158
DEFINE_double(thresh, 1e-10,"Delta threshold for EM training termination.")
float maximization(bool lastiter)
int main(int argc, char *argv[])
SymbolTable * isyms
Definition: M2MFstAligner.h:85
vector< vector< int > > ordered_paths
DEFINE_int32(seq1_max, 2,"Maximum subsequence length for sequence one.")
void ShortestPathSpecialized(const Fst< Arc > &ifst, MutableFst< Arc > *ofst, vector< typename Arc::Weight > *distance, PathFilter *path_filter, size_t beam, ShortestPathOptions< Arc, Queue, ArcFilter > &opts)
void entry2alignfstnoinit(vector< string > seq1, vector< string > seq2, int nbest, string lattice="")
void prune_fst(VectorFst< StdArc > *fst)
vector< string > tokenize_utf8_string(string *utf8_string, string *delimiter)
Definition: util.cc:50
void entry2alignfst(vector< string > seq1, vector< string > seq2)
void compileNBestFarArchive(M2MFstAligner *aligner, vector< VectorFst< LogArc > > *fsts, string far_name, StdArc::Weight threshold, int nbest, bool fb, bool penalize)
int load_input_file(M2MFstAligner *aligner, string input_file, string delim, string s1_char_delim, string s2_char_delim, bool init=false)