Phonetisaurus  1.0
FST-based Grapheme-to-Phoneme conversion
All Classes Namespaces Files Functions Variables Typedefs Macros Modules Pages
Functions
phonetisaurus-align.cc File Reference
#include <include/M2MFstAligner.h>
#include <include/LatticePruner.h>
#include <include/util.h>
#include <include/PhonetisaurusRex.h>

Go to the source code of this file.

Functions

int load_input_file (M2MFstAligner *aligner, string input_file, string delim, string s1_char_delim, string s2_char_delim, bool init=false)
 
void write_alignments (M2MFstAligner *aligner, string ofile_name, StdArc::Weight threshold, int nbest, bool fb, bool penalize)
 
void compileNBestFarArchive (M2MFstAligner *aligner, vector< VectorFst< LogArc > > *fsts, string far_name, StdArc::Weight threshold, int nbest, bool fb, bool penalize)
 
 DEFINE_bool (seq1_del, true,"Allow deletions in sequence one.")
 
 DEFINE_bool (seq2_del, true,"Allow deletions in sequence two.")
 
 DEFINE_bool (penalize, true,"Penalize scores.")
 
 DEFINE_bool (penalize_em, false,"Penalize links during EM training.")
 
 DEFINE_bool (load_model, false,"Load a pre-trained model for use.")
 
 DEFINE_bool (lattice, false,"Write out the alignment lattices as an fst archive (.far).")
 
 DEFINE_bool (restrict, true,"Restrict links to M-1, 1-N during initialization.")
 
 DEFINE_bool (mbr, false,"Use the LMBR decoder (not yet implemented).")
 
 DEFINE_bool (fb, false,"Use forward-backward pruning for the alignment lattices.")
 
 DEFINE_int32 (seq1_max, 2,"Maximum subsequence length for sequence one.")
 
 DEFINE_int32 (seq2_max, 2,"Maximum subsequence length for sequence two.")
 
 DEFINE_int32 (iter, 11,"Maximum number of EM iterations to perform.")
 
 DEFINE_int32 (nbest, 1,"Output the N-best alignments given the model.")
 
 DEFINE_string (input,"","Two-column input file to align.")
 
 DEFINE_string (seq1_sep,"|","Multi-token separator for input tokens.")
 
 DEFINE_string (seq2_sep,"|","Multi-token separator for output tokens.")
 
 DEFINE_string (s1s2_sep,"}","Token used to separate input-output subsequences in the g2p model.")
 
 DEFINE_string (delim,"\t","Delimiter separating entry one and entry two in the input file.")
 
 DEFINE_string (eps,"<eps>","Epsilon symbol.")
 
 DEFINE_string (skip,"_","Skip token used to represent null transitions. Distinct from epsilon.")
 
 DEFINE_string (ofile,"","Output file to write the aligned dictionary to.")
 
 DEFINE_string (s1_char_delim,"","Sequence one input delimeter.")
 
 DEFINE_string (s2_char_delim," ","Sequence two input delimeter.")
 
 DEFINE_string (model_file,"","FST-format alignment model to load.")
 
 DEFINE_string (write_model,"","Write out the alignment model in OpenFst format to filename.")
 
 DEFINE_double (thresh, 1e-10,"Delta threshold for EM training termination.")
 
 DEFINE_double (pthresh,-99,"Pruning threshold. Use to prune unlikely N-best candidates when using multiple alignments.")
 
int main (int argc, char *argv[])
 

Function Documentation

void compileNBestFarArchive ( M2MFstAligner aligner,
vector< VectorFst< LogArc > > *  fsts,
string  far_name,
StdArc::Weight  threshold,
int  nbest,
bool  fb,
bool  penalize 
)

Definition at line 146 of file phonetisaurus-align.cc.

DEFINE_bool ( seq1_del  ,
true  ,
"Allow deletions in sequence one."   
)
DEFINE_bool ( seq2_del  ,
true  ,
"Allow deletions in sequence two."   
)
DEFINE_bool ( penalize  ,
true  ,
"Penalize scores."   
)
DEFINE_bool ( penalize_em  ,
false  ,
"Penalize links during EM training."   
)
DEFINE_bool ( load_model  ,
false  ,
"Load a pre-trained model for use."   
)
DEFINE_bool ( lattice  ,
false  ,
"Write out the alignment lattices as an fst archive (.far)."   
)
DEFINE_bool ( restrict  ,
true  ,
"Restrict links to M-  1,
1-N during initialization."   
)
DEFINE_bool ( mbr  ,
false  ,
"Use the LMBR decoder (not yet implemented)."   
)
DEFINE_bool ( fb  ,
false  ,
"Use forward-backward pruning for the alignment lattices."   
)
DEFINE_double ( thresh  ,
1e-  10,
"Delta threshold for EM training termination."   
)
DEFINE_double ( pthresh  ,
99,
"Pruning threshold. Use to prune unlikely N-best candidates when using multiple alignments."   
)
DEFINE_int32 ( seq1_max  ,
,
"Maximum subsequence length for sequence one."   
)
DEFINE_int32 ( seq2_max  ,
,
"Maximum subsequence length for sequence two."   
)
DEFINE_int32 ( iter  ,
11  ,
"Maximum number of EM iterations to perform."   
)
DEFINE_int32 ( nbest  ,
,
"Output the N-best alignments given the model."   
)
DEFINE_string ( input  ,
""  ,
"Two-column input file to align."   
)
DEFINE_string ( seq1_sep  ,
"|"  ,
"Multi-token separator for input tokens."   
)
DEFINE_string ( seq2_sep  ,
"|"  ,
"Multi-token separator for output tokens."   
)
DEFINE_string ( s1s2_sep  ,
"}"  ,
"Token used to separate input-output subsequences in the g2p model."   
)
DEFINE_string ( delim  ,
"\t"  ,
"Delimiter separating entry one and entry two in the input file."   
)
DEFINE_string ( eps  ,
"<eps>"  ,
"Epsilon symbol."   
)
DEFINE_string ( skip  ,
"_"  ,
"Skip token used to represent null transitions. Distinct from epsilon."   
)
DEFINE_string ( ofile  ,
""  ,
"Output file to write the aligned dictionary to."   
)
DEFINE_string ( s1_char_delim  ,
""  ,
"Sequence one input delimeter."   
)
DEFINE_string ( s2_char_delim  ,
" "  ,
"Sequence two input delimeter."   
)
DEFINE_string ( model_file  ,
""  ,
"FST-format alignment model to load."   
)
DEFINE_string ( write_model  ,
""  ,
"Write out the alignment model in OpenFst format to filename."   
)
int load_input_file ( M2MFstAligner aligner,
string  input_file,
string  delim,
string  s1_char_delim,
string  s2_char_delim,
bool  init = false 
)

Definition at line 39 of file phonetisaurus-align.cc.

int main ( int  argc,
char *  argv[] 
)

Definition at line 263 of file phonetisaurus-align.cc.

void write_alignments ( M2MFstAligner aligner,
string  ofile_name,
StdArc::Weight  threshold,
int  nbest,
bool  fb,
bool  penalize 
)

Definition at line 75 of file phonetisaurus-align.cc.