Phonetisaurus  1.0
FST-based Grapheme-to-Phoneme conversion
util.cc
Go to the documentation of this file.
1 /*
2  Copyright (c) [2012-], Josef Robert Novak
3  All rights reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted #provided that the following conditions
7  are met:
8 
9  * Redistributions of source code must retain the above copyright
10  notice, this list of conditions and the following disclaimer.
11  * Redistributions in binary form must reproduce the above
12  copyright notice, this list of #conditions and the following
13  disclaimer in the documentation and/or other materials provided
14  with the distribution.
15 
16  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20  COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
21  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
25  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
27  OF THE POSSIBILITY OF SUCH DAMAGE.
28 *
29 */
30 #include <include/util.h>
31 using namespace fst;
32 
33 
34 string vec2str( vector<string> vec, string sep ){
35  string ss;
36  for(size_t i = 0; i < vec.size(); ++i){
37  if(i != 0)
38  ss += sep;
39  ss += vec[i];
40  }
41  return ss;
42 }
43 
44 string itoas( int i ){
45  std::stringstream ostring;
46  ostring << i;
47  return ostring.str();
48 }
49 
50 vector<string> tokenize_utf8_string (string* utf8_string, string* delimiter) {
51  /*
52  Support for tokenizing a utf-8 string. Adapted to also
53  support a delimiter. Note that leading, trailing or multiple
54  consecutive delimiters will result in empty vector elements.
55  Normally should not be a problem but just in case. Also note
56  that any tokens that cannot be found in the model symbol table will be
57  deleted from the input word prior to grapheme-to-phoneme conversion.
58 
59  http://stackoverflow.com/questions/2852895/c-iterate-or-split-\
60  utf-8-string-into-array-of-symbols#2856241
61  */
62  char* str = (char*) utf8_string->c_str (); // utf-8 string
63  char* str_i = str; // string iterator
64  char* str_j = str;
65  char* end = str + strlen (str) + 1; // end iterator
66  vector<string> string_vec;
67  if (delimiter->compare ("") != 0)
68  string_vec.push_back ("");
69 
70  do {
71  str_j = str_i;
72  utf8::uint32_t code = utf8::next (str_i, end); // get 32 bit code
73  if (code == 0)
74  continue;
75  int start = strlen (str) - strlen (str_j);
76  int end = strlen (str) - strlen (str_i);
77  int len = end - start;
78 
79  if (delimiter->compare ("") == 0) {
80  string_vec.push_back (utf8_string->substr (start,len));
81  } else {
82  if (delimiter->compare (utf8_string->substr (start, len)) == 0)
83  string_vec.push_back ("");
84  else
85  string_vec [string_vec.size () - 1] += utf8_string->substr (start, len);
86  }
87  } while (str_i < end);
88 
89  return string_vec;
90 }
91 
92 
93 vector<string> tokenize_entry (string* testword, string* sep,
94  SymbolTable* syms) {
95  vector<string> tokens = tokenize_utf8_string (testword, sep);
96  vector<string> entry;
97  for (unsigned int i=0; i<tokens.size (); i++) {
98  if (syms->Find (tokens.at (i)) != -1) {
99  entry.push_back (tokens.at (i));
100  }else{
101  cerr << "Symbol: '" << tokens.at (i)
102  << "' not found in input symbols table." << endl
103  << "Mapping to null..." << endl;
104  }
105  }
106 
107  return entry;
108 }
109 
110 vector<int> tokenize2ints (string* testword, string* sep,
111  const SymbolTable* syms) {
112  vector<string> tokens = tokenize_utf8_string (testword, sep);
113  vector<int> entry;
114  for (unsigned int i=0; i<tokens.size(); i++) {
115  int label = syms->Find (tokens[i]);
116  if (label == -1)
117  cerr << "Symbol: '" << tokens[i]
118  << "' not found in input symbols table." << endl
119  << "Mapping to null..." << endl;
120  else
121  entry.push_back (label);
122  }
123 
124  return entry;
125 }
126 
127 #ifdef __MACH__
128 timespec get_time( ){
129  clock_serv_t cclock;
130  mach_timespec_t mts;
131  host_get_clock_service(mach_host_self(), REALTIME_CLOCK, &cclock);
132  clock_get_time(cclock, &mts);
133 
134  timespec ts = {mts.tv_sec, mts.tv_nsec};
135  return ts;
136 }
137 #else
138 timespec get_time( ){
139  timespec ts;
140  clock_gettime(CLOCK_REALTIME, &ts);
141  return ts;
142 }
143 #endif
144 
145 timespec diff(timespec start, timespec end){
146  timespec temp;
147  if ((end.tv_nsec-start.tv_nsec)<0) {
148  temp.tv_sec = end.tv_sec-start.tv_sec-1;
149  temp.tv_nsec = 1000000000+end.tv_nsec-start.tv_nsec;
150  } else {
151  temp.tv_sec = end.tv_sec-start.tv_sec;
152  temp.tv_nsec = end.tv_nsec-start.tv_nsec;
153  }
154  return temp;
155 }
156 
157 DEFINE_bool (help, false, "show usage information");
158 void PhonetisaurusSetFlags (const char* usage, int* argc, char*** argv,
159  bool remove_flags) {
160  //Workaround for Apple's. It just skips all the options processing.
161 #ifdef DARWIN
162  SetFlags (usage, argc, argv, remove_flags);
163 #else
164  int index = 1;
165  for (; index < *argc; ++index) {
166  string argval = (*argv)[index];
167 
168  if (argval[0] != '-' || argval == "-")
169  break;
170  while (argval[0] == '-')
171  argval = argval.substr(1); // remove initial '-'s
172 
173  string arg = argval;
174  string val = "";
175 
176  // split argval (arg=val) into arg and val
177  size_t pos = argval.find("=");
178  if (pos != string::npos) {
179  arg = argval.substr(0, pos);
180  val = argval.substr(pos + 1);
181  }
182 
183 
184  FlagRegister<bool> *bool_register =
185  FlagRegister<bool>::GetRegister();
186  if (bool_register->SetFlag(arg, val))
187  continue;
188  FlagRegister<string> *string_register =
189  FlagRegister<string>::GetRegister();
190  if (string_register->SetFlag(arg, val))
191  continue;
192  FlagRegister<int32> *int32_register =
193  FlagRegister<int32>::GetRegister();
194  if (int32_register->SetFlag(arg, val))
195  continue;
196  FlagRegister<int64> *int64_register =
197  FlagRegister<int64>::GetRegister();
198  if (int64_register->SetFlag(arg, val))
199  continue;
200  FlagRegister<double> *double_register =
201  FlagRegister<double>::GetRegister();
202  if (double_register->SetFlag(arg, val))
203  continue;
204 
205  LOG(FATAL) << "SetFlags: Bad option: " << (*argv)[index];
206  }
207 
208  if (FLAGS_help) {
209  //Just show program flags - NOT general OpenFst flags
210  // There are too many and they are just confusing.
211  std::set< pair<string, string> > usage_set;
212 
213  cout << usage << "\n";
214 
215  FlagRegister<bool> *bool_register = FlagRegister<bool>::GetRegister();
216  bool_register->GetUsage(&usage_set);
217  FlagRegister<string> *string_register = FlagRegister<string>::GetRegister();
218  string_register->GetUsage(&usage_set);
219  FlagRegister<int32> *int32_register = FlagRegister<int32>::GetRegister();
220  int32_register->GetUsage(&usage_set);
221  FlagRegister<int64> *int64_register = FlagRegister<int64>::GetRegister();
222  int64_register->GetUsage(&usage_set);
223  FlagRegister<double> *double_register = FlagRegister<double>::GetRegister();
224  double_register->GetUsage(&usage_set);
225 
226  for (std::set< pair<string, string> >::const_iterator it =
227  usage_set.begin();
228  it != usage_set.end();
229  ++it) {
230  const string &file = it->first;
231  const string &usage = it->second;
232 
233  //if (file.compare ("flags.cc") == 0 || file.compare ("fst.cc") == 0
234  if (file.compare ("fst.cc") == 0 \
235  || file.compare ("symbol-table.cc") == 0 || \
236  file.compare ("util.cc") == 0)
237  continue;
238 
239  //Else print out the args - they are from the actual program
240  cout << usage << endl;
241  }
242  //Fake this
243  cout << " --help: type = bool, default = false" << endl;
244  cout << " show usage information" << endl;
245  exit(1);
246  }
247 #endif
248 }
249 
250 void LoadWordList (const std::string& filename,
251  std::vector<std::string>* corpus) {
252  std::ifstream ifp (filename.c_str ());
253  std::string line;
254 
255  if (ifp.is_open ()) {
256  while (ifp.good ()) {
257  getline (ifp, line);
258  if (line.empty ())
259  continue;
260 
261  corpus->push_back (line);
262  }
263  ifp.close ();
264  }
265 }
266 
267 
268 void Split (const std::string& s, char delim, std::vector<std::string>& elems) {
269  std::stringstream ss (s);
270  std::string item;
271  while (getline (ss, item, delim))
272  elems.push_back (item);
273 }
void LoadWordList(const std::string &filename, std::vector< std::string > *corpus)
Definition: util.cc:250
string itoas(int i)
Definition: util.cc:44
timespec diff(timespec start, timespec end)
Definition: util.cc:145
string vec2str(vector< string > vec, string sep)
Definition: util.cc:34
void PhonetisaurusSetFlags(const char *usage, int *argc, char ***argv, bool remove_flags)
Definition: util.cc:158
vector< string > tokenize_utf8_string(string *utf8_string, string *delimiter)
Definition: util.cc:50
vector< int > tokenize2ints(string *testword, string *sep, const SymbolTable *syms)
Definition: util.cc:110
timespec get_time()
Definition: util.cc:138
DEFINE_bool(help, false,"show usage information")
vector< string > tokenize_entry(string *testword, string *sep, SymbolTable *syms)
Definition: util.cc:93
void Split(const std::string &s, char delim, std::vector< std::string > &elems)
Definition: util.cc:268