tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
language_model.h
1 // File: language_model.h
3 // Description: Functions that utilize the knowledge about the properties,
4 // structure and statistics of the language to help segmentation
5 // search.
6 // Author: Daria Antonova
7 // Created: Mon Nov 11 11:26:43 PST 2009
8 //
9 // (C) Copyright 2009, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
21 
22 #ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_H_
23 #define TESSERACT_WORDREC_LANGUAGE_MODEL_H_
24 
25 #include <cmath> // for exp
26 #include "associate.h" // for AssociateStats (ptr only), AssociateUtils
27 #include "dawg.h" // for DawgPositionVector
28 #include "dict.h" // for DawgArgs, Dict
29 #include "lm_consistency.h" // for LMConsistencyInfo
30 #include "lm_state.h" // for ViterbiStateEntry, LanguageModelFlagsType
31 #include "params.h" // for DoubleParam, double_VAR_H, IntParam, Boo...
32 #include "params_model.h" // for ParamsModel
33 #include "ratngs.h" // for BLOB_CHOICE (ptr only), BLOB_CHOICE_LIST...
34 #include "stopper.h" // for DANGERR
35 #include "strngs.h" // for STRING
36 
37 class UNICHARSET;
38 class WERD_RES;
39 
40 struct BlamerBundle;
41 
42 template <typename T> class UnicityTable;
43 
44 namespace tesseract {
45 
46 class LMPainPoints;
47 struct FontInfo;
48 
49 // This class that contains the data structures and functions necessary
50 // to represent and use the knowledge about the language.
52  public:
53  // Masks for keeping track of top choices that should not be pruned out.
57  static const LanguageModelFlagsType kDigitFlag = 0x8;
59 
60  // Denominator for normalizing per-letter ngram cost when deriving
61  // penalty adjustments.
62  static const float kMaxAvgNgramCost;
63 
64  LanguageModel(const UnicityTable<FontInfo> *fontinfo_table, Dict *dict);
66 
67  // Fills the given floats array with features extracted from path represented
68  // by the given ViterbiStateEntry. See ccstruct/params_training_featdef.h
69  // for feature information.
70  // Note: the function assumes that features points to an array of size
71  // PTRAIN_NUM_FEATURE_TYPES.
72  static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse,
73  float features[]);
74 
75  // Updates data structures that are used for the duration of the segmentation
76  // search on the current word;
77  void InitForWord(const WERD_CHOICE *prev_word,
78  bool fixed_pitch, float max_char_wh_ratio,
79  float rating_cert_scale);
80 
81  // Updates language model state of the given BLOB_CHOICE_LIST (from
82  // the ratings matrix) a its parent. Updates pain_points if new
83  // problematic points are found in the segmentation graph.
84  //
85  // At most language_model_viterbi_list_size are kept in each
86  // LanguageModelState.viterbi_state_entries list.
87  // At most language_model_viterbi_list_max_num_prunable of those are prunable
88  // (non-dictionary) paths.
89  // The entries that represent dictionary word paths are kept at the front
90  // of the list.
91  // The list ordered by cost that is computed collectively by several
92  // language model components (currently dawg and ngram components).
93  bool UpdateState(
94  bool just_classified,
95  int curr_col, int curr_row,
96  BLOB_CHOICE_LIST *curr_list,
97  LanguageModelState *parent_node,
98  LMPainPoints *pain_points,
99  WERD_RES *word_res,
100  BestChoiceBundle *best_choice_bundle,
101  BlamerBundle *blamer_bundle);
102 
103  // Returns true if an acceptable best choice was discovered.
105  inline void SetAcceptableChoiceFound(bool val) {
107  }
108  // Returns the reference to ParamsModel.
110 
111  protected:
112 
113  inline float CertaintyScore(float cert) {
114  if (language_model_use_sigmoidal_certainty) {
115  // cert is assumed to be between 0 and -dict_->certainty_scale.
116  // If you enable language_model_use_sigmoidal_certainty, you
117  // need to adjust language_model_ngram_nonmatch_score as well.
118  cert = -cert / dict_->certainty_scale;
119  return 1.0f / (1.0f + exp(10.0f * cert));
120  } else {
121  return (-1.0f / cert);
122  }
123  }
124 
125  inline float ComputeAdjustment(int num_problems, float penalty) {
126  if (num_problems == 0) return 0.0f;
127  if (num_problems == 1) return penalty;
128  return (penalty + (language_model_penalty_increment *
129  static_cast<float>(num_problems-1)));
130  }
131 
132  // Computes the adjustment to the ratings sum based on the given
133  // consistency_info. The paths with invalid punctuation, inconsistent
134  // case and character type are penalized proportionally to the number
135  // of inconsistencies on the path.
137  const LanguageModelDawgInfo *dawg_info,
138  const LMConsistencyInfo &consistency_info) {
139  if (dawg_info != nullptr) {
140  return ComputeAdjustment(consistency_info.NumInconsistentCase(),
141  language_model_penalty_case) +
142  (consistency_info.inconsistent_script ?
143  language_model_penalty_script : 0.0f);
144  }
145  return (ComputeAdjustment(consistency_info.NumInconsistentPunc(),
146  language_model_penalty_punc) +
147  ComputeAdjustment(consistency_info.NumInconsistentCase(),
148  language_model_penalty_case) +
149  ComputeAdjustment(consistency_info.NumInconsistentChartype(),
150  language_model_penalty_chartype) +
151  ComputeAdjustment(consistency_info.NumInconsistentSpaces(),
152  language_model_penalty_spacing) +
153  (consistency_info.inconsistent_script ?
154  language_model_penalty_script : 0.0f) +
155  (consistency_info.inconsistent_font ?
156  language_model_penalty_font : 0.0f));
157  }
158 
159  // Returns an adjusted ratings sum that includes inconsistency penalties,
160  // penalties for non-dictionary paths and paths with dips in ngram
161  // probability.
163 
164  // Finds the first lower and upper case letter and first digit in curr_list.
165  // Uses the first character in the list in place of empty results.
166  // Returns true if both alpha and digits are found.
167  bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list,
168  BLOB_CHOICE **first_lower,
169  BLOB_CHOICE **first_upper,
170  BLOB_CHOICE **first_digit) const;
171  // Forces there to be at least one entry in the overall set of the
172  // viterbi_state_entries of each element of parent_node that has the
173  // top_choice_flag set for lower, upper and digit using the same rules as
174  // GetTopLowerUpperDigit, setting the flag on the first found suitable
175  // candidate, whether or not the flag is set on some other parent.
176  // Returns 1 if both alpha and digits are found among the parents, -1 if no
177  // parents are found at all (a legitimate case), and 0 otherwise.
178  int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const;
179 
180  // Finds the next ViterbiStateEntry with which the given unichar_id can
181  // combine sensibly, taking into account any mixed alnum/mixed case
182  // situation, and whether this combination has been inspected before.
184  bool just_classified, bool mixed_alnum,
185  const BLOB_CHOICE* bc, LanguageModelFlagsType blob_choice_flags,
186  const UNICHARSET& unicharset, WERD_RES* word_res,
187  ViterbiStateEntry_IT* vse_it,
188  LanguageModelFlagsType* top_choice_flags) const;
189  // Helper function that computes the cost of the path composed of the
190  // path in the given parent ViterbiStateEntry and the given BLOB_CHOICE.
191  // If the new path looks good enough, adds a new ViterbiStateEntry to the
192  // list of viterbi entries in the given BLOB_CHOICE and returns true.
194  LanguageModelFlagsType top_choice_flags, float denom, bool word_end,
195  int curr_col, int curr_row, BLOB_CHOICE *b,
196  LanguageModelState *curr_state, ViterbiStateEntry *parent_vse,
197  LMPainPoints *pain_points, WERD_RES *word_res,
198  BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle);
199 
200  // Determines whether a potential entry is a true top choice and
201  // updates changed accordingly.
202  //
203  // Note: The function assumes that b, top_choice_flags and changed
204  // are not nullptr.
206  const ViterbiStateEntry *parent_vse,
207  LanguageModelState *lms);
208 
209  // Calls dict_->LetterIsOk() with DawgArgs initialized from parent_vse and
210  // unichar from b.unichar_id(). Constructs and returns LanguageModelDawgInfo
211  // with updated active dawgs, constraints and permuter.
212  //
213  // Note: the caller is responsible for deleting the returned pointer.
215  int curr_col, int curr_row,
216  const BLOB_CHOICE &b,
217  const ViterbiStateEntry *parent_vse);
218 
219  // Computes p(unichar | parent context) and records it in ngram_cost.
220  // If b.unichar_id() is an unlikely continuation of the parent context
221  // sets found_small_prob to true and returns nullptr.
222  // Otherwise creates a new LanguageModelNgramInfo entry containing the
223  // updated context (that includes b.unichar_id() at the end) and returns it.
224  //
225  // Note: the caller is responsible for deleting the returned pointer.
227  const char *unichar, float certainty, float denom,
228  int curr_col, int curr_row, float outline_length,
229  const ViterbiStateEntry *parent_vse);
230 
231  // Computes -(log(prob(classifier)) + log(prob(ngram model)))
232  // for the given unichar in the given context. If there are multiple
233  // unichars at one position - takes the average of their probabilities.
234  // UNICHAR::utf8_step() is used to separate out individual UTF8 characters,
235  // since probability_in_context() can only handle one at a time (while
236  // unicharset might contain ngrams and glyphs composed from multiple UTF8
237  // characters).
238  float ComputeNgramCost(const char *unichar, float certainty, float denom,
239  const char *context, int *unichar_step_len,
240  bool *found_small_prob, float *ngram_prob);
241 
242  // Computes the normalization factors for the classifier confidences
243  // (used by ComputeNgramCost()).
244  float ComputeDenom(BLOB_CHOICE_LIST *curr_list);
245 
246  // Fills the given consistenty_info based on parent_vse.consistency_info
247  // and on the consistency of the given unichar_id with parent_vse.
248  void FillConsistencyInfo(
249  int curr_col, bool word_end, BLOB_CHOICE *b,
250  ViterbiStateEntry *parent_vse,
251  WERD_RES *word_res,
252  LMConsistencyInfo *consistency_info);
253 
254  // Constructs WERD_CHOICE by recording unichar_ids of the BLOB_CHOICEs
255  // on the path represented by the given BLOB_CHOICE and language model
256  // state entries (lmse, dse). The path is re-constructed by following
257  // the parent pointers in the the lang model state entries). If the
258  // constructed WERD_CHOICE is better than the best/raw choice recorded
259  // in the best_choice_bundle, this function updates the corresponding
260  // fields and sets best_choice_bunldle->updated to true.
262  LMPainPoints *pain_points,
263  WERD_RES *word_res,
264  BestChoiceBundle *best_choice_bundle,
265  BlamerBundle *blamer_bundle);
266 
267  // Constructs a WERD_CHOICE by tracing parent pointers starting with
268  // the given LanguageModelStateEntry. Returns the constructed word.
269  // Updates best_char_choices, certainties and state if they are not
270  // nullptr (best_char_choices and certainties are assumed to have the
271  // length equal to lmse->length).
272  // The caller is responsible for freeing memory associated with the
273  // returned WERD_CHOICE.
275  WERD_RES *word_res,
276  DANGERR *fixpt,
277  BlamerBundle *blamer_bundle,
278  bool *truth_path);
279 
280  // Wrapper around AssociateUtils::ComputeStats().
281  inline void ComputeAssociateStats(int col, int row,
282  float max_char_wh_ratio,
283  ViterbiStateEntry *parent_vse,
284  WERD_RES *word_res,
285  AssociateStats *associate_stats) {
287  col, row,
288  (parent_vse != nullptr) ? &(parent_vse->associate_stats) : nullptr,
289  (parent_vse != nullptr) ? parent_vse->length : 0,
290  fixed_pitch_, max_char_wh_ratio,
291  word_res, language_model_debug_level > 2, associate_stats);
292  }
293 
294  // Returns true if the path with such top_choice_flags and dawg_info
295  // could be pruned out (i.e. is neither a system/user/frequent dictionary
296  // nor a top choice path).
297  // In non-space delimited languages all paths can be "somewhat" dictionary
298  // words. In such languages we can not do dictionary-driven path pruning,
299  // so paths with non-empty dawg_info are considered prunable.
300  inline bool PrunablePath(const ViterbiStateEntry &vse) {
301  if (vse.top_choice_flags) return false;
302  if (vse.dawg_info != nullptr &&
303  (vse.dawg_info->permuter == SYSTEM_DAWG_PERM ||
304  vse.dawg_info->permuter == USER_DAWG_PERM ||
305  vse.dawg_info->permuter == FREQ_DAWG_PERM)) return false;
306  return true;
307  }
308 
309  // Returns true if the given ViterbiStateEntry represents an acceptable path.
310  inline bool AcceptablePath(const ViterbiStateEntry &vse) {
311  return (vse.dawg_info != nullptr || vse.Consistent() ||
312  (vse.ngram_info != nullptr && !vse.ngram_info->pruned));
313  }
314 
315  public:
316  // Parameters.
317  INT_VAR_H(language_model_debug_level, 0, "Language model debug level");
318  BOOL_VAR_H(language_model_ngram_on, false,
319  "Turn on/off the use of character ngram model");
320  INT_VAR_H(language_model_ngram_order, 8,
321  "Maximum order of the character ngram model");
322  INT_VAR_H(language_model_viterbi_list_max_num_prunable, 10,
323  "Maximum number of prunable (those for which PrunablePath() is"
324  " true) entries in each viterbi list recorded in BLOB_CHOICEs");
325  INT_VAR_H(language_model_viterbi_list_max_size, 500,
326  "Maximum size of viterbi lists recorded in BLOB_CHOICEs");
327  double_VAR_H(language_model_ngram_small_prob, 0.000001,
328  "To avoid overly small denominators use this as the floor"
329  " of the probability returned by the ngram model");
330  double_VAR_H(language_model_ngram_nonmatch_score, -40.0,
331  "Average classifier score of a non-matching unichar");
332  BOOL_VAR_H(language_model_ngram_use_only_first_uft8_step, false,
333  "Use only the first UTF8 step of the given string"
334  " when computing log probabilities");
335  double_VAR_H(language_model_ngram_scale_factor, 0.03,
336  "Strength of the character ngram model relative to the"
337  " character classifier ");
338  double_VAR_H(language_model_ngram_rating_factor, 16.0,
339  "Factor to bring log-probs into the same range as ratings"
340  " when multiplied by outline length ");
341  BOOL_VAR_H(language_model_ngram_space_delimited_language, true,
342  "Words are delimited by space");
343  INT_VAR_H(language_model_min_compound_length, 3,
344  "Minimum length of compound words");
345  // Penalties used for adjusting path costs and final word rating.
346  double_VAR_H(language_model_penalty_non_freq_dict_word, 0.1,
347  "Penalty for words not in the frequent word dictionary");
348  double_VAR_H(language_model_penalty_non_dict_word, 0.15,
349  "Penalty for non-dictionary words");
350  double_VAR_H(language_model_penalty_punc, 0.2,
351  "Penalty for inconsistent punctuation");
352  double_VAR_H(language_model_penalty_case, 0.1,
353  "Penalty for inconsistent case");
354  double_VAR_H(language_model_penalty_script, 0.5,
355  "Penalty for inconsistent script");
356  double_VAR_H(language_model_penalty_chartype, 0.3,
357  "Penalty for inconsistent character type");
358  double_VAR_H(language_model_penalty_font, 0.00,
359  "Penalty for inconsistent font");
360  double_VAR_H(language_model_penalty_spacing, 0.05,
361  "Penalty for inconsistent spacing");
362  double_VAR_H(language_model_penalty_increment, 0.01, "Penalty increment");
363  INT_VAR_H(wordrec_display_segmentations, 0, "Display Segmentations");
364  BOOL_VAR_H(language_model_use_sigmoidal_certainty, false,
365  "Use sigmoidal score for certainty");
366 
367 
368  protected:
369  // Member Variables.
370 
371  // Temporary DawgArgs struct that is re-used across different words to
372  // avoid dynamic memory re-allocation (should be cleared before each use).
374  // Scaling for recovering blob outline length from rating and certainty.
376 
377  // The following variables are set at construction time.
378 
379  // Pointer to fontinfo table (not owned by LanguageModel).
381 
382  // Pointer to Dict class, that is used for querying the dictionaries
383  // (the pointer is not owned by LanguageModel).
385 
386  // TODO(daria): the following variables should become LanguageModel params
387  // when the old code in bestfirst.cpp and heuristic.cpp is deprecated.
388  //
389  // Set to true if we are dealing with fixed pitch text
390  // (set to assume_fixed_pitch_char_segment).
392  // Max char width-to-height ratio allowed
393  // (set to segsearch_max_char_wh_ratio).
395 
396  // The following variables are initialized with InitForWord().
397 
398  // String representation of the classification of the previous word
399  // (since this is only used by the character ngram model component,
400  // only the last language_model_ngram_order of the word are stored).
403  // Active dawg vector.
406  // Set to true if acceptable choice was discovered.
407  // Note: it would be nice to use this to terminate the search once an
408  // acceptable choices is found. However we do not do that and once an
409  // acceptable choice is found we finish looking for alternative choices
410  // in the current segmentation graph and then exit the search (no more
411  // classifications are done after an acceptable choice is found).
412  // This is needed in order to let the search find the words very close to
413  // the best choice in rating (e.g. what/What, Cat/cat, etc) and log these
414  // choices. This way the stopper will know that the best choice is not
415  // ambiguous (i.e. there are best choices in the best choice list that have
416  // ratings close to the very best one) and will be less likely to mis-adapt.
418  // Set to true if a choice representing correct segmentation was explored.
420 
421  // Params models containing weights for for computing ViterbiStateEntry costs.
423 };
424 
425 } // namespace tesseract
426 
427 #endif // TESSERACT_WORDREC_LANGUAGE_MODEL_H_
Definition: lm_state.h:63
LanguageModelNgramInfo * ngram_info
Definition: lm_state.h:184
static const LanguageModelFlagsType kDigitFlag
Definition: language_model.h:57
PermuterType permuter
Definition: lm_state.h:67
Definition: lm_pain_points.h:57
static const LanguageModelFlagsType kSmallestRatingFlag
Definition: language_model.h:54
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
Definition: language_model.cpp:1018
bool acceptable_choice_found_
Definition: language_model.h:417
static void ComputeStats(int col, int row, const AssociateStats *parent_stats, int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, WERD_RES *word_res, bool debug, AssociateStats *stats)
Definition: associate.cpp:34
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
Definition: language_model.cpp:1341
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: language_model.cpp:565
void SetAcceptableChoiceFound(bool val)
Definition: language_model.h:105
LanguageModel(const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
Definition: language_model.cpp:54
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
Definition: language_model.cpp:937
float max_char_wh_ratio_
Definition: language_model.h:394
float rating_cert_scale_
Definition: language_model.h:375
int NumInconsistentChartype() const
Definition: lm_consistency.h:91
Definition: lm_state.h:93
bool inconsistent_font
Definition: lm_consistency.h:130
static const LanguageModelFlagsType kXhtConsistentFlag
Definition: language_model.h:58
Struct to store information maintained by various language model components.
Definition: lm_state.h:195
Definition: unicharset.h:146
bool Consistent() const
Definition: lm_state.h:135
float CertaintyScore(float cert)
Definition: language_model.h:113
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
Definition: language_model.cpp:387
Definition: lm_consistency.h:39
Definition: baseapi.cpp:94
bool AcceptablePath(const ViterbiStateEntry &vse)
Definition: language_model.h:310
int length
Definition: lm_state.h:169
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
Definition: language_model.cpp:504
Definition: ratngs.h:273
bool inconsistent_script
Definition: lm_consistency.h:128
LanguageModelFlagsType top_choice_flags
Definition: lm_state.h:176
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
Definition: language_model.h:281
INT_VAR_H(language_model_debug_level, 0, "Language model debug level")
DawgPositionVector very_beginning_active_dawgs_
Definition: language_model.h:404
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
Definition: language_model.cpp:880
const UnicityTable< FontInfo > * fontinfo_table_
Definition: language_model.h:380
Bundle together all the things pertaining to the best choice/state.
Definition: lm_state.h:217
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: language_model.cpp:1241
Definition: dict.h:88
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
Definition: language_model.cpp:789
int NumInconsistentPunc() const
Definition: lm_consistency.h:85
Definition: fontinfo.h:30
Definition: blamer.h:100
Definition: dawg.h:381
bool pruned
Definition: lm_state.h:84
int NumInconsistentCase() const
Definition: lm_consistency.h:88
float ComputeAdjustment(int num_problems, float penalty)
Definition: language_model.h:125
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
Definition: language_model.cpp:997
DawgPositionVector beginning_active_dawgs_
Definition: language_model.h:405
AssociateStats associate_stats
Definition: lm_state.h:172
bool fixed_pitch_
Definition: language_model.h:391
DawgArgs dawg_args_
Definition: language_model.h:373
Dict * dict_
Definition: language_model.h:384
Definition: strngs.h:45
BOOL_VAR_H(language_model_ngram_on, false, "Turn on/off the use of character ngram model")
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
Definition: language_model.cpp:773
~LanguageModel()
Definition: language_model.cpp:138
static const float kMaxAvgNgramCost
Definition: language_model.h:62
Definition: pageres.h:169
Definition: params_model.h:32
STRING prev_word_str_
Definition: language_model.h:401
static const LanguageModelFlagsType kUpperCaseFlag
Definition: language_model.h:56
bool correct_segmentation_explored_
Definition: language_model.h:419
Definition: language_model.h:51
int prev_word_unichar_step_len_
Definition: language_model.h:402
double_VAR_H(language_model_ngram_small_prob, 0.000001, "To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model")
bool AcceptableChoiceFound()
Definition: language_model.h:104
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
Definition: language_model.h:136
ParamsModel & getParamsModel()
Definition: language_model.h:109
void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
Definition: language_model.cpp:140
bool PrunablePath(const ViterbiStateEntry &vse)
Definition: language_model.h:300
static const LanguageModelFlagsType kLowerCaseFlag
Definition: language_model.h:55
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
Definition: language_model.cpp:1199
ParamsModel params_model_
Definition: language_model.h:422
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const
Definition: language_model.cpp:427
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
Definition: lm_state.h:39
Definition: dict.h:77
bool UpdateState(bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: language_model.cpp:257
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
Definition: language_model.cpp:1390
int NumInconsistentSpaces() const
Definition: lm_consistency.h:100
LanguageModelDawgInfo * dawg_info
Definition: lm_state.h:180
Definition: ratngs.h:49
Definition: lm_state.h:72
Definition: associate.h:36