tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
tesseract::LanguageModel Class Reference

#include <language_model.h>

Collaboration diagram for tesseract::LanguageModel:

Public Member Functions

 LanguageModel (const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
 
 ~LanguageModel ()
 
void InitForWord (const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
 
bool UpdateState (bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
bool AcceptableChoiceFound ()
 
void SetAcceptableChoiceFound (bool val)
 
ParamsModelgetParamsModel ()
 
 INT_VAR_H (language_model_debug_level, 0, "Language model debug level")
 
 BOOL_VAR_H (language_model_ngram_on, false, "Turn on/off the use of character ngram model")
 
 INT_VAR_H (language_model_ngram_order, 8, "Maximum order of the character ngram model")
 
 INT_VAR_H (language_model_viterbi_list_max_num_prunable, 10, "Maximum number of prunable (those for which PrunablePath() is" " true) entries in each viterbi list recorded in BLOB_CHOICEs")
 
 INT_VAR_H (language_model_viterbi_list_max_size, 500, "Maximum size of viterbi lists recorded in BLOB_CHOICEs")
 
 double_VAR_H (language_model_ngram_small_prob, 0.000001, "To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model")
 
 double_VAR_H (language_model_ngram_nonmatch_score, -40.0, "Average classifier score of a non-matching unichar")
 
 BOOL_VAR_H (language_model_ngram_use_only_first_uft8_step, false, "Use only the first UTF8 step of the given string" " when computing log probabilities")
 
 double_VAR_H (language_model_ngram_scale_factor, 0.03, "Strength of the character ngram model relative to the" " character classifier ")
 
 double_VAR_H (language_model_ngram_rating_factor, 16.0, "Factor to bring log-probs into the same range as ratings" " when multiplied by outline length ")
 
 BOOL_VAR_H (language_model_ngram_space_delimited_language, true, "Words are delimited by space")
 
 INT_VAR_H (language_model_min_compound_length, 3, "Minimum length of compound words")
 
 double_VAR_H (language_model_penalty_non_freq_dict_word, 0.1, "Penalty for words not in the frequent word dictionary")
 
 double_VAR_H (language_model_penalty_non_dict_word, 0.15, "Penalty for non-dictionary words")
 
 double_VAR_H (language_model_penalty_punc, 0.2, "Penalty for inconsistent punctuation")
 
 double_VAR_H (language_model_penalty_case, 0.1, "Penalty for inconsistent case")
 
 double_VAR_H (language_model_penalty_script, 0.5, "Penalty for inconsistent script")
 
 double_VAR_H (language_model_penalty_chartype, 0.3, "Penalty for inconsistent character type")
 
 double_VAR_H (language_model_penalty_font, 0.00, "Penalty for inconsistent font")
 
 double_VAR_H (language_model_penalty_spacing, 0.05, "Penalty for inconsistent spacing")
 
 double_VAR_H (language_model_penalty_increment, 0.01, "Penalty increment")
 
 INT_VAR_H (wordrec_display_segmentations, 0, "Display Segmentations")
 
 BOOL_VAR_H (language_model_use_sigmoidal_certainty, false, "Use sigmoidal score for certainty")
 

Static Public Member Functions

static void ExtractFeaturesFromPath (const ViterbiStateEntry &vse, float features[])
 

Static Public Attributes

static const LanguageModelFlagsType kSmallestRatingFlag = 0x1
 
static const LanguageModelFlagsType kLowerCaseFlag = 0x2
 
static const LanguageModelFlagsType kUpperCaseFlag = 0x4
 
static const LanguageModelFlagsType kDigitFlag = 0x8
 
static const LanguageModelFlagsType kXhtConsistentFlag = 0x10
 
static const float kMaxAvgNgramCost = 25.0f
 

Protected Member Functions

float CertaintyScore (float cert)
 
float ComputeAdjustment (int num_problems, float penalty)
 
float ComputeConsistencyAdjustment (const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
 
float ComputeAdjustedPathCost (ViterbiStateEntry *vse)
 
bool GetTopLowerUpperDigit (BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
 
int SetTopParentLowerUpperDigit (LanguageModelState *parent_node) const
 
ViterbiStateEntryGetNextParentVSE (bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
 
bool AddViterbiStateEntry (LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void GenerateTopChoiceInfo (ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
 
LanguageModelDawgInfoGenerateDawgInfo (bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
 
LanguageModelNgramInfoGenerateNgramInfo (const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
 
float ComputeNgramCost (const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
 
float ComputeDenom (BLOB_CHOICE_LIST *curr_list)
 
void FillConsistencyInfo (int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
 
void UpdateBestChoice (ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
WERD_CHOICEConstructWord (ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
 
void ComputeAssociateStats (int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
 
bool PrunablePath (const ViterbiStateEntry &vse)
 
bool AcceptablePath (const ViterbiStateEntry &vse)
 

Protected Attributes

DawgArgs dawg_args_
 
float rating_cert_scale_
 
const UnicityTable< FontInfo > * fontinfo_table_
 
Dictdict_
 
bool fixed_pitch_
 
float max_char_wh_ratio_
 
STRING prev_word_str_
 
int prev_word_unichar_step_len_
 
DawgPositionVector very_beginning_active_dawgs_
 
DawgPositionVector beginning_active_dawgs_
 
bool acceptable_choice_found_
 
bool correct_segmentation_explored_
 
ParamsModel params_model_
 

Constructor & Destructor Documentation

◆ LanguageModel()

tesseract::LanguageModel::LanguageModel ( const UnicityTable< FontInfo > *  fontinfo_table,
Dict dict 
)

◆ ~LanguageModel()

tesseract::LanguageModel::~LanguageModel ( )

Member Function Documentation

◆ AcceptableChoiceFound()

bool tesseract::LanguageModel::AcceptableChoiceFound ( )
inline

◆ AcceptablePath()

bool tesseract::LanguageModel::AcceptablePath ( const ViterbiStateEntry vse)
inlineprotected

◆ AddViterbiStateEntry()

bool tesseract::LanguageModel::AddViterbiStateEntry ( LanguageModelFlagsType  top_choice_flags,
float  denom,
bool  word_end,
int  curr_col,
int  curr_row,
BLOB_CHOICE b,
LanguageModelState curr_state,
ViterbiStateEntry parent_vse,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

◆ BOOL_VAR_H() [1/4]

tesseract::LanguageModel::BOOL_VAR_H ( language_model_ngram_on  ,
false  ,
"Turn on/off the use of character ngram model"   
)

◆ BOOL_VAR_H() [2/4]

tesseract::LanguageModel::BOOL_VAR_H ( language_model_ngram_use_only_first_uft8_step  ,
false  ,
"Use only the first UTF8 step of the given string" " when computing log probabilities"   
)

◆ BOOL_VAR_H() [3/4]

tesseract::LanguageModel::BOOL_VAR_H ( language_model_ngram_space_delimited_language  ,
true  ,
"Words are delimited by space"   
)

◆ BOOL_VAR_H() [4/4]

tesseract::LanguageModel::BOOL_VAR_H ( language_model_use_sigmoidal_certainty  ,
false  ,
"Use sigmoidal score for certainty"   
)

◆ CertaintyScore()

float tesseract::LanguageModel::CertaintyScore ( float  cert)
inlineprotected

◆ ComputeAdjustedPathCost()

float tesseract::LanguageModel::ComputeAdjustedPathCost ( ViterbiStateEntry vse)
protected

◆ ComputeAdjustment()

float tesseract::LanguageModel::ComputeAdjustment ( int  num_problems,
float  penalty 
)
inlineprotected

◆ ComputeAssociateStats()

void tesseract::LanguageModel::ComputeAssociateStats ( int  col,
int  row,
float  max_char_wh_ratio,
ViterbiStateEntry parent_vse,
WERD_RES word_res,
AssociateStats associate_stats 
)
inlineprotected

◆ ComputeConsistencyAdjustment()

float tesseract::LanguageModel::ComputeConsistencyAdjustment ( const LanguageModelDawgInfo dawg_info,
const LMConsistencyInfo consistency_info 
)
inlineprotected

◆ ComputeDenom()

float tesseract::LanguageModel::ComputeDenom ( BLOB_CHOICE_LIST *  curr_list)
protected

◆ ComputeNgramCost()

float tesseract::LanguageModel::ComputeNgramCost ( const char *  unichar,
float  certainty,
float  denom,
const char *  context,
int *  unichar_step_len,
bool *  found_small_prob,
float *  ngram_prob 
)
protected

◆ ConstructWord()

WERD_CHOICE * tesseract::LanguageModel::ConstructWord ( ViterbiStateEntry vse,
WERD_RES word_res,
DANGERR fixpt,
BlamerBundle blamer_bundle,
bool *  truth_path 
)
protected

◆ double_VAR_H() [1/13]

tesseract::LanguageModel::double_VAR_H ( language_model_ngram_small_prob  ,
0.  000001,
"To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model"   
)

◆ double_VAR_H() [2/13]

tesseract::LanguageModel::double_VAR_H ( language_model_ngram_nonmatch_score  ,
-40.  0,
"Average classifier score of a non-matching unichar"   
)

◆ double_VAR_H() [3/13]

tesseract::LanguageModel::double_VAR_H ( language_model_ngram_scale_factor  ,
0.  03,
"Strength of the character ngram model relative to the" " character classifier "   
)

◆ double_VAR_H() [4/13]

tesseract::LanguageModel::double_VAR_H ( language_model_ngram_rating_factor  ,
16.  0,
"Factor to bring log-probs into the same range as ratings" " when multiplied by outline length "   
)

◆ double_VAR_H() [5/13]

tesseract::LanguageModel::double_VAR_H ( language_model_penalty_non_freq_dict_word  ,
0.  1,
"Penalty for words not in the frequent word dictionary"   
)

◆ double_VAR_H() [6/13]

tesseract::LanguageModel::double_VAR_H ( language_model_penalty_non_dict_word  ,
0.  15,
"Penalty for non-dictionary words"   
)

◆ double_VAR_H() [7/13]

tesseract::LanguageModel::double_VAR_H ( language_model_penalty_punc  ,
0.  2,
"Penalty for inconsistent punctuation"   
)

◆ double_VAR_H() [8/13]

tesseract::LanguageModel::double_VAR_H ( language_model_penalty_case  ,
0.  1,
"Penalty for inconsistent case"   
)

◆ double_VAR_H() [9/13]

tesseract::LanguageModel::double_VAR_H ( language_model_penalty_script  ,
0.  5,
"Penalty for inconsistent script"   
)

◆ double_VAR_H() [10/13]

tesseract::LanguageModel::double_VAR_H ( language_model_penalty_chartype  ,
0.  3,
"Penalty for inconsistent character type"   
)

◆ double_VAR_H() [11/13]

tesseract::LanguageModel::double_VAR_H ( language_model_penalty_font  ,
0.  00,
"Penalty for inconsistent font"   
)

◆ double_VAR_H() [12/13]

tesseract::LanguageModel::double_VAR_H ( language_model_penalty_spacing  ,
0.  05,
"Penalty for inconsistent spacing"   
)

◆ double_VAR_H() [13/13]

tesseract::LanguageModel::double_VAR_H ( language_model_penalty_increment  ,
0.  01,
"Penalty increment"   
)

◆ ExtractFeaturesFromPath()

void tesseract::LanguageModel::ExtractFeaturesFromPath ( const ViterbiStateEntry vse,
float  features[] 
)
static

◆ FillConsistencyInfo()

void tesseract::LanguageModel::FillConsistencyInfo ( int  curr_col,
bool  word_end,
BLOB_CHOICE b,
ViterbiStateEntry parent_vse,
WERD_RES word_res,
LMConsistencyInfo consistency_info 
)
protected

◆ GenerateDawgInfo()

LanguageModelDawgInfo * tesseract::LanguageModel::GenerateDawgInfo ( bool  word_end,
int  curr_col,
int  curr_row,
const BLOB_CHOICE b,
const ViterbiStateEntry parent_vse 
)
protected

◆ GenerateNgramInfo()

LanguageModelNgramInfo * tesseract::LanguageModel::GenerateNgramInfo ( const char *  unichar,
float  certainty,
float  denom,
int  curr_col,
int  curr_row,
float  outline_length,
const ViterbiStateEntry parent_vse 
)
protected

◆ GenerateTopChoiceInfo()

void tesseract::LanguageModel::GenerateTopChoiceInfo ( ViterbiStateEntry new_vse,
const ViterbiStateEntry parent_vse,
LanguageModelState lms 
)
protected

◆ GetNextParentVSE()

ViterbiStateEntry * tesseract::LanguageModel::GetNextParentVSE ( bool  just_classified,
bool  mixed_alnum,
const BLOB_CHOICE bc,
LanguageModelFlagsType  blob_choice_flags,
const UNICHARSET unicharset,
WERD_RES word_res,
ViterbiStateEntry_IT *  vse_it,
LanguageModelFlagsType top_choice_flags 
) const
protected

Finds the next ViterbiStateEntry with which the given unichar_id can combine sensibly, taking into account any mixed alnum/mixed case situation, and whether this combination has been inspected before.

◆ getParamsModel()

ParamsModel& tesseract::LanguageModel::getParamsModel ( )
inline

◆ GetTopLowerUpperDigit()

bool tesseract::LanguageModel::GetTopLowerUpperDigit ( BLOB_CHOICE_LIST *  curr_list,
BLOB_CHOICE **  first_lower,
BLOB_CHOICE **  first_upper,
BLOB_CHOICE **  first_digit 
) const
protected

Finds the first lower and upper case letter and first digit in curr_list. For non-upper/lower languages, alpha counts as upper. Uses the first character in the list in place of empty results. Returns true if both alpha and digits are found.

◆ InitForWord()

void tesseract::LanguageModel::InitForWord ( const WERD_CHOICE prev_word,
bool  fixed_pitch,
float  max_char_wh_ratio,
float  rating_cert_scale 
)

◆ INT_VAR_H() [1/6]

tesseract::LanguageModel::INT_VAR_H ( language_model_debug_level  ,
,
"Language model debug level"   
)

◆ INT_VAR_H() [2/6]

tesseract::LanguageModel::INT_VAR_H ( language_model_ngram_order  ,
,
"Maximum order of the character ngram model"   
)

◆ INT_VAR_H() [3/6]

tesseract::LanguageModel::INT_VAR_H ( language_model_viterbi_list_max_num_prunable  ,
10  ,
"Maximum number of prunable (those for which PrunablePath() is" " true) entries in each viterbi list recorded in BLOB_CHOICEs"   
)

◆ INT_VAR_H() [4/6]

tesseract::LanguageModel::INT_VAR_H ( language_model_viterbi_list_max_size  ,
500  ,
"Maximum size of viterbi lists recorded in BLOB_CHOICEs"   
)

◆ INT_VAR_H() [5/6]

tesseract::LanguageModel::INT_VAR_H ( language_model_min_compound_length  ,
,
"Minimum length of compound words"   
)

◆ INT_VAR_H() [6/6]

tesseract::LanguageModel::INT_VAR_H ( wordrec_display_segmentations  ,
,
"Display Segmentations"   
)

◆ PrunablePath()

bool tesseract::LanguageModel::PrunablePath ( const ViterbiStateEntry vse)
inlineprotected

◆ SetAcceptableChoiceFound()

void tesseract::LanguageModel::SetAcceptableChoiceFound ( bool  val)
inline

◆ SetTopParentLowerUpperDigit()

int tesseract::LanguageModel::SetTopParentLowerUpperDigit ( LanguageModelState parent_node) const
protected

Forces there to be at least one entry in the overall set of the viterbi_state_entries of each element of parent_node that has the top_choice_flag set for lower, upper and digit using the same rules as GetTopLowerUpperDigit, setting the flag on the first found suitable candidate, whether or not the flag is set on some other parent. Returns 1 if both alpha and digits are found among the parents, -1 if no parents are found at all (a legitimate case), and 0 otherwise.

◆ UpdateBestChoice()

void tesseract::LanguageModel::UpdateBestChoice ( ViterbiStateEntry vse,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

◆ UpdateState()

bool tesseract::LanguageModel::UpdateState ( bool  just_classified,
int  curr_col,
int  curr_row,
BLOB_CHOICE_LIST *  curr_list,
LanguageModelState parent_node,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)

UpdateState has the job of combining the ViterbiStateEntry lists on each of the choices on parent_list with each of the blob choices in curr_list, making a new ViterbiStateEntry for each sensible path.

This could be a huge set of combinations, creating a lot of work only to be truncated by some beam limit, but only certain kinds of paths will continue at the next step:

  • paths that are liked by the language model: either a DAWG or the n-gram model, where active.
  • paths that represent some kind of top choice. The old permuter permuted the top raw classifier score, the top upper case word and the top lower- case word. UpdateState now concentrates its top-choice paths on top lower-case, top upper-case (or caseless alpha), and top digit sequence, with allowance for continuation of these paths through blobs where such a character does not appear in the choices list.

GetNextParentVSE enforces some of these models to minimize the number of calls to AddViterbiStateEntry, even prior to looking at the language model. Thus an n-blob sequence of [l1I] will produce 3n calls to AddViterbiStateEntry instead of 3^n.

Of course it isn't quite that simple as Title Case is handled by allowing lower case to continue an upper case initial, but it has to be detected in the combiner so it knows which upper case letters are initial alphas.

Member Data Documentation

◆ acceptable_choice_found_

bool tesseract::LanguageModel::acceptable_choice_found_
protected

◆ beginning_active_dawgs_

DawgPositionVector tesseract::LanguageModel::beginning_active_dawgs_
protected

◆ correct_segmentation_explored_

bool tesseract::LanguageModel::correct_segmentation_explored_
protected

◆ dawg_args_

DawgArgs tesseract::LanguageModel::dawg_args_
protected

◆ dict_

Dict* tesseract::LanguageModel::dict_
protected

◆ fixed_pitch_

bool tesseract::LanguageModel::fixed_pitch_
protected

◆ fontinfo_table_

const UnicityTable<FontInfo>* tesseract::LanguageModel::fontinfo_table_
protected

◆ kDigitFlag

const LanguageModelFlagsType tesseract::LanguageModel::kDigitFlag = 0x8
static

◆ kLowerCaseFlag

const LanguageModelFlagsType tesseract::LanguageModel::kLowerCaseFlag = 0x2
static

◆ kMaxAvgNgramCost

const float tesseract::LanguageModel::kMaxAvgNgramCost = 25.0f
static

◆ kSmallestRatingFlag

const LanguageModelFlagsType tesseract::LanguageModel::kSmallestRatingFlag = 0x1
static

◆ kUpperCaseFlag

const LanguageModelFlagsType tesseract::LanguageModel::kUpperCaseFlag = 0x4
static

◆ kXhtConsistentFlag

const LanguageModelFlagsType tesseract::LanguageModel::kXhtConsistentFlag = 0x10
static

◆ max_char_wh_ratio_

float tesseract::LanguageModel::max_char_wh_ratio_
protected

◆ params_model_

ParamsModel tesseract::LanguageModel::params_model_
protected

◆ prev_word_str_

STRING tesseract::LanguageModel::prev_word_str_
protected

◆ prev_word_unichar_step_len_

int tesseract::LanguageModel::prev_word_unichar_step_len_
protected

◆ rating_cert_scale_

float tesseract::LanguageModel::rating_cert_scale_
protected

◆ very_beginning_active_dawgs_

DawgPositionVector tesseract::LanguageModel::very_beginning_active_dawgs_
protected

The documentation for this class was generated from the following files: