#include <dict.h>

Collaboration diagram for tesseract::Dict:

[legend]

Public Member Functions
	Dict (CCUtil *image_ptr)

	~Dict ()

const CCUtil *	getCCUtil () const

CCUtil *	getCCUtil ()

const UNICHARSET &	getUnicharset () const

UNICHARSET &	getUnicharset ()

const UnicharAmbigs &	getUnicharAmbigs () const

bool	compound_marker (UNICHAR_ID unichar_id)

bool	is_apostrophe (UNICHAR_ID unichar_id)

bool	hyphenated () const
	Returns true if we've recorded the beginning of a hyphenated word. More...

int	hyphen_base_size () const
	Size of the base word (the part on the line before) of a hyphenated word. More...

void	copy_hyphen_info (WERD_CHOICE *word) const

bool	has_hyphen_end (UNICHAR_ID unichar_id, bool first_pos) const
	Check whether the word has a hyphen at the end. More...

bool	has_hyphen_end (const WERD_CHOICE &word) const
	Same as above, but check the unichar at the end of the word. More...

void	reset_hyphen_vars (bool last_word_on_line)

void	set_hyphen_word (const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)

void	update_best_choice (const WERD_CHOICE &word, WERD_CHOICE *best_choice)

void	init_active_dawgs (DawgPositionVector *active_dawgs, bool ambigs_mode) const

void	default_dawgs (DawgPositionVector *anylength_dawgs, bool suppress_patterns) const

bool	NoDangerousAmbig (WERD_CHOICE BestChoice, DANGERR fixpt, bool fix_replaceable, MATRIX *ratings)

void	ReplaceAmbig (int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE werd_choice, MATRIX ratings)

int	LengthOfShortestAlphaRun (const WERD_CHOICE &WordChoice) const
	Returns the length of the shortest alpha run in WordChoice. More...

int	UniformCertainties (const WERD_CHOICE &word)

bool	AcceptableChoice (const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
	Returns true if the given best_choice is good enough to stop. More...

bool	AcceptableResult (WERD_RES *word) const

void	EndDangerousAmbigs ()

void	DebugWordChoices ()
	Prints the current choices for this word to stdout. More...

void	SettupStopperPass1 ()
	Sets up stopper variables in preparation for the first pass. More...

void	SettupStopperPass2 ()
	Sets up stopper variables in preparation for the second pass. More...

int	case_ok (const WERD_CHOICE &word, const UNICHARSET &unicharset) const
	Check a string to see if it matches a set of lexical rules. More...

bool	absolute_garbage (const WERD_CHOICE &word, const UNICHARSET &unicharset)

void	SetupForLoad (DawgCache *dawg_cache)

void	Load (const STRING &lang, TessdataManager *data_file)

void	LoadLSTM (const STRING &lang, TessdataManager *data_file)

bool	FinishLoad ()

void	End ()

void	ResetDocumentDictionary ()

int	def_letter_is_okay (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const

int	LetterIsOkay (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
	Calls letter_is_okay_ member function. More...

double	ProbabilityInContext (const char context, int context_bytes, const char character, int character_bytes)
	Calls probability_in_context_ member function. More...

double	def_probability_in_context (const char lang, const char context, int context_bytes, const char *character, int character_bytes)
	Default (no-op) implementation of probability in context function. More...

double	ngram_probability_in_context (const char lang, const char context, int context_bytes, const char *character, int character_bytes)

float	ParamsModelClassify (const char lang, void path)

float	CallParamsModelClassify (void *path)

void	SetWildcardID (UNICHAR_ID id)

UNICHAR_ID	WildcardID () const

int	NumDawgs () const
	Return the number of dawgs in the dawgs_ vector. More...

const Dawg *	GetDawg (int index) const
	Return i-th dawg pointer recorded in the dawgs_ vector. More...

const Dawg *	GetPuncDawg () const
	Return the points to the punctuation dawg. More...

const Dawg *	GetUnambigDawg () const
	Return the points to the unambiguous words dawg. More...

UNICHAR_ID	char_for_dawg (const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const

void	ProcessPatternEdges (const Dawg dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs dawg_args, PermuterType *current_permuter) const

int	valid_word (const WERD_CHOICE &word, bool numbers_ok) const

int	valid_word (const WERD_CHOICE &word) const

int	valid_word_or_number (const WERD_CHOICE &word) const

int	valid_word (const char *string) const
	This function is used by api/tesseract_cube_combiner.cpp. More...

bool	valid_bigram (const WERD_CHOICE &word1, const WERD_CHOICE &word2) const

bool	valid_punctuation (const WERD_CHOICE &word)

int	good_choice (const WERD_CHOICE &choice)
	Returns true if a good answer is found for the unknown blob rating. More...

void	add_document_word (const WERD_CHOICE &best_choice)
	Adds a word found on this document to the document specific dictionary. More...

void	adjust_word (WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
	Adjusts the rating of the given word. More...

void	SetWordsegRatingAdjustFactor (float f)
	Set wordseg_rating_adjust_factor_ to the given value. More...

bool	IsSpaceDelimitedLang () const
	Returns true if the language is space-delimited (not CJ, or T). More...

	STRING_VAR_H (user_words_file, "", "A filename of user-provided words.")

	STRING_VAR_H (user_words_suffix, "", "A suffix of user-provided words located in tessdata.")

	STRING_VAR_H (user_patterns_file, "", "A filename of user-provided patterns.")

	STRING_VAR_H (user_patterns_suffix, "", "A suffix of user-provided patterns located in tessdata.")

	BOOL_VAR_H (load_system_dawg, true, "Load system word dawg.")

	BOOL_VAR_H (load_freq_dawg, true, "Load frequent word dawg.")

	BOOL_VAR_H (load_unambig_dawg, true, "Load unambiguous word dawg.")

	BOOL_VAR_H (load_punc_dawg, true, "Load dawg with punctuation patterns.")

	BOOL_VAR_H (load_number_dawg, true, "Load dawg with number patterns.")

	BOOL_VAR_H (load_bigram_dawg, true, "Load dawg with special word bigrams.")

	double_VAR_H (xheight_penalty_subscripts, 0.125, "Score penalty (0.1 = 10%) added if there are subscripts " "or superscripts in a word, but it is otherwise OK.")

	double_VAR_H (xheight_penalty_inconsistent, 0.25, "Score penalty (0.1 = 10%) added if an xheight is " "inconsistent.")

	double_VAR_H (segment_penalty_dict_frequent_word, 1.0, "Score multiplier for word matches which have good case and" "are frequent in the given language (lower is better).")

	double_VAR_H (segment_penalty_dict_case_ok, 1.1, "Score multiplier for word matches that have good case " "(lower is better).")

	double_VAR_H (segment_penalty_dict_case_bad, 1.3125, "Default score multiplier for word matches, which may have " "case issues (lower is better).")

	double_VAR_H (segment_penalty_dict_nonword, 1.25, "Score multiplier for glyph fragment segmentations which " "do not match a dictionary word (lower is better).")

	double_VAR_H (segment_penalty_garbage, 1.50, "Score multiplier for poorly cased strings that are not in" " the dictionary and generally look like garbage (lower is" " better).")

	STRING_VAR_H (output_ambig_words_file, "", "Output file for ambiguities found in the dictionary")

	INT_VAR_H (dawg_debug_level, 0, "Set to 1 for general debug info" ", to 2 for more details, to 3 to see all the debug messages")

	INT_VAR_H (hyphen_debug_level, 0, "Debug level for hyphenated words.")

	INT_VAR_H (max_viterbi_list_size, 10, "Maximum size of viterbi list.")

	BOOL_VAR_H (use_only_first_uft8_step, false, "Use only the first UTF8 step of the given string" " when computing log probabilities.")

	double_VAR_H (certainty_scale, 20.0, "Certainty scaling factor")

	double_VAR_H (stopper_nondict_certainty_base, -2.50, "Certainty threshold for non-dict words")

	double_VAR_H (stopper_phase2_certainty_rejection_offset, 1.0, "Reject certainty offset")

	INT_VAR_H (stopper_smallword_size, 2, "Size of dict word to be treated as non-dict word")

	double_VAR_H (stopper_certainty_per_char, -0.50, "Certainty to add for each dict char above small word size.")

	double_VAR_H (stopper_allowable_character_badness, 3.0, "Max certaintly variation allowed in a word (in sigma)")

	INT_VAR_H (stopper_debug_level, 0, "Stopper debug level")

	BOOL_VAR_H (stopper_no_acceptable_choices, false, "Make AcceptableChoice() always return false. Useful" " when there is a need to explore all segmentations")

	INT_VAR_H (tessedit_truncate_wordchoice_log, 10, "Max words to keep in list")

	STRING_VAR_H (word_to_debug, "", "Word for which stopper debug information" " should be printed to stdout")

	STRING_VAR_H (word_to_debug_lengths, "", "Lengths of unichars in word_to_debug")

	INT_VAR_H (fragments_debug, 0, "Debug character fragments")

	BOOL_VAR_H (segment_nonalphabetic_script, false, "Don't use any alphabetic-specific tricks." "Set to true in the traineddata config file for" " scripts that are cursive or inherently fixed-pitch")

	BOOL_VAR_H (save_doc_words, 0, "Save Document Words")

	double_VAR_H (doc_dict_pending_threshold, 0.0, "Worst certainty for using pending dictionary")

	double_VAR_H (doc_dict_certainty_threshold, -2.25, "Worst certainty" " for words that can be inserted into the document dictionary")

	INT_VAR_H (max_permuter_attempts, 10000, "Maximum number of different" " character choices to consider during permutation." " This limit is especially useful when user patterns" " are specified, since overly generic patterns can result in" " dawg search exploring an overly large number of options.")

go_deeper_dawg_fxn
If the choice being composed so far could be a dictionary word keep exploring choices.
WERD_CHOICE *	dawg_permute_and_select (const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)

void	go_deeper_dawg_fxn (const char debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO prev_char_frag_info, bool word_ending, WERD_CHOICE word, float certainties[], float limit, WERD_CHOICE best_choice, int attempts_left, void *void_more_args)

void	permute_choices (const char debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO prev_char_frag_info, WERD_CHOICE word, float certainties[], float limit, WERD_CHOICE best_choice, int attempts_left, void *more_args)

void	append_choices (const char debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO prev_char_frag_info, WERD_CHOICE word, float certainties[], float limit, WERD_CHOICE best_choice, int attempts_left, void *more_args)

fragment_state
Given the current char choice and information about previously seen fragments, determines whether adjacent character fragments are present and whether they can be concatenated. The given prev_char_frag_info contains: fragment: if not nullptr contains information about immediately preceding fragmented character choice num_fragments: number of fragments that have been used so far to construct a character certainty: certainty of the current choice or minimum certainty of all fragments concatenated so far rating: rating of the current choice or sum of fragment ratings concatenated so far The output char_frag_info is filled in as follows: character: is set to be nullptr if the choice is a non-matching or non-ending fragment piece; is set to unichar of the given choice if it represents a regular character or a matching ending fragment fragment,num_fragments,certainty,rating are set as described above Returns false if a non-matching fragment is discovered, true otherwise.
bool	fragment_state_okay (UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO prev_char_frag_info, const char debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)

Static Public Member Functions
static DawgCache *	GlobalDawgCache ()

static NODE_REF	GetStartingNode (const Dawg *dawg, EDGE_REF edge_ref)
	Returns the appropriate next node given the EDGE_REF. More...

static bool	valid_word_permuter (uint8_t perm, bool numbers_ok)
	Check all the DAWGs to see if this word is in any of them. More...

Public Attributes
void(Dict::*	go_deeper_fxn_ )(const char debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO prev_char_frag_info, bool word_ending, WERD_CHOICE word, float certainties[], float limit, WERD_CHOICE best_choice, int attempts_left, void *void_more_args)
	Pointer to go_deeper function. More...

int(Dict::*	letter_is_okay_ )(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const

double(Dict::*	probability_in_context_ )(const char lang, const char context, int context_bytes, const char *character, int character_bytes)
	Probability in context function used by the ngram permuter. More...

float(Dict::*	params_model_classify_ )(const char lang, void path)

Private Attributes
CCUtil *	ccutil_

UnicharAmbigs *	dang_ambigs_table_

UnicharAmbigs *	replace_ambigs_table_

float	reject_offset_

UNICHAR_ID	wildcard_unichar_id_

UNICHAR_ID	apostrophe_unichar_id_

UNICHAR_ID	question_unichar_id_

UNICHAR_ID	slash_unichar_id_

UNICHAR_ID	hyphen_unichar_id_

WERD_CHOICE *	hyphen_word_

DawgPositionVector	hyphen_active_dawgs_

bool	last_word_on_line_

GenericVector< GenericVectorEqEq< UNICHAR_ID > >	equivalent_symbols_

DawgCache *	dawg_cache_

bool	dawg_cache_is_ours_

DawgVector	dawgs_

SuccessorListsVector	successors_

Trie *	pending_words_

Dawg *	bigram_dawg_

Dawg *	freq_dawg_

Dawg *	unambig_dawg_

Dawg *	punc_dawg_

Trie *	document_words_

float	wordseg_rating_adjust_factor_

FILE *	output_ambig_words_file_

Constructor & Destructor Documentation

◆ Dict()

tesseract::Dict::Dict ( CCUtil * image_ptr )

◆ ~Dict()

tesseract::Dict::~Dict ( )

Member Function Documentation

◆ absolute_garbage()

bool tesseract::Dict::absolute_garbage	(	const WERD_CHOICE &	word,
		const UNICHARSET &	unicharset
	)

Returns true if the word looks like an absolute garbage (e.g. image mistakenly recognized as text).

◆ AcceptableChoice()

bool tesseract::Dict::AcceptableChoice	(	const WERD_CHOICE &	best_choice,
		XHeightConsistencyEnum	xheight_consistency
	)

Returns true if the given best_choice is good enough to stop.

◆ AcceptableResult()

bool tesseract::Dict::AcceptableResult ( WERD_RES * word ) const

Returns false if the best choice for the current word is questionable and should be tried again on the second pass or should be flagged to the user.

◆ add_document_word()

void tesseract::Dict::add_document_word ( const WERD_CHOICE & best_choice )

Adds a word found on this document to the document specific dictionary.

◆ adjust_word()

void tesseract::Dict::adjust_word	(	WERD_CHOICE *	word,
		bool	nonword,
		XHeightConsistencyEnum	xheight_consistency,
		float	additional_adjust,
		bool	modify_rating,
		bool	debug
	)

Adjusts the rating of the given word.

◆ append_choices()

void tesseract::Dict::append_choices	(	const char *	debug,
		const BLOB_CHOICE_LIST_VECTOR &	char_choices,
		const BLOB_CHOICE &	blob_choice,
		int	char_choice_index,
		const CHAR_FRAGMENT_INFO *	prev_char_frag_info,
		WERD_CHOICE *	word,
		float	certainties[],
		float *	limit,
		WERD_CHOICE *	best_choice,
		int *	attempts_left,
		void *	more_args
	)

append_choices

Checks to see whether or not the next choice is worth appending to the word being generated. If so then keeps going deeper into the word.

This function assumes that Dict::go_deeper_fxn_ is set.

◆ BOOL_VAR_H() [1/10]

tesseract::Dict::BOOL_VAR_H	(	load_system_dawg	,
		true	,
		"Load system word dawg."
	)

◆ BOOL_VAR_H() [2/10]

tesseract::Dict::BOOL_VAR_H	(	load_freq_dawg	,
		true	,
		"Load frequent word dawg."
	)

◆ BOOL_VAR_H() [3/10]

tesseract::Dict::BOOL_VAR_H	(	load_unambig_dawg	,
		true	,
		"Load unambiguous word dawg."
	)

◆ BOOL_VAR_H() [4/10]

tesseract::Dict::BOOL_VAR_H	(	load_punc_dawg	,
		true	,
		"Load dawg with punctuation patterns."
	)

◆ BOOL_VAR_H() [5/10]

tesseract::Dict::BOOL_VAR_H	(	load_number_dawg	,
		true	,
		"Load dawg with number patterns."
	)

◆ BOOL_VAR_H() [6/10]

tesseract::Dict::BOOL_VAR_H	(	load_bigram_dawg	,
		true	,
		"Load dawg with special word bigrams."
	)

◆ BOOL_VAR_H() [7/10]

tesseract::Dict::BOOL_VAR_H	(	use_only_first_uft8_step	,
		false	,
		"Use only the first UTF8 step of the given string" " when computing log probabilities."
	)

◆ BOOL_VAR_H() [8/10]

tesseract::Dict::BOOL_VAR_H	(	stopper_no_acceptable_choices	,
		false	,
		"Make AcceptableChoice() always return false. Useful" " when there is a need to explore all segmentations"
	)

◆ BOOL_VAR_H() [9/10]

tesseract::Dict::BOOL_VAR_H	(	segment_nonalphabetic_script	,
		false	,
		"Don't use any alphabetic-specific tricks." "Set to true in the traineddata config file for" " scripts that are cursive or inherently fixed-pitch"
	)

◆ BOOL_VAR_H() [10/10]

tesseract::Dict::BOOL_VAR_H	(	save_doc_words	,
		0	,
		"Save Document Words"
	)

◆ CallParamsModelClassify()

float tesseract::Dict::CallParamsModelClassify ( void * path )

inline

◆ case_ok()

int tesseract::Dict::case_ok	(	const WERD_CHOICE &	word,
		const UNICHARSET &	unicharset
	)		const

Check a string to see if it matches a set of lexical rules.

◆ char_for_dawg()

UNICHAR_ID tesseract::Dict::char_for_dawg	(	const UNICHARSET &	unicharset,
		UNICHAR_ID	ch,
		const Dawg *	dawg
	)		const

inline

◆ compound_marker()

bool tesseract::Dict::compound_marker ( UNICHAR_ID unichar_id )

inline

◆ copy_hyphen_info()

void tesseract::Dict::copy_hyphen_info ( WERD_CHOICE * word ) const

inline

If this word is hyphenated copy the base word (the part on the line before) of a hyphenated word into the given word. This function assumes that word is not nullptr.

◆ dawg_permute_and_select()

WERD_CHOICE * tesseract::Dict::dawg_permute_and_select	(	const BLOB_CHOICE_LIST_VECTOR &	char_choices,
		float	rating_limit
	)

Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to explore all the dawgs in the dawgs_ vector in parallel and discard invalid words.

Allocate and return a WERD_CHOICE with the best valid word found.

dawg_permute_and_select

Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to search all the dawgs in the dawgs_ vector in parallel and discard invalid words.

Allocate and return a WERD_CHOICE with the best valid word found.

◆ DebugWordChoices()

void tesseract::Dict::DebugWordChoices ( )

Prints the current choices for this word to stdout.

◆ def_letter_is_okay()

int tesseract::Dict::def_letter_is_okay	(	void *	void_dawg_args,
		const UNICHARSET &	unicharset,
		UNICHAR_ID	unichar_id,
		bool	word_end
	)		const

Returns the maximal permuter code (from ccstruct/ratngs.h) if in light of the current state the letter at word_index in the given word is allowed according to at least one of the dawgs in dawgs_, otherwise returns NO_PERM.

The state is described by void_dawg_args, which are interpreted as DawgArgs and contain relevant active dawg positions. Each entry in the active_dawgs vector contains an index into the dawgs_ vector and an EDGE_REF that indicates the last edge followed in the dawg. It also may contain a position in the punctuation dawg which describes surrounding punctuation (see struct DawgPosition).

Input: At word_index 0 dawg_args->active_dawgs should contain an entry for each dawg that may start at the beginning of a word, with punc_ref and edge_ref initialized to NO_EDGE. Since the punctuation dawg includes the empty pattern " " (meaning anything without surrounding punctuation), having a single entry for the punctuation dawg will cover all dawgs reachable therefrom – that includes all number and word dawgs. The only dawg non-reachable from the punctuation_dawg is the pattern dawg. If hyphen state needs to be applied, initial dawg_args->active_dawgs can be copied from the saved hyphen state (maintained by Dict). For word_index > 0 the corresponding state (active_dawgs and punc position) can be obtained from dawg_args->updated_dawgs passed to def_letter_is_okay for word_index-1. Note: the function assumes that active_dawgs, and updated_dawgs member variables of dawg_args are not nullptr.

Output: The function fills in dawg_args->updated_dawgs vector with the entries for dawgs that contain the word up to the letter at word_index.

◆ def_probability_in_context()

double tesseract::Dict::def_probability_in_context	(	const char *	lang,
		const char *	context,
		int	context_bytes,
		const char *	character,
		int	character_bytes
	)

inline

Default (no-op) implementation of probability in context function.

◆ default_dawgs()

void tesseract::Dict::default_dawgs	(	DawgPositionVector *	anylength_dawgs,
		bool	suppress_patterns
	)		const

◆ double_VAR_H() [1/14]

tesseract::Dict::double_VAR_H	(	xheight_penalty_subscripts	,
		0.	125,
		"Score penalty (0.1 = 10%) added if there are subscripts " "or superscripts in a	word,
		but it is otherwise OK."
	)

◆ double_VAR_H() [2/14]

tesseract::Dict::double_VAR_H	(	xheight_penalty_inconsistent	,
		0.	25,
		"Score penalty (0.1 = 10%) added if an xheight is " "inconsistent."
	)

◆ double_VAR_H() [3/14]

tesseract::Dict::double_VAR_H	(	segment_penalty_dict_frequent_word	,
		1.	0,
		"Score multiplier for word matches which have good case and" "are frequent in the given language (lower is better)."
	)

◆ double_VAR_H() [4/14]

tesseract::Dict::double_VAR_H	(	segment_penalty_dict_case_ok	,
		1.	1,
		"Score multiplier for word matches that have good case " "(lower is better)."
	)

◆ double_VAR_H() [5/14]

tesseract::Dict::double_VAR_H	(	segment_penalty_dict_case_bad	,
		1.	3125,
		"Default score multiplier for word	matches,
		which may have " "case issues(lower is better)."
	)

◆ double_VAR_H() [6/14]

tesseract::Dict::double_VAR_H	(	segment_penalty_dict_nonword	,
		1.	25,
		"Score multiplier for glyph fragment segmentations which " "do not match a dictionary word (lower is better)."
	)

◆ double_VAR_H() [7/14]

tesseract::Dict::double_VAR_H	(	segment_penalty_garbage	,
		1.	50,
		"Score multiplier for poorly cased strings that are not in" " the dictionary and generally look like garbage (lower is" " better)."
	)

◆ double_VAR_H() [8/14]

tesseract::Dict::double_VAR_H	(	certainty_scale	,
		20.	0,
		"Certainty scaling factor"
	)

◆ double_VAR_H() [9/14]

tesseract::Dict::double_VAR_H	(	stopper_nondict_certainty_base	,
		-2.	50,
		"Certainty threshold for non-dict words"
	)

◆ double_VAR_H() [10/14]

tesseract::Dict::double_VAR_H	(	stopper_phase2_certainty_rejection_offset	,
		1.	0,
		"Reject certainty offset"
	)

◆ double_VAR_H() [11/14]

tesseract::Dict::double_VAR_H	(	stopper_certainty_per_char	,
		-0.	50,
		"Certainty to add for each dict char above small word size."
	)

◆ double_VAR_H() [12/14]

tesseract::Dict::double_VAR_H	(	stopper_allowable_character_badness	,
		3.	0,
		"Max certaintly variation allowed in a word (in sigma)"
	)

◆ double_VAR_H() [13/14]

tesseract::Dict::double_VAR_H	(	doc_dict_pending_threshold	,
		0.	0,
		"Worst certainty for using pending dictionary"
	)

◆ double_VAR_H() [14/14]

tesseract::Dict::double_VAR_H	(	doc_dict_certainty_threshold	,
		-2.	25,
		"Worst certainty" " for words that can be inserted into the document dictionary"
	)

◆ End()

void tesseract::Dict::End ( )

◆ EndDangerousAmbigs()

void tesseract::Dict::EndDangerousAmbigs ( )

◆ FinishLoad()

bool tesseract::Dict::FinishLoad ( )

◆ fragment_state_okay()

bool tesseract::Dict::fragment_state_okay	(	UNICHAR_ID	curr_unichar_id,
		float	curr_rating,
		float	curr_certainty,
		const CHAR_FRAGMENT_INFO *	prev_char_frag_info,
		const char *	debug,
		int	word_ending,
		CHAR_FRAGMENT_INFO *	char_frag_info
	)

◆ getCCUtil() [1/2]

const CCUtil* tesseract::Dict::getCCUtil ( ) const

inline

◆ getCCUtil() [2/2]

CCUtil* tesseract::Dict::getCCUtil ( )

inline

◆ GetDawg()

const Dawg* tesseract::Dict::GetDawg ( int index ) const

inline

Return i-th dawg pointer recorded in the dawgs_ vector.

◆ GetPuncDawg()

const Dawg* tesseract::Dict::GetPuncDawg ( ) const

inline

Return the points to the punctuation dawg.

◆ GetStartingNode()

static NODE_REF tesseract::Dict::GetStartingNode	(	const Dawg *	dawg,
		EDGE_REF	edge_ref
	)

inlinestatic

Returns the appropriate next node given the EDGE_REF.

◆ GetUnambigDawg()

const Dawg* tesseract::Dict::GetUnambigDawg ( ) const

inline

Return the points to the unambiguous words dawg.

◆ getUnicharAmbigs()

const UnicharAmbigs& tesseract::Dict::getUnicharAmbigs ( ) const

inline

◆ getUnicharset() [1/2]

const UNICHARSET& tesseract::Dict::getUnicharset ( ) const

inline

◆ getUnicharset() [2/2]

UNICHARSET& tesseract::Dict::getUnicharset ( )

inline

◆ GlobalDawgCache()

DawgCache * tesseract::Dict::GlobalDawgCache ( )

static

Initialize Dict class - load dawgs from [lang].traineddata and user-specified wordlist and parttern list.

◆ go_deeper_dawg_fxn()

void tesseract::Dict::go_deeper_dawg_fxn	(	const char *	debug,
		const BLOB_CHOICE_LIST_VECTOR &	char_choices,
		int	char_choice_index,
		const CHAR_FRAGMENT_INFO *	prev_char_frag_info,
		bool	word_ending,
		WERD_CHOICE *	word,
		float	certainties[],
		float *	limit,
		WERD_CHOICE *	best_choice,
		int *	attempts_left,
		void *	void_more_args
	)

If the choice being composed so far could be a dictionary word and we have not reached the end of the word keep exploring the char_choices further.

◆ good_choice()

int tesseract::Dict::good_choice ( const WERD_CHOICE & choice )

Returns true if a good answer is found for the unknown blob rating.

◆ has_hyphen_end() [1/2]

bool tesseract::Dict::has_hyphen_end	(	UNICHAR_ID	unichar_id,
		bool	first_pos
	)		const

inline

Check whether the word has a hyphen at the end.

◆ has_hyphen_end() [2/2]

bool tesseract::Dict::has_hyphen_end ( const WERD_CHOICE & word ) const

inline

Same as above, but check the unichar at the end of the word.

◆ hyphen_base_size()

int tesseract::Dict::hyphen_base_size ( ) const

inline

Size of the base word (the part on the line before) of a hyphenated word.

◆ hyphenated()

bool tesseract::Dict::hyphenated ( ) const

inline

Returns true if we've recorded the beginning of a hyphenated word.

◆ init_active_dawgs()

void tesseract::Dict::init_active_dawgs	(	DawgPositionVector *	active_dawgs,
		bool	ambigs_mode
	)		const

Fill the given active_dawgs vector with dawgs that could contain the beginning of the word. If hyphenated() returns true, copy the entries from hyphen_active_dawgs_ instead.

◆ INT_VAR_H() [1/8]

tesseract::Dict::INT_VAR_H	(	dawg_debug_level	,
		0	,
		"Set to 1 for general debug info" "	,
		to 2 for more	details,
		to 3 to see all the debug messages"
	)

◆ INT_VAR_H() [2/8]

tesseract::Dict::INT_VAR_H	(	hyphen_debug_level	,
		0	,
		"Debug level for hyphenated words."
	)

◆ INT_VAR_H() [3/8]

tesseract::Dict::INT_VAR_H	(	max_viterbi_list_size	,
		10	,
		"Maximum size of viterbi list."
	)

◆ INT_VAR_H() [4/8]

tesseract::Dict::INT_VAR_H	(	stopper_smallword_size	,
		2	,
		"Size of dict word to be treated as non-dict word"
	)

◆ INT_VAR_H() [5/8]

tesseract::Dict::INT_VAR_H	(	stopper_debug_level	,
		0	,
		"Stopper debug level"
	)

◆ INT_VAR_H() [6/8]

tesseract::Dict::INT_VAR_H	(	tessedit_truncate_wordchoice_log	,
		10	,
		"Max words to keep in list"
	)

◆ INT_VAR_H() [7/8]

tesseract::Dict::INT_VAR_H	(	fragments_debug	,
		0	,
		"Debug character fragments"
	)

◆ INT_VAR_H() [8/8]

tesseract::Dict::INT_VAR_H	(	max_permuter_attempts	,
		10000	,
		"Maximum number of different" " character choices to consider during permutation." " This limit is especially useful when user patterns" " are	specified,
		since overly generic patterns can result in" " dawg search exploring an overly large number of options."
	)

◆ is_apostrophe()

bool tesseract::Dict::is_apostrophe ( UNICHAR_ID unichar_id )

inline

◆ IsSpaceDelimitedLang()

bool tesseract::Dict::IsSpaceDelimitedLang ( ) const

Returns true if the language is space-delimited (not CJ, or T).

◆ LengthOfShortestAlphaRun()

int tesseract::Dict::LengthOfShortestAlphaRun ( const WERD_CHOICE & WordChoice ) const

Returns the length of the shortest alpha run in WordChoice.

◆ LetterIsOkay()

int tesseract::Dict::LetterIsOkay	(	void *	void_dawg_args,
		const UNICHARSET &	unicharset,
		UNICHAR_ID	unichar_id,
		bool	word_end
	)		const

inline

Calls letter_is_okay_ member function.

◆ Load()

void tesseract::Dict::Load	(	const STRING &	lang,
		TessdataManager *	data_file
	)

◆ LoadLSTM()

void tesseract::Dict::LoadLSTM	(	const STRING &	lang,
		TessdataManager *	data_file
	)

◆ ngram_probability_in_context()

double tesseract::Dict::ngram_probability_in_context	(	const char *	lang,
		const char *	context,
		int	context_bytes,
		const char *	character,
		int	character_bytes
	)

◆ NoDangerousAmbig()

bool tesseract::Dict::NoDangerousAmbig	(	WERD_CHOICE *	BestChoice,
		DANGERR *	fixpt,
		bool	fix_replaceable,
		MATRIX *	ratings
	)

◆ NumDawgs()

int tesseract::Dict::NumDawgs ( ) const

inline

Return the number of dawgs in the dawgs_ vector.

◆ ParamsModelClassify()

float tesseract::Dict::ParamsModelClassify	(	const char *	lang,
		void *	path
	)

◆ permute_choices()

void tesseract::Dict::permute_choices	(	const char *	debug,
		const BLOB_CHOICE_LIST_VECTOR &	char_choices,
		int	char_choice_index,
		const CHAR_FRAGMENT_INFO *	prev_char_frag_info,
		WERD_CHOICE *	word,
		float	certainties[],
		float *	limit,
		WERD_CHOICE *	best_choice,
		int *	attempts_left,
		void *	more_args
	)

permute_choices

Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST with the given char_choice_index in char_choices.

◆ ProbabilityInContext()

double tesseract::Dict::ProbabilityInContext	(	const char *	context,
		int	context_bytes,
		const char *	character,
		int	character_bytes
	)

inline

Calls probability_in_context_ member function.

◆ ProcessPatternEdges()

void tesseract::Dict::ProcessPatternEdges	(	const Dawg *	dawg,
		const DawgPosition &	info,
		UNICHAR_ID	unichar_id,
		bool	word_end,
		DawgArgs *	dawg_args,
		PermuterType *	current_permuter
	)		const

For each of the character classes of the given unichar_id (and the unichar_id itself) finds the corresponding outgoing node or self-loop in the given dawg and (after checking that it is valid) records it in dawg_args->updated_ative_dawgs. Updates current_permuter if any valid edges were found.

◆ ReplaceAmbig()

void tesseract::Dict::ReplaceAmbig	(	int	wrong_ngram_begin_index,
		int	wrong_ngram_size,
		UNICHAR_ID	correct_ngram_id,
		WERD_CHOICE *	werd_choice,
		MATRIX *	ratings
	)

◆ reset_hyphen_vars()

void tesseract::Dict::reset_hyphen_vars ( bool last_word_on_line )

Unless the previous word was the last one on the line, and the current one is not (thus it is the first one on the line), erase hyphen_word_, clear hyphen_active_dawgs_, update last_word_on_line_.

◆ ResetDocumentDictionary()

void tesseract::Dict::ResetDocumentDictionary ( )

inline

◆ set_hyphen_word()

void tesseract::Dict::set_hyphen_word	(	const WERD_CHOICE &	word,
		const DawgPositionVector &	active_dawgs
	)

Update hyphen_word_, and copy the given DawgPositionVectors into hyphen_active_dawgs_ .

◆ SettupStopperPass1()

void tesseract::Dict::SettupStopperPass1 ( )

Sets up stopper variables in preparation for the first pass.

◆ SettupStopperPass2()

void tesseract::Dict::SettupStopperPass2 ( )

Sets up stopper variables in preparation for the second pass.

◆ SetupForLoad()

void tesseract::Dict::SetupForLoad ( DawgCache * dawg_cache )

◆ SetWildcardID()

void tesseract::Dict::SetWildcardID ( UNICHAR_ID id )

inline

◆ SetWordsegRatingAdjustFactor()

void tesseract::Dict::SetWordsegRatingAdjustFactor ( float f )

inline

Set wordseg_rating_adjust_factor_ to the given value.

◆ STRING_VAR_H() [1/7]

tesseract::Dict::STRING_VAR_H	(	user_words_file	,
		""	,
		"A filename of user-provided words."
	)

Variable members. These have to be declared and initialized after image_ptr_, which contains the pointer to the params vector - the member of its base CCUtil class.

◆ STRING_VAR_H() [2/7]

tesseract::Dict::STRING_VAR_H	(	user_words_suffix	,
		""	,
		"A suffix of user-provided words located in tessdata."
	)

◆ STRING_VAR_H() [3/7]

tesseract::Dict::STRING_VAR_H	(	user_patterns_file	,
		""	,
		"A filename of user-provided patterns."
	)

◆ STRING_VAR_H() [4/7]

tesseract::Dict::STRING_VAR_H	(	user_patterns_suffix	,
		""	,
		"A suffix of user-provided patterns located in tessdata."
	)

◆ STRING_VAR_H() [5/7]

tesseract::Dict::STRING_VAR_H	(	output_ambig_words_file	,
		""	,
		"Output file for ambiguities found in the dictionary"
	)

◆ STRING_VAR_H() [6/7]

tesseract::Dict::STRING_VAR_H	(	word_to_debug	,
		""	,
		"Word for which stopper debug information" " should be printed to stdout"
	)

◆ STRING_VAR_H() [7/7]

tesseract::Dict::STRING_VAR_H	(	word_to_debug_lengths	,
		""	,
		"Lengths of unichars in word_to_debug"
	)

◆ UniformCertainties()

int tesseract::Dict::UniformCertainties ( const WERD_CHOICE & word )

Returns true if the certainty of the BestChoice word is within a reasonable range of the average certainties for the best choices for each character in the segmentation. This test is used to catch words in which one character is much worse than the other characters in the word (i.e. false will be returned in that case). The algorithm computes the mean and std deviation of the certainties in the word with the worst certainty thrown out.

◆ update_best_choice()

void tesseract::Dict::update_best_choice	(	const WERD_CHOICE &	word,
		WERD_CHOICE *	best_choice
	)

inline

Copies word into best_choice if its rating is smaller than that of best_choice.

◆ valid_bigram()

bool tesseract::Dict::valid_bigram	(	const WERD_CHOICE &	word1,
		const WERD_CHOICE &	word2
	)		const

◆ valid_punctuation()

bool tesseract::Dict::valid_punctuation ( const WERD_CHOICE & word )

Returns true if the word contains a valid punctuation pattern. Note: Since the domains of punctuation symbols and symblos used in numbers are not disjoint, a valid number might contain an invalid punctuation pattern (e.g. .99).

◆ valid_word() [1/3]

int tesseract::Dict::valid_word	(	const WERD_CHOICE &	word,
		bool	numbers_ok
	)		const

◆ valid_word() [2/3]

int tesseract::Dict::valid_word ( const WERD_CHOICE & word ) const

inline

◆ valid_word() [3/3]

int tesseract::Dict::valid_word ( const char * string ) const

inline

This function is used by api/tesseract_cube_combiner.cpp.

◆ valid_word_or_number()

int tesseract::Dict::valid_word_or_number ( const WERD_CHOICE & word ) const

inline

◆ valid_word_permuter()

static bool tesseract::Dict::valid_word_permuter	(	uint8_t	perm,
		bool	numbers_ok
	)

inlinestatic

Check all the DAWGs to see if this word is in any of them.

Read/Write/Access special purpose dawgs which contain words only of a certain length (used for phrase search for non-space-delimited languages).

◆ WildcardID()

UNICHAR_ID tesseract::Dict::WildcardID ( ) const

inline

Member Data Documentation

◆ apostrophe_unichar_id_

UNICHAR_ID tesseract::Dict::apostrophe_unichar_id_

private

◆ bigram_dawg_

Dawg* tesseract::Dict::bigram_dawg_

private

The following pointers are only cached for convenience. The dawgs will be deleted when dawgs_ vector is destroyed.

◆ ccutil_

CCUtil* tesseract::Dict::ccutil_

private

Private member variables.

◆ dang_ambigs_table_

UnicharAmbigs* tesseract::Dict::dang_ambigs_table_

private

Table that stores ambiguities computed during training (loaded when NoDangerousAmbigs() is called for the first time). Each entry i in the table stores a set of amibiguities whose wrong ngram starts with unichar id i.

◆ dawg_cache_

DawgCache* tesseract::Dict::dawg_cache_

private

◆ dawg_cache_is_ours_

bool tesseract::Dict::dawg_cache_is_ours_

private

◆ dawgs_

DawgVector tesseract::Dict::dawgs_

private

◆ document_words_

Trie* tesseract::Dict::document_words_

private

◆ equivalent_symbols_

GenericVector<GenericVectorEqEq<UNICHAR_ID> > tesseract::Dict::equivalent_symbols_

private

◆ freq_dawg_

Dawg* tesseract::Dict::freq_dawg_

private

◆ go_deeper_fxn_

void(Dict::* tesseract::Dict::go_deeper_fxn_) (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)

Pointer to go_deeper function.

◆ hyphen_active_dawgs_

DawgPositionVector tesseract::Dict::hyphen_active_dawgs_

private

◆ hyphen_unichar_id_

UNICHAR_ID tesseract::Dict::hyphen_unichar_id_

private

◆ hyphen_word_

WERD_CHOICE* tesseract::Dict::hyphen_word_

private

◆ last_word_on_line_

bool tesseract::Dict::last_word_on_line_

private

◆ letter_is_okay_

int(Dict::* tesseract::Dict::letter_is_okay_) (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const

◆ output_ambig_words_file_

FILE* tesseract::Dict::output_ambig_words_file_

private

◆ params_model_classify_

float(Dict::* tesseract::Dict::params_model_classify_) (const char *lang, void *path)

◆ pending_words_

Trie* tesseract::Dict::pending_words_

private

◆ probability_in_context_

double(Dict::* tesseract::Dict::probability_in_context_) (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)

Probability in context function used by the ngram permuter.

◆ punc_dawg_

Dawg* tesseract::Dict::punc_dawg_

private

◆ question_unichar_id_

UNICHAR_ID tesseract::Dict::question_unichar_id_

private

◆ reject_offset_

float tesseract::Dict::reject_offset_

private

Additional certainty padding allowed before a word is rejected.

◆ replace_ambigs_table_

UnicharAmbigs* tesseract::Dict::replace_ambigs_table_

private

Same as above, but for ambiguities with replace flag set.

◆ slash_unichar_id_

UNICHAR_ID tesseract::Dict::slash_unichar_id_

private

◆ successors_

SuccessorListsVector tesseract::Dict::successors_

private

◆ unambig_dawg_

Dawg* tesseract::Dict::unambig_dawg_

private

◆ wildcard_unichar_id_

UNICHAR_ID tesseract::Dict::wildcard_unichar_id_

private

◆ wordseg_rating_adjust_factor_

float tesseract::Dict::wordseg_rating_adjust_factor_

private

Current segmentation cost adjust factor for word rating. See comments in incorporate_segcost.

The documentation for this class was generated from the following files:

/home/stephane/src/tesseract/src/dict/dict.h
/home/stephane/src/tesseract/src/dict/context.cpp
/home/stephane/src/tesseract/src/dict/dict.cpp
/home/stephane/src/tesseract/src/dict/hyphen.cpp
/home/stephane/src/tesseract/src/dict/permdawg.cpp
/home/stephane/src/tesseract/src/dict/stopper.cpp

Public Member Functions

Static Public Member Functions

Public Attributes

Private Attributes

Constructor & Destructor Documentation

◆ Dict()

◆ ~Dict()

Member Function Documentation

◆ absolute_garbage()

◆ AcceptableChoice()

◆ AcceptableResult()

◆ add_document_word()

◆ adjust_word()

◆ append_choices()

◆ BOOL_VAR_H() [1/10]

◆ BOOL_VAR_H() [2/10]

◆ BOOL_VAR_H() [3/10]

◆ BOOL_VAR_H() [4/10]

◆ BOOL_VAR_H() [5/10]

◆ BOOL_VAR_H() [6/10]

◆ BOOL_VAR_H() [7/10]

◆ BOOL_VAR_H() [8/10]

◆ BOOL_VAR_H() [9/10]

◆ BOOL_VAR_H() [10/10]

◆ CallParamsModelClassify()

◆ case_ok()

◆ char_for_dawg()

◆ compound_marker()

◆ copy_hyphen_info()

◆ dawg_permute_and_select()

◆ DebugWordChoices()

◆ def_letter_is_okay()

◆ def_probability_in_context()

◆ default_dawgs()

◆ double_VAR_H() [1/14]

◆ double_VAR_H() [2/14]

◆ double_VAR_H() [3/14]

◆ double_VAR_H() [4/14]

◆ double_VAR_H() [5/14]

◆ double_VAR_H() [6/14]

◆ double_VAR_H() [7/14]

◆ double_VAR_H() [8/14]

◆ double_VAR_H() [9/14]

◆ double_VAR_H() [10/14]

◆ double_VAR_H() [11/14]

◆ double_VAR_H() [12/14]

◆ double_VAR_H() [13/14]

◆ double_VAR_H() [14/14]

◆ End()

◆ EndDangerousAmbigs()

◆ FinishLoad()

◆ fragment_state_okay()

◆ getCCUtil() [1/2]

◆ getCCUtil() [2/2]

◆ GetDawg()

◆ GetPuncDawg()

◆ GetStartingNode()

◆ GetUnambigDawg()

◆ getUnicharAmbigs()

◆ getUnicharset() [1/2]

◆ getUnicharset() [2/2]

◆ GlobalDawgCache()

◆ go_deeper_dawg_fxn()

◆ good_choice()

◆ has_hyphen_end() [1/2]

◆ has_hyphen_end() [2/2]

◆ hyphen_base_size()

◆ hyphenated()

◆ init_active_dawgs()

◆ INT_VAR_H() [1/8]

◆ INT_VAR_H() [2/8]

◆ INT_VAR_H() [3/8]

◆ INT_VAR_H() [4/8]

◆ INT_VAR_H() [5/8]

◆ INT_VAR_H() [6/8]

◆ INT_VAR_H() [7/8]

◆ INT_VAR_H() [8/8]

◆ is_apostrophe()

◆ IsSpaceDelimitedLang()

◆ LengthOfShortestAlphaRun()