tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
tesseract::Trie Class Reference

#include <trie.h>

Inheritance diagram for tesseract::Trie:
Collaboration diagram for tesseract::Trie:

Public Types

enum  RTLReversePolicy { RRP_DO_NO_REVERSE, RRP_REVERSE_IF_HAS_RTL, RRP_FORCE_REVERSE }
 

Public Member Functions

 Trie (DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
 
virtual ~Trie ()
 
void clear ()
 
EDGE_REF edge_char_of (NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
 
void unichar_ids_of (NODE_REF node, NodeChildVector *vec, bool word_end) const
 
NODE_REF next_node (EDGE_REF edge_ref) const
 
bool end_of_word (EDGE_REF edge_ref) const
 
UNICHAR_ID edge_letter (EDGE_REF edge_ref) const
 
void KillEdge (EDGE_RECORD *edge_rec) const
 
bool DeadEdge (const EDGE_RECORD &edge_rec) const
 
void print_node (NODE_REF node, int max_num_edges) const
 
SquishedDawgtrie_to_dawg ()
 
bool read_and_add_word_list (const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
 
bool read_word_list (const char *filename, GenericVector< STRING > *words)
 
bool add_word_list (const GenericVector< STRING > &words, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy)
 
bool read_pattern_list (const char *filename, const UNICHARSET &unicharset)
 
void initialize_patterns (UNICHARSET *unicharset)
 
void unichar_id_to_patterns (UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
 
virtual EDGE_REF pattern_loop_edge (EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
 
bool add_word_to_dawg (const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
 
bool add_word_to_dawg (const WERD_CHOICE &word)
 
- Public Member Functions inherited from tesseract::Dawg
DawgType type () const
 
const STRINGlang () const
 
PermuterType permuter () const
 
virtual ~Dawg ()
 
bool word_in_dawg (const WERD_CHOICE &word) const
 Returns true if the given word is in the Dawg. More...
 
bool prefix_in_dawg (const WERD_CHOICE &prefix, bool requires_complete) const
 
int check_for_words (const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
 
void iterate_words (const UNICHARSET &unicharset, TessCallback1< const WERD_CHOICE *> *cb) const
 
void iterate_words (const UNICHARSET &unicharset, TessCallback1< const char *> *cb) const
 

Static Public Member Functions

static const char * get_reverse_policy_name (RTLReversePolicy reverse_policy)
 

Static Public Attributes

static const int kSaneNumConcreteChars = 0
 
static const char kAlphaPatternUnicode [] = "\u2000"
 
static const char kDigitPatternUnicode [] = "\u2001"
 
static const char kAlphanumPatternUnicode [] = "\u2002"
 
static const char kPuncPatternUnicode [] = "\u2003"
 
static const char kLowerPatternUnicode [] = "\u2004"
 
static const char kUpperPatternUnicode [] = "\u2005"
 
- Static Public Attributes inherited from tesseract::Dawg
static const int16_t kDawgMagicNumber = 42
 Magic number to determine endianness when reading the Dawg from file. More...
 
static const UNICHAR_ID kPatternUnicharID = 0
 

Protected Member Functions

EDGE_RECORD * deref_edge_ref (EDGE_REF edge_ref) const
 
EDGE_REF make_edge_ref (NODE_REF node_index, EDGE_INDEX edge_index) const
 
void link_edge (EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
 
void print_edge_rec (const EDGE_RECORD &edge_rec) const
 
bool can_be_eliminated (const EDGE_RECORD &edge_rec)
 
void print_all (const char *msg, int max_num_edges)
 
bool edge_char_of (NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end, UNICHAR_ID unichar_id, EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const
 
bool add_edge_linkage (NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
 
bool add_new_edge (NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
 
void add_word_ending (EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id)
 
NODE_REF new_dawg_node ()
 
void remove_edge_linkage (NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id)
 
void remove_edge (NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
 
bool eliminate_redundant_edges (NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2)
 
bool reduce_lettered_edges (EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, EDGE_VECTOR *backward_edges, NODE_MARKER reduced_nodes)
 
void sort_edges (EDGE_VECTOR *edges)
 
void reduce_node_input (NODE_REF node, NODE_MARKER reduced_nodes)
 
UNICHAR_ID character_class_to_pattern (char ch)
 
- Protected Member Functions inherited from tesseract::Dawg
 Dawg (DawgType type, const STRING &lang, PermuterType perm, int debug_level)
 
NODE_REF next_node_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the next node visited by following this edge. More...
 
bool marker_flag_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the marker flag of this edge. More...
 
int direction_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the direction flag of this edge. More...
 
bool end_of_word_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns true if this edge marks the end of a word. More...
 
UNICHAR_ID unichar_id_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns UNICHAR_ID recorded in this edge. More...
 
void set_next_node_in_edge_rec (EDGE_RECORD *edge_rec, EDGE_REF value)
 Sets the next node link for this edge in the Dawg. More...
 
void set_marker_flag_in_edge_rec (EDGE_RECORD *edge_rec)
 Sets this edge record to be the last one in a sequence of edges. More...
 
int given_greater_than_edge_rec (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
 
bool edge_rec_match (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
 
void init (int unicharset_size)
 
bool match_words (WERD_CHOICE *word, int32_t index, NODE_REF node, UNICHAR_ID wildcard) const
 
void iterate_words_rec (const WERD_CHOICE &word_so_far, NODE_REF to_explore, TessCallback1< const WERD_CHOICE *> *cb) const
 

Protected Attributes

TRIE_NODES nodes_
 
uint64_t num_edges_
 
uint64_t deref_direction_mask_
 
uint64_t deref_node_index_mask_
 
GenericVector< EDGE_INDEX > root_back_freelist_
 
bool initialized_patterns_
 
UNICHAR_ID alpha_pattern_
 
UNICHAR_ID digit_pattern_
 
UNICHAR_ID alphanum_pattern_
 
UNICHAR_ID punc_pattern_
 
UNICHAR_ID lower_pattern_
 
UNICHAR_ID upper_pattern_
 
- Protected Attributes inherited from tesseract::Dawg
DawgType type_
 
STRING lang_
 
PermuterType perm_
 Permuter code that should be used if the word is found in this Dawg. More...
 
int unicharset_size_
 
int flag_start_bit_
 
int next_node_start_bit_
 
uint64_t next_node_mask_
 
uint64_t flags_mask_
 
uint64_t letter_mask_
 
int debug_level_
 

Detailed Description

Concrete class for Trie data structure that allows to store a list of words (extends Dawg base class) as well as dynamically add new words. This class stores a vector of pointers to TRIE_NODE_RECORDs, each of which has a vector of forward and backward edges.

Member Enumeration Documentation

◆ RTLReversePolicy

Enumerator
RRP_DO_NO_REVERSE 
RRP_REVERSE_IF_HAS_RTL 
RRP_FORCE_REVERSE 

Constructor & Destructor Documentation

◆ Trie()

tesseract::Trie::Trie ( DawgType  type,
const STRING lang,
PermuterType  perm,
int  unicharset_size,
int  debug_level 
)
inline

◆ ~Trie()

virtual tesseract::Trie::~Trie ( )
inlinevirtual

Member Function Documentation

◆ add_edge_linkage()

bool tesseract::Trie::add_edge_linkage ( NODE_REF  node1,
NODE_REF  node2,
bool  repeats,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
)
protected

◆ add_new_edge()

bool tesseract::Trie::add_new_edge ( NODE_REF  node1,
NODE_REF  node2,
bool  repeats,
bool  word_end,
UNICHAR_ID  unichar_id 
)
inlineprotected

◆ add_word_ending()

void tesseract::Trie::add_word_ending ( EDGE_RECORD *  edge,
NODE_REF  the_next_node,
bool  repeats,
UNICHAR_ID  unichar_id 
)
protected

◆ add_word_list()

bool tesseract::Trie::add_word_list ( const GenericVector< STRING > &  words,
const UNICHARSET unicharset,
Trie::RTLReversePolicy  reverse_policy 
)

◆ add_word_to_dawg() [1/2]

bool tesseract::Trie::add_word_to_dawg ( const WERD_CHOICE word,
const GenericVector< bool > *  repetitions 
)

◆ add_word_to_dawg() [2/2]

bool tesseract::Trie::add_word_to_dawg ( const WERD_CHOICE word)
inline

◆ can_be_eliminated()

bool tesseract::Trie::can_be_eliminated ( const EDGE_RECORD &  edge_rec)
inlineprotected

◆ character_class_to_pattern()

UNICHAR_ID tesseract::Trie::character_class_to_pattern ( char  ch)
protected

◆ clear()

void tesseract::Trie::clear ( )

◆ DeadEdge()

bool tesseract::Trie::DeadEdge ( const EDGE_RECORD &  edge_rec) const
inline

◆ deref_edge_ref()

EDGE_RECORD* tesseract::Trie::deref_edge_ref ( EDGE_REF  edge_ref) const
inlineprotected

◆ edge_char_of() [1/2]

EDGE_REF tesseract::Trie::edge_char_of ( NODE_REF  node_ref,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
inlinevirtual

Returns the edge that corresponds to the letter out of this node.

Implements tesseract::Dawg.

◆ edge_char_of() [2/2]

bool tesseract::Trie::edge_char_of ( NODE_REF  node_ref,
NODE_REF  next_node,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id,
EDGE_RECORD **  edge_ptr,
EDGE_INDEX *  edge_index 
) const
protected

◆ edge_letter()

UNICHAR_ID tesseract::Trie::edge_letter ( EDGE_REF  edge_ref) const
inlinevirtual

Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

◆ eliminate_redundant_edges()

bool tesseract::Trie::eliminate_redundant_edges ( NODE_REF  node,
const EDGE_RECORD &  edge1,
const EDGE_RECORD &  edge2 
)
protected

◆ end_of_word()

bool tesseract::Trie::end_of_word ( EDGE_REF  edge_ref) const
inlinevirtual

Returns true if the edge indicated by the given EDGE_REF marks the end of a word.

Implements tesseract::Dawg.

◆ get_reverse_policy_name()

const char * tesseract::Trie::get_reverse_policy_name ( RTLReversePolicy  reverse_policy)
static

◆ initialize_patterns()

void tesseract::Trie::initialize_patterns ( UNICHARSET unicharset)

◆ KillEdge()

void tesseract::Trie::KillEdge ( EDGE_RECORD *  edge_rec) const
inline

◆ link_edge()

void tesseract::Trie::link_edge ( EDGE_RECORD *  edge,
NODE_REF  nxt,
bool  repeats,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
)
inlineprotected

Sets up this edge record to the requested values.

◆ make_edge_ref()

EDGE_REF tesseract::Trie::make_edge_ref ( NODE_REF  node_index,
EDGE_INDEX  edge_index 
) const
inlineprotected

Constructs EDGE_REF from the given node_index and edge_index.

◆ new_dawg_node()

NODE_REF tesseract::Trie::new_dawg_node ( )
protected

◆ next_node()

NODE_REF tesseract::Trie::next_node ( EDGE_REF  edge_ref) const
inlinevirtual

Returns the next node visited by following the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

◆ pattern_loop_edge()

virtual EDGE_REF tesseract::Trie::pattern_loop_edge ( EDGE_REF  edge_ref,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
inlinevirtual

Returns the given EDGE_REF if the EDGE_RECORD that it points to has a self loop and the given unichar_id matches the unichar_id stored in the EDGE_RECORD, returns NO_EDGE otherwise.

Reimplemented from tesseract::Dawg.

◆ print_all()

void tesseract::Trie::print_all ( const char *  msg,
int  max_num_edges 
)
inlineprotected

◆ print_edge_rec()

void tesseract::Trie::print_edge_rec ( const EDGE_RECORD &  edge_rec) const
inlineprotected

Prints the given EDGE_RECORD.

◆ print_node()

void tesseract::Trie::print_node ( NODE_REF  node,
int  max_num_edges 
) const
virtual

Prints the contents of the node indicated by the given NODE_REF. At most max_num_edges will be printed.

Implements tesseract::Dawg.

◆ read_and_add_word_list()

bool tesseract::Trie::read_and_add_word_list ( const char *  filename,
const UNICHARSET unicharset,
Trie::RTLReversePolicy  reverse 
)

◆ read_pattern_list()

bool tesseract::Trie::read_pattern_list ( const char *  filename,
const UNICHARSET unicharset 
)

◆ read_word_list()

bool tesseract::Trie::read_word_list ( const char *  filename,
GenericVector< STRING > *  words 
)

◆ reduce_lettered_edges()

bool tesseract::Trie::reduce_lettered_edges ( EDGE_INDEX  edge_index,
UNICHAR_ID  unichar_id,
NODE_REF  node,
EDGE_VECTOR backward_edges,
NODE_MARKER  reduced_nodes 
)
protected

◆ reduce_node_input()

void tesseract::Trie::reduce_node_input ( NODE_REF  node,
NODE_MARKER  reduced_nodes 
)
protected

Eliminates any redundant edges from this node in the Trie.

◆ remove_edge()

void tesseract::Trie::remove_edge ( NODE_REF  node1,
NODE_REF  node2,
bool  word_end,
UNICHAR_ID  unichar_id 
)
inlineprotected

◆ remove_edge_linkage()

void tesseract::Trie::remove_edge_linkage ( NODE_REF  node1,
NODE_REF  node2,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
)
protected

◆ sort_edges()

void tesseract::Trie::sort_edges ( EDGE_VECTOR edges)
protected

Order num_edges of consecutive EDGE_RECORDS in the given EDGE_VECTOR in increasing order of unichar ids. This function is normally called for all edges in a single node, and since number of edges in each node is usually quite small, selection sort is used.

◆ trie_to_dawg()

SquishedDawg * tesseract::Trie::trie_to_dawg ( )

◆ unichar_id_to_patterns()

void tesseract::Trie::unichar_id_to_patterns ( UNICHAR_ID  unichar_id,
const UNICHARSET unicharset,
GenericVector< UNICHAR_ID > *  vec 
) const
virtual

Fills vec with unichar ids that represent the character classes of the given unichar_id.

Reimplemented from tesseract::Dawg.

◆ unichar_ids_of()

void tesseract::Trie::unichar_ids_of ( NODE_REF  node,
NodeChildVector vec,
bool  word_end 
) const
inlinevirtual

Fills the given NodeChildVector with all the unichar ids (and the corresponding EDGE_REFs) for which there is an edge out of this node.

Implements tesseract::Dawg.

Member Data Documentation

◆ alpha_pattern_

UNICHAR_ID tesseract::Trie::alpha_pattern_
protected

◆ alphanum_pattern_

UNICHAR_ID tesseract::Trie::alphanum_pattern_
protected

◆ deref_direction_mask_

uint64_t tesseract::Trie::deref_direction_mask_
protected

◆ deref_node_index_mask_

uint64_t tesseract::Trie::deref_node_index_mask_
protected

◆ digit_pattern_

UNICHAR_ID tesseract::Trie::digit_pattern_
protected

◆ initialized_patterns_

bool tesseract::Trie::initialized_patterns_
protected

◆ kAlphanumPatternUnicode

const char tesseract::Trie::kAlphanumPatternUnicode = "\u2002"
static

◆ kAlphaPatternUnicode

const char tesseract::Trie::kAlphaPatternUnicode = "\u2000"
static

◆ kDigitPatternUnicode

const char tesseract::Trie::kDigitPatternUnicode = "\u2001"
static

◆ kLowerPatternUnicode

const char tesseract::Trie::kLowerPatternUnicode = "\u2004"
static

◆ kPuncPatternUnicode

const char tesseract::Trie::kPuncPatternUnicode = "\u2003"
static

◆ kSaneNumConcreteChars

const int tesseract::Trie::kSaneNumConcreteChars = 0
static

◆ kUpperPatternUnicode

const char tesseract::Trie::kUpperPatternUnicode = "\u2005"
static

◆ lower_pattern_

UNICHAR_ID tesseract::Trie::lower_pattern_
protected

◆ nodes_

TRIE_NODES tesseract::Trie::nodes_
protected

◆ num_edges_

uint64_t tesseract::Trie::num_edges_
protected

◆ punc_pattern_

UNICHAR_ID tesseract::Trie::punc_pattern_
protected

◆ root_back_freelist_

GenericVector<EDGE_INDEX> tesseract::Trie::root_back_freelist_
protected

◆ upper_pattern_

UNICHAR_ID tesseract::Trie::upper_pattern_
protected

The documentation for this class was generated from the following files: