tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
lm_pain_points.h
1 // File: lm_pain_points.h
3 // Description: Functions that utilize the knowledge about the properties
4 // of the paths explored by the segmentation search in order
5 // to generate "pain points" - the locations in the ratings
6 // matrix which should be classified next.
7 // Author: Rika Antonova
8 // Created: Mon Jun 20 11:26:43 PST 2012
9 //
10 // (C) Copyright 2012, Google Inc.
11 // Licensed under the Apache License, Version 2.0 (the "License");
12 // you may not use this file except in compliance with the License.
13 // You may obtain a copy of the License at
14 // http://www.apache.org/licenses/LICENSE-2.0
15 // Unless required by applicable law or agreed to in writing, software
16 // distributed under the License is distributed on an "AS IS" BASIS,
17 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 // See the License for the specific language governing permissions and
19 // limitations under the License.
20 //
22 
23 #ifndef TESSERACT_WORDREC_PAIN_POINTS_H_
24 #define TESSERACT_WORDREC_PAIN_POINTS_H_
25 
26 #include "genericheap.h" // for GenericHeap
27 #include "matrix.h" // for MATRIX_COORD (ptr only), MatrixCoordPair
28 #include "stopper.h" // for DANGERR
29 
30 class WERD_RES;
31 
32 namespace tesseract {
33 
34 class Dict;
35 struct ViterbiStateEntry;
36 
37 // Heap of pain points used for determining where to chop/join.
39 
40 // Types of pain points (ordered in the decreasing level of importance).
46 
48 };
49 
50 static const char * const LMPainPointsTypeName[] = {
51  "LM_PPTYPE_BLAMER",
52  "LM_PPTYPE_AMBIGS",
53  "LM_PPTYPE_PATH",
54  "LM_PPTYPE_SHAPE",
55 };
56 
57 class LMPainPoints {
58  public:
59 
61  // If there is a significant drop in character ngram probability or a
62  // dangerous ambiguity make the thresholds on what blob combinations
63  // can be classified looser.
64  static const float kLooseMaxCharWhRatio;
65  // Returns a description of the type of a pain point.
66  static const char* PainPointDescription(LMPainPointsType type) {
67  return LMPainPointsTypeName[type];
68  }
69 
70  LMPainPoints(int max, float rat, bool fp, const Dict *d, int deb) :
72  dict_(d), debug_level_(deb) {}
74 
75  // Returns true if the heap of pain points of pp_type is not empty().
76  inline bool HasPainPoints(LMPainPointsType pp_type) const {
77  return !pain_points_heaps_[pp_type].empty();
78  }
79 
80  // Dequeues the next pain point from the pain points queue and copies
81  // its contents and priority to *pp and *priority.
82  // Returns LM_PPTYPE_NUM if pain points queue is empty, otherwise the type.
83  LMPainPointsType Deque(MATRIX_COORD *pp, float *priority);
84 
85  // Clears pain points heap.
86  void Clear() {
87  for (int h = 0; h < LM_PPTYPE_NUM; ++h) pain_points_heaps_[h].clear();
88  }
89 
90  // For each cell, generate a "pain point" if the cell is not classified
91  // and has a left or right neighbor that was classified.
92  void GenerateInitial(WERD_RES *word_res);
93 
94  // Generate pain points from the given path.
95  void GenerateFromPath(float rating_cert_scale, ViterbiStateEntry *vse,
96  WERD_RES *word_res);
97 
98  // Generate pain points from dangerous ambiguities in best choice.
99  void GenerateFromAmbigs(const DANGERR &fixpt, ViterbiStateEntry *vse,
100  WERD_RES *word_res);
101 
102  // Generate a pain point for the blamer.
103  bool GenerateForBlamer(double max_char_wh_ratio, WERD_RES *word_res,
104  int col, int row) {
105  return GeneratePainPoint(col, row, LM_PPTYPE_BLAMER, 0.0, false,
106  max_char_wh_ratio, word_res);
107  }
108 
109  // Adds a pain point to classify chunks_record->ratings(col, row).
110  // Returns true if a new pain point was added to an appropriate heap.
111  // Pain point priority is set to special_priority for pain points of
112  // LM_PPTYPE_AMBIG or LM_PPTYPE_PATH, for other pain points
113  // AssociateStats::gap_sum is used.
114  bool GeneratePainPoint(int col, int row, LMPainPointsType pp_type,
115  float special_priority, bool ok_to_extend,
116  float max_char_wh_ratio,
117  WERD_RES *word_res);
118 
119  // Adjusts the pain point coordinates to cope with expansion of the ratings
120  // matrix due to a split of the blob with the given index.
121  void RemapForSplit(int index);
122 
123  private:
124  // Priority queues containing pain points generated by the language model
125  // The priority is set by the language model components, adjustments like
126  // seam cost and width priority are factored into the priority.
128  // Maximum number of points to keep in the heap.
130  // Maximum character width/height ratio.
132  // Set to true if fixed pitch should be assumed.
134  // Cached pointer to dictionary.
135  const Dict *dict_;
136  // Debug level for print statements.
138 };
139 
140 } // namespace tesseract
141 
142 #endif // TESSERACT_WORDREC_PAIN_POINTS_H_
bool GeneratePainPoint(int col, int row, LMPainPointsType pp_type, float special_priority, bool ok_to_extend, float max_char_wh_ratio, WERD_RES *word_res)
Definition: lm_pain_points.cpp:148
int max_heap_size_
Definition: lm_pain_points.h:129
Definition: lm_pain_points.h:57
static const char * PainPointDescription(LMPainPointsType type)
Definition: lm_pain_points.h:66
void GenerateFromPath(float rating_cert_scale, ViterbiStateEntry *vse, WERD_RES *word_res)
Definition: lm_pain_points.cpp:70
Definition: lm_pain_points.h:47
void GenerateFromAmbigs(const DANGERR &fixpt, ViterbiStateEntry *vse, WERD_RES *word_res)
Definition: lm_pain_points.cpp:132
Definition: lm_state.h:93
int debug_level_
Definition: lm_pain_points.h:137
LMPainPointsType Deque(MATRIX_COORD *pp, float *priority)
Definition: lm_pain_points.cpp:39
bool GenerateForBlamer(double max_char_wh_ratio, WERD_RES *word_res, int col, int row)
Definition: lm_pain_points.h:103
Definition: baseapi.cpp:94
LMPainPointsType
Definition: lm_pain_points.h:41
const Dict * dict_
Definition: lm_pain_points.h:135
Definition: lm_pain_points.h:42
Definition: lm_pain_points.h:43
static const char *const LMPainPointsTypeName[]
Definition: lm_pain_points.h:50
Definition: dict.h:88
float max_char_wh_ratio_
Definition: lm_pain_points.h:131
Definition: lm_pain_points.h:45
LMPainPoints(int max, float rat, bool fp, const Dict *d, int deb)
Definition: lm_pain_points.h:70
PainPointHeap pain_points_heaps_[LM_PPTYPE_NUM]
Definition: lm_pain_points.h:127
bool HasPainPoints(LMPainPointsType pp_type) const
Definition: lm_pain_points.h:76
bool fixed_pitch_
Definition: lm_pain_points.h:133
static const float kDefaultPainPointPriorityAdjustment
Definition: lm_pain_points.h:60
void GenerateInitial(WERD_RES *word_res)
Definition: lm_pain_points.cpp:50
bool empty() const
Definition: genericheap.h:68
void RemapForSplit(int index)
Definition: lm_pain_points.cpp:211
Definition: lm_pain_points.h:44
Definition: pageres.h:169
void Clear()
Definition: lm_pain_points.h:86
Definition: matrix.h:605
~LMPainPoints()
Definition: lm_pain_points.h:73
static const float kLooseMaxCharWhRatio
Definition: lm_pain_points.h:64