tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
blamer.h
1 // File: blamer.h
3 // Description: Module allowing precise error causes to be allocated.
4 // Author: Rike Antonova
5 // Refactored: Ray Smith
6 // Created: Mon Feb 04 14:37:01 PST 2013
7 //
8 // (C) Copyright 2013, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #ifndef TESSERACT_CCSTRUCT_BLAMER_H_
22 #define TESSERACT_CCSTRUCT_BLAMER_H_
23 
24 #include <cstdint> // for int16_t
25 #include <cstring> // for memcpy
26 #include "boxword.h" // for BoxWord
27 #include "genericvector.h" // for GenericVector
28 #include "params_training_featdef.h" // for ParamsTrainingBundle, ParamsTra...
29 #include "ratngs.h" // for BLOB_CHOICE_LIST (ptr only)
30 #include "rect.h" // for TBOX
31 #include "strngs.h" // for STRING
32 #include "tprintf.h" // for tprintf
33 #include "unichar.h" // for UNICHAR_ID
34 
35 class DENORM;
36 class MATRIX;
37 class UNICHARSET;
38 class WERD_RES;
39 
40 struct MATRIX_COORD;
41 struct TWERD;
42 
43 template <class R, class A1, class A2> class TessResultCallback2;
44 
45 static const int16_t kBlamerBoxTolerance = 5;
46 
47 // Enum for expressing the source of error.
48 // Note: Please update kIncorrectResultReasonNames when modifying this enum.
49 enum IncorrectResultReason {
50  // The text recorded in best choice == truth text
51  IRR_CORRECT,
52  // Either: Top choice is incorrect and is a dictionary word (language model
53  // is unlikely to help correct such errors, so blame the classifier).
54  // Or: the correct unichar was not included in shortlist produced by the
55  // classifier at all.
56  IRR_CLASSIFIER,
57  // Chopper have not found one or more splits that correspond to the correct
58  // character bounding boxes recorded in BlamerBundle::truth_word.
59  IRR_CHOPPER,
60  // Classifier did include correct unichars for each blob in the correct
61  // segmentation, however its rating could have been too bad to allow the
62  // language model to pull out the correct choice. On the other hand the
63  // strength of the language model might have been too weak to favor the
64  // correct answer, this we call this case a classifier-language model
65  // tradeoff error.
66  IRR_CLASS_LM_TRADEOFF,
67  // Page layout failed to produce the correct bounding box. Blame page layout
68  // if the truth was not found for the word, which implies that the bounding
69  // box of the word was incorrect (no truth word had a similar bounding box).
70  IRR_PAGE_LAYOUT,
71  // SegSearch heuristic prevented one or more blobs from the correct
72  // segmentation state to be classified (e.g. the blob was too wide).
73  IRR_SEGSEARCH_HEUR,
74  // The correct segmentaiton state was not explored because of poor SegSearch
75  // pain point prioritization. We blame SegSearch pain point prioritization
76  // if the best rating of a choice constructed from correct segmentation is
77  // better than that of the best choice (i.e. if we got to explore the correct
78  // segmentation state, language model would have picked the correct choice).
79  IRR_SEGSEARCH_PP,
80  // Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word,
81  // and thus use the old language model (permuters).
82  // TODO(antonova): integrate the new language mode with chopper
83  IRR_CLASS_OLD_LM_TRADEOFF,
84  // If there is an incorrect adaptive template match with a better score than
85  // a correct one (either pre-trained or adapted), mark this as adaption error.
86  IRR_ADAPTION,
87  // split_and_recog_word() failed to find a suitable split in truth.
88  IRR_NO_TRUTH_SPLIT,
89  // Truth is not available for this word (e.g. when words in corrected content
90  // file are turned into ~~~~ because an appropriate alignment was not found.
91  IRR_NO_TRUTH,
92  // The text recorded in best choice != truth text, but none of the above
93  // reasons are set.
94  IRR_UNKNOWN,
95 
96  IRR_NUM_REASONS
97 };
98 
99 // Blamer-related information to determine the source of errors.
100 struct BlamerBundle {
101  static const char *IncorrectReasonName(IncorrectResultReason irr);
103  incorrect_result_reason_(IRR_CORRECT),
104  lattice_data_(nullptr) { ClearResults(); }
105  BlamerBundle(const BlamerBundle &other) {
106  this->CopyTruth(other);
107  this->CopyResults(other);
108  }
109  ~BlamerBundle() { delete[] lattice_data_; }
110 
111  // Accessors.
112  STRING TruthString() const {
113  STRING truth_str;
114  for (int i = 0; i < truth_text_.length(); ++i)
115  truth_str += truth_text_[i];
116  return truth_str;
117  }
118  IncorrectResultReason incorrect_result_reason() const {
120  }
121  bool NoTruth() const {
122  return incorrect_result_reason_ == IRR_NO_TRUTH ||
123  incorrect_result_reason_ == IRR_PAGE_LAYOUT;
124  }
125  bool HasDebugInfo() const {
126  return debug_.length() > 0 || misadaption_debug_.length() > 0;
127  }
128  const STRING& debug() const {
129  return debug_;
130  }
131  const STRING& misadaption_debug() const {
132  return misadaption_debug_;
133  }
134  void UpdateBestRating(float rating) {
137  }
140  }
141  // Returns true if the given ratings matrix col,row position is included
142  // in the correct segmentation path at the given index.
143  bool MatrixPositionCorrect(int index, const MATRIX_COORD& coord) {
144  return correct_segmentation_cols_[index] == coord.col &&
145  correct_segmentation_rows_[index] == coord.row;
146  }
149  }
150  const char* lattice_data() const {
151  return lattice_data_;
152  }
153  int lattice_size() const {
154  return lattice_size_; // size of lattice_data in bytes
155  }
156  void set_lattice_data(const char* data, int size) {
157  lattice_size_ = size;
158  delete [] lattice_data_;
159  lattice_data_ = new char[lattice_size_];
160  memcpy(lattice_data_, data, lattice_size_);
161  }
164  }
165  // Adds a new ParamsTrainingHypothesis to the current hypothesis list.
168  }
169 
170  // Functions to setup the blamer.
171  // Whole word string, whole word bounding box.
172  void SetWordTruth(const UNICHARSET& unicharset,
173  const char* truth_str, const TBOX& word_box);
174  // Single "character" string, "character" bounding box.
175  // May be called multiple times to indicate the characters in a word.
176  void SetSymbolTruth(const UNICHARSET& unicharset,
177  const char* char_str, const TBOX& char_box);
178  // Marks that there is something wrong with the truth text, like it contains
179  // reject characters.
180  void SetRejectedTruth();
181 
182  // Returns true if the provided word_choice is correct.
183  bool ChoiceIsCorrect(const WERD_CHOICE* word_choice) const;
184 
185  void ClearResults() {
188  if (!NoTruth()) incorrect_result_reason_ = IRR_CORRECT;
189  debug_ = "";
195  delete[] lattice_data_;
196  lattice_data_ = nullptr;
197  lattice_size_ = 0;
198  }
199  void CopyTruth(const BlamerBundle &other) {
201  truth_word_ = other.truth_word_;
202  truth_text_ = other.truth_text_;
204  (other.NoTruth() ? other.incorrect_result_reason_ : IRR_CORRECT);
205  }
206  void CopyResults(const BlamerBundle &other) {
216  if (other.lattice_data_ != nullptr) {
217  lattice_data_ = new char[other.lattice_size_];
218  memcpy(lattice_data_, other.lattice_data_, other.lattice_size_);
220  } else {
221  lattice_data_ = nullptr;
222  }
223  }
224  const char *IncorrectReason() const;
225 
226  // Appends choice and truth details to the given debug string.
227  void FillDebugString(const STRING &msg, const WERD_CHOICE *choice,
228  STRING *debug);
229 
230  // Sets up the norm_truth_word from truth_word using the given DENORM.
231  void SetupNormTruthWord(const DENORM& denorm);
232 
233  // Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty
234  // bundles) where the right edge/ of the left-hand word is word1_right,
235  // and the left edge of the right-hand word is word2_left.
236  void SplitBundle(int word1_right, int word2_left, bool debug,
237  BlamerBundle* bundle1, BlamerBundle* bundle2) const;
238  // "Joins" the blames from bundle1 and bundle2 into *this.
239  void JoinBlames(const BlamerBundle& bundle1, const BlamerBundle& bundle2,
240  bool debug);
241 
242  // If a blob with the same bounding box as one of the truth character
243  // bounding boxes is not classified as the corresponding truth character
244  // blames character classifier for incorrect answer.
245  void BlameClassifier(const UNICHARSET& unicharset,
246  const TBOX& blob_box,
247  const BLOB_CHOICE_LIST& choices,
248  bool debug);
249 
250 
251  // Checks whether chops were made at all the character bounding box
252  // boundaries in word->truth_word. If not - blames the chopper for an
253  // incorrect answer.
254  void SetChopperBlame(const WERD_RES* word, bool debug);
255  // Blames the classifier or the language model if, after running only the
256  // chopper, best_choice is incorrect and no blame has been yet set.
257  // Blames the classifier if best_choice is classifier's top choice and is a
258  // dictionary word (i.e. language model could not have helped).
259  // Otherwise, blames the language model (formerly permuter word adjustment).
261  const WERD_RES* word,
262  const UNICHARSET& unicharset, bool valid_permuter, bool debug);
263  // Sets up the correct_segmentation_* to mark the correct bounding boxes.
264  void SetupCorrectSegmentation(const TWERD* word, bool debug);
265 
266  // Returns true if a guided segmentation search is needed.
267  bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const;
268  // Setup ready to guide the segmentation search to the correct segmentation.
269  // The callback pp_cb is used to avoid a cyclic dependency.
270  // It calls into LMPainPoints::GenerateForBlamer by pre-binding the
271  // WERD_RES, and the LMPainPoints itself.
272  // pp_cb must be a permanent callback, and should be deleted by the caller.
273  void InitForSegSearch(const WERD_CHOICE *best_choice,
274  MATRIX* ratings, UNICHAR_ID wildcard_id,
275  bool debug, STRING *debug_str,
277  // Returns true if the guided segsearch is in progress.
278  bool GuidedSegsearchStillGoing() const;
279  // The segmentation search has ended. Sets the blame appropriately.
280  void FinishSegSearch(const WERD_CHOICE *best_choice,
281  bool debug, STRING *debug_str);
282 
283  // If the bundle is null or still does not indicate the correct result,
284  // fix it and use some backup reason for the blame.
285  static void LastChanceBlame(bool debug, WERD_RES* word);
286 
287  // Sets the misadaption debug if this word is incorrect, as this word is
288  // being adapted to.
289  void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug);
290 
291  private:
292  // Copy assignment operator (currently unused, therefore private).
293  BlamerBundle& operator=(const BlamerBundle& other);
294  void SetBlame(IncorrectResultReason irr, const STRING &msg,
295  const WERD_CHOICE *choice, bool debug) {
298  debug_ += " to blame: ";
299  FillDebugString(msg, choice, &debug_);
300  if (debug) tprintf("SetBlame(): %s", debug_.string());
301  }
302 
303  private:
304  // Set to true when bounding boxes for individual unichars are recorded.
306  // The true_word (in the original image coordinate space) contains ground
307  // truth bounding boxes for this WERD_RES.
309  // Same as above, but in normalized coordinates
310  // (filled in by WERD_RES::SetupForRecognition()).
312  // Tolerance for bounding box comparisons in normalized space.
314  // Contains ground truth unichar for each of the bounding boxes in truth_word.
316  // The reason for incorrect OCR result.
317  IncorrectResultReason incorrect_result_reason_;
318  // Debug text associated with the blame.
320  // Misadaption debug information (filled in if this word was misadapted to).
322  // Variables used by the segmentation search when looking for the blame.
323  // Set to true while segmentation search is continued after the usual
324  // termination condition in order to look for the blame.
326  // Best rating for correctly segmented path
327  // (set and used by SegSearch when looking for blame).
329  // Vectors populated by SegSearch to indicate column and row indices that
330  // correspond to blobs with correct bounding boxes.
333  // Set to true if best choice is a dictionary word and
334  // classifier's top choice.
336  // Serialized segmentation search lattice.
338  int lattice_size_; // size of lattice_data in bytes
339  // Information about hypotheses (paths) explored by the segmentation search.
341 };
342 
343 
344 #endif // TESSERACT_CCSTRUCT_BLAMER_H_
void CopyResults(const BlamerBundle &other)
Definition: blamer.h:206
char * lattice_data_
Definition: blamer.h:337
Definition: params_training_featdef.h:106
void SetWordTruth(const UNICHARSET &unicharset, const char *truth_str, const TBOX &word_box)
Definition: blamer.cpp:71
STRING misadaption_debug_
Definition: blamer.h:321
bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const
Definition: blamer.cpp:466
bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const
Definition: blamer.cpp:116
BlamerBundle()
Definition: blamer.h:102
int lattice_size_
Definition: blamer.h:338
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:118
tesseract::BoxWord norm_truth_word_
Definition: blamer.h:311
GenericVector< int > correct_segmentation_rows_
Definition: blamer.h:332
tesseract::ParamsTrainingBundle params_training_bundle_
Definition: blamer.h:340
float best_correctly_segmented_rating_
Definition: blamer.h:328
void set_lattice_data(const char *data, int size)
Definition: blamer.h:156
void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug)
Definition: blamer.cpp:230
int col
Definition: matrix.h:633
void set_best_choice_is_dict_and_top_choice(bool value)
Definition: blamer.h:147
STRING TruthString() const
Definition: blamer.h:112
Definition: rect.h:34
bool truth_has_char_boxes_
Definition: blamer.h:305
Definition: unicharset.h:146
tesseract::BoxWord truth_word_
Definition: blamer.h:308
Definition: params_training_featdef.h:132
void CopyTruth(const BlamerBundle &other)
Definition: blamer.h:199
void FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, STRING *debug_str)
Definition: blamer.cpp:511
void SetupNormTruthWord(const DENORM &denorm)
Definition: blamer.cpp:150
~BlamerBundle()
Definition: blamer.h:109
Definition: matrix.h:575
int32_t length() const
Definition: strngs.cpp:191
int correct_segmentation_length() const
Definition: blamer.h:138
void clear()
Definition: genericvector.h:868
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:61
Definition: ratngs.h:273
const char * IncorrectReason() const
Definition: blamer.cpp:65
bool NoTruth() const
Definition: blamer.h:121
IncorrectResultReason incorrect_result_reason_
Definition: blamer.h:317
const char * string() const
Definition: strngs.cpp:196
void SetupCorrectSegmentation(const TWERD *word, bool debug)
Definition: blamer.cpp:412
Definition: blobs.h:402
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
Definition: blamer.cpp:579
bool best_choice_is_dict_and_top_choice_
Definition: blamer.h:335
void InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings, UNICHAR_ID wildcard_id, bool debug, STRING *debug_str, TessResultCallback2< bool, int, int > *pp_cb)
Definition: blamer.cpp:478
Definition: blamer.h:100
void BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box, const BLOB_CHOICE_LIST &choices, bool debug)
Definition: blamer.cpp:262
void UpdateBestRating(float rating)
Definition: blamer.h:134
bool HasDebugInfo() const
Definition: blamer.h:125
ParamsTrainingHypothesis & AddHypothesis(const ParamsTrainingHypothesis &other)
Definition: params_training_featdef.h:142
const STRING & debug() const
Definition: blamer.h:128
int lattice_size() const
Definition: blamer.h:153
void SetSymbolTruth(const UNICHARSET &unicharset, const char *char_str, const TBOX &char_box)
Definition: blamer.cpp:91
void SetChopperBlame(const WERD_RES *word, bool debug)
Definition: blamer.cpp:315
GenericVector< STRING > truth_text_
Definition: blamer.h:315
void DeleteAllBoxes()
Definition: boxword.cpp:174
Definition: strngs.h:45
Definition: boxword.h:37
int norm_box_tolerance_
Definition: blamer.h:313
BlamerBundle & operator=(const BlamerBundle &other)
int row
Definition: matrix.h:634
void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo)
Definition: blamer.h:166
Definition: normalis.h:50
Definition: pageres.h:169
void SetBlame(IncorrectResultReason irr, const STRING &msg, const WERD_CHOICE *choice, bool debug)
Definition: blamer.h:294
const STRING & misadaption_debug() const
Definition: blamer.h:131
void FillDebugString(const STRING &msg, const WERD_CHOICE *choice, STRING *debug)
Definition: blamer.cpp:128
static void LastChanceBlame(bool debug, WERD_RES *word)
Definition: blamer.cpp:552
bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord)
Definition: blamer.h:143
void ClearResults()
Definition: blamer.h:185
GenericVector< int > correct_segmentation_cols_
Definition: blamer.h:331
Definition: matrix.h:605
void SplitBundle(int word1_right, int word2_left, bool debug, BlamerBundle *bundle1, BlamerBundle *bundle2) const
Definition: blamer.cpp:174
Definition: blamer.h:43
void BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset, bool valid_permuter, bool debug)
Definition: blamer.cpp:374
int length() const
Definition: genericvector.h:85
const tesseract::ParamsTrainingBundle & params_training_bundle() const
Definition: blamer.h:162
const char * lattice_data() const
Definition: blamer.h:150
static const float kBadRating
Definition: ratngs.h:275
void SetRejectedTruth()
Definition: blamer.cpp:110
bool segsearch_is_looking_for_blame_
Definition: blamer.h:325
STRING debug_
Definition: blamer.h:319
BlamerBundle(const BlamerBundle &other)
Definition: blamer.h:105
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:506