tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
classify.h
1 // File: classify.h
3 // Description: classify class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifndef TESSERACT_CLASSIFY_CLASSIFY_H_
20 #define TESSERACT_CLASSIFY_CLASSIFY_H_
21 
22 // Include automatically generated configuration file if running autoconf.
23 #ifdef HAVE_CONFIG_H
24 #include "config_auto.h"
25 #endif
26 
27 
28 #ifdef DISABLED_LEGACY_ENGINE
29 
30 #include "ccstruct.h"
31 #include "dict.h"
32 
33 namespace tesseract {
34 
35 class Classify : public CCStruct {
36  public:
37  Classify();
38  virtual ~Classify();
39  virtual Dict& getDict() {
40  return dict_;
41  }
42 
43  // Member variables.
44 
45  INT_VAR_H(classify_debug_level, 0, "Classify debug level");
46 
47  BOOL_VAR_H(classify_bln_numeric_mode, 0,
48  "Assume the input is numbers [0-9].");
49 
50  double_VAR_H(classify_max_rating_ratio, 1.5,
51  "Veto ratio between classifier ratings");
52 
53  double_VAR_H(classify_max_certainty_margin, 5.5,
54  "Veto difference between classifier certainties");
55 
56  private:
57  Dict dict_;
58 };
59 
60 } // namespace tesseract
61 
62 
63 #else // DISABLED_LEGACY_ENGINE not defined
64 
65 #include "adaptive.h"
66 #include "ccstruct.h"
67 #include "dict.h"
68 #include "featdefs.h"
69 #include "fontinfo.h"
70 #include "imagedata.h"
71 #include "intfx.h"
72 #include "intmatcher.h"
73 #include "normalis.h"
74 #include "ratngs.h"
75 #include "ocrfeatures.h"
76 #include "unicity_table.h"
77 
78 class ScrollView;
79 class WERD_CHOICE;
80 class WERD_RES;
81 struct ADAPT_RESULTS;
82 struct NORM_PROTOS;
83 
84 static const int kUnknownFontinfoId = -1;
85 static const int kBlankFontinfoId = -2;
86 
87 namespace tesseract {
88 
89 class ShapeClassifier;
90 struct ShapeRating;
91 class ShapeTable;
92 struct UnicharRating;
93 
94 // How segmented is a blob. In this enum, character refers to a classifiable
95 // unit, but that is too long and character is usually easier to understand.
97  CST_FRAGMENT, // A partial character.
98  CST_WHOLE, // A correctly segmented character.
99  CST_IMPROPER, // More than one but less than 2 characters.
100  CST_NGRAM // Multiple characters.
101 };
102 
103 class Classify : public CCStruct {
104  public:
105  Classify();
106  virtual ~Classify();
107  virtual Dict& getDict() {
108  return dict_;
109  }
110 
111  const ShapeTable* shape_table() const {
112  return shape_table_;
113  }
114 
115  // Takes ownership of the given classifier, and uses it for future calls
116  // to CharNormClassifier.
117  void SetStaticClassifier(ShapeClassifier* static_classifier);
118 
119  // Adds a noise classification result that is a bit worse than the worst
120  // current result, or the worst possible result if no current results.
121  void AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices);
122 
123  // Returns true if the blob is small enough to be a large speckle.
124  bool LargeSpeckle(const TBLOB &blob);
125 
126  /* adaptive.cpp ************************************************************/
127  ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset);
128  int GetFontinfoId(ADAPT_CLASS Class, uint8_t ConfigId);
129  // Runs the class pruner from int_templates on the given features, returning
130  // the number of classes output in results.
131  // int_templates Class pruner tables
132  // num_features Number of features in blob
133  // features Array of features
134  // normalization_factors (input) Array of int_templates->NumClasses fudge
135  // factors from blob normalization process.
136  // (Indexed by CLASS_INDEX)
137  // expected_num_features (input) Array of int_templates->NumClasses
138  // expected number of features for each class.
139  // (Indexed by CLASS_INDEX)
140  // results (output) Sorted Array of pruned classes.
141  // Array must be sized to take the maximum possible
142  // number of outputs : int_templates->NumClasses.
143  int PruneClasses(const INT_TEMPLATES_STRUCT* int_templates, int num_features,
144  int keep_this, const INT_FEATURE_STRUCT* features,
145  const uint8_t* normalization_factors,
146  const uint16_t* expected_num_features,
148  void ReadNewCutoffs(TFile* fp, CLASS_CUTOFF_ARRAY Cutoffs);
149  void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
150  void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
152  /* normmatch.cpp ************************************************************/
153  float ComputeNormMatch(CLASS_ID ClassId,
154  const FEATURE_STRUCT& feature, bool DebugMatch);
155  void FreeNormProtos();
157  /* protos.cpp ***************************************************************/
158  void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class);
160  const UNICHARSET& target_unicharset);
161  /* adaptmatch.cpp ***********************************************************/
162 
163  // Learns the given word using its chopped_word, seam_array, denorm,
164  // box_word, best_state, and correct_text to learn both correctly and
165  // incorrectly segmented blobs. If fontname is not nullptr, then LearnBlob
166  // is called and the data will be saved in an internal buffer.
167  // Otherwise AdaptToBlob is called for adaption within a document.
168  void LearnWord(const char* fontname, WERD_RES* word);
169 
170  // Builds a blob of length fragments, from the word, starting at start,
171  // and then learns it, as having the given correct_text.
172  // If fontname is not nullptr, then LearnBlob is called and the data will be
173  // saved in an internal buffer for static training.
174  // Otherwise AdaptToBlob is called for adaption within a document.
175  // threshold is a magic number required by AdaptToChar and generated by
176  // ComputeAdaptionThresholds.
177  // Although it can be partly inferred from the string, segmentation is
178  // provided to explicitly clarify the character segmentation.
179  void LearnPieces(const char* fontname, int start, int length, float threshold,
180  CharSegmentationType segmentation, const char* correct_text,
181  WERD_RES* word);
183  void InitAdaptedClass(TBLOB *Blob,
184  CLASS_ID ClassId,
185  int FontinfoId,
186  ADAPT_CLASS Class,
187  ADAPT_TEMPLATES Templates);
188  void AmbigClassifier(const GenericVector<INT_FEATURE_STRUCT>& int_features,
189  const INT_FX_RESULT_STRUCT& fx_info,
190  const TBLOB *blob,
191  INT_TEMPLATES templates,
192  ADAPT_CLASS *classes,
193  UNICHAR_ID *ambiguities,
194  ADAPT_RESULTS *results);
195  void MasterMatcher(INT_TEMPLATES templates,
196  int16_t num_features,
197  const INT_FEATURE_STRUCT* features,
198  const uint8_t* norm_factors,
199  ADAPT_CLASS* classes,
200  int debug,
201  int matcher_multiplier,
202  const TBOX& blob_box,
203  const GenericVector<CP_RESULT_STRUCT>& results,
204  ADAPT_RESULTS* final_results);
205  // Converts configs to fonts, and if the result is not adapted, and a
206  // shape_table_ is present, the shape is expanded to include all
207  // unichar_ids represented, before applying a set of corrections to the
208  // distance rating in int_result, (see ComputeCorrectedRating.)
209  // The results are added to the final_results output.
211  bool debug,
212  int class_id,
213  int bottom, int top,
214  float cp_rating,
215  int blob_length,
216  int matcher_multiplier,
217  const uint8_t* cn_factors,
218  UnicharRating* int_result,
219  ADAPT_RESULTS* final_results);
220  // Applies a set of corrections to the distance im_rating,
221  // including the cn_correction, miss penalty and additional penalty
222  // for non-alnums being vertical misfits. Returns the corrected distance.
223  double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating,
224  double im_rating, int feature_misses,
225  int bottom, int top,
226  int blob_length, int matcher_multiplier,
227  const uint8_t* cn_factors);
228  void ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box,
229  ADAPT_RESULTS *Results,
230  BLOB_CHOICE_LIST *Choices);
231  void AddNewResult(const UnicharRating& new_result, ADAPT_RESULTS *results);
232  int GetAdaptiveFeatures(TBLOB *Blob,
233  INT_FEATURE_ARRAY IntFeatures,
234  FEATURE_SET *FloatFeatures);
235 
236 #ifndef GRAPHICS_DISABLED
237  void DebugAdaptiveClassifier(TBLOB *Blob,
238  ADAPT_RESULTS *Results);
239 #endif
240  PROTO_ID MakeNewTempProtos(FEATURE_SET Features,
241  int NumBadFeat,
242  FEATURE_ID BadFeat[],
243  INT_CLASS IClass,
244  ADAPT_CLASS Class,
245  BIT_VECTOR TempProtoMask);
247  CLASS_ID ClassId,
248  int FontinfoId,
249  int NumFeatures,
250  INT_FEATURE_ARRAY Features,
251  FEATURE_SET FloatFeatures);
252  void MakePermanent(ADAPT_TEMPLATES Templates,
253  CLASS_ID ClassId,
254  int ConfigId,
255  TBLOB *Blob);
256  void PrintAdaptiveMatchResults(const ADAPT_RESULTS& results);
257  void RemoveExtraPuncs(ADAPT_RESULTS *Results);
258  void RemoveBadMatches(ADAPT_RESULTS *Results);
259  void SetAdaptiveThreshold(float Threshold);
260  void ShowBestMatchFor(int shape_id,
261  const INT_FEATURE_STRUCT* features,
262  int num_features);
263  // Returns a string for the classifier class_id: either the corresponding
264  // unicharset debug_str or the shape_table_ debug str.
266  int class_id, int config_id) const;
267  // Converts a classifier class_id index with a config ID to:
268  // shape_table_ present: a shape_table_ index OR
269  // No shape_table_: a font ID.
270  // Without shape training, each class_id, config pair represents a single
271  // unichar id/font combination, so this function looks up the corresponding
272  // font id.
273  // With shape training, each class_id, config pair represents a single
274  // shape table index, so the fontset_table stores the shape table index,
275  // and the shape_table_ must be consulted to obtain the actual unichar_id/
276  // font combinations that the shape represents.
277  int ClassAndConfigIDToFontOrShapeID(int class_id,
278  int int_result_config) const;
279  // Converts a shape_table_ index to a classifier class_id index (not a
280  // unichar-id!). Uses a search, so not fast.
281  int ShapeIDToClassID(int shape_id) const;
282  UNICHAR_ID *BaselineClassifier(
283  TBLOB *Blob, const GenericVector<INT_FEATURE_STRUCT>& int_features,
284  const INT_FX_RESULT_STRUCT& fx_info,
285  ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results);
286  int CharNormClassifier(TBLOB *blob,
287  const TrainingSample& sample,
288  ADAPT_RESULTS *adapt_results);
289 
290  // As CharNormClassifier, but operates on a TrainingSample and outputs to
291  // a GenericVector of ShapeRating without conversion to classes.
292  int CharNormTrainingSample(bool pruner_only, int keep_this,
293  const TrainingSample& sample,
295  UNICHAR_ID *GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass);
296  void DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results);
297  void AdaptToChar(TBLOB* Blob, CLASS_ID ClassId, int FontinfoId,
298  float Threshold, ADAPT_TEMPLATES adaptive_templates);
299  void DisplayAdaptedChar(TBLOB* blob, INT_CLASS_STRUCT* int_class);
300  bool AdaptableWord(WERD_RES* word);
301  void EndAdaptiveClassifier();
302  void SettupPass1();
303  void SettupPass2();
304  void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices);
305  void ClassifyAsNoise(ADAPT_RESULTS *Results);
309 
310  int GetCharNormFeature(const INT_FX_RESULT_STRUCT& fx_info,
311  INT_TEMPLATES templates,
312  uint8_t* pruner_norm_array,
313  uint8_t* char_norm_array);
314  // Computes the char_norm_array for the unicharset and, if not nullptr, the
315  // pruner_array as appropriate according to the existence of the shape_table.
316  // The norm_feature is deleted as it is almost certainly no longer needed.
317  void ComputeCharNormArrays(FEATURE_STRUCT* norm_feature,
318  INT_TEMPLATES_STRUCT* templates,
319  uint8_t* char_norm_array,
320  uint8_t* pruner_array);
321 
322  bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config);
323  void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob);
324 
325  bool AdaptiveClassifierIsFull() const { return NumAdaptationsFailed > 0; }
327  return AdaptedTemplates->NumPermClasses == 0;
328  }
329  bool LooksLikeGarbage(TBLOB *blob);
330  void RefreshDebugWindow(ScrollView **win, const char *msg,
331  int y_offset, const TBOX &wbox);
332  // intfx.cpp
333  // Computes the DENORMS for bl(baseline) and cn(character) normalization
334  // during feature extraction. The input denorm describes the current state
335  // of the blob, which is usually a baseline-normalized word.
336  // The Transforms setup are as follows:
337  // Baseline Normalized (bl) Output:
338  // We center the grapheme by aligning the x-coordinate of its centroid with
339  // x=128 and leaving the already-baseline-normalized y as-is.
340  //
341  // Character Normalized (cn) Output:
342  // We align the grapheme's centroid at the origin and scale it
343  // asymmetrically in x and y so that the 2nd moments are a standard value
344  // (51.2) ie the result is vaguely square.
345  // If classify_nonlinear_norm is true:
346  // A non-linear normalization is setup that attempts to evenly distribute
347  // edges across x and y.
348  //
349  // Some of the fields of fx_info are also setup:
350  // Length: Total length of outline.
351  // Rx: Rounded y second moment. (Reversed by convention.)
352  // Ry: rounded x second moment.
353  // Xmean: Rounded x center of mass of the blob.
354  // Ymean: Rounded y center of mass of the blob.
355  static void SetupBLCNDenorms(const TBLOB& blob, bool nonlinear_norm,
356  DENORM* bl_denorm, DENORM* cn_denorm,
357  INT_FX_RESULT_STRUCT* fx_info);
358 
359  // Extracts sets of 3-D features of length kStandardFeatureLength (=12.8), as
360  // (x,y) position and angle as measured counterclockwise from the vector
361  // <-1, 0>, from blob using two normalizations defined by bl_denorm and
362  // cn_denorm. See SetpuBLCNDenorms for definitions.
363  // If outline_cn_counts is not nullptr, on return it contains the cumulative
364  // number of cn features generated for each outline in the blob (in order).
365  // Thus after the first outline, there were (*outline_cn_counts)[0] features,
366  // after the second outline, there were (*outline_cn_counts)[1] features etc.
367  static void ExtractFeatures(const TBLOB& blob,
368  bool nonlinear_norm,
371  INT_FX_RESULT_STRUCT* results,
372  GenericVector<int>* outline_cn_counts);
373  /* float2int.cpp ************************************************************/
374  void ClearCharNormArray(uint8_t* char_norm_array);
375  void ComputeIntCharNormArray(const FEATURE_STRUCT& norm_feature,
376  uint8_t* char_norm_array);
377  void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures);
378  /* intproto.cpp *************************************************************/
380  void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
381  const UNICHARSET& target_unicharset);
382  CLASS_ID GetClassToDebug(const char *Prompt, bool* adaptive_on,
383  bool* pretrained_on, int* shape_id);
384  void ShowMatchDisplay();
385  /* font detection ***********************************************************/
387  return fontinfo_table_;
388  }
390  return fontinfo_table_;
391  }
393  return fontset_table_;
394  }
395  /* mfoutline.cpp ***********************************************************/
396  void NormalizeOutlines(LIST Outlines, float *XScale, float *YScale);
397  /* outfeat.cpp ***********************************************************/
399  /* picofeat.cpp ***********************************************************/
402  const INT_FX_RESULT_STRUCT& fx_info);
404  const INT_FX_RESULT_STRUCT& fx_info);
405  /* blobclass.cpp ***********************************************************/
406  // Extracts features from the given blob and saves them in the tr_file_data_
407  // member variable.
408  // fontname: Name of font that this blob was printed in.
409  // cn_denorm: Character normalization transformation to apply to the blob.
410  // fx_info: Character normalization parameters computed with cn_denorm.
411  // blob_text: Ground truth text for the blob.
412  void LearnBlob(const STRING& fontname, TBLOB* Blob, const DENORM& cn_denorm,
413  const INT_FX_RESULT_STRUCT& fx_info, const char* blob_text);
414  // Writes stored training data to a .tr file based on the given filename.
415  // Returns false on error.
416  bool WriteTRFile(const STRING& filename);
417 
418  // Member variables.
419 
420  // Parameters.
421  // Set during training (in lang.config) to indicate whether the divisible
422  // blobs chopper should be used (true for latin script.)
423  BOOL_VAR_H(allow_blob_division, true, "Use divisible blobs chopping");
424  // Set during training (in lang.config) to indicate whether the divisible
425  // blobs chopper should be used in preference to chopping. Set to true for
426  // southern Indic scripts.
427  BOOL_VAR_H(prioritize_division, FALSE,
428  "Prioritize blob division over chopping");
429  INT_VAR_H(tessedit_single_match, FALSE, "Top choice only from CP");
430  BOOL_VAR_H(classify_enable_learning, true, "Enable adaptive classifier");
431  INT_VAR_H(classify_debug_level, 0, "Classify debug level");
432 
433  /* mfoutline.cpp ***********************************************************/
434  /* control knobs used to control normalization of outlines */
435  INT_VAR_H(classify_norm_method, character, "Normalization Method ...");
436  double_VAR_H(classify_char_norm_range, 0.2,
437  "Character Normalization Range ...");
438  double_VAR_H(classify_min_norm_scale_x, 0.0, "Min char x-norm scale ...");
439  double_VAR_H(classify_max_norm_scale_x, 0.325, "Max char x-norm scale ...");
440  double_VAR_H(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ...");
441  double_VAR_H(classify_max_norm_scale_y, 0.325, "Max char y-norm scale ...");
442  double_VAR_H(classify_max_rating_ratio, 1.5,
443  "Veto ratio between classifier ratings");
444  double_VAR_H(classify_max_certainty_margin, 5.5,
445  "Veto difference between classifier certainties");
446 
447  /* adaptmatch.cpp ***********************************************************/
448  BOOL_VAR_H(tess_cn_matching, 0, "Character Normalized Matching");
449  BOOL_VAR_H(tess_bn_matching, 0, "Baseline Normalized Matching");
450  BOOL_VAR_H(classify_enable_adaptive_matcher, 1, "Enable adaptive classifier");
451  BOOL_VAR_H(classify_use_pre_adapted_templates, 0,
452  "Use pre-adapted classifier templates");
453  BOOL_VAR_H(classify_save_adapted_templates, 0,
454  "Save adapted templates to a file");
455  BOOL_VAR_H(classify_enable_adaptive_debugger, 0, "Enable match debugger");
456  BOOL_VAR_H(classify_nonlinear_norm, 0,
457  "Non-linear stroke-density normalization");
458  INT_VAR_H(matcher_debug_level, 0, "Matcher Debug Level");
459  INT_VAR_H(matcher_debug_flags, 0, "Matcher Debug Flags");
460  INT_VAR_H(classify_learning_debug_level, 0, "Learning Debug Level: ");
461  double_VAR_H(matcher_good_threshold, 0.125, "Good Match (0-1)");
462  double_VAR_H(matcher_reliable_adaptive_result, 0.0, "Great Match (0-1)");
463  double_VAR_H(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)");
464  double_VAR_H(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)");
465  double_VAR_H(matcher_rating_margin, 0.1, "New template margin (0-1)");
466  double_VAR_H(matcher_avg_noise_size, 12.0, "Avg. noise blob length: ");
467  INT_VAR_H(matcher_permanent_classes_min, 1, "Min # of permanent classes");
468  INT_VAR_H(matcher_min_examples_for_prototyping, 3,
469  "Reliable Config Threshold");
470  INT_VAR_H(matcher_sufficient_examples_for_prototyping, 5,
471  "Enable adaption even if the ambiguities have not been seen");
472  double_VAR_H(matcher_clustering_max_angle_delta, 0.015,
473  "Maximum angle delta for prototype clustering");
474  double_VAR_H(classify_misfit_junk_penalty, 0.0,
475  "Penalty to apply when a non-alnum is vertically out of "
476  "its expected textline position");
477  double_VAR_H(rating_scale, 1.5, "Rating scaling factor");
478  double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
479  double_VAR_H(tessedit_class_miss_scale, 0.00390625,
480  "Scale factor for features not used");
481  double_VAR_H(classify_adapted_pruning_factor, 2.5,
482  "Prune poor adapted results this much worse than best result");
483  double_VAR_H(classify_adapted_pruning_threshold, -1.0,
484  "Threshold at which classify_adapted_pruning_factor starts");
485  INT_VAR_H(classify_adapt_proto_threshold, 230,
486  "Threshold for good protos during adaptive 0-255");
487  INT_VAR_H(classify_adapt_feature_threshold, 230,
488  "Threshold for good features during adaptive 0-255");
489  BOOL_VAR_H(disable_character_fragments, TRUE,
490  "Do not include character fragments in the"
491  " results of the classifier");
492  double_VAR_H(classify_character_fragments_garbage_certainty_threshold, -3.0,
493  "Exclude fragments that do not match any whole character"
494  " with at least this certainty");
495  BOOL_VAR_H(classify_debug_character_fragments, FALSE,
496  "Bring up graphical debugging windows for fragments training");
497  BOOL_VAR_H(matcher_debug_separate_windows, FALSE,
498  "Use two different windows for debugging the matching: "
499  "One for the protos and one for the features.");
500  STRING_VAR_H(classify_learn_debug_str, "", "Class str to debug learning");
501 
502  /* intmatcher.cpp **********************************************************/
503  INT_VAR_H(classify_class_pruner_threshold, 229,
504  "Class Pruner Threshold 0-255");
505  INT_VAR_H(classify_class_pruner_multiplier, 15,
506  "Class Pruner Multiplier 0-255: ");
507  INT_VAR_H(classify_cp_cutoff_strength, 7,
508  "Class Pruner CutoffStrength: ");
509  INT_VAR_H(classify_integer_matcher_multiplier, 10,
510  "Integer Matcher Multiplier 0-255: ");
511 
512  // Use class variables to hold onto built-in templates and adapted templates.
515  // The backup adapted templates are created from the previous page (only)
516  // so they are always ready and reasonably well trained if the primary
517  // adapted templates become full.
519 
520  // Create dummy proto and config masks for use with the built-in templates.
521  BIT_VECTOR AllProtosOn;
522  BIT_VECTOR AllConfigsOn;
523  BIT_VECTOR AllConfigsOff;
524  BIT_VECTOR TempProtoMask;
526  /* normmatch.cpp */
528  /* font detection ***********************************************************/
530  // Without shape training, each class_id, config pair represents a single
531  // unichar id/font combination, so each fontset_table_ entry holds font ids
532  // for each config in the class.
533  // With shape training, each class_id, config pair represents a single
534  // shape_table_ index, so the fontset_table_ stores the shape_table_ index,
535  // and the shape_table_ must be consulted to obtain the actual unichar_id/
536  // font combinations that the shape represents.
538 
539  INT_VAR_H(il1_adaption_test, 0, "Don't adapt to i/I at beginning of word");
540  BOOL_VAR_H(classify_bln_numeric_mode, 0,
541  "Assume the input is numbers [0-9].");
542  double_VAR_H(speckle_large_max_size, 0.30, "Max large speckle size");
543  double_VAR_H(speckle_rating_penalty, 10.0,
544  "Penalty to add to worst rating for noise");
545 
546  protected:
549  // If a shape_table_ is present, it is used to remap classifier output in
550  // ExpandShapesAndApplyCorrections. font_ids referenced by configs actually
551  // mean an index to the shape_table_ and the choices returned are *all* the
552  // shape_table_ entries at that index.
554 
555  private:
557  // The currently active static classifier.
559 
560  /* variables used to hold performance statistics */
562 
563  // Training data gathered here for all the images in a document.
565 
566  // Expected number of features in the class pruner, used to penalize
567  // unknowns that have too few features (like a c being classified as e) so
568  // it doesn't recognize everything as '@' or '#'.
569  // CharNormCutoffs is for the static classifier (with no shapetable).
570  // BaselineCutoffs gets a copy of CharNormCutoffs as an estimate of the real
571  // value in the adaptive classifier. Both are indexed by unichar_id.
572  // shapetable_cutoffs_ provides a similar value for each shape in the
573  // shape_table_
574  uint16_t CharNormCutoffs[MAX_NUM_CLASSES];
575  uint16_t BaselineCutoffs[MAX_NUM_CLASSES];
580 };
581 } // namespace tesseract
582 
583 #endif // DISABLED_LEGACY_ENGINE
584 
585 #endif // TESSERACT_CLASSIFY_CLASSIFY_H_
double_VAR_H(classify_char_norm_range, 0.2, "Character Normalization Range ...")
void ReadNewCutoffs(TFile *fp, CLASS_CUTOFF_ARRAY Cutoffs)
Definition: cutoffs.cpp:46
STRING ClassIDToDebugStr(const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
Definition: adaptmatch.cpp:2199
Definition: classify.h:99
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:529
static void ExtractFeatures(const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
Definition: intfx.cpp:444
bool LooksLikeGarbage(TBLOB *blob)
Definition: adaptmatch.cpp:1637
void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob)
Definition: adaptmatch.cpp:2278
BIT_VECTOR AllProtosOn
Definition: classify.h:521
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:614
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:513
Definition: adaptive.h:39
bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config)
Definition: adaptmatch.cpp:2241
int NumAdaptationsFailed
Definition: classify.h:561
PROTO_ID MakeNewTempProtos(FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
Definition: adaptmatch.cpp:1839
ADAPT_TEMPLATES BackupAdaptedTemplates
Definition: classify.h:518
uint8_t NumPermClasses
Definition: adaptive.h:78
BIT_VECTOR TempProtoMask
Definition: classify.h:524
Classify()
Definition: classify.cpp:60
void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
Definition: intproto.cpp:1030
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:630
int CharNormTrainingSample(bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results)
Definition: adaptmatch.cpp:1333
GenericVector< uint16_t > shapetable_cutoffs_
Definition: classify.h:576
double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors)
Definition: adaptmatch.cpp:1206
Definition: intproto.h:118
NORM_PROTOS * ReadNormProtos(TFile *fp)
Definition: normmatch.cpp:235
INT_TEMPLATES ReadIntTemplates(TFile *fp)
Definition: intproto.cpp:728
virtual Dict & getDict()
Definition: classify.h:107
UNICHAR_ID * BaselineClassifier(TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
Definition: adaptmatch.cpp:1269
int CharNormClassifier(TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
Definition: adaptmatch.cpp:1315
ADAPT_TEMPLATES ReadAdaptedTemplates(TFile *File)
Definition: adaptive.cpp:333
bool LargeSpeckle(const TBLOB &blob)
Definition: classify.cpp:256
const ShapeTable * shape_table() const
Definition: classify.h:111
void ComputeIntCharNormArray(const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array)
Definition: float2int.cpp:62
void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class)
Definition: intproto.cpp:496
Definition: cluster.h:32
ScrollView * learn_fragments_debug_win_
Definition: classify.h:579
float ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
Definition: normmatch.cpp:83
void SetStaticClassifier(ShapeClassifier *static_classifier)
Definition: classify.cpp:225
Definition: classify.h:103
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:514
Definition: rect.h:34
void SettupPass2()
Definition: adaptmatch.cpp:670
Definition: unicharset.h:146
INT_VAR_H(tessedit_single_match, FALSE, "Top choice only from CP")
Definition: classify.h:98
uint16_t BaselineCutoffs[MAX_NUM_CLASSES]
Definition: classify.h:575
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:192
Definition: intfx.h:35
void ResetAdaptiveClassifierInternal()
Definition: adaptmatch.cpp:599
Definition: intmatcher.h:83
int GetCharNormFeature(const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array)
Definition: adaptmatch.cpp:1682
void SetAdaptiveThreshold(float Threshold)
Definition: adaptmatch.cpp:2146
Definition: serialis.h:77
FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob)
Definition: outfeat.cpp:42
Definition: adaptive.h:74
Definition: baseapi.cpp:94
void ClassifyAsNoise(ADAPT_RESULTS *Results)
Definition: adaptmatch.cpp:1403
UnicityTable< FontSet > fontset_table_
Definition: classify.h:537
FEATURE_DEFS_STRUCT feature_defs_
Definition: classify.h:548
void InitAdaptedClass(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
Definition: adaptmatch.cpp:694
UnicityTable< FontSet > & get_fontset_table()
Definition: classify.h:392
void MakePermanent(ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
Definition: adaptmatch.cpp:1925
Definition: classify.h:97
Definition: ratngs.h:273
BIT_VECTOR AllConfigsOff
Definition: classify.h:523
void DisplayAdaptedChar(TBLOB *blob, INT_CLASS_STRUCT *int_class)
Definition: adaptmatch.cpp:950
void FreeNormProtos()
Definition: normmatch.cpp:157
Definition: ocrfeatures.h:60
void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates)
Definition: adaptive.cpp:245
void LearnPieces(const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
Definition: adaptmatch.cpp:375
Definition: protos.h:42
Definition: fileio.h:29
void ExpandShapesAndApplyCorrections(ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
Definition: adaptmatch.cpp:1132
Definition: classify.h:100
Definition: ccstruct.h:25
ScrollView * learn_debug_win_
Definition: classify.h:577
uint16_t CharNormCutoffs[MAX_NUM_CLASSES]
Definition: classify.h:574
Definition: intproto.h:105
void MasterMatcher(INT_TEMPLATES templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
Definition: adaptmatch.cpp:1092
FEATURE_SET ExtractIntGeoFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
Definition: picofeat.cpp:249
void AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES adaptive_templates)
Definition: adaptmatch.cpp:857
bool EnableLearning
Definition: classify.h:525
INT_TEMPLATES CreateIntTemplates(CLASSES FloatProtos, const UNICHARSET &target_unicharset)
Definition: intproto.cpp:535
int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
Definition: adaptmatch.cpp:1744
void DebugAdaptiveClassifier(TBLOB *Blob, ADAPT_RESULTS *Results)
Definition: adaptmatch.cpp:1501
ShapeClassifier * static_classifier_
Definition: classify.h:558
NORM_PROTOS * NormProtos
Definition: classify.h:527
void AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices)
Definition: classify.cpp:233
Definition: dict.h:88
bool WriteTRFile(const STRING &filename)
Definition: blobclass.cpp:102
Definition: scrollview.h:102
void RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
Definition: adaptmatch.cpp:227
Definition: adaptive.h:62
Definition: fontinfo.h:30
FEATURE_SET ExtractIntCNFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
Definition: picofeat.cpp:219
int PruneClasses(const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
Definition: intmatcher.cpp:409
Definition: normmatch.cpp:35
void NormalizeOutlines(LIST Outlines, float *XScale, float *YScale)
Definition: mfoutline.cpp:284
void EndAdaptiveClassifier()
Definition: adaptmatch.cpp:460
Definition: featdefs.h:46
void ConvertMatchesToChoices(const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:1417
void ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)
Definition: adaptmatch.cpp:1702
void ClearCharNormArray(uint8_t *char_norm_array)
Definition: float2int.cpp:44
Definition: tessdatamanager.h:126
STRING_VAR_H(classify_learn_debug_str, "", "Class str to debug learning")
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:386
int GetFontinfoId(ADAPT_CLASS Class, uint8_t ConfigId)
Definition: adaptive.cpp:174
void RemoveExtraPuncs(ADAPT_RESULTS *Results)
Definition: adaptmatch.cpp:2098
STRING tr_file_data_
Definition: classify.h:564
Definition: strngs.h:45
IntegerMatcher im_
Definition: classify.h:547
BIT_VECTOR AllConfigsOn
Definition: classify.h:522
Definition: shapetable.h:262
void PrintAdaptiveMatchResults(const ADAPT_RESULTS &results)
Definition: adaptmatch.cpp:2018
CharSegmentationType
Definition: classify.h:96
Definition: normalis.h:50
void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
Definition: float2int.cpp:90
UNICHAR_ID * GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass)
Definition: adaptmatch.cpp:1596
Definition: protos.h:53
Definition: pageres.h:169
Definition: shapeclassifier.h:43
bool AdaptiveClassifierIsEmpty() const
Definition: classify.h:326
Dict dict_
Definition: classify.h:556
Definition: ocrfeatures.h:66
void AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results)
Definition: adaptmatch.cpp:998
Definition: oldlist.h:124
int GetAdaptiveFeatures(TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
Definition: adaptmatch.cpp:787
void SettupPass1()
Definition: adaptmatch.cpp:653
virtual ~Classify()
Definition: classify.cpp:215
Definition: intproto.h:132
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:251
void ShowBestMatchFor(int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
Definition: adaptmatch.cpp:2164
void ShowMatchDisplay()
Definition: intproto.cpp:973
void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates)
Definition: adaptive.cpp:454
Definition: blobs.h:268
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:823
int ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const
Definition: adaptmatch.cpp:2212
void RemoveBadMatches(ADAPT_RESULTS *Results)
Definition: adaptmatch.cpp:2038
Definition: trainingsample.h:53
void DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results)
Definition: adaptmatch.cpp:1534
void InitAdaptiveClassifier(TessdataManager *mgr)
Definition: adaptmatch.cpp:528
ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset)
Definition: adaptive.cpp:152
ShapeTable * shape_table_
Definition: classify.h:553
void LearnBlob(const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
Definition: blobclass.cpp:74
Definition: adaptmatch.cpp:92
void AmbigClassifier(const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
Definition: adaptmatch.cpp:1049
bool AdaptiveClassifierIsFull() const
Definition: classify.h:325
Definition: shapetable.h:41
int ShapeIDToClassID(int shape_id) const
Definition: adaptmatch.cpp:2225
static void SetupBLCNDenorms(const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
Definition: intfx.cpp:131
ScrollView * learn_fragmented_word_debug_win_
Definition: classify.h:578
FEATURE_SET ExtractPicoFeatures(TBLOB *Blob)
Definition: picofeat.cpp:64
const UnicityTable< FontInfo > & get_fontinfo_table() const
Definition: classify.h:389
BOOL_VAR_H(allow_blob_division, true, "Use divisible blobs chopping")
CLASS_ID GetClassToDebug(const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
Definition: intproto.cpp:1274