tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
blobbox.h
1 /**********************************************************************
2  * File: blobbox.h (Formerly blobnbox.h)
3  * Description: Code for the textord blob class.
4  * Author: Ray Smith
5  * Created: Thu Jul 30 09:08:51 BST 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifndef BLOBBOX_H
21 #define BLOBBOX_H
22 
23 #include <cinttypes> // for PRId32
24 #include <cmath> // for sqrt
25 #include <cstdint> // for int16_t, int32_t
26 #include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
27 #include "elst2.h" // for ELIST2_ITERATOR, ELIST2IZEH, ELIST2_LINK
28 #include "errcode.h" // for ASSERT_HOST
29 #include "ocrblock.h" // for BLOCK
30 #include "params.h" // for DoubleParam, double_VAR_H
31 #include "pdblock.h" // for PDBLK
32 #include "points.h" // for FCOORD, ICOORD, ICOORDELT_LIST
33 #include "quspline.h" // for QSPLINE
34 #include "rect.h" // for TBOX
35 #include "scrollview.h" // for ScrollView, ScrollView::Color
36 #include "statistc.h" // for STATS
37 #include "stepblob.h" // for C_BLOB
38 #include "tprintf.h" // for tprintf
39 #include "werd.h" // for WERD_LIST
40 
41 class C_OUTLINE;
42 
43 struct Pix;
44 
45 enum PITCH_TYPE
46 {
47  PITCH_DUNNO, // insufficient data
48  PITCH_DEF_FIXED, // definitely fixed
49  PITCH_MAYBE_FIXED, // could be
50  PITCH_DEF_PROP,
51  PITCH_MAYBE_PROP,
52  PITCH_CORR_FIXED,
53  PITCH_CORR_PROP
54 };
55 
56 // The possible tab-stop types of each side of a BLOBNBOX.
57 // The ordering is important, as it is used for deleting dead-ends in the
58 // search. ALIGNED, CONFIRMED and VLINE should remain greater than the
59 // non-aligned, unset, or deleted members.
60 enum TabType {
61  TT_NONE, // Not a tab.
62  TT_DELETED, // Not a tab after detailed analysis.
63  TT_MAYBE_RAGGED, // Initial designation of a tab-stop candidate.
64  TT_MAYBE_ALIGNED, // Initial designation of a tab-stop candidate.
65  TT_CONFIRMED, // Aligned with neighbours.
66  TT_VLINE // Detected as a vertical line.
67 };
68 
69 // The possible region types of a BLOBNBOX.
70 // Note: keep all the text types > BRT_UNKNOWN and all the image types less.
71 // Keep in sync with kBlobTypes in colpartition.cpp and BoxColor, and the
72 // *Type static functions below.
73 enum BlobRegionType {
74  BRT_NOISE, // Neither text nor image.
75  BRT_HLINE, // Horizontal separator line.
76  BRT_VLINE, // Vertical separator line.
77  BRT_RECTIMAGE, // Rectangular image.
78  BRT_POLYIMAGE, // Non-rectangular image.
79  BRT_UNKNOWN, // Not determined yet.
80  BRT_VERT_TEXT, // Vertical alignment, not necessarily vertically oriented.
81  BRT_TEXT, // Convincing text.
82 
83  BRT_COUNT // Number of possibilities.
84 };
85 
86 // enum for elements of arrays that refer to neighbours.
87 // NOTE: keep in this order, so ^2 can be used to flip direction.
88 enum BlobNeighbourDir {
89  BND_LEFT,
90  BND_BELOW,
91  BND_RIGHT,
92  BND_ABOVE,
93  BND_COUNT
94 };
95 
96 // enum for special type of text characters, such as math symbol or italic.
97 enum BlobSpecialTextType {
98  BSTT_NONE, // No special.
99  BSTT_ITALIC, // Italic style.
100  BSTT_DIGIT, // Digit symbols.
101  BSTT_MATH, // Mathmatical symobls (not including digit).
102  BSTT_UNCLEAR, // Characters with low recognition rate.
103  BSTT_SKIP, // Characters that we skip labeling (usually too small).
104  BSTT_COUNT
105 };
106 
107 inline BlobNeighbourDir DirOtherWay(BlobNeighbourDir dir) {
108  return static_cast<BlobNeighbourDir>(dir ^ 2);
109 }
110 
111 // BlobTextFlowType indicates the quality of neighbouring information
112 // related to a chain of connected components, either horizontally or
113 // vertically. Also used by ColPartition for the collection of blobs
114 // within, which should all have the same value in most cases.
115 enum BlobTextFlowType {
116  BTFT_NONE, // No text flow set yet.
117  BTFT_NONTEXT, // Flow too poor to be likely text.
118  BTFT_NEIGHBOURS, // Neighbours support flow in this direction.
119  BTFT_CHAIN, // There is a weak chain of text in this direction.
120  BTFT_STRONG_CHAIN, // There is a strong chain of text in this direction.
121  BTFT_TEXT_ON_IMAGE, // There is a strong chain of text on an image.
122  BTFT_LEADER, // Leader dots/dashes etc.
123  BTFT_COUNT
124 };
125 
126 // Returns true if type1 dominates type2 in a merge. Mostly determined by the
127 // ordering of the enum, LEADER is weak and dominates nothing.
128 // The function is anti-symmetric (t1 > t2) === !(t2 > t1), except that
129 // this cannot be true if t1 == t2, so the result is undefined.
130 inline bool DominatesInMerge(BlobTextFlowType type1, BlobTextFlowType type2) {
131  // LEADER always loses.
132  if (type1 == BTFT_LEADER) return false;
133  if (type2 == BTFT_LEADER) return true;
134  // With those out of the way, the ordering of the enum determines the result.
135  return type1 >= type2;
136 }
137 
138 namespace tesseract {
139 class ColPartition;
140 }
141 
142 class BLOBNBOX;
143 ELISTIZEH (BLOBNBOX)
144 class BLOBNBOX:public ELIST_LINK
145 {
146  public:
149  }
150  explicit BLOBNBOX(C_BLOB *srcblob) {
151  box = srcblob->bounding_box();
153  cblob_ptr = srcblob;
154  area = static_cast<int>(srcblob->area());
155  }
157  if (owns_cblob_) delete cblob_ptr;
158  }
159  static BLOBNBOX* RealBlob(C_OUTLINE* outline) {
160  C_BLOB* blob = new C_BLOB(outline);
161  return new BLOBNBOX(blob);
162  }
163 
164  // Rotates the box and the underlying blob.
165  void rotate(FCOORD rotation);
166 
167  // Methods that act on the box without touching the underlying blob.
168  // Reflect the box in the y-axis, leaving the underlying blob untouched.
169  void reflect_box_in_y_axis();
170  // Rotates the box by the angle given by rotation.
171  // If the blob is a diacritic, then only small rotations for skew
172  // correction can be applied.
173  void rotate_box(FCOORD rotation);
174  // Moves just the box by the given vector.
176  if (IsDiacritic()) {
177  box.move(v);
178  base_char_top_ += v.y();
179  base_char_bottom_ += v.y();
180  } else {
181  box.move(v);
183  }
184  }
185  void merge(BLOBNBOX *nextblob);
186  void really_merge(BLOBNBOX* other);
187  void chop( // fake chop blob
188  BLOBNBOX_IT *start_it, // location of this
189  BLOBNBOX_IT *blob_it, // iterator
190  FCOORD rotation, // for landscape
191  float xheight); // line height
192 
193  void NeighbourGaps(int gaps[BND_COUNT]) const;
194  void MinMaxGapsClipped(int* h_min, int* h_max,
195  int* v_min, int* v_max) const;
196  void CleanNeighbours();
197  // Returns positive if there is at least one side neighbour that has a
198  // similar stroke width and is not on the other side of a rule line.
199  int GoodTextBlob() const;
200  // Returns the number of side neighbours that are of type BRT_NOISE.
201  int NoisyNeighbours() const;
202 
203  // Returns true if the blob is noise and has no owner.
204  bool DeletableNoise() const {
205  return owner() == nullptr && region_type() == BRT_NOISE;
206  }
207 
208  // Returns true, and sets vert_possible/horz_possible if the blob has some
209  // feature that makes it individually appear to flow one way.
210  // eg if it has a high aspect ratio, yet has a complex shape, such as a
211  // joined word in Latin, Arabic, or Hindi, rather than being a -, I, l, 1.
212  bool DefiniteIndividualFlow();
213 
214  // Returns true if there is no tabstop violation in merging this and other.
215  bool ConfirmNoTabViolation(const BLOBNBOX& other) const;
216 
217  // Returns true if other has a similar stroke width to this.
218  bool MatchingStrokeWidth(const BLOBNBOX& other,
219  double fractional_tolerance,
220  double constant_tolerance) const;
221 
222  // Returns a bounding box of the outline contained within the
223  // given horizontal range.
224  TBOX BoundsWithinLimits(int left, int right);
225 
226  // Estimates and stores the baseline position based on the shape of the
227  // outline.
229 
230  // Simple accessors.
231  const TBOX& bounding_box() const {
232  return box;
233  }
234  // Set the bounding box. Use with caution.
235  // Normally use compute_bounding_box instead.
236  void set_bounding_box(const TBOX& new_box) {
237  box = new_box;
238  base_char_top_ = box.top();
240  }
243  base_char_top_ = box.top();
245  baseline_y_ = box.bottom();
246  }
247  const TBOX& reduced_box() const {
248  return red_box;
249  }
250  void set_reduced_box(TBOX new_box) {
251  red_box = new_box;
252  reduced = true;
253  }
254  int32_t enclosed_area() const {
255  return area;
256  }
257  bool joined_to_prev() const {
258  return joined != 0;
259  }
260  bool red_box_set() const {
261  return reduced != 0;
262  }
263  int repeated_set() const {
264  return repeated_set_;
265  }
266  void set_repeated_set(int set_id) {
267  repeated_set_ = set_id;
268  }
269  C_BLOB *cblob() const {
270  return cblob_ptr;
271  }
272  TabType left_tab_type() const {
273  return left_tab_type_;
274  }
275  void set_left_tab_type(TabType new_type) {
276  left_tab_type_ = new_type;
277  }
278  TabType right_tab_type() const {
279  return right_tab_type_;
280  }
281  void set_right_tab_type(TabType new_type) {
282  right_tab_type_ = new_type;
283  }
284  BlobRegionType region_type() const {
285  return region_type_;
286  }
287  void set_region_type(BlobRegionType new_type) {
288  region_type_ = new_type;
289  }
290  BlobSpecialTextType special_text_type() const {
291  return spt_type_;
292  }
293  void set_special_text_type(BlobSpecialTextType new_type) {
294  spt_type_ = new_type;
295  }
296  BlobTextFlowType flow() const {
297  return flow_;
298  }
299  void set_flow(BlobTextFlowType value) {
300  flow_ = value;
301  }
302  bool vert_possible() const {
303  return vert_possible_;
304  }
305  void set_vert_possible(bool value) {
306  vert_possible_ = value;
307  }
308  bool horz_possible() const {
309  return horz_possible_;
310  }
311  void set_horz_possible(bool value) {
312  horz_possible_ = value;
313  }
314  int left_rule() const {
315  return left_rule_;
316  }
317  void set_left_rule(int new_left) {
318  left_rule_ = new_left;
319  }
320  int right_rule() const {
321  return right_rule_;
322  }
323  void set_right_rule(int new_right) {
324  right_rule_ = new_right;
325  }
326  int left_crossing_rule() const {
327  return left_crossing_rule_;
328  }
329  void set_left_crossing_rule(int new_left) {
330  left_crossing_rule_ = new_left;
331  }
332  int right_crossing_rule() const {
333  return right_crossing_rule_;
334  }
335  void set_right_crossing_rule(int new_right) {
336  right_crossing_rule_ = new_right;
337  }
338  float horz_stroke_width() const {
339  return horz_stroke_width_;
340  }
341  void set_horz_stroke_width(float width) {
342  horz_stroke_width_ = width;
343  }
344  float vert_stroke_width() const {
345  return vert_stroke_width_;
346  }
347  void set_vert_stroke_width(float width) {
348  vert_stroke_width_ = width;
349  }
350  float area_stroke_width() const {
351  return area_stroke_width_;
352  }
354  return owner_;
355  }
357  owner_ = new_owner;
358  }
359  bool leader_on_left() const {
360  return leader_on_left_;
361  }
362  void set_leader_on_left(bool flag) {
363  leader_on_left_ = flag;
364  }
365  bool leader_on_right() const {
366  return leader_on_right_;
367  }
368  void set_leader_on_right(bool flag) {
369  leader_on_right_ = flag;
370  }
371  BLOBNBOX* neighbour(BlobNeighbourDir n) const {
372  return neighbours_[n];
373  }
374  bool good_stroke_neighbour(BlobNeighbourDir n) const {
375  return good_stroke_neighbours_[n];
376  }
377  void set_neighbour(BlobNeighbourDir n, BLOBNBOX* neighbour, bool good) {
378  neighbours_[n] = neighbour;
379  good_stroke_neighbours_[n] = good;
380  }
381  bool IsDiacritic() const {
382  return base_char_top_ != box.top() || base_char_bottom_ != box.bottom();
383  }
384  int base_char_top() const {
385  return base_char_top_;
386  }
387  int base_char_bottom() const {
388  return base_char_bottom_;
389  }
390  int baseline_position() const {
391  return baseline_y_;
392  }
393  int line_crossings() const {
394  return line_crossings_;
395  }
396  void set_line_crossings(int value) {
397  line_crossings_ = value;
398  }
399  void set_diacritic_box(const TBOX& diacritic_box) {
400  base_char_top_ = diacritic_box.top();
401  base_char_bottom_ = diacritic_box.bottom();
402  }
404  return base_char_blob_;
405  }
407  base_char_blob_ = blob;
408  }
409  void set_owns_cblob(bool value) { owns_cblob_ = value; }
410 
411  bool UniquelyVertical() const {
412  return vert_possible_ && !horz_possible_;
413  }
414  bool UniquelyHorizontal() const {
415  return horz_possible_ && !vert_possible_;
416  }
417 
418  // Returns true if the region type is text.
419  static bool IsTextType(BlobRegionType type) {
420  return type == BRT_TEXT || type == BRT_VERT_TEXT;
421  }
422  // Returns true if the region type is image.
423  static bool IsImageType(BlobRegionType type) {
424  return type == BRT_RECTIMAGE || type == BRT_POLYIMAGE;
425  }
426  // Returns true if the region type is line.
427  static bool IsLineType(BlobRegionType type) {
428  return type == BRT_HLINE || type == BRT_VLINE;
429  }
430  // Returns true if the region type cannot be merged.
431  static bool UnMergeableType(BlobRegionType type) {
432  return IsLineType(type) || IsImageType(type);
433  }
434  // Helper to call CleanNeighbours on all blobs on the list.
435  static void CleanNeighbours(BLOBNBOX_LIST* blobs);
436  // Helper to delete all the deletable blobs on the list.
437  static void DeleteNoiseBlobs(BLOBNBOX_LIST* blobs);
438  // Helper to compute edge offsets for all the blobs on the list.
439  // See coutln.h for an explanation of edge offsets.
440  static void ComputeEdgeOffsets(Pix* thresholds, Pix* grey,
441  BLOBNBOX_LIST* blobs);
442 
443 #ifndef GRAPHICS_DISABLED
444  // Helper to draw all the blobs on the list in the given body_colour,
445  // with child outlines in the child_colour.
446  static void PlotBlobs(BLOBNBOX_LIST* list,
447  ScrollView::Color body_colour,
448  ScrollView::Color child_colour,
449  ScrollView* win);
450  // Helper to draw only DeletableNoise blobs (unowned, BRT_NOISE) on the
451  // given list in the given body_colour, with child outlines in the
452  // child_colour.
453  static void PlotNoiseBlobs(BLOBNBOX_LIST* list,
454  ScrollView::Color body_colour,
455  ScrollView::Color child_colour,
456  ScrollView* win);
457 
458  static ScrollView::Color TextlineColor(BlobRegionType region_type,
459  BlobTextFlowType flow_type);
460 
461  // Keep in sync with BlobRegionType.
462  ScrollView::Color BoxColor() const;
463 
464  void plot(ScrollView* window, // window to draw in
465  ScrollView::Color blob_colour, // for outer bits
466  ScrollView::Color child_colour); // for holes
467 #endif
468 
469  // Initializes the bulk of the members to default values for use at
470  // construction time.
472  cblob_ptr = nullptr;
473  owns_cblob_ = false;
474  area = 0;
475  area_stroke_width_ = 0.0f;
476  horz_stroke_width_ = 0.0f;
477  vert_stroke_width_ = 0.0f;
478  ReInit();
479  }
480  // Initializes members set by StrokeWidth and beyond, without discarding
481  // stored area and strokewidth values, which are expensive to calculate.
482  void ReInit() {
483  joined = false;
484  reduced = false;
485  repeated_set_ = 0;
486  left_tab_type_ = TT_NONE;
487  right_tab_type_ = TT_NONE;
488  region_type_ = BRT_UNKNOWN;
489  flow_ = BTFT_NONE;
490  spt_type_ = BSTT_SKIP;
491  left_rule_ = 0;
492  right_rule_ = 0;
495  if (area_stroke_width_ == 0.0f && area > 0 && cblob() != nullptr
496  && cblob()->perimeter()!=0)
497  area_stroke_width_ = 2.0f * area / cblob()->perimeter();
498  owner_ = nullptr;
499  base_char_top_ = box.top();
501  baseline_y_ = box.bottom();
502  line_crossings_ = 0;
503  base_char_blob_ = nullptr;
504  horz_possible_ = false;
505  vert_possible_ = false;
506  leader_on_left_ = false;
507  leader_on_right_ = false;
508  ClearNeighbours();
509  }
510 
512  for (int n = 0; n < BND_COUNT; ++n) {
513  neighbours_[n] = nullptr;
514  good_stroke_neighbours_[n] = false;
515  }
516  }
517 
518  private:
519  C_BLOB *cblob_ptr; // edgestep blob
520  TBOX box; // bounding box
521  TBOX red_box; // bounding box
522  signed int area:30; // enclosed area
523  unsigned joined : 1; // joined to prev
524  unsigned reduced : 1; // reduced box set
525  int repeated_set_; // id of the set of repeated blobs
526  TabType left_tab_type_; // Indicates tab-stop assessment
527  TabType right_tab_type_; // Indicates tab-stop assessment
528  BlobRegionType region_type_; // Type of region this blob belongs to
529  BlobTextFlowType flow_; // Quality of text flow.
530  int16_t left_rule_; // x-coord of nearest but not crossing rule line
531  int16_t right_rule_; // x-coord of nearest but not crossing rule line
532  int16_t left_crossing_rule_; // x-coord of nearest or crossing rule line
533  int16_t right_crossing_rule_; // x-coord of nearest or crossing rule line
534  int16_t base_char_top_; // y-coord of top/bottom of diacritic base,
535  int16_t base_char_bottom_; // if it exists else top/bottom of this blob.
536  int16_t baseline_y_; // Estimate of baseline position.
537  int line_crossings_; // Number of line intersections touched.
538  BLOBNBOX* base_char_blob_; // The blob that was the base char.
539  float horz_stroke_width_; // Median horizontal stroke width
540  float vert_stroke_width_; // Median vertical stroke width
541  float area_stroke_width_; // Stroke width from area/perimeter ratio.
542  tesseract::ColPartition* owner_; // Who will delete me when I am not needed
543  BlobSpecialTextType spt_type_; // Special text type.
544  BLOBNBOX* neighbours_[BND_COUNT];
545  bool good_stroke_neighbours_[BND_COUNT];
546  bool horz_possible_; // Could be part of horizontal flow.
547  bool vert_possible_; // Could be part of vertical flow.
548  bool leader_on_left_; // There is a leader to the left.
549  bool leader_on_right_; // There is a leader to the right.
550  // Iff true, then the destructor should delete the cblob_ptr.
551  // TODO(rays) migrate all uses to correctly setting this flag instead of
552  // deleting the C_BLOB before deleting the BLOBNBOX.
554 };
555 
556 class TO_ROW: public ELIST2_LINK
557 {
558  public:
559  static const int kErrorWeight = 3;
560 
561  TO_ROW() {
562  clear();
563  } //empty
564  TO_ROW( //constructor
565  BLOBNBOX *blob, //from first blob
566  float top, //of row //target height
567  float bottom,
568  float row_size);
569 
570  void print() const;
571  float max_y() const { //access function
572  return y_max;
573  }
574  float min_y() const {
575  return y_min;
576  }
577  float mean_y() const {
578  return (y_min + y_max) / 2.0f;
579  }
580  float initial_min_y() const {
581  return initial_y_min;
582  }
583  float line_m() const { //access to line fit
584  return m;
585  }
586  float line_c() const {
587  return c;
588  }
589  float line_error() const {
590  return error;
591  }
592  float parallel_c() const {
593  return para_c;
594  }
595  float parallel_error() const {
596  return para_error;
597  }
598  float believability() const { //baseline goodness
599  return credibility;
600  }
601  float intercept() const { //real parallel_c
602  return y_origin;
603  }
604  void add_blob( //put in row
605  BLOBNBOX *blob, //blob to add
606  float top, //of row //target height
607  float bottom,
608  float row_size);
609  void insert_blob( //put in row in order
610  BLOBNBOX *blob);
611 
612  BLOBNBOX_LIST *blob_list() { //get list
613  return &blobs;
614  }
615 
616  void set_line( //set line spec
617  float new_m, //line to set
618  float new_c,
619  float new_error) {
620  m = new_m;
621  c = new_c;
622  error = new_error;
623  }
624  void set_parallel_line( //set fixed gradient line
625  float gradient, //page gradient
626  float new_c,
627  float new_error) {
628  para_c = new_c;
629  para_error = new_error;
630  credibility =
631  (float) (blobs.length () - kErrorWeight * new_error);
632  y_origin = (float) (new_c / sqrt (1 + gradient * gradient));
633  //real intercept
634  }
635  void set_limits( //set min,max
636  float new_min, //bottom and
637  float new_max) { //top of row
638  y_min = new_min;
639  y_max = new_max;
640  }
641  void compute_vertical_projection();
642  //get projection
643 
644  bool rep_chars_marked() const {
645  return num_repeated_sets_ != -1;
646  }
648  num_repeated_sets_ = -1;
649  }
650  int num_repeated_sets() const {
651  return num_repeated_sets_;
652  }
653  void set_num_repeated_sets(int num_sets) {
654  num_repeated_sets_ = num_sets;
655  }
656 
657  // true when dead
658  bool merged;
659  bool all_caps; // had no ascenders
660  bool used_dm_model; // in guessing pitch
661  int16_t projection_left; // start of projection
662  int16_t projection_right; // start of projection
663  PITCH_TYPE pitch_decision; // how strong is decision
664  float fixed_pitch; // pitch or 0
665  float fp_space; // sp if fixed pitch
666  float fp_nonsp; // nonsp if fixed pitch
667  float pr_space; // sp if prop
668  float pr_nonsp; // non sp if prop
669  float spacing; // to "next" row
670  float xheight; // of line
671  int xheight_evidence; // number of blobs of height xheight
672  float ascrise; // ascenders
673  float descdrop; // descenders
674  float body_size; // of CJK characters. Assumed to be
675  // xheight+ascrise for non-CJK text.
676  int32_t min_space; // min size for real space
677  int32_t max_nonspace; // max size of non-space
678  int32_t space_threshold; // space vs nonspace
679  float kern_size; // average non-space
680  float space_size; // average space
681  WERD_LIST rep_words; // repeated chars
682  ICOORDELT_LIST char_cells; // fixed pitch cells
683  QSPLINE baseline; // curved baseline
684  STATS projection; // vertical projection
685 
686  private:
687  void clear(); // clear all values to reasonable defaults
688 
689  BLOBNBOX_LIST blobs; //blobs in row
690  float y_min; //coords
691  float y_max;
693  float m, c; //line spec
694  float error; //line error
695  float para_c; //constrained fit
696  float para_error;
697  float y_origin; //rotated para_c;
698  float credibility; //baseline believability
699  int num_repeated_sets_; // number of sets of repeated blobs
700  // set to -1 if we have not searched
701  // for repeated blobs in this row yet
702 };
703 
704 ELIST2IZEH (TO_ROW)
705 class TO_BLOCK:public ELIST_LINK
706 {
707  public:
708  TO_BLOCK() : pitch_decision(PITCH_DUNNO) {
709  clear();
710  } //empty
711  TO_BLOCK( //constructor
712  BLOCK *src_block); //real block
713  ~TO_BLOCK();
714 
715  void clear(); // clear all scalar members.
716 
717  TO_ROW_LIST *get_rows() { //access function
718  return &row_list;
719  }
720 
721  // Rotate all the blobnbox lists and the underlying block. Then update the
722  // median size statistic from the blobs list.
723  void rotate(const FCOORD& rotation) {
724  BLOBNBOX_LIST* blobnbox_list[] = {&blobs, &underlines, &noise_blobs,
725  &small_blobs, &large_blobs, nullptr};
726  for (BLOBNBOX_LIST** list = blobnbox_list; *list != nullptr; ++list) {
727  BLOBNBOX_IT it(*list);
728  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
729  it.data()->rotate(rotation);
730  }
731  }
732  // Rotate the block
733  ASSERT_HOST(block->pdblk.poly_block() != nullptr);
734  block->rotate(rotation);
735  // Update the median size statistic from the blobs list.
736  STATS widths(0, block->pdblk.bounding_box().width());
737  STATS heights(0, block->pdblk.bounding_box().height());
738  BLOBNBOX_IT blob_it(&blobs);
739  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
740  widths.add(blob_it.data()->bounding_box().width(), 1);
741  heights.add(blob_it.data()->bounding_box().height(), 1);
742  }
743  block->set_median_size(static_cast<int>(widths.median() + 0.5),
744  static_cast<int>(heights.median() + 0.5));
745  }
746 
747  void print_rows() { //debug info
748  TO_ROW_IT row_it = &row_list;
749  TO_ROW *row;
750 
751  for (row_it.mark_cycle_pt(); !row_it.cycled_list();
752  row_it.forward()) {
753  row = row_it.data();
754  tprintf("Row range (%g,%g), para_c=%g, blobcount=%" PRId32 "\n",
755  row->min_y(), row->max_y(), row->parallel_c(),
756  row->blob_list()->length());
757  }
758  }
759 
760  // Reorganizes the blob lists with a different definition of small, medium
761  // and large, compared to the original definition.
762  // Height is still the primary filter key, but medium width blobs of small
763  // height become medium, and very wide blobs of small height stay small.
764  void ReSetAndReFilterBlobs();
765 
766  // Deletes noise blobs from all lists where not owned by a ColPartition.
767  void DeleteUnownedNoise();
768 
769  // Computes and stores the edge offsets on each blob for use in feature
770  // extraction, using greyscale if the supplied grey and thresholds pixes
771  // are 8-bit or otherwise (if nullptr or not 8 bit) the original binary
772  // edge step outlines.
773  // Thresholds must either be the same size as grey or an integer down-scale
774  // of grey.
775  // See coutln.h for an explanation of edge offsets.
776  void ComputeEdgeOffsets(Pix* thresholds, Pix* grey);
777 
778 #ifndef GRAPHICS_DISABLED
779  // Draw the noise blobs from all lists in red.
780  void plot_noise_blobs(ScrollView* to_win);
781  // Draw the blobs on on the various lists in the block in different colors.
782  void plot_graded_blobs(ScrollView* to_win);
783 #endif
784 
785  BLOBNBOX_LIST blobs; //medium size
786  BLOBNBOX_LIST underlines; //underline blobs
787  BLOBNBOX_LIST noise_blobs; //very small
788  BLOBNBOX_LIST small_blobs; //fairly small
789  BLOBNBOX_LIST large_blobs; //big blobs
790  BLOCK *block; //real block
791  PITCH_TYPE pitch_decision; //how strong is decision
792  float line_spacing; //estimate
793  // line_size is a lower-bound estimate of the font size in pixels of
794  // the text in the block (with ascenders and descenders), being a small
795  // (1.25) multiple of the median height of filtered blobs.
796  // In most cases the font size will be bigger, but it will be closer
797  // if the text is allcaps, or in a no-x-height script.
798  float line_size; //estimate
799  float max_blob_size; //line assignment limit
800  float baseline_offset; //phase shift
801  float xheight; //median blob size
802  float fixed_pitch; //pitch or 0
803  float kern_size; //average non-space
804  float space_size; //average space
805  int32_t min_space; //min definite space
806  int32_t max_nonspace; //max definite
807  float fp_space; //sp if fixed pitch
808  float fp_nonsp; //nonsp if fixed pitch
809  float pr_space; //sp if prop
810  float pr_nonsp; //non sp if prop
811  TO_ROW *key_row; //starting row
812 
813  private:
814  TO_ROW_LIST row_list; //temporary rows
815 };
816 
817 ELISTIZEH (TO_BLOCK)
818 extern double_VAR_H (textord_error_weight, 3,
819 "Weighting for error in believability");
820 void find_cblob_limits( //get y limits
821  C_BLOB *blob, //blob to search
822  float leftx, //x limits
823  float rightx,
824  FCOORD rotation, //for landscape
825  float &ymin, //output y limits
826  float &ymax);
827 void find_cblob_vlimits( //get y limits
828  C_BLOB *blob, //blob to search
829  float leftx, //x limits
830  float rightx,
831  float &ymin, //output y limits
832  float &ymax);
833 void find_cblob_hlimits( //get x limits
834  C_BLOB *blob, //blob to search
835  float bottomy, //y limits
836  float topy,
837  float &xmin, //output x limits
838  float &xymax);
839 C_BLOB *crotate_cblob( //rotate it
840  C_BLOB *blob, //blob to search
841  FCOORD rotation //for landscape
842  );
843 TBOX box_next( //get bounding box
844  BLOBNBOX_IT *it //iterator to blobds
845  );
846 TBOX box_next_pre_chopped( //get bounding box
847  BLOBNBOX_IT *it //iterator to blobds
848  );
849 void vertical_cblob_projection( //project outlines
850  C_BLOB *blob, //blob to project
851  STATS *stats //output
852  );
853 void vertical_coutline_projection( //project outlines
854  C_OUTLINE *outline, //outline to project
855  STATS *stats //output
856  );
857 #ifndef GRAPHICS_DISABLED
858 void plot_blob_list(ScrollView* win, // window to draw in
859  BLOBNBOX_LIST *list, // blob list
860  ScrollView::Color body_colour, // colour to draw
861  ScrollView::Color child_colour); // colour of child
862 #endif // GRAPHICS_DISABLED
863 #endif
void set_reduced_box(TBOX new_box)
Definition: blobbox.h:250
void plot(ScrollView *window, ScrollView::Color blob_colour, ScrollView::Color child_colour)
Definition: blobbox.cpp:486
void CleanNeighbours()
Definition: blobbox.cpp:215
void set_repeated_set(int set_id)
Definition: blobbox.h:266
static bool UnMergeableType(BlobRegionType type)
Definition: blobbox.h:431
void set_bounding_box(const TBOX &new_box)
Definition: blobbox.h:236
void set_region_type(BlobRegionType new_type)
Definition: blobbox.h:287
float fixed_pitch
Definition: blobbox.h:802
void set_num_repeated_sets(int num_sets)
Definition: blobbox.h:653
void set_owns_cblob(bool value)
Definition: blobbox.h:409
void translate_box(ICOORD v)
Definition: blobbox.h:175
BlobSpecialTextType special_text_type() const
Definition: blobbox.h:290
const TBOX & bounding_box() const
Definition: blobbox.h:231
void ReInit()
Definition: blobbox.h:482
void set_leader_on_right(bool flag)
Definition: blobbox.h:368
int16_t right_rule_
Definition: blobbox.h:531
bool joined_to_prev() const
Definition: blobbox.h:257
BLOBNBOX_LIST blobs
Definition: blobbox.h:785
float line_m() const
Definition: blobbox.h:583
float fp_space
Definition: blobbox.h:665
float line_spacing
Definition: blobbox.h:792
const TBOX & reduced_box() const
Definition: blobbox.h:247
float spacing
Definition: blobbox.h:669
void set_right_crossing_rule(int new_right)
Definition: blobbox.h:335
bool DeletableNoise() const
Definition: blobbox.h:204
BLOBNBOX * neighbours_[BND_COUNT]
Definition: blobbox.h:544
void set_right_tab_type(TabType new_type)
Definition: blobbox.h:281
int32_t min_space
Definition: blobbox.h:805
int32_t enclosed_area() const
Definition: blobbox.h:254
BlobTextFlowType flow() const
Definition: blobbox.h:296
float fp_nonsp
Definition: blobbox.h:666
float intercept() const
Definition: blobbox.h:601
bool IsDiacritic() const
Definition: blobbox.h:381
float mean_y() const
Definition: blobbox.h:577
float vert_stroke_width() const
Definition: blobbox.h:344
float xheight
Definition: blobbox.h:801
void print_rows()
Definition: blobbox.h:747
bool leader_on_right() const
Definition: blobbox.h:365
tesseract::ColPartition * owner() const
Definition: blobbox.h:353
static bool IsImageType(BlobRegionType type)
Definition: blobbox.h:423
void set_line_crossings(int value)
Definition: blobbox.h:396
float min_y() const
Definition: blobbox.h:574
float body_size
Definition: blobbox.h:674
BLOCK * block
Definition: blobbox.h:790
void NeighbourGaps(int gaps[BND_COUNT]) const
Definition: blobbox.cpp:182
float believability() const
Definition: blobbox.h:598
void reflect_box_in_y_axis()
Definition: blobbox.cpp:63
float parallel_c() const
Definition: blobbox.h:592
TabType left_tab_type_
Definition: blobbox.h:526
float parallel_error() const
Definition: blobbox.h:595
void set_line(float new_m, float new_c, float new_error)
Definition: blobbox.h:616
Color
Definition: scrollview.h:105
WERD_LIST rep_words
Definition: blobbox.h:681
BLOBNBOX_LIST small_blobs
Definition: blobbox.h:788
float para_c
Definition: blobbox.h:695
BlobRegionType region_type_
Definition: blobbox.h:528
int line_crossings_
Definition: blobbox.h:537
Definition: rect.h:34
float initial_min_y() const
Definition: blobbox.h:580
TBOX bounding_box() const
Definition: stepblob.cpp:255
int32_t perimeter()
Definition: stepblob.cpp:294
QSPLINE baseline
Definition: blobbox.h:683
static bool IsTextType(BlobRegionType type)
Definition: blobbox.h:419
float y_origin
Definition: blobbox.h:697
float fp_space
Definition: blobbox.h:807
float pr_nonsp
Definition: blobbox.h:668
void move(const ICOORD vec)
Definition: rect.h:157
TO_ROW * key_row
Definition: blobbox.h:811
void compute_bounding_box()
Definition: blobbox.h:241
Definition: quspline.h:32
void merge(BLOBNBOX *nextblob)
Definition: blobbox.cpp:93
bool rep_chars_marked() const
Definition: blobbox.h:644
int16_t base_char_bottom_
Definition: blobbox.h:535
signed int area
Definition: blobbox.h:522
TO_ROW_LIST * get_rows()
Definition: blobbox.h:717
C_BLOB * cblob_ptr
Definition: blobbox.h:519
Definition: baseapi.cpp:94
int num_repeated_sets() const
Definition: blobbox.h:650
void set_left_tab_type(TabType new_type)
Definition: blobbox.h:275
float horz_stroke_width_
Definition: blobbox.h:539
BLOBNBOX_LIST underlines
Definition: blobbox.h:786
tesseract::ColPartition * owner_
Definition: blobbox.h:542
static void ComputeEdgeOffsets(Pix *thresholds, Pix *grey, BLOBNBOX_LIST *blobs)
Definition: blobbox.cpp:386
int32_t area()
Definition: stepblob.cpp:275
bool red_box_set() const
Definition: blobbox.h:260
int base_char_top() const
Definition: blobbox.h:384
int xheight_evidence
Definition: blobbox.h:671
float ascrise
Definition: blobbox.h:672
void set_left_crossing_rule(int new_left)
Definition: blobbox.h:329
TO_ROW_LIST row_list
Definition: blobbox.h:814
float area_stroke_width() const
Definition: blobbox.h:350
ICOORDELT_LIST char_cells
Definition: blobbox.h:682
float max_y() const
Definition: blobbox.h:571
int16_t bottom() const
Definition: rect.h:65
BlobTextFlowType flow_
Definition: blobbox.h:529
TO_BLOCK()
Definition: blobbox.h:708
void clear_rep_chars_marked()
Definition: blobbox.h:647
void chop(BLOBNBOX_IT *start_it, BLOBNBOX_IT *blob_it, FCOORD rotation, float xheight)
Definition: blobbox.cpp:121
int16_t projection_left
Definition: blobbox.h:661
TabType right_tab_type_
Definition: blobbox.h:527
static ScrollView::Color TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type)
Definition: blobbox.cpp:445
int32_t max_nonspace
Definition: blobbox.h:806
float initial_y_min
Definition: blobbox.h:692
float pr_space
Definition: blobbox.h:809
PITCH_TYPE pitch_decision
Definition: blobbox.h:663
void set_limits(float new_min, float new_max)
Definition: blobbox.h:635
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:612
bool leader_on_left_
Definition: blobbox.h:548
bool good_stroke_neighbours_[BND_COUNT]
Definition: blobbox.h:545
static bool IsLineType(BlobRegionType type)
Definition: blobbox.h:427
BLOBNBOX(C_BLOB *srcblob)
Definition: blobbox.h:150
int right_crossing_rule() const
Definition: blobbox.h:332
double_VAR_H(textord_tabvector_vertical_gap_fraction, 0.5, "Max fraction of mean blob width allowed for vertical gaps in vertical text")
bool owns_cblob_
Definition: blobbox.h:553
Definition: blobbox.h:144
bool vert_possible_
Definition: blobbox.h:547
Definition: ocrblock.h:30
Definition: scrollview.h:102
void set_right_rule(int new_right)
Definition: blobbox.h:323
TabType right_tab_type() const
Definition: blobbox.h:278
void set_diacritic_box(const TBOX &diacritic_box)
Definition: blobbox.h:399
float fp_nonsp
Definition: blobbox.h:808
void set_parallel_line(float gradient, float new_c, float new_error)
Definition: blobbox.h:624
void set_leader_on_left(bool flag)
Definition: blobbox.h:362
int16_t y() const
access_function
Definition: points.h:57
ScrollView::Color BoxColor() const
Definition: blobbox.cpp:482
float line_c() const
Definition: blobbox.h:586
float area_stroke_width_
Definition: blobbox.h:541
BLOBNBOX * base_char_blob_
Definition: blobbox.h:538
void ConstructionInit()
Definition: blobbox.h:471
static void PlotBlobs(BLOBNBOX_LIST *list, ScrollView::Color body_colour, ScrollView::Color child_colour, ScrollView *win)
Definition: blobbox.cpp:420
void rotate(FCOORD rotation)
Definition: blobbox.cpp:56
bool used_dm_model
Definition: blobbox.h:660
float space_size
Definition: blobbox.h:680
float kern_size
Definition: blobbox.h:679
float pr_space
Definition: blobbox.h:667
BLOBNBOX * neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:371
Definition: stepblob.h:37
TBOX BoundsWithinLimits(int left, int right)
Definition: blobbox.cpp:334
bool vert_possible() const
Definition: blobbox.h:302
bool good_stroke_neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:374
unsigned joined
Definition: blobbox.h:523
int num_repeated_sets_
Definition: blobbox.h:699
BLOBNBOX_LIST noise_blobs
Definition: blobbox.h:787
bool UniquelyHorizontal() const
Definition: blobbox.h:414
unsigned reduced
Definition: blobbox.h:524
int line_crossings() const
Definition: blobbox.h:393
int right_rule() const
Definition: blobbox.h:320
void set_vert_stroke_width(float width)
Definition: blobbox.h:347
float credibility
Definition: blobbox.h:698
Definition: blobbox.h:556
int16_t projection_right
Definition: blobbox.h:662
bool all_caps
Definition: blobbox.h:659
Definition: statistc.h:33
integer coordinate
Definition: points.h:32
float m
Definition: blobbox.h:693
float kern_size
Definition: blobbox.h:803
int16_t top() const
Definition: rect.h:58
BLOBNBOX * base_char_blob() const
Definition: blobbox.h:403
bool MatchingStrokeWidth(const BLOBNBOX &other, double fractional_tolerance, double constant_tolerance) const
Definition: blobbox.cpp:306
bool leader_on_left() const
Definition: blobbox.h:359
int baseline_position() const
Definition: blobbox.h:390
int32_t space_threshold
Definition: blobbox.h:678
bool ConfirmNoTabViolation(const BLOBNBOX &other) const
Definition: blobbox.cpp:293
~BLOBNBOX()
Definition: blobbox.h:156
void rotate(const FCOORD &rotation)
Definition: blobbox.h:723
float descdrop
Definition: blobbox.h:673
static void PlotNoiseBlobs(BLOBNBOX_LIST *list, ScrollView::Color body_colour, ScrollView::Color child_colour, ScrollView *win)
Definition: blobbox.cpp:433
STATS projection
Definition: blobbox.h:684
int left_crossing_rule() const
Definition: blobbox.h:326
int16_t baseline_y_
Definition: blobbox.h:536
void EstimateBaselinePosition()
Definition: blobbox.cpp:358
int repeated_set_
Definition: blobbox.h:525
int NoisyNeighbours() const
Definition: blobbox.cpp:238
void set_base_char_blob(BLOBNBOX *blob)
Definition: blobbox.h:406
BLOBNBOX_LIST blobs
Definition: blobbox.h:689
float horz_stroke_width() const
Definition: blobbox.h:338
int base_char_bottom() const
Definition: blobbox.h:387
float vert_stroke_width_
Definition: blobbox.h:540
int16_t left_crossing_rule_
Definition: blobbox.h:532
float fixed_pitch
Definition: blobbox.h:664
float para_error
Definition: blobbox.h:696
void set_vert_possible(bool value)
Definition: blobbox.h:305
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:789
float pr_nonsp
Definition: blobbox.h:810
float y_min
Definition: blobbox.h:690
PITCH_TYPE pitch_decision
Definition: blobbox.h:791
Definition: blobbox.h:705
int repeated_set() const
Definition: blobbox.h:263
void rotate_box(FCOORD rotation)
Definition: blobbox.cpp:72
int32_t max_nonspace
Definition: blobbox.h:677
void set_left_rule(int new_left)
Definition: blobbox.h:317
int32_t min_space
Definition: blobbox.h:676
int GoodTextBlob() const
Definition: blobbox.cpp:227
bool UniquelyVertical() const
Definition: blobbox.h:411
void really_merge(BLOBNBOX *other)
Definition: blobbox.cpp:104
void ClearNeighbours()
Definition: blobbox.h:511
void set_flow(BlobTextFlowType value)
Definition: blobbox.h:299
void set_horz_possible(bool value)
Definition: blobbox.h:311
void MinMaxGapsClipped(int *h_min, int *h_max, int *v_min, int *v_max) const
Definition: blobbox.cpp:201
bool horz_possible_
Definition: blobbox.h:546
float error
Definition: blobbox.h:694
void set_neighbour(BlobNeighbourDir n, BLOBNBOX *neighbour, bool good)
Definition: blobbox.h:377
bool DefiniteIndividualFlow()
Definition: blobbox.cpp:253
bool horz_possible() const
Definition: blobbox.h:308
int16_t base_char_top_
Definition: blobbox.h:534
bool leader_on_right_
Definition: blobbox.h:549
BlobSpecialTextType spt_type_
Definition: blobbox.h:543
C_BLOB * cblob() const
Definition: blobbox.h:269
static void DeleteNoiseBlobs(BLOBNBOX_LIST *blobs)
Definition: blobbox.cpp:373
float baseline_offset
Definition: blobbox.h:800
TO_ROW()
Definition: blobbox.h:561
TBOX box
Definition: blobbox.h:520
int left_rule() const
Definition: blobbox.h:314
TBOX red_box
Definition: blobbox.h:521
int16_t left_rule_
Definition: blobbox.h:530
bool merged
Definition: blobbox.h:658
float y_max
Definition: blobbox.h:691
Definition: colpartition.h:68
Definition: points.h:189
static BLOBNBOX * RealBlob(C_OUTLINE *outline)
Definition: blobbox.h:159
float max_blob_size
Definition: blobbox.h:799
float line_error() const
Definition: blobbox.h:589
TabType left_tab_type() const
Definition: blobbox.h:272
float line_size
Definition: blobbox.h:798
void set_horz_stroke_width(float width)
Definition: blobbox.h:341
BlobRegionType region_type() const
Definition: blobbox.h:284
float xheight
Definition: blobbox.h:670
void set_owner(tesseract::ColPartition *new_owner)
Definition: blobbox.h:356
void set_special_text_type(BlobSpecialTextType new_type)
Definition: blobbox.h:293
int16_t right_crossing_rule_
Definition: blobbox.h:533
Definition: coutln.h:72
float space_size
Definition: blobbox.h:804
BLOBNBOX()
Definition: blobbox.h:147