tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
colpartitionset.h
1 // File: colpartitionset.h
3 // Description: Class to hold a list of ColPartitions of the page that
4 // correspond roughly to columns.
5 // Author: Ray Smith
6 // Created: Thu Aug 14 10:50:01 PDT 2008
7 //
8 // (C) Copyright 2008, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #ifndef TESSERACT_TEXTORD_COLPARTITIONSET_H_
22 #define TESSERACT_TEXTORD_COLPARTITIONSET_H_
23 
24 #include "colpartition.h" // For ColPartition_LIST.
25 #include "genericvector.h" // For GenericVector.
26 #include "rect.h" // For TBOX.
27 #include "tabvector.h" // For BLOBNBOX_CLIST.
28 
29 namespace tesseract {
30 
31 class WorkingPartSet_LIST;
32 class ColSegment_LIST;
33 class ColPartitionSet;
35 
36 // ColPartitionSet is a class that holds a list of ColPartitions.
37 // Its main use is in holding a candidate partitioning of the width of the
38 // image into columns, where each member ColPartition is a single column.
39 // ColPartitionSets are used in building the column layout of a page.
40 class ColPartitionSet : public ELIST_LINK {
41  public:
42  ColPartitionSet() = default;
43  explicit ColPartitionSet(ColPartition_LIST* partitions);
44  explicit ColPartitionSet(ColPartition* partition);
45 
46  ~ColPartitionSet() = default;
47 
48  // Simple accessors.
49  const TBOX& bounding_box() const {
50  return bounding_box_;
51  }
52  bool Empty() const {
53  return parts_.empty();
54  }
55  int ColumnCount() const {
56  return parts_.length();
57  }
58 
59  // Returns the number of columns of good width.
60  int GoodColumnCount() const;
61 
62  // Return an element of the parts_ list from its index.
63  ColPartition* GetColumnByIndex(int index);
64 
65  // Return the ColPartition that contains the given coords, if any, else nullptr.
66  ColPartition* ColumnContaining(int x, int y);
67 
68  // Return the bounding boxes of columns at the given y-range
69  void GetColumnBoxes(int y_bottom, int y_top, ColSegment_LIST *segments);
70 
71  // Extract all the parts from the list, relinquishing ownership.
72  void RelinquishParts();
73 
74  // Attempt to improve this by adding partitions or expanding partitions.
76 
77  // If this set is good enough to represent a new partitioning into columns,
78  // add it to the vector of sets, otherwise delete it.
79  void AddToColumnSetsIfUnique(PartSetVector* column_sets, WidthCallback* cb);
80 
81  // Return true if the partitions in other are all compatible with the columns
82  // in this.
83  bool CompatibleColumns(bool debug, ColPartitionSet* other, WidthCallback* cb);
84 
85  // Returns the total width of all blobs in the part_set that do not lie
86  // within an approved column. Used as a cost measure for using this
87  // column set over another that might be compatible.
88  int UnmatchedWidth(ColPartitionSet* part_set);
89 
90  // Return true if this ColPartitionSet makes a legal column candidate by
91  // having legal individual partitions and non-overlapping adjacent pairs.
92  bool LegalColumnCandidate();
93 
94  // Return a copy of this. If good_only will only copy the Good ColPartitions.
95  ColPartitionSet* Copy(bool good_only);
96 
97  // Display the edges of the columns at the given y coords.
98  void DisplayColumnEdges(int y_bottom, int y_top, ScrollView* win);
99 
100  // Return the ColumnSpanningType that best explains the columns overlapped
101  // by the given coords(left,right,y), with the given margins.
102  // Also return the first and last column index touched by the coords and
103  // the leftmost spanned column.
104  // Column indices are 2n + 1 for real columns (0 based) and even values
105  // represent the gaps in between columns, with 0 being left of the leftmost.
106  // resolution refers to the ppi resolution of the image. It may be 0 if only
107  // the first_col and last_col are required.
108  ColumnSpanningType SpanningType(int resolution,
109  int left, int right, int height, int y,
110  int left_margin, int right_margin,
111  int* first_col, int* last_col,
112  int* first_spanned_col);
113 
114  // The column_set has changed. Close down all in-progress WorkingPartSets in
115  // columns that do not match and start new ones for the new columns in this.
116  // As ColPartitions are turned into BLOCKs, the used ones are put in
117  // used_parts, as they still need to be referenced in the grid.
118  void ChangeWorkColumns(const ICOORD& bleft, const ICOORD& tright,
119  int resolution, ColPartition_LIST* used_parts,
120  WorkingPartSet_LIST* working_set);
121 
122  // Accumulate the widths and gaps into the given variables.
123  void AccumulateColumnWidthsAndGaps(int* total_width, int* width_samples,
124  int* total_gap, int* gap_samples);
125 
126  // Provide debug output for this ColPartitionSet and all the ColPartitions.
127  void Print();
128 
129  private:
130  // Add the given partition to the list in the appropriate place.
131  void AddPartition(ColPartition* new_part, ColPartition_IT* it);
132 
133  // Compute the coverage and good column count. Coverage is the amount of the
134  // width of the page (in pixels) that is covered by ColPartitions, which are
135  // used to provide candidate column layouts.
136  // Coverage is split into good and bad. Good coverage is provided by
137  // ColPartitions of a frequent width (according to the callback function
138  // provided by TabFinder::WidthCB, which accesses stored statistics on the
139  // widths of ColParititions) and bad coverage is provided by all other
140  // ColPartitions, even if they have tab vectors at both sides. Thus:
141  // |-----------------------------------------------------------------|
142  // | Double width heading |
143  // |-----------------------------------------------------------------|
144  // |-------------------------------| |-------------------------------|
145  // | Common width ColParition | | Common width ColPartition |
146  // |-------------------------------| |-------------------------------|
147  // the layout with two common-width columns has better coverage than the
148  // double width heading, because the coverage is "good," even though less in
149  // total coverage than the heading, because the heading coverage is "bad."
150  void ComputeCoverage();
151 
152  // Adds the coverage, column count and box for a single partition,
153  // without adding it to the list. (Helper factored from ComputeCoverage.)
154  void AddPartitionCoverageAndBox(const ColPartition& part);
155 
156  // The partitions in this column candidate.
157  ColPartition_LIST parts_;
158  // The number of partitions that have a frequent column width.
160  // Total width of all the good ColPartitions.
162  // Total width of all the bad ColPartitions.
164  // Bounding box of all partitions in the set.
166 };
167 
168 ELISTIZEH(ColPartitionSet)
169 
170 } // namespace tesseract.
171 
172 #endif // TESSERACT_TEXTORD_COLPARTITION_H_
void AccumulateColumnWidthsAndGaps(int *total_width, int *width_samples, int *total_gap, int *gap_samples)
Definition: colpartitionset.cpp:572
bool CompatibleColumns(bool debug, ColPartitionSet *other, WidthCallback *cb)
Definition: colpartitionset.cpp:223
bool Empty() const
Definition: colpartitionset.h:52
int ColumnCount() const
Definition: colpartitionset.h:55
Definition: colpartitionset.h:40
bool LegalColumnCandidate()
Definition: colpartitionset.cpp:331
void AddPartitionCoverageAndBox(const ColPartition &part)
Definition: colpartitionset.cpp:651
Definition: rect.h:34
ColPartition * ColumnContaining(int x, int y)
Definition: colpartitionset.cpp:70
int good_coverage_
Definition: colpartitionset.h:161
const TBOX & bounding_box() const
Definition: colpartitionset.h:49
Definition: baseapi.cpp:94
ColPartition * GetColumnByIndex(int index)
Definition: colpartitionset.cpp:60
void AddToColumnSetsIfUnique(PartSetVector *column_sets, WidthCallback *cb)
Definition: colpartitionset.cpp:175
Definition: scrollview.h:102
void ComputeCoverage()
Definition: colpartitionset.cpp:636
void DisplayColumnEdges(int y_bottom, int y_top, ScrollView *win)
Definition: colpartitionset.cpp:386
Definition: baseapi.h:37
void ImproveColumnCandidate(WidthCallback *cb, PartSetVector *src_sets)
Definition: colpartitionset.cpp:90
Definition: tesscallback.h:1673
int good_column_count_
Definition: colpartitionset.h:159
integer coordinate
Definition: points.h:32
void GetColumnBoxes(int y_bottom, int y_top, ColSegment_LIST *segments)
Definition: colpartitionset.cpp:370
int GoodColumnCount() const
Definition: colpartitionset.cpp:49
void ChangeWorkColumns(const ICOORD &bleft, const ICOORD &tright, int resolution, ColPartition_LIST *used_parts, WorkingPartSet_LIST *working_set)
Definition: colpartitionset.cpp:500
ColPartition_LIST parts_
Definition: colpartitionset.h:157
void Print()
Definition: colpartitionset.cpp:593
void AddPartition(ColPartition *new_part, ColPartition_IT *it)
Definition: colpartitionset.cpp:609
ColPartitionSet * Copy(bool good_only)
Definition: colpartitionset.cpp:354
void RelinquishParts()
Definition: colpartitionset.cpp:81
ColumnSpanningType
Definition: colpartition.h:48
int UnmatchedWidth(ColPartitionSet *part_set)
Definition: colpartitionset.cpp:306
Definition: colpartition.h:68
ColumnSpanningType SpanningType(int resolution, int left, int right, int height, int y, int left_margin, int right_margin, int *first_col, int *last_col, int *first_spanned_col)
Definition: colpartitionset.cpp:405
int bad_coverage_
Definition: colpartitionset.h:163
TBOX bounding_box_
Definition: colpartitionset.h:165