tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
ocrpara.h
1 // File: ocrpara.h
3 // Description: OCR Paragraph Output Type
4 // Author: David Eger
5 // Created: 2010-11-15
6 //
7 // (C) Copyright 2010, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifndef TESSERACT_CCSTRUCT_OCRPARA_H_
21 #define TESSERACT_CCSTRUCT_OCRPARA_H_
22 
23 #include "publictypes.h"
24 #include "elst.h"
25 #include "strngs.h"
26 
27 class ParagraphModel;
28 
29 struct PARA : public ELIST_LINK {
30  public:
31  PARA() : model(nullptr), is_list_item(false),
33 
34  // We do not own the model, we just reference it.
35  // model may be nullptr if there is not a good model for this paragraph.
37 
39 
40  // The first paragraph on a page often lacks a first line indent, but should
41  // still be modeled by the same model as other body text paragraphs on the
42  // page.
44 
45  // Does this paragraph begin with a drop cap?
47 };
48 
49 ELISTIZEH(PARA)
50 
51 // A geometric model of paragraph indentation and alignment.
52 //
53 // Measurements are in pixels. The meaning of the integer arguments changes
54 // depending upon the value of justification. Distances less than or equal
55 // to tolerance apart we take as "equivalent" for the purpose of model
56 // matching, and in the examples below, we assume tolerance is zero.
57 //
58 // justification = LEFT:
59 // margin the "ignored" margin to the left block edge.
60 // first_indent indent from the left margin to a typical first text line.
61 // body_indent indent from the left margin of a typical body text line.
62 //
63 // justification = RIGHT:
64 // margin the "ignored" margin to the right block edge.
65 // first_indent indent from the right margin to a typical first text line.
66 // body_indent indent from the right margin of a typical body text line.
67 //
68 // justification = CENTER:
69 // margin ignored
70 // first_indent ignored
71 // body_indent ignored
72 //
73 // ====== Extended example, assuming each letter is ten pixels wide: =======
74 //
75 // +--------------------------------+
76 // | Awesome | ParagraphModel(CENTER, 0, 0, 0)
77 // | Centered Title |
78 // | Paragraph Detection |
79 // | OCR TEAM |
80 // | 10 November 2010 |
81 // | |
82 // | Look here, I have a paragraph.| ParagraphModel(LEFT, 0, 20, 0)
83 // |This paragraph starts at the top|
84 // |of the page and takes 3 lines. |
85 // | Here I have a second paragraph| ParagraphModel(LEFT, 0, 20, 0)
86 // |which indicates that the first |
87 // |paragraph is not a continuation |
88 // |from a previous page, as it is |
89 // |indented just like this second |
90 // |paragraph. |
91 // | Here is a block quote. It | ParagraphModel(LEFT, 30, 0, 0)
92 // | looks like the prior text |
93 // | but it is indented more |
94 // | and is fully justified. |
95 // | So how does one deal with | ParagraphModel(LEFT, 0, 20, 0)
96 // |centered text, block quotes, |
97 // |normal paragraphs, and lists |
98 // |like what follows? |
99 // |1. Make a plan. | ParagraphModel(LEFT, 0, 0, 30)
100 // |2. Use a heuristic, for example,| ParagraphModel(LEFT, 0, 0, 30)
101 // | looking for lines where the |
102 // | first word of the next line |
103 // | would fit on the previous |
104 // | line. |
105 // |8. Try to implement the plan in | ParagraphModel(LEFT, 0, 0, 30)
106 // | Python and try it out. |
107 // |4. Determine how to fix the | ParagraphModel(LEFT, 0, 0, 30)
108 // | mistakes. |
109 // |5. Repeat. | ParagraphModel(LEFT, 0, 0, 30)
110 // | For extra painful penalty work| ParagraphModel(LEFT, 0, 20, 0)
111 // |you can try to identify source |
112 // |code. Ouch! |
113 // +--------------------------------+
115  public:
117  int margin,
118  int first_indent,
119  int body_indent,
120  int tolerance)
121  : justification_(justification),
122  margin_(margin),
123  first_indent_(first_indent),
124  body_indent_(body_indent),
125  tolerance_(tolerance) {
126  // Make one of {first_indent, body_indent} is 0.
127  int added_margin = first_indent;
128  if (body_indent < added_margin)
129  added_margin = body_indent;
130  margin_ += added_margin;
131  first_indent_ -= added_margin;
132  body_indent_ -= added_margin;
133  }
134 
136  : justification_(tesseract::JUSTIFICATION_UNKNOWN),
137  margin_(0),
138  first_indent_(0),
139  body_indent_(0),
140  tolerance_(0) { }
141 
142  // ValidFirstLine() and ValidBodyLine() take arguments describing a text line
143  // in a block of text which we are trying to model:
144  // lmargin, lindent: these add up to the distance from the leftmost ink
145  // in the text line to the surrounding text block's left
146  // edge.
147  // rmargin, rindent: these add up to the distance from the rightmost ink
148  // in the text line to the surrounding text block's right
149  // edge.
150  // The caller determines the division between "margin" and "indent", which
151  // only actually affect whether we think the line may be centered.
152  //
153  // If the amount of whitespace matches the amount of whitespace expected on
154  // the relevant side of the line (within tolerance_) we say it matches.
155 
156  // Return whether a given text line could be a first paragraph line according
157  // to this paragraph model.
158  bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const;
159 
160  // Return whether a given text line could be a first paragraph line according
161  // to this paragraph model.
162  bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const;
163 
165  return justification_;
166  }
167  int margin() const { return margin_; }
168  int first_indent() const { return first_indent_; }
169  int body_indent() const { return body_indent_; }
170  int tolerance() const { return tolerance_; }
171  bool is_flush() const {
172  return (justification_ == tesseract::JUSTIFICATION_LEFT ||
173  justification_ == tesseract::JUSTIFICATION_RIGHT) &&
174  abs(first_indent_ - body_indent_) <= tolerance_;
175  }
176 
177  // Return whether this model is likely to agree with the other model on most
178  // paragraphs they are marked.
179  bool Comparable(const ParagraphModel &other) const;
180 
181  STRING ToString() const;
182 
183  private:
185  int margin_;
189 };
190 
191 #endif // TESSERACT_CCSTRUCT_OCRPARA_H_
const ParagraphModel * model
Definition: ocrpara.h:36
Definition: ocrpara.h:114
int first_indent() const
Definition: ocrpara.h:168
bool is_very_first_or_continuation
Definition: ocrpara.h:43
tesseract::ParagraphJustification justification_
Definition: ocrpara.h:184
Definition: baseapi.cpp:94
bool is_list_item
Definition: ocrpara.h:38
ParagraphModel()
Definition: ocrpara.h:135
ParagraphJustification
Definition: publictypes.h:251
Definition: strngs.h:45
int first_indent_
Definition: ocrpara.h:186
Definition: publictypes.h:255
int margin_
Definition: ocrpara.h:185
int body_indent() const
Definition: ocrpara.h:169
bool has_drop_cap
Definition: ocrpara.h:46
PARA()
Definition: ocrpara.h:31
ParagraphModel(tesseract::ParagraphJustification justification, int margin, int first_indent, int body_indent, int tolerance)
Definition: ocrpara.h:116
int tolerance_
Definition: ocrpara.h:188
bool is_flush() const
Definition: ocrpara.h:171
Definition: publictypes.h:253
int body_indent_
Definition: ocrpara.h:187
int tolerance() const
Definition: ocrpara.h:170
int margin() const
Definition: ocrpara.h:167
tesseract::ParagraphJustification justification() const
Definition: ocrpara.h:164
Definition: ocrpara.h:29