tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
paragraphs.h
1 /**********************************************************************
2  * File: paragraphs.h
3  * Description: Paragraph Detection data structures.
4  * Author: David Eger
5  * Created: 25 February 2011
6  *
7  * (C) Copyright 2011, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifndef TESSERACT_CCMAIN_PARAGRAPHS_H_
21 #define TESSERACT_CCMAIN_PARAGRAPHS_H_
22 
23 #include "rect.h" // for TBOX
24 #include "strngs.h" // for STRING
25 
26 class PARA_LIST;
27 class ParagraphModel;
28 
29 struct PARA;
30 
31 template <typename T> class GenericVector;
32 
33 namespace tesseract {
34 
35 class MutableIterator;
36 
37 // This structure captures all information needed about a text line for the
38 // purposes of paragraph detection. It is meant to be exceedingly light-weight
39 // so that we can easily test paragraph detection independent of the rest of
40 // Tesseract.
41 class RowInfo {
42  public:
43  // Constant data derived from Tesseract output.
44  STRING text; // the full UTF-8 text of the line.
45  bool ltr; // whether the majority of the text is left-to-right
46  // TODO(eger) make this more fine-grained.
47 
48  bool has_leaders; // does the line contain leader dots (.....)?
49  bool has_drop_cap; // does the line have a drop cap?
50  int pix_ldistance; // distance to the left pblock boundary in pixels
51  int pix_rdistance; // distance to the right pblock boundary in pixels
52  float pix_xheight; // guessed xheight for the line
53  int average_interword_space; // average space between words in pixels.
54 
55  int num_words;
56  TBOX lword_box; // in normalized (horiz text rows) space
57  TBOX rword_box; // in normalized (horiz text rows) space
58 
59  STRING lword_text; // the UTF-8 text of the leftmost werd
60  STRING rword_text; // the UTF-8 text of the rightmost werd
61 
62  // The text of a paragraph typically starts with the start of an idea and
63  // ends with the end of an idea. Here we define paragraph as something that
64  // may have a first line indent and a body indent which may be different.
65  // Typical words that start an idea are:
66  // 1. Words in western scripts that start with
67  // a capital letter, for example "The"
68  // 2. Bulleted or numbered list items, for
69  // example "2."
70  // Typical words which end an idea are words ending in punctuation marks. In
71  // this vocabulary, each list item is represented as a paragraph.
75 
79 };
80 
81 // Main entry point for Paragraph Detection Algorithm.
82 //
83 // Given a set of equally spaced textlines (described by row_infos),
84 // Split them into paragraphs. See http://goto/paragraphstalk
85 //
86 // Output:
87 // row_owners - one pointer for each row, to the paragraph it belongs to.
88 // paragraphs - this is the actual list of PARA objects.
89 // models - the list of paragraph models referenced by the PARA objects.
90 // caller is responsible for deleting the models.
91 void DetectParagraphs(int debug_level,
92  GenericVector<RowInfo> *row_infos,
93  GenericVector<PARA *> *row_owners,
94  PARA_LIST *paragraphs,
96 
97 // Given a MutableIterator to the start of a block, run DetectParagraphs on
98 // that block and commit the results to the underlying ROW and BLOCK structs,
99 // saving the ParagraphModels in models. Caller owns the models.
100 // We use unicharset during the function to answer questions such as "is the
101 // first letter of this word upper case?"
102 void DetectParagraphs(int debug_level,
103  bool after_text_recognition,
104  const MutableIterator *block_start,
106 
107 } // namespace
108 
109 #endif // TESSERACT_CCMAIN_PARAGRAPHS_H_
void DetectParagraphs(int debug_level, GenericVector< RowInfo > *row_infos, GenericVector< PARA *> *row_owners, PARA_LIST *paragraphs, GenericVector< ParagraphModel *> *models)
Definition: paragraphs.cpp:2271
Definition: ocrpara.h:114
bool lword_likely_starts_idea
Definition: paragraphs.h:73
int pix_rdistance
Definition: paragraphs.h:51
bool rword_likely_ends_idea
Definition: paragraphs.h:78
int pix_ldistance
Definition: paragraphs.h:50
Definition: paragraphs.h:41
Definition: rect.h:34
STRING lword_text
Definition: paragraphs.h:59
STRING text
Definition: paragraphs.h:44
bool lword_indicates_list_item
Definition: paragraphs.h:72
Definition: baseapi.cpp:94
TBOX rword_box
Definition: paragraphs.h:57
bool rword_indicates_list_item
Definition: paragraphs.h:76
bool has_leaders
Definition: paragraphs.h:48
int num_words
Definition: paragraphs.h:55
bool ltr
Definition: paragraphs.h:45
bool lword_likely_ends_idea
Definition: paragraphs.h:74
int average_interword_space
Definition: paragraphs.h:53
Definition: baseapi.h:37
bool rword_likely_starts_idea
Definition: paragraphs.h:77
Definition: strngs.h:45
bool has_drop_cap
Definition: paragraphs.h:49
Definition: mutableiterator.h:44
TBOX lword_box
Definition: paragraphs.h:56
STRING rword_text
Definition: paragraphs.h:60
float pix_xheight
Definition: paragraphs.h:52
Definition: ocrpara.h:29