tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
resultiterator.h
1 // File: resultiterator.h
3 // Description: Iterator for tesseract results that is capable of
4 // iterating in proper reading order over Bi Directional
5 // (e.g. mixed Hebrew and English) text.
6 // Author: David Eger
7 // Created: Fri May 27 13:58:06 PST 2011
8 //
9 // (C) Copyright 2011, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
21 
22 #ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_
23 #define TESSERACT_CCMAIN_RESULT_ITERATOR_H_
24 
25 #include <set> // for std::pair
26 #include <vector> // for std::vector
27 #include "ltrresultiterator.h" // for LTRResultIterator
28 #include "platform.h" // for TESS_API, TESS_LOCAL
29 #include "publictypes.h" // for PageIteratorLevel
30 #include "unichar.h" // for StrongScriptDirection
31 
32 template <typename T> class GenericVector;
33 template <typename T> class GenericVectorEqEq;
34 
35 class STRING;
36 
37 namespace tesseract {
38 
39 class Tesseract;
40 
41 class TESS_API ResultIterator : public LTRResultIterator {
42  public:
43  static ResultIterator *StartOfParagraph(const LTRResultIterator &resit);
44 
49  virtual ~ResultIterator() = default;
50 
51  // ============= Moving around within the page ============.
56  virtual void Begin();
57 
70  virtual bool Next(PageIteratorLevel level);
71 
78  virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
79 
85  virtual bool IsAtFinalElement(PageIteratorLevel level,
86  PageIteratorLevel element) const;
87 
88  // ============= Functions that refer to words only ============.
89  // Returns the number of blanks before the current word.
90  int BlanksBeforeWord() const;
91 
92  // ============= Accessing data ==============.
93 
98  virtual char* GetUTF8Text(PageIteratorLevel level) const;
99 
103  virtual std::vector<std::vector<std::pair<const char*, float>>>* GetBestLSTMSymbolChoices() const;
104 
109  bool ParagraphIsLtr() const;
110 
111  // ============= Exposed only for testing =============.
112 
135  static void CalculateTextlineOrder(
136  bool paragraph_is_ltr,
137  const GenericVector<StrongScriptDirection> &word_dirs,
138  GenericVectorEqEq<int> *reading_order);
139 
140  static const int kMinorRunStart;
141  static const int kMinorRunEnd;
142  static const int kComplexWord;
143 
144  protected:
151  TESS_LOCAL explicit ResultIterator(const LTRResultIterator &resit);
152 
153  private:
158  bool CurrentParagraphIsLtr() const;
159 
171  void CalculateTextlineOrder(bool paragraph_is_ltr,
172  const LTRResultIterator &resit,
173  GenericVectorEqEq<int> *indices) const;
175  void CalculateTextlineOrder(bool paragraph_is_ltr,
176  const LTRResultIterator &resit,
178  GenericVectorEqEq<int> *indices) const;
179 
184  int LTRWordIndex() const;
185 
190  void CalculateBlobOrder(GenericVector<int> *blob_indices) const;
191 
193  void MoveToLogicalStartOfTextline();
194 
199  void MoveToLogicalStartOfWord();
200 
202  bool IsAtFinalSymbolOfWord() const;
203 
205  bool IsAtFirstSymbolOfWord() const;
206 
211  void AppendSuffixMarks(STRING *text) const;
212 
214  void AppendUTF8WordText(STRING *text) const;
215 
223  void IterateAndAppendUTF8TextlineText(STRING *text);
224 
231  void AppendUTF8ParagraphText(STRING *text) const;
232 
234  bool BidiDebug(int min_level) const;
235 
237 
243 
246 
252 };
253 
254 } // namespace tesseract.
255 
256 #endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H_
Definition: resultiterator.h:33
bool current_paragraph_is_ltr_
Definition: resultiterator.h:236
static const int kMinorRunEnd
Definition: resultiterator.h:141
static const int kMinorRunStart
Definition: resultiterator.h:140
static const int kComplexWord
Definition: resultiterator.h:142
bool preserve_interword_spaces_
Definition: resultiterator.h:251
Definition: resultiterator.h:41
Definition: baseapi.cpp:94
PageIteratorLevel
Definition: publictypes.h:219
Definition: baseapi.h:37
Definition: strngs.h:45
Definition: ltrresultiterator.h:48
bool in_minor_direction_
Definition: resultiterator.h:245
bool at_beginning_of_minor_run_
Definition: resultiterator.h:242