tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
pageiterator.h
1 // File: pageiterator.h
3 // Description: Iterator for tesseract page structure that avoids using
4 // tesseract internal data structures.
5 // Author: Ray Smith
6 // Created: Fri Feb 26 11:01:06 PST 2010
7 //
8 // (C) Copyright 2010, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_
22 #define TESSERACT_CCMAIN_PAGEITERATOR_H_
23 
24 #include "publictypes.h"
25 #include "platform.h"
26 
27 struct BlamerBundle;
28 class C_BLOB_IT;
29 class PAGE_RES;
30 class PAGE_RES_IT;
31 class WERD;
32 struct Pix;
33 struct Pta;
34 
35 namespace tesseract {
36 
37 class Tesseract;
38 
52 class TESS_API PageIterator {
53  public:
69  int scale, int scaled_yres,
70  int rect_left, int rect_top,
71  int rect_width, int rect_height);
72  virtual ~PageIterator();
73 
80  PageIterator(const PageIterator& src);
81  const PageIterator& operator=(const PageIterator& src);
82 
84  bool PositionedAtSameWord(const PAGE_RES_IT* other) const;
85 
86  // ============= Moving around within the page ============.
87 
92  virtual void Begin();
93 
99  virtual void RestartParagraph();
100 
105  bool IsWithinFirstTextlineOfParagraph() const;
106 
112  virtual void RestartRow();
113 
125  virtual bool Next(PageIteratorLevel level);
126 
140  virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
141 
158  virtual bool IsAtFinalElement(PageIteratorLevel level,
159  PageIteratorLevel element) const;
160 
167  int Cmp(const PageIterator &other) const;
168 
169  // ============= Accessing data ==============.
170  // Coordinate system:
171  // Integer coordinates are at the cracks between the pixels.
172  // The top-left corner of the top-left pixel in the image is at (0,0).
173  // The bottom-right corner of the bottom-right pixel in the image is at
174  // (width, height).
175  // Every bounding box goes from the top-left of the top-left contained
176  // pixel to the bottom-right of the bottom-right contained pixel, so
177  // the bounding box of the single top-left pixel in the image is:
178  // (0,0)->(1,1).
179  // If an image rectangle has been set in the API, then returned coordinates
180  // relate to the original (full) image, rather than the rectangle.
181 
191  void SetBoundingBoxComponents(bool include_upper_dots,
192  bool include_lower_dots) {
193  include_upper_dots_ = include_upper_dots;
194  include_lower_dots_ = include_lower_dots;
195  }
196 
206  bool BoundingBox(PageIteratorLevel level,
207  int* left, int* top, int* right, int* bottom) const;
208  bool BoundingBox(PageIteratorLevel level, const int padding,
209  int* left, int* top, int* right, int* bottom) const;
215  bool BoundingBoxInternal(PageIteratorLevel level,
216  int* left, int* top, int* right, int* bottom) const;
217 
219  bool Empty(PageIteratorLevel level) const;
220 
225  PolyBlockType BlockType() const;
226 
234  Pta* BlockPolygon() const;
235 
242  Pix* GetBinaryImage(PageIteratorLevel level) const;
243 
255  Pix* GetImage(PageIteratorLevel level, int padding, Pix* original_img,
256  int* left, int* top) const;
257 
264  bool Baseline(PageIteratorLevel level,
265  int* x1, int* y1, int* x2, int* y2) const;
266 
275  void Orientation(tesseract::Orientation *orientation,
276  tesseract::WritingDirection *writing_direction,
277  tesseract::TextlineOrder *textline_order,
278  float *deskew_angle) const;
279 
308  void ParagraphInfo(tesseract::ParagraphJustification *justification,
309  bool *is_list_item,
310  bool *is_crown,
311  int *first_line_indent) const;
312 
313  // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle
314  // of the current word to the given pointer (takes ownership of the pointer)
315  // and returns true.
316  // Can only be used when iterating on the word level.
317  bool SetWordBlamerBundle(BlamerBundle *blamer_bundle);
318 
319  protected:
324  TESS_LOCAL void BeginWord(int offset);
325 
349  C_BLOB_IT* cblob_it_;
354  int scale_;
360 };
361 
362 } // namespace tesseract.
363 
364 #endif // TESSERACT_CCMAIN_PAGEITERATOR_H_
WritingDirection
Definition: publictypes.h:135
bool include_upper_dots_
Definition: pageiterator.h:351
Definition: werd.h:59
C_BLOB_IT * cblob_it_
Definition: pageiterator.h:349
bool include_lower_dots_
Definition: pageiterator.h:352
int rect_width_
Definition: pageiterator.h:358
WERD * word_
Definition: pageiterator.h:339
PAGE_RES * page_res_
Definition: pageiterator.h:327
Definition: baseapi.cpp:94
int rect_top_
Definition: pageiterator.h:357
PageIteratorLevel
Definition: publictypes.h:219
void SetBoundingBoxComponents(bool include_upper_dots, bool include_lower_dots)
Definition: pageiterator.h:191
Definition: pageres.h:675
Definition: blamer.h:100
ParagraphJustification
Definition: publictypes.h:251
int rect_height_
Definition: pageiterator.h:359
int rect_left_
Definition: pageiterator.h:356
Orientation
Definition: publictypes.h:120
Tesseract * tesseract_
Definition: pageiterator.h:329
Definition: tesseractclass.h:173
PAGE_RES_IT * it_
Definition: pageiterator.h:334
Definition: pageres.h:77
int blob_index_
Definition: pageiterator.h:343
TextlineOrder
Definition: publictypes.h:152
int scaled_yres_
Definition: pageiterator.h:355
int word_length_
Definition: pageiterator.h:341
int scale_
Definition: pageiterator.h:354
Definition: pageiterator.h:52