tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
tesseract::ResultIterator Class Reference

#include <resultiterator.h>

Inheritance diagram for tesseract::ResultIterator:
Collaboration diagram for tesseract::ResultIterator:

Public Member Functions

virtual ~ResultIterator ()=default
 
virtual void Begin ()
 
virtual bool Next (PageIteratorLevel level)
 
virtual bool IsAtBeginningOf (PageIteratorLevel level) const
 
virtual bool IsAtFinalElement (PageIteratorLevel level, PageIteratorLevel element) const
 
int BlanksBeforeWord () const
 
virtual char * GetUTF8Text (PageIteratorLevel level) const
 
virtual std::vector< std::vector< std::pair< const char *, float > > > * GetBestLSTMSymbolChoices () const
 
bool ParagraphIsLtr () const
 
- Public Member Functions inherited from tesseract::LTRResultIterator
 LTRResultIterator (PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
 
virtual ~LTRResultIterator ()
 
char * GetUTF8Text (PageIteratorLevel level) const
 
void SetLineSeparator (const char *new_line)
 
void SetParagraphSeparator (const char *new_para)
 
float Confidence (PageIteratorLevel level) const
 
void RowAttributes (float *row_height, float *descenders, float *ascenders) const
 
const char * WordFontAttributes (bool *is_bold, bool *is_italic, bool *is_underlined, bool *is_monospace, bool *is_serif, bool *is_smallcaps, int *pointsize, int *font_id) const
 
const char * WordRecognitionLanguage () const
 
StrongScriptDirection WordDirection () const
 
bool WordIsFromDictionary () const
 
int BlanksBeforeWord () const
 
bool WordIsNumeric () const
 
bool HasBlamerInfo () const
 
const void * GetParamsTrainingBundle () const
 
const char * GetBlamerDebug () const
 
const char * GetBlamerMisadaptionDebug () const
 
bool HasTruthString () const
 
bool EquivalentToTruth (const char *str) const
 
char * WordTruthUTF8Text () const
 
char * WordNormedUTF8Text () const
 
const char * WordLattice (int *lattice_size) const
 
bool SymbolIsSuperscript () const
 
bool SymbolIsSubscript () const
 
bool SymbolIsDropcap () const
 
- Public Member Functions inherited from tesseract::PageIterator
 PageIterator (PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
 
virtual ~PageIterator ()
 
 PageIterator (const PageIterator &src)
 
const PageIteratoroperator= (const PageIterator &src)
 
bool PositionedAtSameWord (const PAGE_RES_IT *other) const
 
virtual void RestartParagraph ()
 
bool IsWithinFirstTextlineOfParagraph () const
 
virtual void RestartRow ()
 
int Cmp (const PageIterator &other) const
 
void SetBoundingBoxComponents (bool include_upper_dots, bool include_lower_dots)
 
bool BoundingBox (PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
 
bool BoundingBox (PageIteratorLevel level, const int padding, int *left, int *top, int *right, int *bottom) const
 
bool BoundingBoxInternal (PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
 
bool Empty (PageIteratorLevel level) const
 
PolyBlockType BlockType () const
 
Pta * BlockPolygon () const
 
Pix * GetBinaryImage (PageIteratorLevel level) const
 
Pix * GetImage (PageIteratorLevel level, int padding, Pix *original_img, int *left, int *top) const
 
bool Baseline (PageIteratorLevel level, int *x1, int *y1, int *x2, int *y2) const
 
void Orientation (tesseract::Orientation *orientation, tesseract::WritingDirection *writing_direction, tesseract::TextlineOrder *textline_order, float *deskew_angle) const
 
void ParagraphInfo (tesseract::ParagraphJustification *justification, bool *is_list_item, bool *is_crown, int *first_line_indent) const
 
bool SetWordBlamerBundle (BlamerBundle *blamer_bundle)
 

Static Public Member Functions

static ResultIteratorStartOfParagraph (const LTRResultIterator &resit)
 
static void CalculateTextlineOrder (bool paragraph_is_ltr, const GenericVector< StrongScriptDirection > &word_dirs, GenericVectorEqEq< int > *reading_order)
 

Static Public Attributes

static const int kMinorRunStart = -1
 
static const int kMinorRunEnd = -2
 
static const int kComplexWord = -3
 

Protected Member Functions

TESS_LOCAL ResultIterator (const LTRResultIterator &resit)
 
- Protected Member Functions inherited from tesseract::PageIterator
TESS_LOCAL void BeginWord (int offset)
 

Private Member Functions

bool CurrentParagraphIsLtr () const
 
void CalculateTextlineOrder (bool paragraph_is_ltr, const LTRResultIterator &resit, GenericVectorEqEq< int > *indices) const
 
void CalculateTextlineOrder (bool paragraph_is_ltr, const LTRResultIterator &resit, GenericVector< StrongScriptDirection > *ssd, GenericVectorEqEq< int > *indices) const
 
int LTRWordIndex () const
 
void CalculateBlobOrder (GenericVector< int > *blob_indices) const
 
void MoveToLogicalStartOfTextline ()
 
void MoveToLogicalStartOfWord ()
 
bool IsAtFinalSymbolOfWord () const
 
bool IsAtFirstSymbolOfWord () const
 
void AppendSuffixMarks (STRING *text) const
 
void AppendUTF8WordText (STRING *text) const
 
void IterateAndAppendUTF8TextlineText (STRING *text)
 
void AppendUTF8ParagraphText (STRING *text) const
 
bool BidiDebug (int min_level) const
 

Private Attributes

bool current_paragraph_is_ltr_
 
bool at_beginning_of_minor_run_
 
bool in_minor_direction_
 
bool preserve_interword_spaces_
 

Additional Inherited Members

- Protected Attributes inherited from tesseract::LTRResultIterator
const char * line_separator_
 
const char * paragraph_separator_
 
- Protected Attributes inherited from tesseract::PageIterator
PAGE_RESpage_res_
 
Tesseracttesseract_
 
PAGE_RES_ITit_
 
WERDword_
 
int word_length_
 
int blob_index_
 
C_BLOB_IT * cblob_it_
 
bool include_upper_dots_
 
bool include_lower_dots_
 
int scale_
 
int scaled_yres_
 
int rect_left_
 
int rect_top_
 
int rect_width_
 
int rect_height_
 

Constructor & Destructor Documentation

◆ ~ResultIterator()

virtual tesseract::ResultIterator::~ResultIterator ( )
virtualdefault

ResultIterator is copy constructible! The default copy constructor works just fine for us.

◆ ResultIterator()

tesseract::ResultIterator::ResultIterator ( const LTRResultIterator resit)
explicitprotected

We presume the data associated with the given iterator will outlive us. NB: This is private because it does something that is non-obvious: it resets to the beginning of the paragraph instead of staying wherever resit might have pointed.

Member Function Documentation

◆ AppendSuffixMarks()

void tesseract::ResultIterator::AppendSuffixMarks ( STRING text) const
private

Append any extra marks that should be appended to this word when printed. Mostly, these are Unicode BiDi control characters.

◆ AppendUTF8ParagraphText()

void tesseract::ResultIterator::AppendUTF8ParagraphText ( STRING text) const
private

Appends the text of the current paragraph in reading order to the given buffer. Each textline is terminated in a single newline character, and the paragraph gets an extra newline at the end.

◆ AppendUTF8WordText()

void tesseract::ResultIterator::AppendUTF8WordText ( STRING text) const
private

Appends the current word in reading order to the given buffer.

◆ Begin()

void tesseract::ResultIterator::Begin ( )
virtual

Moves the iterator to point to the start of the page to begin an iteration.

Reimplemented from tesseract::PageIterator.

◆ BidiDebug()

bool tesseract::ResultIterator::BidiDebug ( int  min_level) const
private

Returns whether the bidi_debug flag is set to at least min_level.

◆ BlanksBeforeWord()

int tesseract::ResultIterator::BlanksBeforeWord ( ) const

◆ CalculateBlobOrder()

void tesseract::ResultIterator::CalculateBlobOrder ( GenericVector< int > *  blob_indices) const
private

Given an iterator pointing at a word, returns the logical reading order of blob indices for the word.

◆ CalculateTextlineOrder() [1/3]

void tesseract::ResultIterator::CalculateTextlineOrder ( bool  paragraph_is_ltr,
const GenericVector< StrongScriptDirection > &  word_dirs,
GenericVectorEqEq< int > *  reading_order 
)
static

Yields the reading order as a sequence of indices and (optional) meta-marks for a set of words (given left-to-right). The meta marks are passed as negative values: kMinorRunStart Start of minor direction text. kMinorRunEnd End of minor direction text. kComplexWord The next indexed word contains both left-to-right and right-to-left characters and was treated as neutral.

For example, suppose we have five words in a text line, indexed [0,1,2,3,4] from the leftmost side of the text line. The following are all believable reading_orders:

Left-to-Right (in ltr paragraph): { 0, 1, 2, 3, 4 } Left-to-Right (in rtl paragraph): { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd } Right-to-Left (in rtl paragraph): { 4, 3, 2, 1, 0 } Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph: { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }

◆ CalculateTextlineOrder() [2/3]

void tesseract::ResultIterator::CalculateTextlineOrder ( bool  paragraph_is_ltr,
const LTRResultIterator resit,
GenericVectorEqEq< int > *  indices 
) const
private

Returns word indices as measured from resit->RestartRow() = index 0 for the reading order of words within a textline given an iterator into the middle of the text line. In addition to non-negative word indices, the following negative values may be inserted: kMinorRunStart Start of minor direction text. kMinorRunEnd End of minor direction text. kComplexWord The previous word contains both left-to-right and right-to-left characters and was treated as neutral.

◆ CalculateTextlineOrder() [3/3]

void tesseract::ResultIterator::CalculateTextlineOrder ( bool  paragraph_is_ltr,
const LTRResultIterator resit,
GenericVector< StrongScriptDirection > *  ssd,
GenericVectorEqEq< int > *  indices 
) const
private

Same as above, but the caller's ssd gets filled in if ssd != nullptr.

◆ CurrentParagraphIsLtr()

bool tesseract::ResultIterator::CurrentParagraphIsLtr ( ) const
private

Calculates the current paragraph's dominant writing direction. Typically, members should use current_paragraph_ltr_ instead.

◆ GetBestLSTMSymbolChoices()

std::vector< std::vector< std::pair< const char *, float > > > * tesseract::ResultIterator::GetBestLSTMSymbolChoices ( ) const
virtual

Returns the LSTM choices for every LSTM timestep for the current word.

◆ GetUTF8Text()

char * tesseract::ResultIterator::GetUTF8Text ( PageIteratorLevel  level) const
virtual

Returns the null terminated UTF-8 encoded text string for the current object at the given level. Use delete [] to free after use.

◆ IsAtBeginningOf()

bool tesseract::ResultIterator::IsAtBeginningOf ( PageIteratorLevel  level) const
virtual

IsAtBeginningOf() returns whether we're at the logical beginning of the given level. (as opposed to ResultIterator's left-to-right top-to-bottom order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf(). For a full description, see pageiterator.h

Reimplemented from tesseract::PageIterator.

◆ IsAtFinalElement()

bool tesseract::ResultIterator::IsAtFinalElement ( PageIteratorLevel  level,
PageIteratorLevel  element 
) const
virtual

Implement PageIterator's IsAtFinalElement correctly in a BiDi context. For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we point at the last word in a paragraph. See PageIterator for full comment.

NOTE! This is an exact copy of PageIterator::IsAtFinalElement with the change that the variable next is now a ResultIterator instead of a PageIterator.

Reimplemented from tesseract::PageIterator.

◆ IsAtFinalSymbolOfWord()

bool tesseract::ResultIterator::IsAtFinalSymbolOfWord ( ) const
private

Are we pointing at the final (reading order) symbol of the word?

◆ IsAtFirstSymbolOfWord()

bool tesseract::ResultIterator::IsAtFirstSymbolOfWord ( ) const
private

Are we pointing at the first (reading order) symbol of the word?

◆ IterateAndAppendUTF8TextlineText()

void tesseract::ResultIterator::IterateAndAppendUTF8TextlineText ( STRING text)
private

Appends the text of the current text line, assuming this iterator is positioned at the beginning of the text line This function updates the iterator to point to the first position past the text line. Each textline is terminated in a single newline character. If the textline ends a paragraph, it gets a second terminal newline.

◆ LTRWordIndex()

int tesseract::ResultIterator::LTRWordIndex ( ) const
private

What is the index of the current word in a strict left-to-right reading of the row?

◆ MoveToLogicalStartOfTextline()

void tesseract::ResultIterator::MoveToLogicalStartOfTextline ( )
private

Precondition: current_paragraph_is_ltr_ is set.

◆ MoveToLogicalStartOfWord()

void tesseract::ResultIterator::MoveToLogicalStartOfWord ( )
private

Precondition: current_paragraph_is_ltr_ and in_minor_direction_ are set.

◆ Next()

bool tesseract::ResultIterator::Next ( PageIteratorLevel  level)
virtual

Moves to the start of the next object at the given level in the page hierarchy in the appropriate reading order and returns false if the end of the page was reached. NOTE that RIL_SYMBOL will skip non-text blocks, but all other PageIteratorLevel level values will visit each non-text block once. Think of non text blocks as containing a single para, with a single line, with a single imaginary word. Calls to Next with different levels may be freely intermixed. This function iterates words in right-to-left scripts correctly, if the appropriate language has been loaded into Tesseract.

Reimplemented from tesseract::PageIterator.

◆ ParagraphIsLtr()

bool tesseract::ResultIterator::ParagraphIsLtr ( ) const

Return whether the current paragraph's dominant reading direction is left-to-right (as opposed to right-to-left).

◆ StartOfParagraph()

ResultIterator * tesseract::ResultIterator::StartOfParagraph ( const LTRResultIterator resit)
static

Member Data Documentation

◆ at_beginning_of_minor_run_

bool tesseract::ResultIterator::at_beginning_of_minor_run_
private

Is the currently pointed-at character at the beginning of a minor-direction run?

◆ current_paragraph_is_ltr_

bool tesseract::ResultIterator::current_paragraph_is_ltr_
private

◆ in_minor_direction_

bool tesseract::ResultIterator::in_minor_direction_
private

Is the currently pointed-at character in a minor-direction sequence?

◆ kComplexWord

const int tesseract::ResultIterator::kComplexWord = -3
static

◆ kMinorRunEnd

const int tesseract::ResultIterator::kMinorRunEnd = -2
static

◆ kMinorRunStart

const int tesseract::ResultIterator::kMinorRunStart = -1
static

◆ preserve_interword_spaces_

bool tesseract::ResultIterator::preserve_interword_spaces_
private

Should detected inter-word spaces be preserved, or "compressed" to a single space character (default behavior).


The documentation for this class was generated from the following files: