tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
werd.h
1 /**********************************************************************
2  * File: werd.h
3  * Description: Code for the WERD class.
4  * Author: Ray Smith
5  * Created: Tue Oct 08 14:32:12 BST 1991
6  *
7  * (C) Copyright 1991, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifndef WERD_H
21 #define WERD_H
22 
23 #include "params.h"
24 #include "bits16.h"
25 #include "elst2.h"
26 #include "strngs.h"
27 #include "stepblob.h"
28 
29 enum WERD_FLAGS
30 {
31  W_SEGMENTED, //< correctly segmented
32  W_ITALIC, //< italic text
33  W_BOLD, //< bold text
34  W_BOL, //< start of line
35  W_EOL, //< end of line
36  W_NORMALIZED, //< flags
37  W_SCRIPT_HAS_XHEIGHT, //< x-height concept makes sense.
38  W_SCRIPT_IS_LATIN, //< Special case latin for y. splitting.
39  W_DONT_CHOP, //< fixed pitch chopped
40  W_REP_CHAR, //< repeated character
41  W_FUZZY_SP, //< fuzzy space
42  W_FUZZY_NON, //< fuzzy nonspace
43  W_INVERSE //< white on black
44 };
45 
46 enum DISPLAY_FLAGS
47 {
48  /* Display flags bit number allocations */
49  DF_BOX, //< Bounding box
50  DF_TEXT, //< Correct ascii
51  DF_POLYGONAL, //< Polyg approx
52  DF_EDGE_STEP, //< Edge steps
53  DF_BN_POLYGONAL, //< BL normalisd polyapx
54  DF_BLAMER //< Blamer information
55 };
56 
57 class ROW; //forward decl
58 
59 class WERD : public ELIST2_LINK {
60  public:
61  WERD() = default;
62  // WERD constructed with:
63  // blob_list - blobs of the word (we take this list's contents)
64  // blanks - number of blanks before the word
65  // text - correct text (outlives WERD)
66  WERD(C_BLOB_LIST *blob_list, uint8_t blanks, const char *text);
67 
68  // WERD constructed from:
69  // blob_list - blobs in the word
70  // clone - werd to clone flags, etc from.
71  WERD(C_BLOB_LIST *blob_list, WERD *clone);
72 
73  // Construct a WERD from a single_blob and clone the flags from this.
74  // W_BOL and W_EOL flags are set according to the given values.
75  WERD* ConstructFromSingleBlob(bool bol, bool eol, C_BLOB* blob);
76 
77  ~WERD() = default;
78 
79  // assignment
80  WERD & operator= (const WERD &source);
81 
82  // This method returns a new werd constructed using the blobs in the input
83  // all_blobs list, which correspond to the blobs in this werd object. The
84  // blobs used to construct the new word are consumed and removed from the
85  // input all_blobs list.
86  // Returns nullptr if the word couldn't be constructed.
87  // Returns original blobs for which no matches were found in the output list
88  // orphan_blobs (appends).
89  WERD *ConstructWerdWithNewBlobs(C_BLOB_LIST *all_blobs,
90  C_BLOB_LIST *orphan_blobs);
91 
92  // Accessors for reject / DUFF blobs in various formats
93  C_BLOB_LIST *rej_cblob_list() { // compact format
94  return &rej_cblobs;
95  }
96 
97  // Accessors for good blobs in various formats.
98  C_BLOB_LIST *cblob_list() { // get compact blobs
99  return &cblobs;
100  }
101 
102  uint8_t space() { // access function
103  return blanks;
104  }
105  void set_blanks(uint8_t new_blanks) {
106  blanks = new_blanks;
107  }
108  int script_id() const {
109  return script_id_;
110  }
111  void set_script_id(int id) {
112  script_id_ = id;
113  }
114 
115  // Returns the (default) bounding box including all the dots.
116  TBOX bounding_box() const; // compute bounding box
117  // Returns the bounding box including the desired combination of upper and
118  // lower noise/diacritic elements.
119  TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const;
120  // Returns the bounding box of only the good blobs.
121  TBOX true_bounding_box() const;
122 
123  const char *text() const { return correct.string(); }
124  void set_text(const char *new_text) { correct = new_text; }
125 
126  bool flag(WERD_FLAGS mask) const { return flags.bit(mask); }
127  void set_flag(WERD_FLAGS mask, bool value) { flags.set_bit(mask, value); }
128 
129  bool display_flag(uint8_t flag) const { return disp_flags.bit(flag); }
130  void set_display_flag(uint8_t flag, bool value) {
131  disp_flags.set_bit(flag, value);
132  }
133 
134  WERD *shallow_copy(); // shallow copy word
135 
136  // reposition word by vector
137  void move(const ICOORD vec);
138 
139  // join other's blobs onto this werd, emptying out other.
140  void join_on(WERD* other);
141 
142  // copy other's blobs onto this word, leaving other intact.
143  void copy_on(WERD* other);
144 
145  // tprintf word metadata (but not blob innards)
146  void print();
147 
148  #ifndef GRAPHICS_DISABLED
149  // plot word on window in a uniform colour
150  void plot(ScrollView *window, ScrollView::Color colour);
151 
152  // Get the next color in the (looping) rainbow.
154 
155  // plot word on window in a rainbow of colours
156  void plot(ScrollView *window);
157 
158  // plot rejected blobs in a rainbow of colours
159  void plot_rej_blobs(ScrollView *window);
160  #endif // GRAPHICS_DISABLED
161 
162  // Removes noise from the word by moving small outlines to the rej_cblobs
163  // list, based on the size_threshold.
164  void CleanNoise(float size_threshold);
165 
166  // Extracts all the noise outlines and stuffs the pointers into the given
167  // vector of outlines. Afterwards, the outlines vector owns the pointers.
169  // Adds the selected outlines to the indcated real blobs, and puts the rest
170  // back in rej_cblobs where they came from. Where the target_blobs entry is
171  // nullptr, a run of wanted outlines is put into a single new blob.
172  // Ownership of the outlines is transferred back to the word. (Hence
173  // GenericVector and not PointerVector.)
174  // Returns true if any new blob was added to the start of the word, which
175  // suggests that it might need joining to the word before it, and likewise
176  // sets make_next_word_fuzzy true if any new blob was added to the end.
177  bool AddSelectedOutlines(const GenericVector<bool> &wanted,
178  const GenericVector<C_BLOB *> &target_blobs,
179  const GenericVector<C_OUTLINE *> &outlines,
180  bool *make_next_word_fuzzy);
181 
182  private:
183  uint8_t blanks; // no of blanks
184  uint8_t dummy; // padding
185  BITS16 flags; // flags about word
186  BITS16 disp_flags; // display flags
187  int16_t script_id_; // From unicharset.
188  STRING correct; // correct text
189  C_BLOB_LIST cblobs; // compacted blobs
190  C_BLOB_LIST rej_cblobs; // DUFF blobs
191 };
192 
193 ELIST2IZEH (WERD)
194 #include "ocrrow.h" // placed here due to
195 // compare words by increasing order of left edge, suitable for qsort(3)
196 int word_comparator(const void *word1p, const void *word2p);
197 #endif
static ScrollView::Color NextColor(ScrollView::Color colour)
Definition: werd.cpp:305
STRING correct
Definition: werd.h:188
void move(const ICOORD vec)
Definition: werd.cpp:197
uint8_t space()
Definition: werd.h:102
Definition: werd.h:59
void GetNoiseOutlines(GenericVector< C_OUTLINE *> *outlines)
Definition: werd.cpp:529
WERD & operator=(const WERD &source)
Definition: werd.cpp:368
const char * text() const
Definition: werd.h:123
Definition: bits16.h:25
void join_on(WERD *other)
Definition: werd.cpp:210
void set_blanks(uint8_t new_blanks)
Definition: werd.h:105
Color
Definition: scrollview.h:105
Definition: rect.h:34
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:93
bool display_flag(uint8_t flag) const
Definition: werd.h:129
WERD * ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob)
Definition: werd.cpp:136
C_BLOB_LIST rej_cblobs
Definition: werd.h:190
uint8_t blanks
Definition: werd.h:183
TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const
Definition: werd.cpp:163
int16_t script_id_
Definition: werd.h:187
void CleanNoise(float size_threshold)
Definition: werd.cpp:505
void set_display_flag(uint8_t flag, bool value)
Definition: werd.h:130
const char * string() const
Definition: strngs.cpp:196
bool bit(uint8_t bit_num) const
Definition: bits16.h:57
Definition: scrollview.h:102
void set_script_id(int id)
Definition: werd.h:111
void copy_on(WERD *other)
Definition: werd.cpp:233
WERD * shallow_copy()
Definition: werd.cpp:351
~WERD()=default
void plot(ScrollView *window, ScrollView::Color colour)
Definition: werd.cpp:296
Definition: baseapi.h:37
Definition: stepblob.h:37
BITS16 disp_flags
Definition: werd.h:186
C_BLOB_LIST cblobs
Definition: werd.h:189
Definition: strngs.h:45
uint8_t dummy
Definition: werd.h:184
void plot_rej_blobs(ScrollView *window)
Definition: werd.cpp:336
integer coordinate
Definition: points.h:32
Definition: ocrrow.h:36
C_BLOB_LIST * cblob_list()
Definition: werd.h:98
BITS16 flags
Definition: werd.h:185
int script_id() const
Definition: werd.h:108
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:127
void print()
Definition: werd.cpp:265
void set_bit(uint8_t bit_num, bool value)
Definition: bits16.h:48
bool AddSelectedOutlines(const GenericVector< bool > &wanted, const GenericVector< C_BLOB *> &target_blobs, const GenericVector< C_OUTLINE *> &outlines, bool *make_next_word_fuzzy)
Definition: werd.cpp:547
TBOX bounding_box() const
Definition: werd.cpp:159
WERD * ConstructWerdWithNewBlobs(C_BLOB_LIST *all_blobs, C_BLOB_LIST *orphan_blobs)
Definition: werd.cpp:411
WERD()=default
TBOX true_bounding_box() const
Definition: werd.cpp:180
bool flag(WERD_FLAGS mask) const
Definition: werd.h:126
void set_text(const char *new_text)
Definition: werd.h:124