tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
recodebeam.h
1 // File: recodebeam.h
3 // Description: Beam search to decode from the re-encoded CJK as a sequence of
4 // smaller numbers in place of a single large code.
5 // Author: Ray Smith
6 // Created: Fri Mar 13 09:12:01 PDT 2015
7 //
8 // (C) Copyright 2015, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #ifndef THIRD_PARTY_TESSERACT_LSTM_RECODEBEAM_H_
22 #define THIRD_PARTY_TESSERACT_LSTM_RECODEBEAM_H_
23 
24 #include "dawg.h"
25 #include "dict.h"
26 #include "genericheap.h"
27 #include "kdpair.h"
28 #include "networkio.h"
29 #include "ratngs.h"
30 #include "unicharcompress.h"
31 #include <deque>
32 #include <set>
33 #include <vector>
34 
35 namespace tesseract {
36 
37 // Enum describing what can follow the current node.
38 // Consider the following softmax outputs:
39 // Timestep 0 1 2 3 4 5 6 7 8
40 // X-score 0.01 0.55 0.98 0.42 0.01 0.01 0.40 0.95 0.01
41 // Y-score 0.00 0.01 0.01 0.01 0.01 0.97 0.59 0.04 0.01
42 // Null-score 0.99 0.44 0.01 0.57 0.98 0.02 0.01 0.01 0.98
43 // Then the correct CTC decoding (in which adjacent equal classes are folded,
44 // and then all nulls are dropped) is clearly XYX, but simple decoding (taking
45 // the max at each timestep) leads to:
46 // Null@0.99 X@0.55 X@0.98 Null@0.57 Null@0.98 Y@0.97 Y@0.59 X@0.95 Null@0.98,
47 // which folds to the correct XYX. The conversion to Tesseract rating and
48 // certainty uses the sum of the log probs (log of the product of probabilities)
49 // for the Rating and the minimum log prob for the certainty, but that yields a
50 // minimum certainty of log(0.55), which is poor for such an obvious case.
51 // CTC says that the probability of the result is the SUM of the products of the
52 // probabilities over ALL PATHS that decode to the same result, which includes:
53 // NXXNNYYXN, NNXNNYYN, NXXXNYYXN, NNXXNYXXN, and others including XXXXXYYXX.
54 // That is intractable, so some compromise between simple and ideal is needed.
55 // Observing that evenly split timesteps rarely happen next to each other, we
56 // allow scores at a transition between classes to be added for decoding thus:
57 // N@0.99 (N+X)@0.99 X@0.98 (N+X)@0.99 N@0.98 Y@0.97 (X+Y+N)@1.00 X@0.95 N@0.98.
58 // This works because NNX and NXX both decode to X, so in the middle we can use
59 // N+X. Note that the classes either side of a sum must stand alone, i.e. use a
60 // single score, to force all paths to pass through them and decode to the same
61 // result. Also in the special case of a transition from X to Y, with only one
62 // timestep between, it is possible to add X+Y+N, since XXY, XYY, and XNY all
63 // decode to XY.
64 // An important condition is that we cannot combine X and Null between two
65 // stand-alone Xs, since that can decode as XNX->XX or XXX->X, so the scores for
66 // X and Null have to go in separate paths. Combining scores in this way
67 // provides a much better minimum certainty of log(0.95).
68 // In the implementation of the beam search, we have to place the possibilities
69 // X, X+N and X+Y+N in the beam under appropriate conditions of the previous
70 // node, and constrain what can follow, to enforce the rules explained above.
71 // We therefore have 3 different types of node determined by what can follow:
73  NC_ANYTHING, // This node used just its own score, so anything can follow.
74  NC_ONLY_DUP, // The current node combined another score with the score for
75  // itself, without a stand-alone duplicate before, so must be
76  // followed by a stand-alone duplicate.
77  NC_NO_DUP, // The current node combined another score with the score for
78  // itself, after a stand-alone, so can only be followed by
79  // something other than a duplicate of the current node.
81 };
82 
83 // Enum describing the top-n status of a code.
84 enum TopNState {
85  TN_TOP2, // Winner or 2nd.
86  TN_TOPN, // Runner up in top-n, but not 1st or 2nd.
87  TN_ALSO_RAN, // Not in the top-n.
89 };
90 
91 // Lattice element for Re-encode beam search.
92 struct RecodeNode {
94  : code(-1),
95  unichar_id(INVALID_UNICHAR_ID),
96  permuter(TOP_CHOICE_PERM),
97  start_of_dawg(false),
98  start_of_word(false),
99  end_of_word(false),
100  duplicate(false),
101  certainty(0.0f),
102  score(0.0f),
103  prev(nullptr),
104  dawgs(nullptr),
105  code_hash(0) {}
106  RecodeNode(int c, int uni_id, PermuterType perm, bool dawg_start,
107  bool word_start, bool end, bool dup, float cert, float s,
108  const RecodeNode* p, DawgPositionVector* d, uint64_t hash)
109  : code(c),
110  unichar_id(uni_id),
111  permuter(perm),
112  start_of_dawg(dawg_start),
113  start_of_word(word_start),
114  end_of_word(end),
115  duplicate(dup),
116  certainty(cert),
117  score(s),
118  prev(p),
119  dawgs(d),
120  code_hash(hash) {}
121  // NOTE: If we could use C++11, then this would be a move constructor.
122  // Instead we have copy constructor that does a move!! This is because we
123  // don't want to copy the whole DawgPositionVector each time, and true
124  // copying isn't necessary for this struct. It does get moved around a lot
125  // though inside the heap and during heap push, hence the move semantics.
126  RecodeNode(RecodeNode& src) : dawgs(nullptr) {
127  *this = src;
128  ASSERT_HOST(src.dawgs == nullptr);
129  }
131  delete dawgs;
132  memcpy(this, &src, sizeof(src));
133  src.dawgs = nullptr;
134  return *this;
135  }
136  ~RecodeNode() { delete dawgs; }
137  // Prints details of the node.
138  void Print(int null_char, const UNICHARSET& unicharset, int depth) const;
139 
140  // The re-encoded code here = index to network output.
141  int code;
142  // The decoded unichar_id is only valid for the final code of a sequence.
144  // The type of permuter active at this point. Intervals between start_of_word
145  // and end_of_word make valid words of type given by permuter where
146  // end_of_word is true. These aren't necessarily delimited by spaces.
147  PermuterType permuter;
148  // True if this is the initial dawg state. May be attached to a space or,
149  // in a non-space-delimited lang, the end of the previous word.
151  // True if this is the first node in a dictionary word.
153  // True if this represents a valid candidate end of word position. Does not
154  // necessarily mark the end of a word, since a word can be extended beyond a
155  // candidate end by a continuation, eg 'the' continues to 'these'.
157  // True if this->code is a duplicate of prev->code. Some training modes
158  // allow the network to output duplicate characters and crush them with CTC,
159  // but that would mess up the dictionary search, so we just smash them
160  // together on the fly using the duplicate flag.
161  bool duplicate;
162  // Certainty (log prob) of (just) this position.
163  float certainty;
164  // Total certainty of the path to this position.
165  float score;
166  // The previous node in this chain. Borrowed pointer.
167  const RecodeNode* prev;
168  // The currently active dawgs at this position. Owned pointer.
170  // A hash of all codes in the prefix and this->code as well. Used for
171  // duplicate path removal.
172  uint64_t code_hash;
173 };
174 
177 
178 // Class that holds the entire beam search for recognition of a text line.
180  public:
181  // Borrows the pointer, which is expected to survive until *this is deleted.
182  RecodeBeamSearch(const UnicharCompress& recoder, int null_char,
183  bool simple_text, Dict* dict);
184 
185  // Decodes the set of network outputs, storing the lattice internally.
186  // If charset is not null, it enables detailed debugging of the beam search.
187  void Decode(const NetworkIO& output, double dict_ratio, double cert_offset,
188  double worst_dict_cert, const UNICHARSET* charset,
189  int lstm_choice_mode = 0);
190  void Decode(const GENERIC_2D_ARRAY<float>& output, double dict_ratio,
191  double cert_offset, double worst_dict_cert,
192  const UNICHARSET* charset);
193 
194  // Returns the best path as labels/scores/xcoords similar to simple CTC.
195  void ExtractBestPathAsLabels(GenericVector<int>* labels,
196  GenericVector<int>* xcoords) const;
197  // Returns the best path as unichar-ids/certs/ratings/xcoords skipping
198  // duplicates, nulls and intermediate parts.
199  void ExtractBestPathAsUnicharIds(bool debug, const UNICHARSET* unicharset,
200  GenericVector<int>* unichar_ids,
201  GenericVector<float>* certs,
202  GenericVector<float>* ratings,
203  GenericVector<int>* xcoords) const;
204 
205  // Returns the best path as a set of WERD_RES.
206  void ExtractBestPathAsWords(const TBOX& line_box, float scale_factor,
207  bool debug, const UNICHARSET* unicharset,
209  int lstm_choice_mode = 0);
210 
211  // Generates debug output of the content of the beams after a Decode.
212  void DebugBeams(const UNICHARSET& unicharset) const;
213 
214  // Stores the alternative characters of every timestep together with their
215  // probability.
216  std::vector< std::vector<std::pair<const char*, float>>> timesteps;
217 
218  // Clipping value for certainty inside Tesseract. Reflects the minimum value
219  // of certainty that will be returned by ExtractBestPathAsUnicharIds.
220  // Supposedly on a uniform scale that can be compared across languages and
221  // engines.
222  static const float kMinCertainty;
223  // Number of different code lengths for which we have a separate beam.
224  static const int kNumLengths = RecodedCharID::kMaxCodeLen + 1;
225  // Total number of beams: dawg/nodawg * number of NodeContinuation * number
226  // of different lengths.
227  static const int kNumBeams = 2 * NC_COUNT * kNumLengths;
228  // Returns the relevant factor in the beams_ index.
229  static int LengthFromBeamsIndex(int index) { return index % kNumLengths; }
231  return static_cast<NodeContinuation>((index / kNumLengths) % NC_COUNT);
232  }
233  static bool IsDawgFromBeamsIndex(int index) {
234  return index / (kNumLengths * NC_COUNT) > 0;
235  }
236  // Computes a beams_ index from the given factors.
237  static int BeamIndex(bool is_dawg, NodeContinuation cont, int length) {
238  return (is_dawg * NC_COUNT + cont) * kNumLengths + length;
239  }
240 
241  private:
242  // Struct for the Re-encode beam search. This struct holds the data for
243  // a single time-step position of the output. Use a PointerVector<RecodeBeam>
244  // to hold all the timesteps and prevent reallocation of the individual heaps.
245  struct RecodeBeam {
246  // Resets to the initial state without deleting all the memory.
247  void Clear() {
248  for (int i = 0; i < kNumBeams; ++i) {
249  beams_[i].clear();
250  }
251  RecodeNode empty;
252  for (int i = 0; i < NC_COUNT; ++i) {
253  best_initial_dawgs_[i] = empty;
254  }
255  }
256 
257  // A separate beam for each combination of code length,
258  // NodeContinuation, and dictionary flag. Separating out all these types
259  // allows the beam to be quite narrow, and yet still have a low chance of
260  // losing the best path.
261  // We have to keep all these beams separate, since the highest scoring paths
262  // come from the paths that are most likely to dead-end at any time, like
263  // dawg paths, NC_ONLY_DUP etc.
264  // Each heap is stored with the WORST result at the top, so we can quickly
265  // get the top-n values.
266  RecodeHeap beams_[kNumBeams];
267  // While the language model is only a single word dictionary, we can use
268  // word starts as a choke point in the beam, and keep only a single dict
269  // start node at each step (for each NodeContinuation type), so we find the
270  // best one here and push it on the heap, if it qualifies, after processing
271  // all of the step.
272  RecodeNode best_initial_dawgs_[NC_COUNT];
273  };
275 
276  // Generates debug output of the content of a single beam position.
277  void DebugBeamPos(const UNICHARSET& unicharset, const RecodeHeap& heap) const;
278 
279  // Returns the given best_nodes as unichar-ids/certs/ratings/xcoords skipping
280  // duplicates, nulls and intermediate parts.
281  static void ExtractPathAsUnicharIds(
282  const GenericVector<const RecodeNode*>& best_nodes,
283  GenericVector<int>* unichar_ids, GenericVector<float>* certs,
284  GenericVector<float>* ratings, GenericVector<int>* xcoords,
285  std::deque<std::pair<int,int>>* best_choices = nullptr);
286 
287  // Sets up a word with the ratings matrix and fake blobs with boxes in the
288  // right places.
289  WERD_RES* InitializeWord(bool leading_space, const TBOX& line_box,
290  int word_start, int word_end, float space_certainty,
291  const UNICHARSET* unicharset,
292  const GenericVector<int>& xcoords,
293  float scale_factor);
294 
295  // Fills top_n_flags_ with bools that are true iff the corresponding output
296  // is one of the top_n.
297  void ComputeTopN(const float* outputs, int num_outputs, int top_n);
298 
299  // Adds the computation for the current time-step to the beam. Call at each
300  // time-step in sequence from left to right. outputs is the activation vector
301  // for the current timestep.
302  void DecodeStep(const float* outputs, int t, double dict_ratio,
303  double cert_offset, double worst_dict_cert,
304  const UNICHARSET* charset, bool debug = false);
305 
306  //Saves the most certain choices for the current time-step
307  void SaveMostCertainChoices(const float* outputs, int num_outputs, const UNICHARSET* charset, int xCoord);
308 
309  // Adds to the appropriate beams the legal (according to recoder)
310  // continuations of context prev, which is from the given index to beams_,
311  // using the given network outputs to provide scores to the choices. Uses only
312  // those choices for which top_n_flags[code] == top_n_flag.
313  void ContinueContext(const RecodeNode* prev, int index, const float* outputs,
314  TopNState top_n_flag, double dict_ratio,
315  double cert_offset, double worst_dict_cert,
316  RecodeBeam* step);
317  // Continues for a new unichar, using dawg or non-dawg as per flag.
318  void ContinueUnichar(int code, int unichar_id, float cert,
319  float worst_dict_cert, float dict_ratio, bool use_dawgs,
320  NodeContinuation cont, const RecodeNode* prev,
321  RecodeBeam* step);
322  // Adds a RecodeNode composed of the args to the correct heap in step if
323  // unichar_id is a valid dictionary continuation of whatever is in prev.
324  void ContinueDawg(int code, int unichar_id, float cert, NodeContinuation cont,
325  const RecodeNode* prev, RecodeBeam* step);
326  // Sets the correct best_initial_dawgs_ with a RecodeNode composed of the args
327  // if better than what is already there.
328  void PushInitialDawgIfBetter(int code, int unichar_id, PermuterType permuter,
329  bool start, bool end, float cert,
330  NodeContinuation cont, const RecodeNode* prev,
331  RecodeBeam* step);
332  // Adds a RecodeNode composed of the args to the correct heap in step for
333  // partial unichar or duplicate if there is room or if better than the
334  // current worst element if already full.
335  void PushDupOrNoDawgIfBetter(int length, bool dup, int code, int unichar_id,
336  float cert, float worst_dict_cert,
337  float dict_ratio, bool use_dawgs,
338  NodeContinuation cont, const RecodeNode* prev,
339  RecodeBeam* step);
340  // Adds a RecodeNode composed of the args to the correct heap in step if there
341  // is room or if better than the current worst element if already full.
342  void PushHeapIfBetter(int max_size, int code, int unichar_id,
343  PermuterType permuter, bool dawg_start, bool word_start,
344  bool end, bool dup, float cert, const RecodeNode* prev,
345  DawgPositionVector* d, RecodeHeap* heap);
346  // Adds a RecodeNode to heap if there is room
347  // or if better than the current worst element if already full.
348  void PushHeapIfBetter(int max_size, RecodeNode* node, RecodeHeap* heap);
349  // Searches the heap for an entry matching new_node, and updates the entry
350  // with reshuffle if needed. Returns true if there was a match.
351  bool UpdateHeapIfMatched(RecodeNode* new_node, RecodeHeap* heap);
352  // Computes and returns the code-hash for the given code and prev.
353  uint64_t ComputeCodeHash(int code, bool dup, const RecodeNode* prev) const;
354  // Backtracks to extract the best path through the lattice that was built
355  // during Decode. On return the best_nodes vector essentially contains the set
356  // of code, score pairs that make the optimal path with the constraint that
357  // the recoder can decode the code sequence back to a sequence of unichar-ids.
358  void ExtractBestPaths(GenericVector<const RecodeNode*>* best_nodes,
359  GenericVector<const RecodeNode*>* second_nodes) const;
360  // Helper backtracks through the lattice from the given node, storing the
361  // path and reversing it.
362  void ExtractPath(const RecodeNode* node,
364  // Helper prints debug information on the given lattice path.
365  void DebugPath(const UNICHARSET* unicharset,
366  const GenericVector<const RecodeNode*>& path) const;
367  // Helper prints debug information on the given unichar path.
368  void DebugUnicharPath(const UNICHARSET* unicharset,
370  const GenericVector<int>& unichar_ids,
371  const GenericVector<float>& certs,
372  const GenericVector<float>& ratings,
373  const GenericVector<int>& xcoords) const;
374 
375  static const int kBeamWidths[RecodedCharID::kMaxCodeLen + 1];
376 
377  // The encoder/decoder that we will be using.
379  // The beam for each timestep in the output.
381  // The number of timesteps valid in beam_;
383  // A flag to indicate which outputs are the top-n choices. Current timestep
384  // only.
386  // A record of the highest and second scoring codes.
389  // Heap used to compute the top_n_flags_.
391  // Borrowed pointer to the dictionary to use in the search.
393  // True if the language is space-delimited, which is true for most languages
394  // except chi*, jpn, tha.
396  // True if the input is simple text, ie adjacent equal chars are not to be
397  // eliminated.
399  // The encoded (class label) of the null/reject character.
401 };
402 
403 } // namespace tesseract.
404 
405 #endif // THIRD_PARTY_TESSERACT_LSTM_RECODEBEAM_H_
Definition: recodebeam.h:87
RecodeNode & operator=(RecodeNode &src)
Definition: recodebeam.h:130
int beam_size_
Definition: recodebeam.h:382
float score
Definition: recodebeam.h:165
bool start_of_dawg
Definition: recodebeam.h:150
RecodeNode(int c, int uni_id, PermuterType perm, bool dawg_start, bool word_start, bool end, bool dup, float cert, float s, const RecodeNode *p, DawgPositionVector *d, uint64_t hash)
Definition: recodebeam.h:106
int second_code_
Definition: recodebeam.h:388
int code
Definition: recodebeam.h:141
int top_code_
Definition: recodebeam.h:387
bool is_simple_text_
Definition: recodebeam.h:398
void Clear()
Definition: recodebeam.h:247
PermuterType permuter
Definition: recodebeam.h:147
Definition: kdpair.h:51
Definition: rect.h:34
Definition: unicharset.h:146
DawgPositionVector * dawgs
Definition: recodebeam.h:169
static int BeamIndex(bool is_dawg, NodeContinuation cont, int length)
Definition: recodebeam.h:237
RecodeNode(RecodeNode &src)
Definition: recodebeam.h:126
Definition: recodebeam.h:179
GenericVector< TopNState > top_n_flags_
Definition: recodebeam.h:385
NodeContinuation
Definition: recodebeam.h:72
Definition: baseapi.cpp:94
Definition: recodebeam.h:88
Definition: recodebeam.h:92
Dict * dict_
Definition: recodebeam.h:392
Definition: recodebeam.h:80
void Print(int null_char, const UNICHARSET &unicharset, int depth) const
Definition: recodebeam.cpp:48
Definition: unicharcompress.h:128
std::vector< std::vector< std::pair< const char *, float > > > timesteps
Definition: recodebeam.h:216
Definition: recodebeam.h:77
TopNState
Definition: recodebeam.h:84
GenericHeap< TopPair > top_heap_
Definition: recodebeam.h:390
Definition: dict.h:88
Definition: recodebeam.h:245
Definition: dawg.h:381
int null_char_
Definition: recodebeam.h:400
bool duplicate
Definition: recodebeam.h:161
static int LengthFromBeamsIndex(int index)
Definition: recodebeam.h:229
uint64_t code_hash
Definition: recodebeam.h:172
static const int kMaxCodeLen
Definition: unicharcompress.h:37
bool start_of_word
Definition: recodebeam.h:152
bool space_delimited_
Definition: recodebeam.h:395
int unichar_id
Definition: recodebeam.h:143
~RecodeNode()
Definition: recodebeam.h:136
const RecodeNode * prev
Definition: recodebeam.h:167
Definition: pageres.h:169
Definition: recodebeam.h:73
Definition: recodebeam.h:74
Definition: recodebeam.h:85
Definition: networkio.h:39
static const float kMinCertainty
Definition: recodebeam.h:222
Definition: recodebeam.h:86
RecodeNode()
Definition: recodebeam.h:93
static bool IsDawgFromBeamsIndex(int index)
Definition: recodebeam.h:233
const UnicharCompress & recoder_
Definition: recodebeam.h:378
static NodeContinuation ContinuationFromBeamsIndex(int index)
Definition: recodebeam.h:230
bool end_of_word
Definition: recodebeam.h:156
PointerVector< RecodeBeam > beam_
Definition: recodebeam.h:380
float certainty
Definition: recodebeam.h:163