tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
validator.h
1 /**********************************************************************
2  * File: validator.h
3  * Description: Base class for various text validators. Intended mainly for
4  * scripts that use a virama character.
5  * Author: Ray Smith
6  * Created: Tue May 23 2017
7  *
8  * (C) Copyright 2017, Google Inc.
9  * Licensed under the Apache License, Version 2.0 (the "License");
10  * you may not use this file except in compliance with the License.
11  * You may obtain a copy of the License at
12  * http://www.apache.org/licenses/LICENSE-2.0
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  *
19  **********************************************************************/
20 
21 #ifndef TESSERACT_TRAINING_VALIDATOR_H_
22 #define TESSERACT_TRAINING_VALIDATOR_H_
23 
24 #include <memory>
25 #include <vector>
26 #include "unichar.h"
27 
28 namespace tesseract {
29 
30 // Different kinds of grapheme normalization - not just for Indic!
31 // A grapheme is a syllable unit in Indic and can be several unicodes.
32 // In other scripts, a grapheme is a base character and accent/diacritic
33 // combination, as not all accented characters have a single composed form.
34 enum class GraphemeNormMode {
35  // Validation result is a single string, even if input is multi-word.
37  // Standard unicode graphemes are validated and output as grapheme units.
38  kCombined,
39  // Graphemes are validated and sub-divided. For virama-using scripts, units
40  // that correspond to repeatable glyphs are generated. (Mostly single unicodes
41  // but viramas and joiners are paired with the most sensible neighbor.)
42  // For non-virama scripts, this means that base/accent pairs are separated,
43  // ie the output is individual unicodes.
45  // The output is always single unicodes, regardless of the script.
47 };
48 
49 // An enum representing the scripts that use a virama character. It is
50 // guaranteed that the value of any element, (except kNonVirama) can be cast
51 // to a unicode (char32) value that represents the start of the unicode range
52 // of the corresponding script.
53 enum class ViramaScript : char32 {
54  kNonVirama = 0,
55  kDevanagari = 0x900,
56  kBengali = 0x980,
57  kGurmukhi = 0xa00,
58  kGujarati = 0xa80,
59  kOriya = 0xb00,
60  kTamil = 0xb80,
61  kTelugu = 0xc00,
62  kKannada = 0xc80,
63  kMalayalam = 0xd00,
64  kSinhala = 0xd80,
65  kMyanmar = 0x1000,
66  kKhmer = 0x1780,
67  kJavanese = 0xa980,
68 };
69 
70 // Base class offers a validation API and protected methods to allow subclasses
71 // to easily build the validated/segmented output.
72 class Validator {
73  public:
74  // Validates and cleans the src vector of unicodes to the *dest, according to
75  // g_mode. In the case of kSingleString, a single vector containing the whole
76  // result is added to *dest. With kCombined, multiple vectors are added to
77  // *dest with one grapheme in each. With kGlyphSplit, multiple vectors are
78  // added to *dest with a smaller unit representing a glyph in each.
79  // In case of validation error, returns false and as much as possible of the
80  // input, without discarding invalid text.
81  static bool ValidateCleanAndSegment(GraphemeNormMode g_mode,
82  bool report_errors,
83  const std::vector<char32>& src,
84  std::vector<std::vector<char32>>* dest);
85 
86  // Returns true if the unicode ch is a non-printing zero-width mark of no
87  // significance to OCR training or evaluation.
88  static bool IsZeroWidthMark(char32 ch) {
89  return ch == kZeroWidthSpace || ch == kLeftToRightMark ||
90  ch == kRightToLeftMark || ch == kInvalid;
91  }
92  virtual ~Validator();
93 
94  // Some specific but universally useful unicodes.
95  static const char32 kZeroWidthSpace;
97  static const char32 kZeroWidthJoiner;
98  static const char32 kLeftToRightMark;
99  static const char32 kRightToLeftMark;
100  static const char32 kInvalid;
101 
102  protected:
103  // These are more or less the character class identifiers in the ISCII
104  // standard, section 8. They have been augmented with the Unicode meta
105  // characters Zero Width Joiner and Zero Width Non Joiner, and the
106  // Unicode Vedic Marks.
107  // The best sources of information on Unicode and Indic scripts are:
108  // http://varamozhi.sourceforge.net/iscii91.pdf
109  // http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
110  // http://unicode.org/faq/indic.html
111  // http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx
112  enum class CharClass {
113  // NOTE: The values of the enum members are meaningless and arbitrary, ie
114  // they are not used for sorting, or any other risky application.
115  // The reason they are what they are is they are a single character
116  // abbreviation that can be used in a regexp/BNF definition of a grammar,
117  // IN A COMMENT, and still not relied upon in the code.
118  kConsonant = 'C',
119  kVowel = 'V',
120  kVirama = 'H', // (aka Halant)
121  kMatra = 'M', // (aka Dependent Vowel)
122  kMatraPiece = 'P', // unicode provides pieces of Matras.
123  kVowelModifier = 'D', // (candrabindu, anusvara, visarga, other marks)
124  kZeroWidthNonJoiner = 'z', // Unicode Zero Width Non-Joiner U+200C
125  kZeroWidthJoiner = 'Z', // Unicode Zero Width Joiner U+200D
126  kVedicMark = 'v', // Modifiers can come modify any indic syllable.
127  kNukta = 'N', // Occurs only immediately after consonants.
128  kRobat = 'R', // Khmer only.
129  kOther = 'O', // (digits, measures, non-Indic, etc)
130  // Additional classes used only by ValidateGrapheme.
131  kWhitespace = ' ',
132  kCombiner = 'c', // Combiners other than virama.
133  };
134  using IndicPair = std::pair<CharClass, char32>;
135 
136  Validator(ViramaScript script, bool report_errors)
137  : script_(script),
138  codes_used_(0),
139  output_used_(0),
140  report_errors_(report_errors) {}
141 
142  // Factory method that understands how to map script to the right subclass.
143  static std::unique_ptr<Validator> ScriptValidator(ViramaScript script,
144  bool report_errors);
145 
146  // Internal version of the public static ValidateCleanAndSegment.
147  // Validates and cleans the src vector of unicodes to the *dest, according to
148  // its type and the given g_mode.
149  // In case of validation error, returns false and returns as much as possible
150  // of the input, without discarding invalid text.
151  bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode,
152  const std::vector<char32>& src,
153  std::vector<std::vector<char32>>* dest);
154  // Moves the results from parts_ or output_ to dest according to g_mode.
155  void MoveResultsToDest(GraphemeNormMode g_mode,
156  std::vector<std::vector<char32>>* dest);
157 
158  // Computes and returns the ViramaScript corresponding to the most frequent
159  // virama-using script in the input, or kNonVirama if none are present.
160  static ViramaScript MostFrequentViramaScript(
161  const std::vector<char32>& utf32);
162  // Returns true if the given UTF-32 unicode is a "virama" character.
163  static bool IsVirama(char32 unicode);
164  // Returns true if the given UTF-32 unicode is a vedic accent.
165  static bool IsVedicAccent(char32 unicode);
166  // Returns true if the script is one that uses subscripts for conjuncts.
167  bool IsSubscriptScript() const;
168 
169  // Helper function appends the next element of codes_ only to output_,
170  // without touching parts_
171  // Returns true at the end of codes_.
173  output_.push_back(codes_[codes_used_].second);
174  return ++codes_used_ == codes_.size();
175  }
176 
177  // Helper function adds a length-element vector to parts_ from the last length
178  // elements of output_. If there are more than length unused elements in
179  // output_, adds unicodes as single-element vectors to parts_ to catch
180  // output_used_ up to output->size() - length before adding the length-element
181  // vector.
182  void MultiCodePart(int length) {
183  while (output_used_ + length < output_.size()) {
184  parts_.emplace_back(
185  std::initializer_list<char32>{output_[output_used_++]});
186  }
187  parts_.emplace_back(std::initializer_list<char32>{output_[output_used_]});
188  while (++output_used_ < output_.size()) {
189  parts_.back().push_back(output_[output_used_]);
190  }
191  }
192 
193  // Helper function appends the next element of codes_ to output_, and then
194  // calls MultiCodePart to add the appropriate components to parts_.
195  // Returns true at the end of codes_.
196  bool UseMultiCode(int length) {
197  output_.push_back(codes_[codes_used_].second);
198  MultiCodePart(length);
199  return ++codes_used_ == codes_.size();
200  }
201 
202  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
203  // parts_ and output_. Returns true if a valid Grapheme was consumed,
204  // otherwise does not increment codes_used_.
205  virtual bool ConsumeGraphemeIfValid() = 0;
206  // Sets codes_ to the class codes for the given unicode text.
207  void ComputeClassCodes(const std::vector<char32>& text);
208  // Returns the CharClass corresponding to the given Unicode ch.
209  virtual CharClass UnicodeToCharClass(char32 ch) const = 0;
210  // Resets to the initial state.
211  void Clear();
212 
213  // Number of unicodes in each Indic codepage.
214  static const int kIndicCodePageSize = 128;
215  // Lowest unicode value of any Indic script. (Devanagari).
216  static const char32 kMinIndicUnicode = 0x900;
217  // Highest unicode value of any consistent (ISCII-based) Indic script.
218  static const char32 kMaxSinhalaUnicode = 0xdff;
219  // Highest unicode value of any virama-using script. (Khmer).
220  static const char32 kMaxViramaScriptUnicode = 0x17ff;
221  // Some special unicodes.
222  static const char32 kSinhalaVirama = 0xdca;
223  static const char32 kMyanmarVirama = 0x1039;
224  static const char32 kKhmerVirama = 0x17d2;
225  // Javanese Script - aksarajawa
226  static const char32 kJavaneseVirama = 0xa9c0;
227  static const char32 kMaxJavaneseUnicode = 0xa9df;
228 
229  // Script we are operating on.
231  // Input unicodes with assigned CharClass is the data to be validated.
232  std::vector<IndicPair> codes_;
233  // Glyph-like components of the input.
234  std::vector<std::vector<char32>> parts_;
235  // Copied validated unicodes from codes_ that are OK to output.
236  std::vector<char32> output_;
237  // The number of elements of codes_ that have been processed so far.
239  // The number of elements of output_ that have already been added to parts_.
241  // Log error messages for reasons why text is invalid.
243 };
244 
245 } // namespace tesseract
246 
247 #endif // TESSERACT_TRAINING_VALIDATOR_H_
bool UseMultiCode(int length)
Definition: validator.h:196
signed int char32
Definition: unichar.h:52
int output_used_
Definition: validator.h:240
std::vector< IndicPair > codes_
Definition: validator.h:232
void MultiCodePart(int length)
Definition: validator.h:182
static const char32 kLeftToRightMark
Definition: validator.h:98
GraphemeNormMode
Definition: validator.h:34
Definition: baseapi.cpp:94
std::pair< CharClass, char32 > IndicPair
Definition: validator.h:134
std::vector< std::vector< char32 > > parts_
Definition: validator.h:234
ViramaScript
Definition: validator.h:53
std::vector< char32 > output_
Definition: validator.h:236
CharClass
Definition: validator.h:112
static const char32 kRightToLeftMark
Definition: validator.h:99
static const char32 kInvalid
Definition: validator.h:100
static bool IsZeroWidthMark(char32 ch)
Definition: validator.h:88
static const char32 kZeroWidthJoiner
Definition: validator.h:97
bool CodeOnlyToOutput()
Definition: validator.h:172
bool report_errors_
Definition: validator.h:242
int codes_used_
Definition: validator.h:238
Validator(ViramaScript script, bool report_errors)
Definition: validator.h:136
ViramaScript script_
Definition: validator.h:230
static const char32 kZeroWidthNonJoiner
Definition: validator.h:96
static const char32 kZeroWidthSpace
Definition: validator.h:95
Definition: validator.h:72