doxygen/tesseract/validator_8h_source.html

 /**********************************************************************
  * File:        validator.h
  * Description: Base class for various text validators. Intended mainly for
  *              scripts that use a virama character.
  * Author:      Ray Smith
  * Created:     Tue May 23 2017
  *
  * (C) Copyright 2017, Google Inc.
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
  **********************************************************************/

 #ifndef TESSERACT_TRAINING_VALIDATOR_H_
 #define TESSERACT_TRAINING_VALIDATOR_H_

 #include <memory>
 #include <vector>
 #include "unichar.h"

 namespace tesseract {

 // Different kinds of grapheme normalization - not just for Indic!
 // A grapheme is a syllable unit in Indic and can be several unicodes.
 // In other scripts, a grapheme is a base character and accent/diacritic
 // combination, as not all accented characters have a single composed form.
 enum class GraphemeNormMode {
   // Validation result is a single string, even if input is multi-word.
   kSingleString,
   // Standard unicode graphemes are validated and output as grapheme units.
   kCombined,
   // Graphemes are validated and sub-divided. For virama-using scripts, units
   // that correspond to repeatable glyphs are generated. (Mostly single unicodes
   // but viramas and joiners are paired with the most sensible neighbor.)
   // For non-virama scripts, this means that base/accent pairs are separated,
   // ie the output is individual unicodes.
   kGlyphSplit,
   // The output is always single unicodes, regardless of the script.
   kIndividualUnicodes,
 };

 // An enum representing the scripts that use a virama character. It is
 // guaranteed that the value of any element, (except kNonVirama) can be cast
 // to a unicode (char32) value that represents the start of the unicode range
 // of the corresponding script.
 enum class ViramaScript : char32 {
   kNonVirama = 0,
   kDevanagari = 0x900,
   kBengali = 0x980,
   kGurmukhi = 0xa00,
   kGujarati = 0xa80,
   kOriya = 0xb00,
   kTamil = 0xb80,
   kTelugu = 0xc00,
   kKannada = 0xc80,
   kMalayalam = 0xd00,
   kSinhala = 0xd80,
   kMyanmar = 0x1000,
   kKhmer = 0x1780,
   kJavanese = 0xa980,
 };

 // Base class offers a validation API and protected methods to allow subclasses
 // to easily build the validated/segmented output.
 class Validator {
  public:
   // Validates and cleans the src vector of unicodes to the *dest, according to
   // g_mode. In the case of kSingleString, a single vector containing the whole
   // result is added to *dest. With kCombined, multiple vectors are added to
   // *dest with one grapheme in each. With kGlyphSplit, multiple vectors are
   // added to *dest with a smaller unit representing a glyph in each.
   // In case of validation error, returns false and as much as possible of the
   // input, without discarding invalid text.
   static bool ValidateCleanAndSegment(GraphemeNormMode g_mode,
                                       bool report_errors,
                                       const std::vector<char32>& src,
                                       std::vector<std::vector<char32>>* dest);

   // Returns true if the unicode ch is a non-printing zero-width mark of no
   // significance to OCR training or evaluation.
   static bool IsZeroWidthMark(char32 ch) {
     return ch == kZeroWidthSpace || ch == kLeftToRightMark ||
            ch == kRightToLeftMark || ch == kInvalid;
   }
   virtual ~Validator();

   // Some specific but universally useful unicodes.
   static const char32 kZeroWidthSpace;
   static const char32 kZeroWidthNonJoiner;
   static const char32 kZeroWidthJoiner;
   static const char32 kLeftToRightMark;
   static const char32 kRightToLeftMark;
   static const char32 kInvalid;

  protected:
   // These are more or less the character class identifiers in the ISCII
   // standard, section 8.  They have been augmented with the Unicode meta
   // characters Zero Width Joiner and Zero Width Non Joiner, and the
   // Unicode Vedic Marks.
   // The best sources of information on Unicode and Indic scripts are:
   //   http://varamozhi.sourceforge.net/iscii91.pdf
   //   http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
   //   http://unicode.org/faq/indic.html
   //   http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx
   enum class CharClass {
     // NOTE: The values of the enum members are meaningless and arbitrary, ie
     // they are not used for sorting, or any other risky application.
     // The reason they are what they are is they are a single character
     // abbreviation that can be used in a regexp/BNF definition of a grammar,
     // IN A COMMENT, and still not relied upon in the code.
     kConsonant = 'C',
     kVowel = 'V',
     kVirama = 'H',              // (aka Halant)
     kMatra = 'M',               // (aka Dependent Vowel)
     kMatraPiece = 'P',          // unicode provides pieces of Matras.
     kVowelModifier = 'D',       // (candrabindu, anusvara, visarga, other marks)
     kZeroWidthNonJoiner = 'z',  // Unicode Zero Width Non-Joiner U+200C
     kZeroWidthJoiner = 'Z',     // Unicode Zero Width Joiner U+200D
     kVedicMark = 'v',           // Modifiers can come modify any indic syllable.
     kNukta = 'N',               // Occurs only immediately after consonants.
     kRobat = 'R',               // Khmer only.
     kOther = 'O',               // (digits, measures, non-Indic, etc)
     // Additional classes used only by ValidateGrapheme.
     kWhitespace = ' ',
     kCombiner = 'c',  // Combiners other than virama.
   };
   using IndicPair = std::pair<CharClass, char32>;

   Validator(ViramaScript script, bool report_errors)
       : script_(script),
         codes_used_(0),
         output_used_(0),
         report_errors_(report_errors) {}

   // Factory method that understands how to map script to the right subclass.
   static std::unique_ptr<Validator> ScriptValidator(ViramaScript script,
                                                     bool report_errors);

   // Internal version of the public static ValidateCleanAndSegment.
   // Validates and cleans the src vector of unicodes to the *dest, according to
   // its type and the given g_mode.
   // In case of validation error, returns false and returns as much as possible
   // of the input, without discarding invalid text.
   bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode,
                                        const std::vector<char32>& src,
                                        std::vector<std::vector<char32>>* dest);
   // Moves the results from parts_ or output_ to dest according to g_mode.
   void MoveResultsToDest(GraphemeNormMode g_mode,
                          std::vector<std::vector<char32>>* dest);

   // Computes and returns the ViramaScript corresponding to the most frequent
   // virama-using script in the input, or kNonVirama if none are present.
   static ViramaScript MostFrequentViramaScript(
       const std::vector<char32>& utf32);
   // Returns true if the given UTF-32 unicode is a "virama" character.
   static bool IsVirama(char32 unicode);
   // Returns true if the given UTF-32 unicode is a vedic accent.
   static bool IsVedicAccent(char32 unicode);
   // Returns true if the script is one that uses subscripts for conjuncts.
   bool IsSubscriptScript() const;

   // Helper function appends the next element of codes_ only to output_,
   // without touching parts_
   // Returns true at the end of codes_.
   bool CodeOnlyToOutput() {
     output_.push_back(codes_[codes_used_].second);
     return ++codes_used_ == codes_.size();
   }

   // Helper function adds a length-element vector to parts_ from the last length
   // elements of output_. If there are more than length unused elements in
   // output_, adds unicodes as single-element vectors to parts_ to catch
   // output_used_ up to output->size() - length before adding the length-element
   // vector.
   void MultiCodePart(int length) {
     while (output_used_ + length < output_.size()) {
       parts_.emplace_back(
           std::initializer_list<char32>{output_[output_used_++]});
     }
     parts_.emplace_back(std::initializer_list<char32>{output_[output_used_]});
     while (++output_used_ < output_.size()) {
       parts_.back().push_back(output_[output_used_]);
     }
   }

   // Helper function appends the next element of codes_ to output_, and then
   // calls MultiCodePart to add the appropriate components to parts_.
   // Returns true at the end of codes_.
   bool UseMultiCode(int length) {
     output_.push_back(codes_[codes_used_].second);
     MultiCodePart(length);
     return ++codes_used_ == codes_.size();
   }

   // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
   // parts_ and output_. Returns true if a valid Grapheme was consumed,
   // otherwise does not increment codes_used_.
   virtual bool ConsumeGraphemeIfValid() = 0;
   // Sets codes_ to the class codes for the given unicode text.
   void ComputeClassCodes(const std::vector<char32>& text);
   // Returns the CharClass corresponding to the given Unicode ch.
   virtual CharClass UnicodeToCharClass(char32 ch) const = 0;
   // Resets to the initial state.
   void Clear();

   // Number of unicodes in each Indic codepage.
   static const int kIndicCodePageSize = 128;
   // Lowest unicode value of any Indic script. (Devanagari).
   static const char32 kMinIndicUnicode = 0x900;
   // Highest unicode value of any consistent (ISCII-based) Indic script.
   static const char32 kMaxSinhalaUnicode = 0xdff;
   // Highest unicode value of any virama-using script. (Khmer).
   static const char32 kMaxViramaScriptUnicode = 0x17ff;
   // Some special unicodes.
   static const char32 kSinhalaVirama = 0xdca;
   static const char32 kMyanmarVirama = 0x1039;
   static const char32 kKhmerVirama = 0x17d2;
   // Javanese Script - aksarajawa
   static const char32 kJavaneseVirama = 0xa9c0;
   static const char32 kMaxJavaneseUnicode = 0xa9df;

   // Script we are operating on.
   ViramaScript script_;
   // Input unicodes with assigned CharClass is the data to be validated.
   std::vector<IndicPair> codes_;
   // Glyph-like components of the input.
   std::vector<std::vector<char32>> parts_;
   // Copied validated unicodes from codes_ that are OK to output.
   std::vector<char32> output_;
   // The number of elements of codes_ that have been processed so far.
   int codes_used_;
   // The number of elements of output_ that have already been added to parts_.
   int output_used_;
   // Log error messages for reasons why text is invalid.
   bool report_errors_;
 };

 }  // namespace tesseract

 #endif  // TESSERACT_TRAINING_VALIDATOR_H_
tesseract::ViramaScript::kGurmukhi

tesseract::ViramaScript::kMalayalam

tesseract::ViramaScript::kOriya

tesseract::ViramaScript::kDevanagari

tesseract::Validator::UseMultiCode
bool UseMultiCode(int length)
Definition: validator.h:196

tesseract::char32
signed int char32
Definition: unichar.h:52

tesseract::GraphemeNormMode::kGlyphSplit

tesseract::Validator::output_used_
int output_used_
Definition: validator.h:240

tesseract::Validator::codes_
std::vector< IndicPair > codes_
Definition: validator.h:232

tesseract::Validator::MultiCodePart
void MultiCodePart(int length)
Definition: validator.h:182

tesseract::ViramaScript::kKannada

tesseract::ViramaScript::kBengali

tesseract::ViramaScript::kKhmer

tesseract::Validator::kLeftToRightMark
static const char32 kLeftToRightMark
Definition: validator.h:98

tesseract::GraphemeNormMode
GraphemeNormMode
Definition: validator.h:34

tesseract
Definition: baseapi.cpp:94

tesseract::Validator::IndicPair
std::pair< CharClass, char32 > IndicPair
Definition: validator.h:134

tesseract::ViramaScript::kTelugu

tesseract::Validator::parts_
std::vector< std::vector< char32 > > parts_
Definition: validator.h:234

tesseract::GraphemeNormMode::kSingleString

tesseract::GraphemeNormMode::kIndividualUnicodes

tesseract::ViramaScript
ViramaScript
Definition: validator.h:53

tesseract::Validator::output_
std::vector< char32 > output_
Definition: validator.h:236

tesseract::Validator::CharClass
CharClass
Definition: validator.h:112

tesseract::ViramaScript::kGujarati

tesseract::ViramaScript::kJavanese

tesseract::Validator::kRightToLeftMark
static const char32 kRightToLeftMark
Definition: validator.h:99

tesseract::Validator::kInvalid
static const char32 kInvalid
Definition: validator.h:100

tesseract::ViramaScript::kSinhala

tesseract::Validator::IsZeroWidthMark
static bool IsZeroWidthMark(char32 ch)
Definition: validator.h:88

tesseract::Validator::kZeroWidthJoiner
static const char32 kZeroWidthJoiner
Definition: validator.h:97

tesseract::Validator::CodeOnlyToOutput
bool CodeOnlyToOutput()
Definition: validator.h:172

tesseract::GraphemeNormMode::kCombined

tesseract::ViramaScript::kMyanmar

tesseract::Validator::report_errors_
bool report_errors_
Definition: validator.h:242

tesseract::Validator::codes_used_
int codes_used_
Definition: validator.h:238

tesseract::Validator::Validator
Validator(ViramaScript script, bool report_errors)
Definition: validator.h:136

tesseract::Validator::script_
ViramaScript script_
Definition: validator.h:230

tesseract::ViramaScript::kNonVirama

tesseract::ViramaScript::kTamil

tesseract::Validator::kZeroWidthNonJoiner
static const char32 kZeroWidthNonJoiner
Definition: validator.h:96

tesseract::Validator::kZeroWidthSpace
static const char32 kZeroWidthSpace
Definition: validator.h:95

tesseract::Validator
Definition: validator.h:72