tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
unicharset.h
1 // File: unicharset.h
3 // Description: Unicode character/ligature set class.
4 // Author: Thomas Kielbus
5 // Created: Wed Jun 28 17:05:01 PDT 2006
6 //
7 // (C) Copyright 2006, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifndef TESSERACT_CCUTIL_UNICHARSET_H_
21 #define TESSERACT_CCUTIL_UNICHARSET_H_
22 
23 #include "errcode.h"
24 #include "genericvector.h"
25 #include "helpers.h"
26 #include "serialis.h"
27 #include "strngs.h"
28 #include "tesscallback.h"
29 #include "unichar.h"
30 #include "unicharmap.h"
31 
32 // Enum holding special values of unichar_id. Every unicharset has these.
33 // Warning! Keep in sync with kSpecialUnicharCodes.
34 enum SpecialUnicharCodes {
35  UNICHAR_SPACE,
36  UNICHAR_JOINED,
37  UNICHAR_BROKEN,
38 
39  SPECIAL_UNICHAR_CODES_COUNT
40 };
41 
42 // Boolean flag for unichar_insert. It's a bit of a double negative to allow
43 // the default value to be false.
44 enum class OldUncleanUnichars {
45  kFalse,
46  kTrue,
47 };
48 
50  public:
51  // Minimum number of characters used for fragment representation.
52  static const int kMinLen = 6;
53  // Maximum number of characters used for fragment representation.
54  static const int kMaxLen = 3 + UNICHAR_LEN + 2;
55  // Maximum number of fragments per character.
56  static const int kMaxChunks = 5;
57 
58  // Setters and Getters.
59  inline void set_all(const char *unichar, int pos, int total, bool natural) {
60  set_unichar(unichar);
61  set_pos(pos);
62  set_total(total);
63  set_natural(natural);
64  }
65  inline void set_unichar(const char *uch) {
66  strncpy(this->unichar, uch, UNICHAR_LEN);
67  this->unichar[UNICHAR_LEN] = '\0';
68  }
69  inline void set_pos(int p) { this->pos = p; }
70  inline void set_total(int t) { this->total = t; }
71  inline const char* get_unichar() const { return this->unichar; }
72  inline int get_pos() const { return this->pos; }
73  inline int get_total() const { return this->total; }
74 
75  // Returns the string that represents a fragment
76  // with the given unichar, pos and total.
77  static STRING to_string(const char *unichar, int pos, int total,
78  bool natural);
79  // Returns the string that represents this fragment.
80  STRING to_string() const {
81  return to_string(unichar, pos, total, natural);
82  }
83 
84  // Checks whether a fragment has the same unichar,
85  // position and total as the given inputs.
86  inline bool equals(const char *other_unichar,
87  int other_pos, int other_total) const {
88  return (strcmp(this->unichar, other_unichar) == 0 &&
89  this->pos == other_pos && this->total == other_total);
90  }
91  inline bool equals(const CHAR_FRAGMENT *other) const {
92  return this->equals(other->get_unichar(),
93  other->get_pos(),
94  other->get_total());
95  }
96 
97  // Checks whether a given fragment is a continuation of this fragment.
98  // Assumes that the given fragment pointer is not nullptr.
99  inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
100  return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
101  this->total == fragment->get_total() &&
102  this->pos == fragment->get_pos() + 1);
103  }
104 
105  // Returns true if this fragment is a beginning fragment.
106  inline bool is_beginning() const { return this->pos == 0; }
107 
108  // Returns true if this fragment is an ending fragment.
109  inline bool is_ending() const { return this->pos == this->total-1; }
110 
111  // Returns true if the fragment was a separate component to begin with,
112  // ie did not need chopping to be isolated, but may have been separated
113  // out from a multi-outline blob.
114  inline bool is_natural() const { return natural; }
115  void set_natural(bool value) { natural = value; }
116 
117  // Parses the string to see whether it represents a character fragment
118  // (rather than a regular character). If so, allocates memory for a new
119  // CHAR_FRAGMENT instance and fills it in with the corresponding fragment
120  // information. Fragments are of the form:
121  // |m|1|2, meaning chunk 1 of 2 of character m, or
122  // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed
123  // to divide the parts, as they were already separate connected components.
124  //
125  // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
126  // instance, otherwise (if the string does not represent a fragment or it
127  // looks like it does, but parsing it as a fragment fails) returns nullptr.
128  //
129  // Note: The caller is responsible for deallocating memory
130  // associated with the returned pointer.
131  static CHAR_FRAGMENT *parse_from_string(const char *str);
132 
133  private:
134  char unichar[UNICHAR_LEN + 1];
135  // True if the fragment was a separate component to begin with,
136  // ie did not need chopping to be isolated, but may have been separated
137  // out from a multi-outline blob.
138  bool natural;
139  int16_t pos; // fragment position in the character
140  int16_t total; // total number of fragments in the character
141 };
142 
143 // The UNICHARSET class is an utility class for Tesseract that holds the
144 // set of characters that are used by the engine. Each character is identified
145 // by a unique number, from 0 to (size - 1).
146 class UNICHARSET {
147  public:
148  // Custom list of characters and their ligature forms (UTF8)
149  // These map to unicode values in the private use area (PUC) and are supported
150  // by only few font families (eg. Wyld, Adobe Caslon Pro).
151  static TESS_API const char* kCustomLigatures[][2];
152 
153  // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
154  static TESS_API const char* kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT];
155 
156  // ICU 2.0 UCharDirection enum (from third_party/icu/include/unicode/uchar.h)
157  enum Direction {
158  U_LEFT_TO_RIGHT = 0,
159  U_RIGHT_TO_LEFT = 1,
160  U_EUROPEAN_NUMBER = 2,
161  U_EUROPEAN_NUMBER_SEPARATOR = 3,
162  U_EUROPEAN_NUMBER_TERMINATOR = 4,
163  U_ARABIC_NUMBER = 5,
164  U_COMMON_NUMBER_SEPARATOR = 6,
165  U_BLOCK_SEPARATOR = 7,
166  U_SEGMENT_SEPARATOR = 8,
167  U_WHITE_SPACE_NEUTRAL = 9,
168  U_OTHER_NEUTRAL = 10,
169  U_LEFT_TO_RIGHT_EMBEDDING = 11,
170  U_LEFT_TO_RIGHT_OVERRIDE = 12,
171  U_RIGHT_TO_LEFT_ARABIC = 13,
172  U_RIGHT_TO_LEFT_EMBEDDING = 14,
173  U_RIGHT_TO_LEFT_OVERRIDE = 15,
174  U_POP_DIRECTIONAL_FORMAT = 16,
175  U_DIR_NON_SPACING_MARK = 17,
176  U_BOUNDARY_NEUTRAL = 18,
177  U_CHAR_DIRECTION_COUNT
178  };
179 
180  // Create an empty UNICHARSET
181  UNICHARSET();
182 
183  ~UNICHARSET();
184 
185  // Return the UNICHAR_ID of a given unichar representation within the
186  // UNICHARSET.
187  UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
188 
189  // Return the UNICHAR_ID of a given unichar representation within the
190  // UNICHARSET. Only the first length characters from unichar_repr are used.
191  UNICHAR_ID unichar_to_id(const char* const unichar_repr, int length) const;
192 
193  // Return the minimum number of bytes that matches a legal UNICHAR_ID,
194  // while leaving the rest of the string encodable. Returns 0 if the
195  // beginning of the string is not encodable.
196  // WARNING: this function now encodes the whole string for precision.
197  // Use encode_string in preference to repeatedly calling step.
198  int step(const char* str) const;
199 
200  // Returns true if the given UTF-8 string is encodable with this UNICHARSET.
201  // If not encodable, write the first byte offset which cannot be converted
202  // into the second (return) argument.
203  bool encodable_string(const char *str, int *first_bad_position) const;
204 
205  // Encodes the given UTF-8 string with this UNICHARSET.
206  // Any part of the string that cannot be encoded (because the utf8 can't
207  // be broken up into pieces that are in the unicharset) then:
208  // if give_up_on_failure, stops and returns a partial encoding,
209  // else continues and inserts an INVALID_UNICHAR_ID in the returned encoding.
210  // Returns true if the encoding succeeds completely, false if there is at
211  // least one failure.
212  // If lengths is not nullptr, then it is filled with the corresponding
213  // byte length of each encoded UNICHAR_ID.
214  // If encoded_length is not nullptr then on return it contains the length of
215  // str that was encoded. (if give_up_on_failure the location of the first
216  // failure, otherwise strlen(str).)
217  // WARNING: Caller must guarantee that str has already been cleaned of codes
218  // that do not belong in the unicharset, or encoding may fail.
219  // Use CleanupString to perform the cleaning.
220  bool encode_string(const char* str, bool give_up_on_failure,
221  GenericVector<UNICHAR_ID>* encoding,
222  GenericVector<char>* lengths,
223  int* encoded_length) const;
224 
225  // Return the unichar representation corresponding to the given UNICHAR_ID
226  // within the UNICHARSET.
227  const char* id_to_unichar(UNICHAR_ID id) const;
228 
229  // Return the UTF8 representation corresponding to the given UNICHAR_ID after
230  // resolving any private encodings internal to Tesseract. This method is
231  // preferable to id_to_unichar for outputting text that will be visible to
232  // external applications.
233  const char* id_to_unichar_ext(UNICHAR_ID id) const;
234 
235  // Return a STRING that reformats the utf8 str into the str followed
236  // by its hex unicodes.
237  static STRING debug_utf8_str(const char* str);
238 
239  // Removes/replaces content that belongs in rendered text, but not in the
240  // unicharset.
241  static std::string CleanupString(const char* utf8_str) {
242  return CleanupString(utf8_str, strlen(utf8_str));
243  }
244  static std::string CleanupString(const char* utf8_str, size_t length);
245 
246  // Return a STRING containing debug information on the unichar, including
247  // the id_to_unichar, its hex unicodes and the properties.
248  STRING debug_str(UNICHAR_ID id) const;
249  STRING debug_str(const char * unichar_repr) const {
250  return debug_str(unichar_to_id(unichar_repr));
251  }
252 
253  // Adds a unichar representation to the set. If old_style is true, then
254  // TATWEEL characters are kept and n-grams are allowed. Otherwise TATWEEL
255  // characters are ignored/skipped as if they don't exist and n-grams that
256  // can already be encoded are not added.
257  void unichar_insert(const char* const unichar_repr,
258  OldUncleanUnichars old_style);
259  void unichar_insert(const char* const unichar_repr) {
260  unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
261  }
262  // Adds a unichar representation to the set. Avoids setting old_style to true,
263  // unless it is necessary to make the new unichar get added.
264  void unichar_insert_backwards_compatible(const char* const unichar_repr) {
265  std::string cleaned = CleanupString(unichar_repr);
266  if (cleaned != unichar_repr) {
267  unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
268  } else {
269  int old_size = size();
270  unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
271  if (size() == old_size) {
272  unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
273  }
274  }
275  }
276 
277  // Return true if the given unichar id exists within the set.
278  // Relies on the fact that unichar ids are contiguous in the unicharset.
279  bool contains_unichar_id(UNICHAR_ID unichar_id) const {
280  return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
281  unichar_id >= 0;
282  }
283 
284  // Return true if the given unichar representation exists within the set.
285  bool contains_unichar(const char* const unichar_repr) const;
286  bool contains_unichar(const char* const unichar_repr, int length) const;
287 
288  // Return true if the given unichar representation corresponds to the given
289  // UNICHAR_ID within the set.
290  bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const;
291 
292  // Delete CHAR_FRAGMENTs stored in properties of unichars array.
294  for (int i = 0; i < size_used; ++i) {
295  delete unichars[i].properties.fragment;
296  unichars[i].properties.fragment = nullptr;
297  }
298  }
299 
300  // Clear the UNICHARSET (all the previous data is lost).
301  void clear() {
302  if (script_table != nullptr) {
303  for (int i = 0; i < script_table_size_used; ++i)
304  delete[] script_table[i];
305  delete[] script_table;
306  script_table = nullptr;
307  script_table_size_used = 0;
308  }
309  if (unichars != nullptr) {
310  delete_pointers_in_unichars();
311  delete[] unichars;
312  unichars = nullptr;
313  }
314  script_table_size_reserved = 0;
315  size_reserved = 0;
316  size_used = 0;
317  ids.clear();
318  top_bottom_set_ = false;
319  script_has_upper_lower_ = false;
320  script_has_xheight_ = false;
321  old_style_included_ = false;
322  null_sid_ = 0;
323  common_sid_ = 0;
324  latin_sid_ = 0;
325  cyrillic_sid_ = 0;
326  greek_sid_ = 0;
327  han_sid_ = 0;
328  hiragana_sid_ = 0;
329  katakana_sid_ = 0;
330  thai_sid_ = 0;
331  hangul_sid_ = 0;
332  default_sid_ = 0;
333  }
334 
335  // Return the size of the set (the number of different UNICHAR it holds).
336  int size() const {
337  return size_used;
338  }
339 
340  // Reserve enough memory space for the given number of UNICHARS
341  void reserve(int unichars_number);
342 
343  // Opens the file indicated by filename and saves unicharset to that file.
344  // Returns true if the operation is successful.
345  bool save_to_file(const char * const filename) const {
346  FILE* file = fopen(filename, "w+b");
347  if (file == nullptr) return false;
348  bool result = save_to_file(file);
349  fclose(file);
350  return result;
351  }
352 
353  // Saves the content of the UNICHARSET to the given file.
354  // Returns true if the operation is successful.
355  bool save_to_file(FILE *file) const {
356  STRING str;
357  return save_to_string(&str) &&
358  tesseract::Serialize(file, &str[0], str.length());
359  }
360 
361  bool save_to_file(tesseract::TFile *file) const {
362  STRING str;
363  return save_to_string(&str) && file->Serialize(&str[0], str.length());
364  }
365 
366  // Saves the content of the UNICHARSET to the given STRING.
367  // Returns true if the operation is successful.
368  bool save_to_string(STRING *str) const;
369 
370  // Load a unicharset from a unicharset file that has been loaded into
371  // the given memory buffer.
372  // Returns true if the operation is successful.
373  bool load_from_inmemory_file(const char* const memory, int mem_size,
374  bool skip_fragments);
375  // Returns true if the operation is successful.
376  bool load_from_inmemory_file(const char* const memory, int mem_size) {
377  return load_from_inmemory_file(memory, mem_size, false);
378  }
379 
380  // Opens the file indicated by filename and loads the UNICHARSET
381  // from the given file. The previous data is lost.
382  // Returns true if the operation is successful.
383  bool load_from_file(const char* const filename, bool skip_fragments) {
384  FILE* file = fopen(filename, "rb");
385  if (file == nullptr) return false;
386  bool result = load_from_file(file, skip_fragments);
387  fclose(file);
388  return result;
389  }
390  // returns true if the operation is successful.
391  bool load_from_file(const char* const filename) {
392  return load_from_file(filename, false);
393  }
394 
395  // Loads the UNICHARSET from the given file. The previous data is lost.
396  // Returns true if the operation is successful.
397  bool load_from_file(FILE *file, bool skip_fragments);
398  bool load_from_file(FILE *file) { return load_from_file(file, false); }
399  bool load_from_file(tesseract::TFile *file, bool skip_fragments);
400 
401 
402  // Sets up internal data after loading the file, based on the char
403  // properties. Called from load_from_file, but also needs to be run
404  // during set_unicharset_properties.
405  void post_load_setup();
406 
407  // Returns true if right_to_left scripts are significant in the unicharset,
408  // but without being so sensitive that "universal" unicharsets containing
409  // characters from many scripts, like orientation and script detection,
410  // look like they are right_to_left.
411  bool major_right_to_left() const;
412 
413  // Set a whitelist and/or blacklist of characters to recognize.
414  // An empty or nullptr whitelist enables everything (minus any blacklist).
415  // An empty or nullptr blacklist disables nothing.
416  // An empty or nullptr unblacklist has no effect.
417  // The blacklist overrides the whitelist.
418  // The unblacklist overrides the blacklist.
419  // Each list is a string of utf8 character strings. Boundaries between
420  // unicharset units are worked out automatically, and characters not in
421  // the unicharset are silently ignored.
422  void set_black_and_whitelist(const char* blacklist, const char* whitelist,
423  const char* unblacklist);
424 
425  // Set the isalpha property of the given unichar to the given value.
426  void set_isalpha(UNICHAR_ID unichar_id, bool value) {
427  unichars[unichar_id].properties.isalpha = value;
428  }
429 
430  // Set the islower property of the given unichar to the given value.
431  void set_islower(UNICHAR_ID unichar_id, bool value) {
432  unichars[unichar_id].properties.islower = value;
433  }
434 
435  // Set the isupper property of the given unichar to the given value.
436  void set_isupper(UNICHAR_ID unichar_id, bool value) {
437  unichars[unichar_id].properties.isupper = value;
438  }
439 
440  // Set the isdigit property of the given unichar to the given value.
441  void set_isdigit(UNICHAR_ID unichar_id, bool value) {
442  unichars[unichar_id].properties.isdigit = value;
443  }
444 
445  // Set the ispunctuation property of the given unichar to the given value.
446  void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
447  unichars[unichar_id].properties.ispunctuation = value;
448  }
449 
450  // Set the isngram property of the given unichar to the given value.
451  void set_isngram(UNICHAR_ID unichar_id, bool value) {
452  unichars[unichar_id].properties.isngram = value;
453  }
454 
455  // Set the script name of the given unichar to the given value.
456  // Value is copied and thus can be a temporary;
457  void set_script(UNICHAR_ID unichar_id, const char* value) {
458  unichars[unichar_id].properties.script_id = add_script(value);
459  }
460 
461  // Set other_case unichar id in the properties for the given unichar id.
462  void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
463  unichars[unichar_id].properties.other_case = other_case;
464  }
465 
466  // Set the direction property of the given unichar to the given value.
467  void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value) {
468  unichars[unichar_id].properties.direction = value;
469  }
470 
471  // Set mirror unichar id in the properties for the given unichar id.
472  void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) {
473  unichars[unichar_id].properties.mirror = mirror;
474  }
475 
476  // Record normalized version of unichar with the given unichar_id.
477  void set_normed(UNICHAR_ID unichar_id, const char* normed) {
478  unichars[unichar_id].properties.normed = normed;
479  unichars[unichar_id].properties.normed_ids.truncate(0);
480  }
481  // Sets the normed_ids vector from the normed string. normed_ids is not
482  // stored in the file, and needs to be set when the UNICHARSET is loaded.
483  void set_normed_ids(UNICHAR_ID unichar_id);
484 
485  // Return the isalpha property of the given unichar.
486  bool get_isalpha(UNICHAR_ID unichar_id) const {
487  if (INVALID_UNICHAR_ID == unichar_id) return false;
488  ASSERT_HOST(contains_unichar_id(unichar_id));
489  return unichars[unichar_id].properties.isalpha;
490  }
491 
492  // Return the islower property of the given unichar.
493  bool get_islower(UNICHAR_ID unichar_id) const {
494  if (INVALID_UNICHAR_ID == unichar_id) return false;
495  ASSERT_HOST(contains_unichar_id(unichar_id));
496  return unichars[unichar_id].properties.islower;
497  }
498 
499  // Return the isupper property of the given unichar.
500  bool get_isupper(UNICHAR_ID unichar_id) const {
501  if (INVALID_UNICHAR_ID == unichar_id) return false;
502  ASSERT_HOST(contains_unichar_id(unichar_id));
503  return unichars[unichar_id].properties.isupper;
504  }
505 
506  // Return the isdigit property of the given unichar.
507  bool get_isdigit(UNICHAR_ID unichar_id) const {
508  if (INVALID_UNICHAR_ID == unichar_id) return false;
509  ASSERT_HOST(contains_unichar_id(unichar_id));
510  return unichars[unichar_id].properties.isdigit;
511  }
512 
513  // Return the ispunctuation property of the given unichar.
514  bool get_ispunctuation(UNICHAR_ID unichar_id) const {
515  if (INVALID_UNICHAR_ID == unichar_id) return false;
516  ASSERT_HOST(contains_unichar_id(unichar_id));
517  return unichars[unichar_id].properties.ispunctuation;
518  }
519 
520  // Return the isngram property of the given unichar.
521  bool get_isngram(UNICHAR_ID unichar_id) const {
522  if (INVALID_UNICHAR_ID == unichar_id) return false;
523  ASSERT_HOST(contains_unichar_id(unichar_id));
524  return unichars[unichar_id].properties.isngram;
525  }
526 
527  // Returns whether the unichar id represents a unicode value in the private
528  // use area.
529  bool get_isprivate(UNICHAR_ID unichar_id) const;
530 
531  // Returns true if the ids have useful min/max top/bottom values.
532  bool top_bottom_useful() const {
533  return top_bottom_set_;
534  }
535  // Sets all ranges to empty, so they can be expanded to set the values.
536  void set_ranges_empty();
537  // Sets all the properties for this unicharset given a src_unicharset with
538  // everything set. The unicharsets don't have to be the same, and graphemes
539  // are correctly accounted for.
541  PartialSetPropertiesFromOther(0, src);
542  }
543  // Sets properties from Other, starting only at the given index.
544  void PartialSetPropertiesFromOther(int start_index, const UNICHARSET& src);
545  // Expands the tops and bottoms and widths for this unicharset given a
546  // src_unicharset with ranges in it. The unicharsets don't have to be the
547  // same, and graphemes are correctly accounted for.
548  void ExpandRangesFromOther(const UNICHARSET& src);
549  // Makes this a copy of src. Clears this completely first, so the automattic
550  // ids will not be present in this if not in src.
551  void CopyFrom(const UNICHARSET& src);
552  // For each id in src, if it does not occur in this, add it, as in
553  // SetPropertiesFromOther, otherwise expand the ranges, as in
554  // ExpandRangesFromOther.
555  void AppendOtherUnicharset(const UNICHARSET& src);
556  // Returns true if the acceptable ranges of the tops of the characters do
557  // not overlap, making their x-height calculations distinct.
558  bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const;
559  // Returns the min and max bottom and top of the given unichar in
560  // baseline-normalized coordinates, ie, where the baseline is
561  // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
562  // (See normalis.h for the definitions).
563  void get_top_bottom(UNICHAR_ID unichar_id,
564  int* min_bottom, int* max_bottom,
565  int* min_top, int* max_top) const {
566  if (INVALID_UNICHAR_ID == unichar_id) {
567  *min_bottom = *min_top = 0;
568  *max_bottom = *max_top = 256; // kBlnCellHeight
569  return;
570  }
571  ASSERT_HOST(contains_unichar_id(unichar_id));
572  *min_bottom = unichars[unichar_id].properties.min_bottom;
573  *max_bottom = unichars[unichar_id].properties.max_bottom;
574  *min_top = unichars[unichar_id].properties.min_top;
575  *max_top = unichars[unichar_id].properties.max_top;
576  }
577  void set_top_bottom(UNICHAR_ID unichar_id,
578  int min_bottom, int max_bottom,
579  int min_top, int max_top) {
580  unichars[unichar_id].properties.min_bottom =
581  ClipToRange<int>(min_bottom, 0, UINT8_MAX);
582  unichars[unichar_id].properties.max_bottom =
583  ClipToRange<int>(max_bottom, 0, UINT8_MAX);
584  unichars[unichar_id].properties.min_top =
585  ClipToRange<int>(min_top, 0, UINT8_MAX);
586  unichars[unichar_id].properties.max_top =
587  ClipToRange<int>(max_top, 0, UINT8_MAX);
588  }
589  // Returns the width stats (as mean, sd) of the given unichar relative to the
590  // median advance of all characters in the character set.
591  void get_width_stats(UNICHAR_ID unichar_id,
592  float* width, float* width_sd) const {
593  if (INVALID_UNICHAR_ID == unichar_id) {
594  *width = 0.0f;
595  *width_sd = 0.0f;;
596  return;
597  }
598  ASSERT_HOST(contains_unichar_id(unichar_id));
599  *width = unichars[unichar_id].properties.width;
600  *width_sd = unichars[unichar_id].properties.width_sd;
601  }
602  void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd) {
603  unichars[unichar_id].properties.width = width;
604  unichars[unichar_id].properties.width_sd = width_sd;
605  }
606  // Returns the stats of the x-bearing (as mean, sd) of the given unichar
607  // relative to the median advance of all characters in the character set.
608  void get_bearing_stats(UNICHAR_ID unichar_id,
609  float* bearing, float* bearing_sd) const {
610  if (INVALID_UNICHAR_ID == unichar_id) {
611  *bearing = *bearing_sd = 0.0f;
612  return;
613  }
614  ASSERT_HOST(contains_unichar_id(unichar_id));
615  *bearing = unichars[unichar_id].properties.bearing;
616  *bearing_sd = unichars[unichar_id].properties.bearing_sd;
617  }
618  void set_bearing_stats(UNICHAR_ID unichar_id,
619  float bearing, float bearing_sd) {
620  unichars[unichar_id].properties.bearing = bearing;
621  unichars[unichar_id].properties.bearing_sd = bearing_sd;
622  }
623  // Returns the stats of the x-advance of the given unichar (as mean, sd)
624  // relative to the median advance of all characters in the character set.
625  void get_advance_stats(UNICHAR_ID unichar_id,
626  float* advance, float* advance_sd) const {
627  if (INVALID_UNICHAR_ID == unichar_id) {
628  *advance = *advance_sd = 0;
629  return;
630  }
631  ASSERT_HOST(contains_unichar_id(unichar_id));
632  *advance = unichars[unichar_id].properties.advance;
633  *advance_sd = unichars[unichar_id].properties.advance_sd;
634  }
635  void set_advance_stats(UNICHAR_ID unichar_id,
636  float advance, float advance_sd) {
637  unichars[unichar_id].properties.advance = advance;
638  unichars[unichar_id].properties.advance_sd = advance_sd;
639  }
640  // Returns true if the font metrics properties are empty.
641  bool PropertiesIncomplete(UNICHAR_ID unichar_id) const {
642  return unichars[unichar_id].properties.AnyRangeEmpty();
643  }
644 
645  // Returns true if the script of the given id is space delimited.
646  // Returns false for Han and Thai scripts.
647  bool IsSpaceDelimited(UNICHAR_ID unichar_id) const {
648  if (INVALID_UNICHAR_ID == unichar_id) return true;
649  int script_id = get_script(unichar_id);
650  return script_id != han_sid_ && script_id != thai_sid_ &&
651  script_id != hangul_sid_ && script_id != hiragana_sid_ &&
652  script_id != katakana_sid_;
653  }
654 
655  // Return the script name of the given unichar.
656  // The returned pointer will always be the same for the same script, it's
657  // managed by unicharset and thus MUST NOT be deleted
658  int get_script(UNICHAR_ID unichar_id) const {
659  if (INVALID_UNICHAR_ID == unichar_id) return null_sid_;
660  ASSERT_HOST(contains_unichar_id(unichar_id));
661  return unichars[unichar_id].properties.script_id;
662  }
663 
664  // Return the character properties, eg. alpha/upper/lower/digit/punct,
665  // as a bit field of unsigned int.
666  unsigned int get_properties(UNICHAR_ID unichar_id) const;
667 
668  // Return the character property as a single char. If a character has
669  // multiple attributes, the main property is defined by the following order:
670  // upper_case : 'A'
671  // lower_case : 'a'
672  // alpha : 'x'
673  // digit : '0'
674  // punctuation: 'p'
675  char get_chartype(UNICHAR_ID unichar_id) const;
676 
677  // Get other_case unichar id in the properties for the given unichar id.
678  UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const {
679  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
680  ASSERT_HOST(contains_unichar_id(unichar_id));
681  return unichars[unichar_id].properties.other_case;
682  }
683 
684  // Returns the direction property of the given unichar.
685  Direction get_direction(UNICHAR_ID unichar_id) const {
686  if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL;
687  ASSERT_HOST(contains_unichar_id(unichar_id));
688  return unichars[unichar_id].properties.direction;
689  }
690 
691  // Get mirror unichar id in the properties for the given unichar id.
692  UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const {
693  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
694  ASSERT_HOST(contains_unichar_id(unichar_id));
695  return unichars[unichar_id].properties.mirror;
696  }
697 
698  // Returns UNICHAR_ID of the corresponding lower-case unichar.
699  UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
700  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
701  ASSERT_HOST(contains_unichar_id(unichar_id));
702  if (unichars[unichar_id].properties.islower) return unichar_id;
703  return unichars[unichar_id].properties.other_case;
704  }
705 
706  // Returns UNICHAR_ID of the corresponding upper-case unichar.
707  UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
708  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
709  ASSERT_HOST(contains_unichar_id(unichar_id));
710  if (unichars[unichar_id].properties.isupper) return unichar_id;
711  return unichars[unichar_id].properties.other_case;
712  }
713 
714  // Returns true if this UNICHARSET has the special codes in
715  // SpecialUnicharCodes available. If false then there are normal unichars
716  // at these codes and they should not be used.
717  bool has_special_codes() const {
718  return get_fragment(UNICHAR_BROKEN) != nullptr &&
719  strcmp(id_to_unichar(UNICHAR_BROKEN),
720  kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0;
721  }
722 
723  // Returns true if there are any repeated unicodes in the normalized
724  // text of any unichar-id in the unicharset.
725  bool AnyRepeatedUnicodes() const;
726 
727  // Return a pointer to the CHAR_FRAGMENT class if the given
728  // unichar id represents a character fragment.
729  const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
730  if (INVALID_UNICHAR_ID == unichar_id) return nullptr;
731  ASSERT_HOST(contains_unichar_id(unichar_id));
732  return unichars[unichar_id].properties.fragment;
733  }
734 
735  // Return the isalpha property of the given unichar representation.
736  bool get_isalpha(const char* const unichar_repr) const {
737  return get_isalpha(unichar_to_id(unichar_repr));
738  }
739 
740  // Return the islower property of the given unichar representation.
741  bool get_islower(const char* const unichar_repr) const {
742  return get_islower(unichar_to_id(unichar_repr));
743  }
744 
745  // Return the isupper property of the given unichar representation.
746  bool get_isupper(const char* const unichar_repr) const {
747  return get_isupper(unichar_to_id(unichar_repr));
748  }
749 
750  // Return the isdigit property of the given unichar representation.
751  bool get_isdigit(const char* const unichar_repr) const {
752  return get_isdigit(unichar_to_id(unichar_repr));
753  }
754 
755  // Return the ispunctuation property of the given unichar representation.
756  bool get_ispunctuation(const char* const unichar_repr) const {
757  return get_ispunctuation(unichar_to_id(unichar_repr));
758  }
759 
760  // Return the character properties, eg. alpha/upper/lower/digit/punct,
761  // of the given unichar representation
762  unsigned int get_properties(const char* const unichar_repr) const {
763  return get_properties(unichar_to_id(unichar_repr));
764  }
765 
766  char get_chartype(const char* const unichar_repr) const {
767  return get_chartype(unichar_to_id(unichar_repr));
768  }
769 
770  // Return the script name of the given unichar representation.
771  // The returned pointer will always be the same for the same script, it's
772  // managed by unicharset and thus MUST NOT be deleted
773  int get_script(const char* const unichar_repr) const {
774  return get_script(unichar_to_id(unichar_repr));
775  }
776 
777  // Return a pointer to the CHAR_FRAGMENT class struct if the given
778  // unichar representation represents a character fragment.
779  const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
780  if (unichar_repr == nullptr || unichar_repr[0] == '\0' ||
781  !ids.contains(unichar_repr, false)) {
782  return nullptr;
783  }
784  return get_fragment(unichar_to_id(unichar_repr));
785  }
786 
787  // Return the isalpha property of the given unichar representation.
788  // Only the first length characters from unichar_repr are used.
789  bool get_isalpha(const char* const unichar_repr,
790  int length) const {
791  return get_isalpha(unichar_to_id(unichar_repr, length));
792  }
793 
794  // Return the islower property of the given unichar representation.
795  // Only the first length characters from unichar_repr are used.
796  bool get_islower(const char* const unichar_repr,
797  int length) const {
798  return get_islower(unichar_to_id(unichar_repr, length));
799  }
800 
801  // Return the isupper property of the given unichar representation.
802  // Only the first length characters from unichar_repr are used.
803  bool get_isupper(const char* const unichar_repr,
804  int length) const {
805  return get_isupper(unichar_to_id(unichar_repr, length));
806  }
807 
808  // Return the isdigit property of the given unichar representation.
809  // Only the first length characters from unichar_repr are used.
810  bool get_isdigit(const char* const unichar_repr,
811  int length) const {
812  return get_isdigit(unichar_to_id(unichar_repr, length));
813  }
814 
815  // Return the ispunctuation property of the given unichar representation.
816  // Only the first length characters from unichar_repr are used.
817  bool get_ispunctuation(const char* const unichar_repr,
818  int length) const {
819  return get_ispunctuation(unichar_to_id(unichar_repr, length));
820  }
821 
822  // Returns normalized version of unichar with the given unichar_id.
823  const char *get_normed_unichar(UNICHAR_ID unichar_id) const {
824  if (unichar_id == UNICHAR_SPACE) return " ";
825  return unichars[unichar_id].properties.normed.string();
826  }
827  // Returns a vector of UNICHAR_IDs that represent the ids of the normalized
828  // version of the given id. There may be more than one UNICHAR_ID in the
829  // vector if unichar_id represents a ligature.
830  const GenericVector<UNICHAR_ID>& normed_ids(UNICHAR_ID unichar_id) const {
831  return unichars[unichar_id].properties.normed_ids;
832  }
833 
834  // Return the script name of the given unichar representation.
835  // Only the first length characters from unichar_repr are used.
836  // The returned pointer will always be the same for the same script, it's
837  // managed by unicharset and thus MUST NOT be deleted
838  int get_script(const char* const unichar_repr,
839  int length) const {
840  return get_script(unichar_to_id(unichar_repr, length));
841  }
842 
843  // Return the (current) number of scripts in the script table
844  int get_script_table_size() const {
845  return script_table_size_used;
846  }
847 
848  // Return the script string from its id
849  const char* get_script_from_script_id(int id) const {
850  if (id >= script_table_size_used || id < 0)
851  return null_script;
852  return script_table[id];
853  }
854 
855  // Returns the id from the name of the script, or 0 if script is not found.
856  // Note that this is an expensive operation since it involves iteratively
857  // comparing strings in the script table. To avoid dependency on STL, we
858  // won't use a hash. Instead, the calling function can use this to lookup
859  // and save the ID for relevant scripts for fast comparisons later.
860  int get_script_id_from_name(const char* script_name) const;
861 
862  // Return true if the given script is the null script
863  bool is_null_script(const char* script) const {
864  return script == null_script;
865  }
866 
867  // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
868  // then the returned pointer will be the same.
869  // The script parameter is copied and thus can be a temporary.
870  int add_script(const char* script);
871 
872  // Return the enabled property of the given unichar.
873  bool get_enabled(UNICHAR_ID unichar_id) const {
874  return unichars[unichar_id].properties.enabled;
875  }
876 
877 
878  int null_sid() const { return null_sid_; }
879  int common_sid() const { return common_sid_; }
880  int latin_sid() const { return latin_sid_; }
881  int cyrillic_sid() const { return cyrillic_sid_; }
882  int greek_sid() const { return greek_sid_; }
883  int han_sid() const { return han_sid_; }
884  int hiragana_sid() const { return hiragana_sid_; }
885  int katakana_sid() const { return katakana_sid_; }
886  int thai_sid() const { return thai_sid_; }
887  int hangul_sid() const { return hangul_sid_; }
888  int default_sid() const { return default_sid_; }
889 
890  // Returns true if the unicharset has the concept of upper/lower case.
891  bool script_has_upper_lower() const {
892  return script_has_upper_lower_;
893  }
894  // Returns true if the unicharset has the concept of x-height.
895  // script_has_xheight can be true even if script_has_upper_lower is not,
896  // when the script has a sufficiently predominant top line with ascenders,
897  // such as Devanagari and Thai.
898  bool script_has_xheight() const {
899  return script_has_xheight_;
900  }
901 
902  private:
903 
906  // Initializes all properties to sensible default values.
907  void Init();
908  // Sets all ranges wide open. Initialization default in case there are
909  // no useful values available.
910  void SetRangesOpen();
911  // Sets all ranges to empty. Used before expanding with font-based data.
912  void SetRangesEmpty();
913  // Returns true if any of the top/bottom/width/bearing/advance ranges/stats
914  // is empty.
915  bool AnyRangeEmpty() const;
916  // Expands the ranges with the ranges from the src properties.
917  void ExpandRangesFrom(const UNICHAR_PROPERTIES& src);
918  // Copies the properties from src into this.
919  void CopyFrom(const UNICHAR_PROPERTIES& src);
920 
921  bool isalpha;
922  bool islower;
923  bool isupper;
924  bool isdigit;
926  bool isngram;
927  bool enabled;
928  // Possible limits of the top and bottom of the bounding box in
929  // baseline-normalized coordinates, ie, where the baseline is
930  // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
931  // (See normalis.h for the definitions).
932  uint8_t min_bottom;
933  uint8_t max_bottom;
934  uint8_t min_top;
935  uint8_t max_top;
936  // Statstics of the widths of bounding box, relative to the median advance.
937  float width;
938  float width_sd;
939  // Stats of the x-bearing and advance, also relative to the median advance.
940  float bearing;
941  float bearing_sd;
942  float advance;
943  float advance_sd;
945  UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar
946  Direction direction; // direction of this unichar
947  // Mirror property is useful for reverse DAWG lookup for words in
948  // right-to-left languages (e.g. "(word)" would be in
949  // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string.
950  // However, what we want in our DAWG is
951  // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not
952  // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'.
953  UNICHAR_ID mirror;
954  // A string of unichar_ids that represent the corresponding normed string.
955  // For awkward characters like em-dash, this gives hyphen.
956  // For ligatures, this gives the string of normal unichars.
958  STRING normed; // normalized version of this unichar
959  // Contains meta information about the fragment if a unichar represents
960  // a fragment of a character, otherwise should be set to nullptr.
961  // It is assumed that character fragments are added to the unicharset
962  // after the corresponding 'base' characters.
964  };
965 
966  struct UNICHAR_SLOT {
967  char representation[UNICHAR_LEN + 1];
969  };
970 
971  // Internal recursive version of encode_string above.
972  // str is the start of the whole string.
973  // str_index is the current position in str.
974  // str_length is the length of str.
975  // encoding is a working encoding of str.
976  // lengths is a working set of lengths of each element of encoding.
977  // best_total_length is the longest length of str that has been successfully
978  // encoded so far.
979  // On return:
980  // best_encoding contains the encoding that used the longest part of str.
981  // best_lengths (may be null) contains the lengths of best_encoding.
982  void encode_string(const char* str, int str_index, int str_length,
983  GenericVector<UNICHAR_ID>* encoding,
984  GenericVector<char>* lengths,
985  int* best_total_length,
986  GenericVector<UNICHAR_ID>* best_encoding,
987  GenericVector<char>* best_lengths) const;
988 
989  // Gets the properties for a grapheme string, combining properties for
990  // multiple characters in a meaningful way where possible.
991  // Returns false if no valid match was found in the unicharset.
992  // NOTE that script_id, mirror, and other_case refer to this unicharset on
993  // return and will need redirecting if the target unicharset is different.
994  bool GetStrProperties(const char* utf8_str,
995  UNICHAR_PROPERTIES* props) const;
996 
997  // Load ourselves from a "file" where our only interface to the file is
998  // an implementation of fgets(). This is the parsing primitive accessed by
999  // the public routines load_from_file() and load_from_inmemory_file().
1000  bool load_via_fgets(TessResultCallback2<char *, char *, int> *fgets_cb,
1001  bool skip_fragments);
1002 
1003  // List of mappings to make when ingesting strings from the outside.
1004  // The substitutions clean up text that should exists for rendering of
1005  // synthetic data, but not in the recognition set.
1006  static const char* kCleanupMaps[][2];
1007  static TESS_API const char* null_script;
1008 
1016  // True if the unichars have their tops/bottoms set.
1018  // True if the unicharset has significant upper/lower case chars.
1020  // True if the unicharset has a significant mean-line with significant
1021  // ascenders above that.
1023  // True if the set contains chars that would be changed by the cleanup.
1025 
1026  // A few convenient script name-to-id mapping without using hash.
1027  // These are initialized when unicharset file is loaded. Anything
1028  // missing from this list can be looked up using get_script_id_from_name.
1039  // The most frequently occurring script in the charset.
1041 };
1042 
1043 #endif // TESSERACT_CCUTIL_UNICHARSET_H_
Direction direction
Definition: unicharset.h:946
int size_reserved
Definition: unicharset.h:1012
void set_advance_stats(UNICHAR_ID unichar_id, float advance, float advance_sd)
Definition: unicharset.h:635
UNICHAR_ID mirror
Definition: unicharset.h:953
void set_bearing_stats(UNICHAR_ID unichar_id, float bearing, float bearing_sd)
Definition: unicharset.h:618
bool top_bottom_useful() const
Definition: unicharset.h:532
void delete_pointers_in_unichars()
Definition: unicharset.h:293
bool script_has_xheight_
Definition: unicharset.h:1022
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
Definition: unicharset.h:467
bool get_isalpha(const char *const unichar_repr, int length) const
Definition: unicharset.h:789
bool islower
Definition: unicharset.h:922
STRING debug_str(const char *unichar_repr) const
Definition: unicharset.h:249
bool get_isdigit(const char *const unichar_repr, int length) const
Definition: unicharset.h:810
bool get_isalpha(const char *const unichar_repr) const
Definition: unicharset.h:736
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
Definition: unicharset.h:625
float width
Definition: unicharset.h:937
int size_used
Definition: unicharset.h:1011
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:279
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:514
bool get_ispunctuation(const char *const unichar_repr, int length) const
Definition: unicharset.h:817
void set_normed(UNICHAR_ID unichar_id, const char *normed)
Definition: unicharset.h:477
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:849
bool has_special_codes() const
Definition: unicharset.h:717
bool script_has_upper_lower() const
Definition: unicharset.h:891
int default_sid_
Definition: unicharset.h:1040
int script_table_size_used
Definition: unicharset.h:1014
uint8_t min_top
Definition: unicharset.h:934
bool script_has_upper_lower_
Definition: unicharset.h:1019
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
Definition: unicharset.h:99
bool load_from_inmemory_file(const char *const memory, int mem_size)
Definition: unicharset.h:376
bool ispunctuation
Definition: unicharset.h:925
int han_sid() const
Definition: unicharset.h:883
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:699
bool equals(const char *other_unichar, int other_pos, int other_total) const
Definition: unicharset.h:86
bool get_isupper(const char *const unichar_repr, int length) const
Definition: unicharset.h:803
uint8_t min_bottom
Definition: unicharset.h:932
bool old_style_included_
Definition: unicharset.h:1024
bool get_isdigit(const char *const unichar_repr) const
Definition: unicharset.h:751
Direction
Definition: unicharset.h:157
void set_isdigit(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:441
bool save_to_file(tesseract::TFile *file) const
Definition: unicharset.h:361
int null_sid() const
Definition: unicharset.h:878
void set_unichar(const char *uch)
Definition: unicharset.h:65
uint8_t max_top
Definition: unicharset.h:935
const CHAR_FRAGMENT * get_fragment(const char *const unichar_repr) const
Definition: unicharset.h:779
int thai_sid_
Definition: unicharset.h:1037
Definition: unicharset.h:146
void set_islower(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:431
float bearing_sd
Definition: unicharset.h:941
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.h:259
int get_script(const char *const unichar_repr) const
Definition: unicharset.h:773
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:873
void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd)
Definition: unicharset.h:602
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:446
int size() const
Definition: unicharset.h:336
int script_table_size_reserved
Definition: unicharset.h:1015
int32_t length() const
Definition: strngs.cpp:191
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:563
bool isdigit
Definition: unicharset.h:924
Definition: serialis.h:77
float advance
Definition: unicharset.h:942
void SetPropertiesFromOther(const UNICHARSET &src)
Definition: unicharset.h:540
void set_total(int t)
Definition: unicharset.h:70
int cyrillic_sid() const
Definition: unicharset.h:881
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:823
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:500
int null_sid_
Definition: unicharset.h:1029
int hiragana_sid_
Definition: unicharset.h:1035
int get_script_table_size() const
Definition: unicharset.h:844
int script_id
Definition: unicharset.h:944
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:830
bool load_from_file(FILE *file)
Definition: unicharset.h:398
char ** script_table
Definition: unicharset.h:1013
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:241
bool isupper
Definition: unicharset.h:923
float bearing
Definition: unicharset.h:940
int get_total() const
Definition: unicharset.h:73
char get_chartype(const char *const unichar_repr) const
Definition: unicharset.h:766
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
Definition: unicharset.h:591
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
Definition: unicharset.h:608
float advance_sd
Definition: unicharset.h:943
int16_t pos
Definition: unicharset.h:139
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
Definition: unicharset.h:462
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:486
bool PropertiesIncomplete(UNICHAR_ID unichar_id) const
Definition: unicharset.h:641
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:678
bool get_islower(const char *const unichar_repr) const
Definition: unicharset.h:741
bool get_isupper(const char *const unichar_repr) const
Definition: unicharset.h:746
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:685
bool IsSpaceDelimited(UNICHAR_ID unichar_id) const
Definition: unicharset.h:647
int get_pos() const
Definition: unicharset.h:72
bool equals(const CHAR_FRAGMENT *other) const
Definition: unicharset.h:91
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
Definition: unicharset.h:472
bool get_ispunctuation(const char *const unichar_repr) const
Definition: unicharset.h:756
int cyrillic_sid_
Definition: unicharset.h:1032
bool get_islower(const char *const unichar_repr, int length) const
Definition: unicharset.h:796
void set_isngram(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:451
void set_all(const char *unichar, int pos, int total, bool natural)
Definition: unicharset.h:59
Definition: unicharset.h:168
bool is_natural() const
Definition: unicharset.h:114
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:507
bool Serialize(FILE *fp, const char *data, size_t n)
Definition: serialis.cpp:59
bool script_has_xheight() const
Definition: unicharset.h:898
bool enabled
Definition: unicharset.h:927
void set_isalpha(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:426
bool top_bottom_set_
Definition: unicharset.h:1017
int latin_sid() const
Definition: unicharset.h:880
Definition: unicharmap.h:27
unsigned int get_properties(const char *const unichar_repr) const
Definition: unicharset.h:762
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:457
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:692
UNICHAR_PROPERTIES properties
Definition: unicharset.h:968
uint8_t max_bottom
Definition: unicharset.h:933
int common_sid_
Definition: unicharset.h:1030
bool natural
Definition: unicharset.h:138
int common_sid() const
Definition: unicharset.h:879
STRING normed
Definition: unicharset.h:958
const char * get_unichar() const
Definition: unicharset.h:71
Definition: strngs.h:45
int default_sid() const
Definition: unicharset.h:888
void set_natural(bool value)
Definition: unicharset.h:115
int greek_sid_
Definition: unicharset.h:1033
bool is_null_script(const char *script) const
Definition: unicharset.h:863
CHAR_FRAGMENT * fragment
Definition: unicharset.h:963
bool isalpha
Definition: unicharset.h:921
UNICHAR_SLOT * unichars
Definition: unicharset.h:1009
UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:707
bool is_beginning() const
Definition: unicharset.h:106
int greek_sid() const
Definition: unicharset.h:882
void set_pos(int p)
Definition: unicharset.h:69
STRING to_string() const
Definition: unicharset.h:80
int hangul_sid_
Definition: unicharset.h:1038
bool save_to_file(const char *const filename) const
Definition: unicharset.h:345
bool is_ending() const
Definition: unicharset.h:109
UNICHAR_ID other_case
Definition: unicharset.h:945
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:729
bool get_isngram(UNICHAR_ID unichar_id) const
Definition: unicharset.h:521
int katakana_sid() const
Definition: unicharset.h:885
Definition: unicharset.h:966
int get_script(const char *const unichar_repr, int length) const
Definition: unicharset.h:838
int katakana_sid_
Definition: unicharset.h:1036
int thai_sid() const
Definition: unicharset.h:886
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:658
UNICHARMAP ids
Definition: unicharset.h:1010
bool Serialize(const char *data, size_t count=1)
Definition: serialis.cpp:147
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:493
static const char * null_script
Definition: unicharset.h:1007
void clear()
Definition: unicharset.h:301
void unichar_insert_backwards_compatible(const char *const unichar_repr)
Definition: unicharset.h:264
void set_isupper(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:436
int16_t total
Definition: unicharset.h:140
Definition: blamer.h:43
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:383
bool isngram
Definition: unicharset.h:926
bool save_to_file(FILE *file) const
Definition: unicharset.h:355
GenericVector< UNICHAR_ID > normed_ids
Definition: unicharset.h:957
int hangul_sid() const
Definition: unicharset.h:887
Definition: unicharset.h:904
Definition: unicharset.h:49
int hiragana_sid() const
Definition: unicharset.h:884
float width_sd
Definition: unicharset.h:938
int han_sid_
Definition: unicharset.h:1034
void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
Definition: unicharset.h:577
bool load_from_file(const char *const filename)
Definition: unicharset.h:391
int latin_sid_
Definition: unicharset.h:1031