tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
pango_font_info.h
1 /**********************************************************************
2  * File: pango_font_info.h
3  * Description: Font-related objects and helper functions
4  * Author: Ranjith Unnikrishnan
5  * Created: Mon Nov 18 2013
6  *
7  * (C) Copyright 2013, Google Inc.
8  * Licensed under the Apache License, Version 2.0 (the "License");
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  * http://www.apache.org/licenses/LICENSE-2.0
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifndef TESSERACT_TRAINING_PANGO_FONT_INFO_H_
21 #define TESSERACT_TRAINING_PANGO_FONT_INFO_H_
22 
23 #include <string>
24 #include <unordered_map>
25 #include <utility>
26 #include <vector>
27 
28 #include "commandlineflags.h"
29 #include "host.h"
30 #include "pango/pango-font.h"
31 #include "pango/pango.h"
32 #include "pango/pangocairo.h"
33 #include "util.h"
34 
35 DECLARE_STRING_PARAM_FLAG(fonts_dir);
36 DECLARE_STRING_PARAM_FLAG(fontconfig_tmpdir);
37 
38 using char32 = signed int;
39 
40 namespace tesseract {
41 
42 // Data holder class for a font, intended to avoid having to work with Pango or
43 // FontConfig-specific objects directly.
45  public:
46  enum FontTypeEnum {
51  };
52  PangoFontInfo();
54  // Initialize from parsing a font description name, defined as a string of the
55  // format:
56  // "FamilyName [FaceName] [PointSize]"
57  // where a missing FaceName implies the default regular face.
58  // eg. "Arial Italic 12", "Verdana"
59  //
60  // FaceName is a combination of:
61  // [StyleName] [Variant] [Weight] [Stretch]
62  // with (all optional) Pango-defined values of:
63  // StyleName: Oblique, Italic
64  // Variant : Small-Caps
65  // Weight : Ultra-Light, Light, Medium, Semi-Bold, Bold, Ultra-Bold, Heavy
66  // Stretch : Ultra-Condensed, Extra-Condensed, Condensed, Semi-Condensed,
67  // Semi-Expanded, Expanded, Extra-Expanded, Ultra-Expanded.
68  explicit PangoFontInfo(const std::string& name);
69  bool ParseFontDescriptionName(const std::string& name);
70 
71  // Returns true if the font have codepoint coverage for the specified text.
72  bool CoversUTF8Text(const char* utf8_text, int byte_length) const;
73  // Modifies string to remove unicode points that are not covered by the
74  // font. Returns the number of characters dropped.
75  int DropUncoveredChars(std::string* utf8_text) const;
76 
77  // Returns true if the entire string can be rendered by the font with full
78  // character coverage and no unknown glyph or dotted-circle glyph
79  // substitutions on encountering a badly formed unicode sequence.
80  // If true, returns individual graphemes. Any whitespace characters in the
81  // original string are also included in the list.
82  bool CanRenderString(const char* utf8_word, int len,
83  std::vector<std::string>* graphemes) const;
84  bool CanRenderString(const char* utf8_word, int len) const;
85 
86  // Retrieves the x_bearing and x_advance for the given utf8 character in the
87  // font. Returns false if the glyph for the character could not be found in
88  // the font.
89  // Ref: http://freetype.sourceforge.net/freetype2/docs/glyphs/glyphs-3.html
90  bool GetSpacingProperties(const std::string& utf8_char,
91  int* x_bearing, int* x_advance) const;
92 
93  // If not already initialized, initializes FontConfig by setting its
94  // environment variable and creating a fonts.conf file that points to the
95  // FLAGS_fonts_dir and the cache to FLAGS_fontconfig_tmpdir.
96  static void SoftInitFontConfig();
97  // Re-initializes font config, whether or not already initialized.
98  // If already initialized, any existing cache is deleted, just to be sure.
99  static void HardInitFontConfig(const std::string& fonts_dir,
100  const std::string& cache_dir);
101 
102  // Accessors
103  std::string DescriptionName() const;
104  // Font Family name eg. "Arial"
105  const std::string& family_name() const { return family_name_; }
106  // Size in points (1/72"), rounded to the nearest integer.
107  int font_size() const { return font_size_; }
108  FontTypeEnum font_type() const { return font_type_; }
109 
110  int resolution() const { return resolution_; }
111  void set_resolution(const int resolution) {
113  }
114 
115  private:
116  friend class FontUtils;
117  void Clear();
118  bool ParseFontDescription(const PangoFontDescription* desc);
119  // Returns the PangoFont structure corresponding to the closest available font
120  // in the font map.
121  PangoFont* ToPangoFont() const;
122 
123  // Font properties set automatically from parsing the font description name.
124  std::string family_name_;
127  // The Pango description that was used to initialize the instance.
128  PangoFontDescription* desc_;
129  // Default output resolution to assume for GetSpacingProperties() and any
130  // other methods that returns pixel values.
132  // Fontconfig operates through an environment variable, so it intrinsically
133  // cannot be thread-friendly, but you can serialize multiple independent
134  // font configurations by calling HardInitFontConfig(fonts_dir, cache_dir).
135  // These hold the last initialized values set by HardInitFontConfig or
136  // the first call to SoftInitFontConfig.
137  // Directory to be scanned for font files.
138  static std::string fonts_dir_;
139  // Directory to store the cache of font information. (Can be the same as
140  // fonts_dir_)
141  static std::string cache_dir_;
142 
143  private:
145  void operator=(const PangoFontInfo&);
146 };
147 
148 // Static utility methods for querying font availability and font-selection
149 // based on codepoint coverage.
150 class FontUtils {
151  public:
152  // Returns true if the font of the given description name is available in the
153  // target directory specified by --fonts_dir
154  static bool IsAvailableFont(const char* font_desc) {
155  return IsAvailableFont(font_desc, nullptr);
156  }
157  // Returns true if the font of the given description name is available in the
158  // target directory specified by --fonts_dir. If false is returned, and
159  // best_match is not nullptr, the closest matching font is returned there.
160  static bool IsAvailableFont(const char* font_desc, std::string* best_match);
161  // Outputs description names of available fonts.
162  static const std::vector<std::string>& ListAvailableFonts();
163 
164  // Picks font among available fonts that covers and can render the given word,
165  // and returns the font description name and the decomposition of the word to
166  // graphemes. Returns false if no suitable font was found.
167  static bool SelectFont(const char* utf8_word, const int utf8_len,
168  std::string* font_name, std::vector<std::string>* graphemes);
169 
170  // Picks font among all_fonts that covers and can render the given word,
171  // and returns the font description name and the decomposition of the word to
172  // graphemes. Returns false if no suitable font was found.
173  static bool SelectFont(const char* utf8_word, const int utf8_len,
174  const std::vector<std::string>& all_fonts,
175  std::string* font_name, std::vector<std::string>* graphemes);
176 
177  // Returns a bitmask where the value of true at index 'n' implies that unicode
178  // value 'n' is renderable by at least one available font.
179  static void GetAllRenderableCharacters(std::vector<bool>* unichar_bitmap);
180  // Variant of the above function that inspects only the provided font names.
181  static void GetAllRenderableCharacters(const std::vector<std::string>& font_names,
182  std::vector<bool>* unichar_bitmap);
183  static void GetAllRenderableCharacters(const std::string& font_name,
184  std::vector<bool>* unichar_bitmap);
185 
186  // NOTE: The following utilities were written to be backward compatible with
187  // StringRender.
188 
189  // BestFonts returns a font name and a bit vector of the characters it
190  // can render for the fonts that score within some fraction of the best
191  // font on the characters in the given hash map.
192  // In the flags vector, each flag is set according to whether the
193  // corresponding character (in order of iterating ch_map) can be rendered.
194  // The return string is a list of the acceptable fonts that were used.
195  static std::string BestFonts(
196  const std::unordered_map<char32, int64_t>& ch_map,
197  std::vector<std::pair<const char*, std::vector<bool> > >* font_flag);
198 
199  // FontScore returns the weighted renderability score of the given
200  // hash map character table in the given font. The unweighted score
201  // is also returned in raw_score.
202  // The values in the bool vector ch_flags correspond to whether the
203  // corresponding character (in order of iterating ch_map) can be rendered.
204  static int FontScore(const std::unordered_map<char32, int64_t>& ch_map,
205  const std::string& fontname, int* raw_score,
206  std::vector<bool>* ch_flags);
207 
208  // PangoFontInfo is reinitialized, so clear the static list of fonts.
209  static void ReInit();
210  static void PangoFontTypeInfo();
211 
212  private:
213  static std::vector<std::string> available_fonts_; // cache list
214 };
215 } // namespace tesseract
216 
217 #endif // TESSERACT_TRAINING_PANGO_FONT_INFO_H_
int resolution_
Definition: pango_font_info.h:131
bool ParseFontDescription(const PangoFontDescription *desc)
Definition: pango_font_info.cpp:171
FontTypeEnum
Definition: pango_font_info.h:46
bool CoversUTF8Text(const char *utf8_text, int byte_length) const
Definition: pango_font_info.cpp:218
static std::vector< std::string > available_fonts_
Definition: pango_font_info.h:213
void set_resolution(const int resolution)
Definition: pango_font_info.h:111
int resolution() const
Definition: pango_font_info.h:110
const std::string & family_name() const
Definition: pango_font_info.h:105
PangoFont * ToPangoFont() const
Definition: pango_font_info.cpp:203
bool ParseFontDescriptionName(const std::string &name)
Definition: pango_font_info.cpp:193
static std::string fonts_dir_
Definition: pango_font_info.h:138
FontTypeEnum font_type() const
Definition: pango_font_info.h:108
static bool IsAvailableFont(const char *font_desc)
Definition: pango_font_info.h:154
static std::string cache_dir_
Definition: pango_font_info.h:141
static void HardInitFontConfig(const std::string &fonts_dir, const std::string &cache_dir)
Definition: pango_font_info.cpp:123
Definition: pango_font_info.h:47
int font_size_
Definition: pango_font_info.h:125
Definition: baseapi.cpp:94
static void SoftInitFontConfig()
Definition: pango_font_info.cpp:113
int DropUncoveredChars(std::string *utf8_text) const
Definition: pango_font_info.cpp:261
Definition: pango_font_info.h:48
Definition: pango_font_info.h:44
PangoFontInfo()
Definition: pango_font_info.cpp:76
Definition: pango_font_info.h:150
std::string family_name_
Definition: pango_font_info.h:124
PangoFontDescription * desc_
Definition: pango_font_info.h:128
Definition: pango_font_info.h:49
std::string DescriptionName() const
Definition: pango_font_info.cpp:101
void Clear()
Definition: pango_font_info.cpp:89
void operator=(const PangoFontInfo &)
bool CanRenderString(const char *utf8_word, int len, std::vector< std::string > *graphemes) const
Definition: pango_font_info.cpp:349
bool GetSpacingProperties(const std::string &utf8_char, int *x_bearing, int *x_advance) const
Definition: pango_font_info.cpp:304
Definition: pango_font_info.h:50
int font_size() const
Definition: pango_font_info.h:107
~PangoFontInfo()
Definition: pango_font_info.cpp:99
FontTypeEnum font_type_
Definition: pango_font_info.h:126