tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
imagedata.h
1 // File: imagedata.h
3 // Description: Class to hold information about a single image and its
4 // corresponding boxes or text file.
5 // Author: Ray Smith
6 // Created: Mon Jul 22 14:17:06 PDT 2013
7 //
8 // (C) Copyright 2013, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
19 
20 #ifndef TESSERACT_IMAGE_IMAGEDATA_H_
21 #define TESSERACT_IMAGE_IMAGEDATA_H_
22 
23 #include "genericvector.h" // for GenericVector, PointerVector, FileReader
24 #include "points.h" // for FCOORD
25 #include "strngs.h" // for STRING
26 #include "svutil.h" // for SVAutoLock, SVMutex
27 
28 class ScrollView;
29 class TBOX;
30 struct Pix;
31 
32 namespace tesseract {
33 
34 class TFile;
35 
36 // Amount of padding to apply in output pixels in feature mode.
37 const int kFeaturePadding = 2;
38 // Number of pixels to pad around text boxes.
39 const int kImagePadding = 4;
40 
41 // Enum to determine the caching and data sequencing strategy.
43  // Reads all of one file before moving on to the next. Requires samples to be
44  // shuffled across files. Uses the count of samples in the first file as
45  // the count in all the files to achieve high-speed random access. As a
46  // consequence, if subsequent files are smaller, they get entries used more
47  // than once, and if subsequent files are larger, some entries are not used.
48  // Best for larger data sets that don't fit in memory.
50  // Reads one sample from each file in rotation. Does not require shuffled
51  // samples, but is extremely disk-intensive. Samples in smaller files also
52  // get used more often than samples in larger files.
53  // Best for smaller data sets that mostly fit in memory.
55 };
56 
57 class WordFeature {
58  public:
59  WordFeature();
60  WordFeature(const FCOORD& fcoord, uint8_t dir);
61 
62  // Computes the maximum x and y value in the features.
63  static void ComputeSize(const GenericVector<WordFeature>& features,
64  int* max_x, int* max_y);
65  // Draws the features in the given window.
66  static void Draw(const GenericVector<WordFeature>& features,
67  ScrollView* window);
68 
69  // Accessors.
70  int x() const { return x_; }
71  int y() const { return y_; }
72  int dir() const { return dir_; }
73 
74  // Writes to the given file. Returns false in case of error.
75  bool Serialize(FILE* fp) const;
76  // Reads from the given file. Returns false in case of error.
77  // If swap is true, assumes a big/little-endian swap is needed.
78  bool DeSerialize(bool swap, FILE* fp);
79 
80  private:
81  int16_t x_;
82  uint8_t y_;
83  uint8_t dir_;
84 };
85 
86 // A floating-point version of WordFeature, used as an intermediate during
87 // scaling.
89  static void FromWordFeatures(const GenericVector<WordFeature>& word_features,
90  GenericVector<FloatWordFeature>* float_features);
91  // Sort function to sort first by x-bucket, then by y.
92  static int SortByXBucket(const void*, const void*);
93 
94  float x;
95  float y;
96  float dir;
97  int x_bucket;
98 };
99 
100 // Class to hold information on a single image:
101 // Filename, cached image as a Pix*, character boxes, text transcription.
102 // The text transcription is the ground truth UTF-8 text for the image.
103 // Character boxes are optional and indicate the desired segmentation of
104 // the text into recognition units.
105 class ImageData {
106  public:
107  ImageData();
108  // Takes ownership of the pix.
109  ImageData(bool vertical, Pix* pix);
110  ~ImageData();
111 
112  // Builds and returns an ImageData from the basic data. Note that imagedata,
113  // truth_text, and box_text are all the actual file data, NOT filenames.
114  static ImageData* Build(const char* name, int page_number, const char* lang,
115  const char* imagedata, int imagedatasize,
116  const char* truth_text, const char* box_text);
117 
118  // Writes to the given file. Returns false in case of error.
119  bool Serialize(TFile* fp) const;
120  // Reads from the given file. Returns false in case of error.
121  bool DeSerialize(TFile* fp);
122  // As DeSerialize, but only seeks past the data - hence a static method.
123  static bool SkipDeSerialize(TFile* fp);
124 
125  // Other accessors.
126  const STRING& imagefilename() const {
127  return imagefilename_;
128  }
129  void set_imagefilename(const STRING& name) {
130  imagefilename_ = name;
131  }
132  int page_number() const {
133  return page_number_;
134  }
135  void set_page_number(int num) {
136  page_number_ = num;
137  }
139  return image_data_;
140  }
141  const STRING& language() const {
142  return language_;
143  }
144  void set_language(const STRING& lang) {
145  language_ = lang;
146  }
147  const STRING& transcription() const {
148  return transcription_;
149  }
150  const GenericVector<TBOX>& boxes() const {
151  return boxes_;
152  }
154  return box_texts_;
155  }
156  const STRING& box_text(int index) const {
157  return box_texts_[index];
158  }
159  // Saves the given Pix as a PNG-encoded string and destroys it.
160  void SetPix(Pix* pix);
161  // Returns the Pix image for *this. Must be pixDestroyed after use.
162  Pix* GetPix() const;
163  // Gets anything and everything with a non-nullptr pointer, prescaled to a
164  // given target_height (if 0, then the original image height), and aligned.
165  // Also returns (if not nullptr) the width and height of the scaled image.
166  // The return value is the scaled Pix, which must be pixDestroyed after use,
167  // and scale_factor (if not nullptr) is set to the scale factor that was applied
168  // to the image to achieve the target_height.
169  Pix* PreScale(int target_height, int max_height, float* scale_factor,
170  int* scaled_width, int* scaled_height,
171  GenericVector<TBOX>* boxes) const;
172 
173  int MemoryUsed() const;
174 
175  // Draws the data in a new window.
176  void Display() const;
177 
178  // Adds the supplied boxes and transcriptions that correspond to the correct
179  // page number.
180  void AddBoxes(const GenericVector<TBOX>& boxes,
181  const GenericVector<STRING>& texts,
182  const GenericVector<int>& box_pages);
183 
184  private:
185  // Saves the given Pix as a PNG-encoded string and destroys it.
186  static void SetPixInternal(Pix* pix, GenericVector<char>* image_data);
187  // Returns the Pix image for the image_data. Must be pixDestroyed after use.
188  static Pix* GetPixInternal(const GenericVector<char>& image_data);
189  // Parses the text string as a box file and adds any discovered boxes that
190  // match the page number. Returns false on error.
191  bool AddBoxes(const char* box_text);
192 
193  private:
194  STRING imagefilename_; // File to read image from.
195  int32_t page_number_; // Page number if multi-page tif or -1.
196  GenericVector<char> image_data_; // PNG file data.
197  STRING language_; // Language code for image.
198  STRING transcription_; // UTF-8 ground truth of image.
199  GenericVector<TBOX> boxes_; // If non-empty boxes of the image.
200  GenericVector<STRING> box_texts_; // String for text in each box.
201  bool vertical_text_; // Image has been rotated from vertical.
202 };
203 
204 // A collection of ImageData that knows roughly how much memory it is using.
206  friend void* ReCachePagesFunc(void* data);
207 
208  public:
209  explicit DocumentData(const STRING& name);
210  ~DocumentData();
211 
212  // Reads all the pages in the given lstmf filename to the cache. The reader
213  // is used to read the file.
214  bool LoadDocument(const char* filename, int start_page, int64_t max_memory,
215  FileReader reader);
216  // Sets up the document, without actually loading it.
217  void SetDocument(const char* filename, int64_t max_memory, FileReader reader);
218  // Writes all the pages to the given filename. Returns false on error.
219  bool SaveDocument(const char* filename, FileWriter writer);
220  bool SaveToBuffer(GenericVector<char>* buffer);
221 
222  // Adds the given page data to this document, counting up memory.
223  void AddPageToDocument(ImageData* page);
224 
225  const STRING& document_name() const {
226  SVAutoLock lock(&general_mutex_);
227  return document_name_;
228  }
229  int NumPages() const {
230  SVAutoLock lock(&general_mutex_);
231  return total_pages_;
232  }
233  int64_t memory_used() const {
234  SVAutoLock lock(&general_mutex_);
235  return memory_used_;
236  }
237  // If the given index is not currently loaded, loads it using a separate
238  // thread. Note: there are 4 cases:
239  // Document uncached: IsCached() returns false, total_pages_ < 0.
240  // Required page is available: IsPageAvailable returns true. In this case,
241  // total_pages_ > 0 and
242  // pages_offset_ <= index%total_pages_ <= pages_offset_+pages_.size()
243  // Pages are loaded, but the required one is not.
244  // The requested page is being loaded by LoadPageInBackground. In this case,
245  // index == pages_offset_. Once the loading starts, the pages lock is held
246  // until it completes, at which point IsPageAvailable will unblock and return
247  // true.
248  void LoadPageInBackground(int index);
249  // Returns a pointer to the page with the given index, modulo the total
250  // number of pages. Blocks until the background load is completed.
251  const ImageData* GetPage(int index);
252  // Returns true if the requested page is available, and provides a pointer,
253  // which may be nullptr if the document is empty. May block, even though it
254  // doesn't guarantee to return true.
255  bool IsPageAvailable(int index, ImageData** page);
256  // Takes ownership of the given page index. The page is made nullptr in *this.
257  ImageData* TakePage(int index) {
258  SVAutoLock lock(&pages_mutex_);
259  ImageData* page = pages_[index];
260  pages_[index] = nullptr;
261  return page;
262  }
263  // Returns true if the document is currently loaded or in the process of
264  // loading.
265  bool IsCached() const { return NumPages() >= 0; }
266  // Removes all pages from memory and frees the memory, but does not forget
267  // the document metadata. Returns the memory saved.
268  int64_t UnCache();
269  // Shuffles all the pages in the document.
270  void Shuffle();
271 
272  private:
273  // Sets the value of total_pages_ behind a mutex.
274  void set_total_pages(int total) {
275  SVAutoLock lock(&general_mutex_);
276  total_pages_ = total;
277  }
278  void set_memory_used(int64_t memory_used) {
279  SVAutoLock lock(&general_mutex_);
280  memory_used_ = memory_used;
281  }
282  // Locks the pages_mutex_ and Loads as many pages can fit in max_memory_
283  // starting at index pages_offset_.
284  bool ReCachePages();
285 
286  private:
287  // A name for this document.
289  // A group of pages that corresponds in some loose way to a document.
291  // Page number of the first index in pages_.
293  // Total number of pages in document (may exceed size of pages_.)
295  // Total of all pix sizes in the document.
296  int64_t memory_used_;
297  // Max memory to use at any time.
298  int64_t max_memory_;
299  // Saved reader from LoadDocument to allow re-caching.
301  // Mutex that protects pages_ and pages_offset_ against multiple parallel
302  // loads, and provides a wait for page.
304  // Mutex that protects other data members that callers want to access without
305  // waiting for a load operation.
307 };
308 
309 // A collection of DocumentData that knows roughly how much memory it is using.
310 // Note that while it supports background read-ahead, it assumes that a single
311 // thread is accessing documents, ie it is not safe for multiple threads to
312 // access different documents in parallel, as one may de-cache the other's
313 // content.
315  public:
316  explicit DocumentCache(int64_t max_memory);
317  ~DocumentCache();
318 
319  // Deletes all existing documents from the cache.
320  void Clear() {
321  documents_.clear();
322  num_pages_per_doc_ = 0;
323  }
324  // Adds all the documents in the list of filenames, counting memory.
325  // The reader is used to read the files.
326  bool LoadDocuments(const GenericVector<STRING>& filenames,
327  CachingStrategy cache_strategy, FileReader reader);
328 
329  // Adds document to the cache.
330  bool AddToCache(DocumentData* data);
331 
332  // Finds and returns a document by name.
333  DocumentData* FindDocument(const STRING& document_name) const;
334 
335  // Returns a page by serial number using the current cache_strategy_ to
336  // determine the mapping from serial number to page.
337  const ImageData* GetPageBySerial(int serial) {
338  if (cache_strategy_ == CS_SEQUENTIAL)
339  return GetPageSequential(serial);
340  else
341  return GetPageRoundRobin(serial);
342  }
343 
345  return documents_;
346  }
347  // Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
348  // strategy, could take a long time.
349  int TotalPages();
350 
351  private:
352  // Returns a page by serial number, selecting them in a round-robin fashion
353  // from all the documents. Highly disk-intensive, but doesn't need samples
354  // to be shuffled between files to begin with.
355  const ImageData* GetPageRoundRobin(int serial);
356  // Returns a page by serial number, selecting them in sequence from each file.
357  // Requires the samples to be shuffled between the files to give a random or
358  // uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
359  const ImageData* GetPageSequential(int serial);
360 
361  // Helper counts the number of adjacent cached neighbour documents_ of index
362  // looking in direction dir, ie index+dir, index+2*dir etc.
363  int CountNeighbourDocs(int index, int dir);
364 
365  // A group of pages that corresponds in some loose way to a document.
367  // Strategy to use for caching and serializing data samples.
369  // Number of pages in the first document, used as a divisor in
370  // GetPageSequential to determine the document index.
372  // Max memory allowed in this cache.
373  int64_t max_memory_;
374 };
375 
376 } // namespace tesseract
377 
378 
379 #endif // TESSERACT_IMAGE_IMAGEDATA_H_
GenericVector< char > image_data_
Definition: imagedata.h:196
void Clear()
Definition: imagedata.h:320
int64_t max_memory_
Definition: imagedata.h:373
const STRING & document_name() const
Definition: imagedata.h:225
const STRING & transcription() const
Definition: imagedata.h:147
void * ReCachePagesFunc(void *data)
Definition: imagedata.cpp:369
ImageData * TakePage(int index)
Definition: imagedata.h:257
int NumPages() const
Definition: imagedata.h:229
PointerVector< DocumentData > documents_
Definition: imagedata.h:366
int16_t x_
Definition: imagedata.h:81
STRING document_name_
Definition: imagedata.h:288
STRING language_
Definition: imagedata.h:197
CachingStrategy
Definition: imagedata.h:42
Definition: imagedata.h:105
GenericVector< STRING > box_texts_
Definition: imagedata.h:200
const GenericVector< char > & image_data() const
Definition: imagedata.h:138
const GenericVector< TBOX > & boxes() const
Definition: imagedata.h:150
bool Serialize(FILE *fp) const
Definition: imagedata.cpp:86
int page_number() const
Definition: imagedata.h:132
void set_page_number(int num)
Definition: imagedata.h:135
Definition: rect.h:34
const STRING & box_text(int index) const
Definition: imagedata.h:156
Definition: imagedata.h:314
float x
Definition: imagedata.h:94
GenericVector< TBOX > boxes_
Definition: imagedata.h:199
const STRING & imagefilename() const
Definition: imagedata.h:126
bool vertical_text_
Definition: imagedata.h:201
bool(* FileReader)(const STRING &filename, GenericVector< char > *data)
Definition: genericvector.h:360
Definition: serialis.h:77
int x() const
Definition: imagedata.h:70
const int kImagePadding
Definition: imagedata.h:39
const GenericVector< STRING > & box_texts() const
Definition: imagedata.h:153
Definition: baseapi.cpp:94
const int kFeaturePadding
Definition: imagedata.h:37
PointerVector< ImageData > pages_
Definition: imagedata.h:290
int32_t page_number_
Definition: imagedata.h:195
int total_pages_
Definition: imagedata.h:294
bool(* FileWriter)(const GenericVector< char > &data, const STRING &filename)
Definition: genericvector.h:363
SVMutex general_mutex_
Definition: imagedata.h:306
Definition: scrollview.h:102
bool DeSerialize(bool swap, FILE *fp)
Definition: imagedata.cpp:93
int dir() const
Definition: imagedata.h:72
uint8_t dir_
Definition: imagedata.h:83
bool IsCached() const
Definition: imagedata.h:265
int64_t memory_used_
Definition: imagedata.h:296
const PointerVector< DocumentData > & documents() const
Definition: imagedata.h:344
Definition: imagedata.h:54
Definition: baseapi.h:37
Definition: svutil.h:78
STRING imagefilename_
Definition: imagedata.h:194
float y
Definition: imagedata.h:95
void set_language(const STRING &lang)
Definition: imagedata.h:144
static void ComputeSize(const GenericVector< WordFeature > &features, int *max_x, int *max_y)
Definition: imagedata.cpp:58
Definition: strngs.h:45
Definition: imagedata.h:88
void set_memory_used(int64_t memory_used)
Definition: imagedata.h:278
float dir
Definition: imagedata.h:96
const STRING & language() const
Definition: imagedata.h:141
FileReader reader_
Definition: imagedata.h:300
Definition: genericvector.h:457
Definition: svutil.h:96
int64_t memory_used() const
Definition: imagedata.h:233
Definition: imagedata.h:49
const ImageData * GetPageBySerial(int serial)
Definition: imagedata.h:337
int x_bucket
Definition: imagedata.h:97
Definition: imagedata.h:205
STRING transcription_
Definition: imagedata.h:198
CachingStrategy cache_strategy_
Definition: imagedata.h:368
void set_imagefilename(const STRING &name)
Definition: imagedata.h:129
int64_t max_memory_
Definition: imagedata.h:298
SVMutex pages_mutex_
Definition: imagedata.h:303
int y() const
Definition: imagedata.h:71
static void Draw(const GenericVector< WordFeature > &features, ScrollView *window)
Definition: imagedata.cpp:69
Definition: imagedata.h:57
WordFeature()
Definition: imagedata.cpp:48
int pages_offset_
Definition: imagedata.h:292
Definition: points.h:189
void set_total_pages(int total)
Definition: imagedata.h:274
int num_pages_per_doc_
Definition: imagedata.h:371
uint8_t y_
Definition: imagedata.h:82