tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
commontraining.h
1 // Copyright 2008 Google Inc. All Rights Reserved.
2 // Author: scharron@google.com (Samuel Charron)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 
14 #ifndef TESSERACT_TRAINING_COMMONTRAINING_H_
15 #define TESSERACT_TRAINING_COMMONTRAINING_H_
16 
17 #ifdef HAVE_CONFIG_H
18 #include "config_auto.h"
19 #include "baseapi.h"
20 #endif
21 
22 #ifdef DISABLED_LEGACY_ENGINE
23 
24 #include "tprintf.h"
25 #include "commandlineflags.h"
26 
27 
28 void ParseArguments(int* argc, char*** argv);
29 
30 
31 namespace tesseract {
32 
33 // Check whether the shared tesseract library is the right one.
34 // This function must be inline because otherwise it would be part of
35 // the shared library, so it could not compare the versions.
36 static inline void CheckSharedLibraryVersion()
37 {
38 #ifdef HAVE_CONFIG_H
39  if (!!strcmp(TESSERACT_VERSION_STR, TessBaseAPI::Version())) {
40  tprintf("ERROR: shared library version mismatch (was %s, expected %s\n"
41  "Did you use a wrong shared tesseract library?\n",
42  TessBaseAPI::Version(), TESSERACT_VERSION_STR);
43  exit(1);
44  }
45 #endif
46 }
47 
48 } // namespace tesseract
49 
50 
51 #else
52 
53 #include "cluster.h"
54 #include "commandlineflags.h"
55 #include "featdefs.h"
56 #include "intproto.h"
57 #include "oldlist.h"
58 
59 namespace tesseract {
60 class Classify;
61 class MasterTrainer;
62 class ShapeTable;
63 }
64 
66 // Globals ///////////////////////////////////////////////////////////////////
68 
69 extern FEATURE_DEFS_STRUCT feature_defs;
70 
71 // Must be defined in the file that "implements" commonTraining facilities.
72 extern CLUSTERCONFIG Config;
73 
75 // Structs ///////////////////////////////////////////////////////////////////
77 typedef struct
78 {
79  char *Label;
83 }
85 
86 typedef struct
87 {
88  char* Label;
89  int NumMerged[MAX_NUM_PROTOS];
93 
94 
96 // Functions /////////////////////////////////////////////////////////////////
98 void ParseArguments(int* argc, char*** argv);
99 
100 namespace tesseract {
101 
102 // Check whether the shared tesseract library is the right one.
103 // This function must be inline because otherwise it would be part of
104 // the shared library, so it could not compare the versions.
105 static inline void CheckSharedLibraryVersion()
106 {
107 #ifdef HAVE_CONFIG_H
108  if (!!strcmp(TESSERACT_VERSION_STR, TessBaseAPI::Version())) {
109  tprintf("ERROR: shared library version mismatch (was %s, expected %s\n"
110  "Did you use a wrong shared tesseract library?\n",
111  TessBaseAPI::Version(), TESSERACT_VERSION_STR);
112  exit(1);
113  }
114 #endif
115 }
116 
117 // Helper loads shape table from the given file.
118 ShapeTable* LoadShapeTable(const STRING& file_prefix);
119 // Helper to write the shape_table.
120 void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table);
121 
122 // Creates a MasterTraininer and loads the training data into it:
123 // Initializes feature_defs and IntegerFX.
124 // Loads the shape_table if shape_table != nullptr.
125 // Loads initial unicharset from -U command-line option.
126 // If FLAGS_input_trainer is set, loads the majority of data from there, else:
127 // Loads font info from -F option.
128 // Loads xheights from -X option.
129 // Loads samples from .tr files in remaining command-line args.
130 // Deletes outliers and computes canonical samples.
131 // If FLAGS_output_trainer is set, saves the trainer for future use.
132 // Computes canonical and cloud features.
133 // If shape_table is not nullptr, but failed to load, make a fake flat one,
134 // as shape clustering was not run.
135 MasterTrainer* LoadTrainingData(int argc, const char* const * argv,
136  bool replication,
137  ShapeTable** shape_table,
138  STRING* file_prefix);
139 } // namespace tesseract.
140 
141 const char *GetNextFilename(int argc, const char* const * argv);
142 
143 LABELEDLIST FindList(
144  LIST List,
145  char *Label);
146 
147 LABELEDLIST NewLabeledList(
148  const char *Label);
149 
150 void ReadTrainingSamples(const FEATURE_DEFS_STRUCT& feature_defs,
151  const char *feature_name, int max_samples,
152  UNICHARSET* unicharset,
153  FILE* file, LIST* training_samples);
154 
155 void WriteTrainingSamples(
156  const FEATURE_DEFS_STRUCT &FeatureDefs,
157  char *Directory,
158  LIST CharList,
159  const char *program_feature_type);
160 
161 void FreeTrainingSamples(
162  LIST CharList);
163 
164 void FreeLabeledList(
165  LABELEDLIST LabeledList);
166 
167 void FreeLabeledClassList(
168  LIST ClassListList);
169 
170 CLUSTERER *SetUpForClustering(
171  const FEATURE_DEFS_STRUCT &FeatureDefs,
172  LABELEDLIST CharSample,
173  const char *program_feature_type);
174 
175 LIST RemoveInsignificantProtos(
176  LIST ProtoList,
177  bool KeepSigProtos,
178  bool KeepInsigProtos,
179  int N);
180 
181 void CleanUpUnusedData(
182  LIST ProtoList);
183 
184 void MergeInsignificantProtos(
185  LIST ProtoList,
186  const char *label,
187  CLUSTERER *Clusterer,
188  CLUSTERCONFIG *Config);
189 
190 MERGE_CLASS FindClass(
191  LIST List,
192  const char *Label);
193 
194 MERGE_CLASS NewLabeledClass(
195  const char *Label);
196 
197 void FreeTrainingSamples(
198  LIST CharList);
199 
200 CLASS_STRUCT* SetUpForFloat2Int(const UNICHARSET& unicharset,
201  LIST LabeledClassList);
202 
203 void Normalize(
204  float *Values);
205 
206 void FreeNormProtoList(
207  LIST CharList);
208 
209 void AddToNormProtosList(
210  LIST* NormProtoList,
211  LIST ProtoList,
212  char *CharName);
213 
214 int NumberOfProtos(
215  LIST ProtoList,
216  bool CountSigProtos,
217  bool CountInsigProtos);
218 
219 
220 void allocNormProtos();
221 
222 #endif // def DISABLED_LEGACY_ENGINE
223 
224 #endif // TESSERACT_TRAINING_COMMONTRAINING_H_
Definition: mastertrainer.h:69
char * Label
Definition: commontraining.h:88
ShapeTable * LoadShapeTable(const STRING &file_prefix)
Definition: commontraining.cpp:162
MasterTrainer * LoadTrainingData(int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
Definition: commontraining.cpp:219
char * Label
Definition: commontraining.h:79
Definition: commontraining.h:77
static const char * Version()
Definition: baseapi.cpp:223
Definition: unicharset.h:146
Definition: baseapi.cpp:94
Definition: cluster.h:48
Definition: featdefs.h:46
static void CheckSharedLibraryVersion()
Definition: commontraining.h:105
Definition: commontraining.h:86
Definition: strngs.h:45
Definition: shapetable.h:262
int SampleCount
Definition: commontraining.h:80
Definition: protos.h:53
Definition: oldlist.h:124
int font_sample_count
Definition: commontraining.h:81
LIST List
Definition: commontraining.h:82
Definition: cluster.h:86
CLASS_TYPE Class
Definition: commontraining.h:90
void WriteShapeTable(const STRING &file_prefix, const ShapeTable &shape_table)
Definition: commontraining.cpp:187