tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
cluster.h
1 /******************************************************************************
2  ** Filename: cluster.h
3  ** Purpose: Definition of feature space clustering routines
4  ** Author: Dan Johnson
5  **
6  ** (c) Copyright Hewlett-Packard Company, 1988.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *****************************************************************************/
17 
18 #ifndef CLUSTER_H
19 #define CLUSTER_H
20 
21 #include "kdtree.h"
22 #include "oldlist.h"
23 
24 struct BUCKETS;
25 
26 #define MINBUCKETS 5
27 #define MAXBUCKETS 39
28 
29 /*----------------------------------------------------------------------
30  Types
31 ----------------------------------------------------------------------*/
32 typedef struct sample {
33  unsigned Clustered:1; // TRUE if included in a higher cluster
34  unsigned Prototype:1; // TRUE if cluster represented by a proto
35  unsigned SampleCount:30; // number of samples in this cluster
36  struct sample *Left; // ptr to left sub-cluster
37  struct sample *Right; // ptr to right sub-cluster
38  int32_t CharID; // identifier of char sample came from
39  float Mean[1]; // mean of cluster - SampleSize floats
40 } CLUSTER;
41 
42 typedef CLUSTER SAMPLE; // can refer to as either sample or cluster
43 
44 typedef enum {
45  spherical, elliptical, mixed, automatic
46 } PROTOSTYLE;
47 
48 typedef struct { // parameters to control clustering
49  PROTOSTYLE ProtoStyle; // specifies types of protos to be made
50  float MinSamples; // min # of samples per proto - % of total
51  float MaxIllegal; // max percentage of samples in a cluster which
52  // have more than 1 feature in that cluster
53  float Independence; // desired independence between dimensions
54  double Confidence; // desired confidence in prototypes created
55  int MagicSamples; // Ideal number of samples in a cluster.
57 
58 typedef enum {
59  normal, uniform, D_random, DISTRIBUTION_COUNT
60 } DISTRIBUTION;
61 
62 typedef union {
63  float Spherical;
64  float *Elliptical;
65 } FLOATUNION;
66 
67 typedef struct {
68  unsigned Significant:1; // TRUE if prototype is significant
69  unsigned Merged:1; // Merged after clustering so do not output
70  // but kept for display purposes. If it has no
71  // samples then it was actually merged.
72  // Otherwise it matched an already significant
73  // cluster.
74  unsigned Style:2; // spherical, elliptical, or mixed
75  unsigned NumSamples:28; // number of samples in the cluster
76  CLUSTER *Cluster; // ptr to cluster which made prototype
77  DISTRIBUTION *Distrib; // different distribution for each dimension
78  float *Mean; // prototype mean
79  float TotalMagnitude; // total magnitude over all dimensions
80  float LogMagnitude; // log base e of TotalMagnitude
81  FLOATUNION Variance; // prototype variance
82  FLOATUNION Magnitude; // magnitude of density function
83  FLOATUNION Weight; // weight of density function
84 } PROTOTYPE;
85 
86 typedef struct {
87  int16_t SampleSize; // number of parameters per sample
88  PARAM_DESC *ParamDesc; // description of each parameter
89  int32_t NumberOfSamples; // total number of samples being clustered
90  KDTREE *KDTree; // for optimal nearest neighbor searching
91  CLUSTER *Root; // ptr to root cluster of cluster tree
92  LIST ProtoList; // list of prototypes
93  int32_t NumChar; // # of characters represented by samples
94  // cache of reusable histograms by distribution type and number of buckets.
95  BUCKETS* bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS + 1 - MINBUCKETS];
96 } CLUSTERER;
97 
98 typedef struct {
99  int32_t NumSamples; // number of samples in list
100  int32_t MaxNumSamples; // maximum size of list
101  SAMPLE *Sample[1]; // array of ptrs to sample data structures
102 } SAMPLELIST;
103 
104 // low level cluster tree analysis routines.
105 #define InitSampleSearch(S,C) (((C)==nullptr)?(S=NIL_LIST):(S=push(NIL_LIST,(C))))
106 
107 /*--------------------------------------------------------------------------
108  Public Function Prototypes
109 --------------------------------------------------------------------------*/
110 CLUSTERER *MakeClusterer (int16_t SampleSize, const PARAM_DESC ParamDesc[]);
111 
112 SAMPLE *MakeSample(CLUSTERER* Clusterer, const float* Feature, int32_t CharID);
113 
114 LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
115 
116 void FreeClusterer(CLUSTERER *Clusterer);
117 
118 void FreeProtoList(LIST *ProtoList);
119 
120 void FreePrototype(void *arg); // PROTOTYPE *Prototype);
121 
122 CLUSTER *NextSample(LIST *SearchState);
123 
124 float Mean(PROTOTYPE *Proto, uint16_t Dimension);
125 
126 float StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension);
127 
128 int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2,
129  float m[], float m1[], float m2[]);
130 
131 #endif
unsigned Style
Definition: cluster.h:74
unsigned Prototype
Definition: cluster.h:34
Definition: cluster.h:67
int32_t NumberOfSamples
Definition: cluster.h:89
PARAM_DESC * ParamDesc
Definition: cluster.h:88
unsigned SampleCount
Definition: cluster.h:35
float TotalMagnitude
Definition: cluster.h:79
int32_t CharID
Definition: cluster.h:38
int32_t NumChar
Definition: cluster.h:93
LIST ProtoList
Definition: cluster.h:92
Definition: cluster.h:32
float Independence
Definition: cluster.h:53
float Spherical
Definition: cluster.h:63
float * Elliptical
Definition: cluster.h:64
Definition: cluster.h:98
FLOATUNION Weight
Definition: cluster.h:83
Definition: cluster.h:48
float Mean[1]
Definition: cluster.h:39
unsigned Merged
Definition: cluster.h:69
FLOATUNION Magnitude
Definition: cluster.h:82
int MagicSamples
Definition: cluster.h:55
float LogMagnitude
Definition: cluster.h:80
struct sample * Right
Definition: cluster.h:37
int32_t MaxNumSamples
Definition: cluster.h:100
Definition: cluster.cpp:179
unsigned Significant
Definition: cluster.h:68
DISTRIBUTION * Distrib
Definition: cluster.h:77
unsigned NumSamples
Definition: cluster.h:75
int16_t SampleSize
Definition: cluster.h:87
Definition: oldlist.h:124
CLUSTER * Cluster
Definition: cluster.h:76
double Confidence
Definition: cluster.h:54
float MinSamples
Definition: cluster.h:50
PROTOSTYLE ProtoStyle
Definition: cluster.h:49
unsigned Clustered
Definition: cluster.h:33
struct sample * Left
Definition: cluster.h:36
Definition: ocrfeatures.h:43
KDTREE * KDTree
Definition: cluster.h:90
float MaxIllegal
Definition: cluster.h:51
Definition: cluster.h:62
FLOATUNION Variance
Definition: cluster.h:81
CLUSTER * Root
Definition: cluster.h:91
float * Mean
Definition: cluster.h:78
Definition: cluster.h:86
int32_t NumSamples
Definition: cluster.h:99
Definition: kdtree.h:47