tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
statistc.h
1 /**********************************************************************
2  * File: statistc.h (Formerly stats.h)
3  * Description: Class description for STATS class.
4  * Author: Ray Smith
5  * Created: Mon Feb 04 16:19:07 GMT 1991
6  *
7  * (C) Copyright 1991, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifndef TESSERACT_CCSTRUCT_STATISTC_H_
21 #define TESSERACT_CCSTRUCT_STATISTC_H_
22 
23 #include <cstdio>
24 #include "host.h"
25 #include "kdpair.h"
26 #include "scrollview.h"
27 
28 template <typename T> class GenericVector;
29 
30 
31 // Simple histogram-based statistics for integer values in a known
32 // range, such that the range is small compared to the number of samples.
33 class STATS {
34  public:
35  // The histogram buckets are in the range
36  // [min_bucket_value, max_bucket_value_plus_1 - 1] i.e.
37  // [min_bucket_value, max_bucket_value].
38  // Any data under min_bucket value is silently mapped to min_bucket_value,
39  // and likewise, any data over max_bucket_value is silently mapped to
40  // max_bucket_value.
41  // In the internal array, min_bucket_value maps to 0 and
42  // max_bucket_value_plus_1 - min_bucket_value to the array size.
43  // TODO(rays) This is ugly. Convert the second argument to
44  // max_bucket_value and all the code that uses it.
45  STATS(int32_t min_bucket_value, int32_t max_bucket_value_plus_1);
46  STATS(); // empty for arrays
47 
48  ~STATS();
49 
50  // (Re)Sets the range and clears the counts.
51  // See the constructor for info on max and min values.
52  bool set_range(int32_t min_bucket_value, int32_t max_bucket_value_plus_1);
53 
54  void clear(); // empty buckets
55 
56  void add(int32_t value, int32_t count);
57 
58  // "Accessors" return various statistics on the data.
59  int32_t mode() const; // get mode of samples
60  double mean() const; // get mean of samples
61  double sd() const; // standard deviation
62  // Returns the fractile value such that frac fraction (in [0,1]) of samples
63  // has a value less than the return value.
64  double ile(double frac) const;
65  // Returns the minimum used entry in the histogram (ie the minimum of the
66  // data, NOT the minimum of the supplied range, nor is it an index.)
67  // Would normally be called min(), but that is a reserved word in VC++.
68  int32_t min_bucket() const; // Find min
69  // Returns the maximum used entry in the histogram (ie the maximum of the
70  // data, NOT the maximum of the supplied range, nor is it an index.)
71  int32_t max_bucket() const; // Find max
72  // Finds a more useful estimate of median than ile(0.5).
73  // Overcomes a problem with ile() - if the samples are, for example,
74  // 6,6,13,14 ile(0.5) return 7.0 - when a more useful value would be midway
75  // between 6 and 13 = 9.5
76  double median() const; // get median of samples
77  // Returns the count of the given value.
78  int32_t pile_count(int32_t value) const {
79  if (value <= rangemin_)
80  return buckets_[0];
81  if (value >= rangemax_ - 1)
82  return buckets_[rangemax_ - rangemin_ - 1];
83  return buckets_[value - rangemin_];
84  }
85  // Returns the total count of all buckets.
86  int32_t get_total() const {
87  return total_count_; // total of all piles
88  }
89  // Returns true if x is a local min.
90  bool local_min(int32_t x) const;
91 
92  // Apply a triangular smoothing filter to the stats.
93  // This makes the modes a bit more useful.
94  // The factor gives the height of the triangle, i.e. the weight of the
95  // centre.
96  void smooth(int32_t factor);
97 
98  // Cluster the samples into max_cluster clusters.
99  // Each call runs one iteration. The array of clusters must be
100  // max_clusters+1 in size as cluster 0 is used to indicate which samples
101  // have been used.
102  // The return value is the current number of clusters.
103  int32_t cluster(float lower, // thresholds
104  float upper,
105  float multiple, // distance threshold
106  int32_t max_clusters, // max no to make
107  STATS *clusters); // array of clusters
108 
109 // Finds (at most) the top max_modes modes, well actually the whole peak around
110 // each mode, returning them in the given modes vector as a <mean of peak,
111 // total count of peak> pair in order of decreasing total count.
112 // Since the mean is the key and the count the data in the pair, a single call
113 // to sort on the output will re-sort by increasing mean of peak if that is
114 // more useful than decreasing total count.
115 // Returns the actual number of modes found.
116  int top_n_modes(
117  int max_modes,
119 
120  // Prints a summary and table of the histogram.
121  void print() const;
122  // Prints summary stats only of the histogram.
123  void print_summary() const;
124 
125  #ifndef GRAPHICS_DISABLED
126  // Draws the histogram as a series of rectangles.
127  void plot(ScrollView* window, // window to draw in
128  float xorigin, // origin of histo
129  float yorigin, // gram
130  float xscale, // size of one unit
131  float yscale, // size of one uint
132  ScrollView::Color colour) const; // colour to draw in
133 
134  // Draws a line graph of the histogram.
135  void plotline(ScrollView* window, // window to draw in
136  float xorigin, // origin of histo
137  float yorigin, // gram
138  float xscale, // size of one unit
139  float yscale, // size of one uint
140  ScrollView::Color colour) const; // colour to draw in
141  #endif // GRAPHICS_DISABLED
142 
143  private:
144  int32_t rangemin_; // min of range
145  // rangemax_ is not well named as it is really one past the max.
146  int32_t rangemax_; // max of range
147  int32_t total_count_; // no of samples
148  int32_t* buckets_; // array of cells
149 };
150 
151 // Returns the nth ordered item from the array, as if they were
152 // ordered, but without ordering them, in linear time.
153 // The array does get shuffled!
154 int32_t choose_nth_item(int32_t index, // index to choose
155  float *array, // array of items
156  int32_t count); // no of items
157 // Generic version uses a defined comparator (with qsort semantics).
158 int32_t choose_nth_item(int32_t index, // index to choose
159  void *array, // array of items
160  int32_t count, // no of items
161  size_t size, // element size
162  int (*compar)(const void*, const void*)); // comparator
163 // Swaps 2 entries in an array in-place.
164 void swap_entries(void *array, // array of entries
165  size_t size, // size of entry
166  int32_t index1, // entries to swap
167  int32_t index2);
168 
169 #endif // TESSERACT_CCSTRUCT_STATISTC_H_
int32_t * buckets_
Definition: statistc.h:148
int32_t cluster(float lower, float upper, float multiple, int32_t max_clusters, STATS *clusters)
Definition: statistc.cpp:319
void print() const
Definition: statistc.cpp:533
void clear()
Definition: statistc.cpp:82
void plot(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
Definition: statistc.cpp:584
int32_t total_count_
Definition: statistc.h:147
int32_t min_bucket() const
Definition: statistc.cpp:205
Color
Definition: scrollview.h:105
Definition: kdpair.h:51
int32_t rangemin_
Definition: statistc.h:144
void smooth(int32_t factor)
Definition: statistc.cpp:288
double mean() const
Definition: statistc.cpp:134
bool local_min(int32_t x) const
Definition: statistc.cpp:261
Definition: scrollview.h:102
int32_t rangemax_
Definition: statistc.h:146
double sd() const
Definition: statistc.cpp:150
int32_t get_total() const
Definition: statistc.h:86
void add(int32_t value, int32_t count)
Definition: statistc.cpp:100
Definition: baseapi.h:37
~STATS()
Definition: statistc.cpp:93
Definition: statistc.h:33
double ile(double frac) const
Definition: statistc.cpp:173
int top_n_modes(int max_modes, GenericVector< tesseract::KDPairInc< float, int > > *modes) const
Definition: statistc.cpp:468
STATS()
Definition: statistc.cpp:52
bool set_range(int32_t min_bucket_value, int32_t max_bucket_value_plus_1)
Definition: statistc.cpp:63
int32_t max_bucket() const
Definition: statistc.cpp:220
int32_t mode() const
Definition: statistc.cpp:114
void plotline(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
Definition: statistc.cpp:611
void print_summary() const
Definition: statistc.cpp:559
double median() const
Definition: statistc.cpp:238
int32_t pile_count(int32_t value) const
Definition: statistc.h:78