tesseract  v4.0.0-17-g361f3264
Open Source OCR Engine
intsimdmatrix.h
1 // File: intsimdmatrix.h
3 // Description: Base class for 8-bit int SIMD matrix multipliers.
4 // Author: Ray Smith
5 // Created: Tue Aug 15 07:37:20 PST 2017
6 //
7 // (C) Copyright 2017, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
18 
19 #ifndef TESSERACT_ARCH_INTSIMDMATRIX_H_
20 #define TESSERACT_ARCH_INTSIMDMATRIX_H_
21 
22 #include <cstdint>
23 #include <vector>
24 
25 template <class T> class GENERIC_2D_ARRAY;
26 template <typename T> class GenericVector;
27 
28 namespace tesseract {
29 
30 // Base class for a SIMD function to multiply a matrix by a vector, with sources
31 // of 8-bit signed integer, and result in a double, after appropriate scaling.
32 // Assumes a specific method of multiplication that can be applied to any size
33 // and number of SIMD registers as follows:
34 // int32_t results are computed with num_outputs_per_register_ in each of
35 // max_output_registers_ result registers, repeatedly until it would make too
36 // many results, then the number of registers is halved, and so-on down to a
37 // single result register. The last calculation only outputs the required number
38 // of results instead of writing beyond the bounds. Eg: matrix has 75 outputs,
39 // num_outputs_per_register_ = 4, and max_output_registers_ = 8,
40 // Step 1: 8x4=32 results are computed,
41 // Step 2: 8x4=32 again, total 64,
42 // Step 3: 2x4=8 (since 8x4 is too many, so is 4x4), total 72,
43 // Step 4: 1x3, total 75.
44 // Each step above is computed using a PartialFunc, which runs over the input
45 // vector once. The input is read one registerful of num_inputs_per_register_
46 // at a time (presumably 4x num_outputs_per_register_ since they are int8_t)
47 // so the inputs MUST BE PADDED to a multiple of num_inputs_per_register_.
48 // Since it is slow (on Intel at least) to horizontally add in a register,
49 // provision is made to process num_inputs_per_group_ inputs at a time, with
50 // the group being replicated num_input_groups_ times and multiplied by a
51 // num_inputs_per_group_ by num_input_groups_ rectangle of the weights matrix.
52 // This is most convenient if num_inputs_per_group_ is 4, and the product
53 // sign-extends and sums 8x8=16 bit results to 32 bits, adding 4 adjacent
54 // results in the process, but it doesn't have to be implemented that way.
55 // The weights are re-ordered by Init() to be used sequentially by the above
56 // algorithm, followed by the biases, so they can be added at the end.
57 // The base class computes the base C++ implementation.
58 // NOTE that, although the subclasses execute on different SIMD hardware, no
59 // virtual methods are needed, as the constructor sets up everything that
60 // is required to allow the base class implementation to do all the work.
62  public:
63  // Constructor should set the data members to indicate the sizes.
64  // NOTE: Base constructor public only for test purposes.
70  num_input_groups_(1) {}
71 
72  // Factory makes and returns an IntSimdMatrix (sub)class of the best
73  // available type for the current architecture.
75 
76  // Computes a reshaped copy of the weight matrix w. If there are no
77  // partial_funcs_, it does nothing.
78  void Init(const GENERIC_2D_ARRAY<int8_t>& w);
79 
80  // Rounds the size up to a multiple of the input register size (in int8_t).
81  int RoundInputs(int size) const {
82  return Roundup(size, num_inputs_per_register_);
83  }
84  // Rounds the size up to a multiple of the output register size (in int32_t).
85  int RoundOutputs(int size) const {
86  return Roundup(size, num_outputs_per_register_);
87  }
88 
89  // Computes matrix.vector v = Wu.
90  // u is of size W.dim2() - 1 and the output v is of size W.dim1().
91  // u is imagined to have an extra element at the end with value 1, to
92  // implement the bias, but it doesn't actually have it.
93  // Computes the base C++ implementation, if there are no partial_funcs_.
94  // NOTE: The size of the input vector (u) must be padded using
95  // RoundInputs above.
96  // The input will be over-read to the extent of the padding. There are no
97  // alignment requirements.
99  const GenericVector<double>& scales, const int8_t* u,
100  double* v) const;
101 
102  protected:
103  // Function to compute part of a matrix.vector multiplication. The weights
104  // are in a very specific order (see above) in w, which is multiplied by
105  // u of length num_in, to produce output v after scaling the integer results
106  // by the corresponding member of scales.
107  // The amount of w and scales consumed is fixed and not available to the
108  // caller. The number of outputs written to v will be at most num_out.
109  typedef void (*PartialFunc)(const int8_t* w, const double* scales,
110  const int8_t* u, int num_in, int num_out,
111  double* v);
112 
113  // Rounds the input up to a multiple of the given factor.
114  static int Roundup(int input, int factor) {
115  return (input + factor - 1) / factor * factor;
116  }
117 
118  // Number of 32 bit outputs held in each register.
120  // Maximum number of registers that we will use to hold outputs.
122  // Number of 8 bit inputs in the inputs register.
124  // Number of inputs in each weight group.
126  // Number of groups of inputs to be broadcast.
128  // The weights matrix reorganized in whatever way suits this instance.
129  std::vector<int8_t> shaped_w_;
130  // A series of functions to compute a partial result.
131  std::vector<PartialFunc> partial_funcs_;
132 };
133 
134 } // namespace tesseract
135 
136 #endif // TESSERACT_ARCH_INTSIMDMATRIX_H_
Definition: intsimdmatrix.h:61
void MatrixDotVector(const GENERIC_2D_ARRAY< int8_t > &w, const GenericVector< double > &scales, const int8_t *u, double *v) const
Definition: intsimdmatrix.cpp:96
void(* PartialFunc)(const int8_t *w, const double *scales, const int8_t *u, int num_in, int num_out, double *v)
Definition: intsimdmatrix.h:109
std::vector< int8_t > shaped_w_
Definition: intsimdmatrix.h:129
int num_input_groups_
Definition: intsimdmatrix.h:127
int max_output_registers_
Definition: intsimdmatrix.h:121
Definition: intsimdmatrix.h:25
int num_inputs_per_register_
Definition: intsimdmatrix.h:123
int RoundOutputs(int size) const
Definition: intsimdmatrix.h:85
static int Roundup(int input, int factor)
Definition: intsimdmatrix.h:114
Definition: baseapi.cpp:94
int num_outputs_per_register_
Definition: intsimdmatrix.h:119
std::vector< PartialFunc > partial_funcs_
Definition: intsimdmatrix.h:131
Definition: baseapi.h:37
int num_inputs_per_group_
Definition: intsimdmatrix.h:125
int RoundInputs(int size) const
Definition: intsimdmatrix.h:81
static IntSimdMatrix * GetFastestMultiplier()
Definition: intsimdmatrix.cpp:31
IntSimdMatrix()
Definition: intsimdmatrix.h:65
void Init(const GENERIC_2D_ARRAY< int8_t > &w)
Definition: intsimdmatrix.cpp:46