General matrix multiplication (GEMM) More...

#include "gemm.hpp"
#include "im2col.hpp"
#include "Timing.hpp"

Include dependency graph for gemm.cpp:

Macros
#define	PUT_IN_REGISTER register

#define	TILE_K 16

#define	TILE_M 4

#define	TILE_N 16

Functions
void	activate_array_cpu_custom (float *x, const int n, const ACTIVATION a)

void	convolution_2d (int w, int h, int ksize, int n, int c, int pad, int stride, float weights, float input, float output, float mean)

void	convolution_repacked (uint32_t packed_input, uint32_t packed_weights, float output, int w, int h, int c, int n, int size, int pad, int new_lda, float mean_arr)

void	float_to_bit (float src, unsigned char dst, size_t size)

void	forward_maxpool_layer_avx (float src, float dst, int *indexes, int size, int w, int h, int out_w, int out_h, int c, int pad, int stride, int batch)

void	gemm_gpu (int TA, int TB, int M, int N, int K, float ALPHA, float A, int lda, float B, int ldb, float BETA, float *C, int ldc)

void	gemm_nn (int M, int N, int K, float ALPHA, float A, int lda, float B, int ldb, float *C, int ldc)

void	gemm_nn_bin_32bit_packed (int M, int N, int K, float ALPHA, uint32_t A, int lda, uint32_t B, int ldb, float C, int ldc, float mean_arr)

void	gemm_nn_bin_transposed_32bit_packed (int M, int N, int K, float ALPHA, uint32_t A, int lda, uint32_t B, int ldb, float C, int ldc, float mean_arr)

void	gemm_nn_custom_bin_mean_transposed (int M, int N, int K, float ALPHA_UNUSED, unsigned char A, int lda, unsigned char B, int ldb, float C, int ldc, float mean_arr)

void	gemm_nn_fast (int M, int N, int K, float ALPHA, float A, int lda, float B, int ldb, float *C, int ldc)

void	gemm_nt (int M, int N, int K, float ALPHA, float A, int lda, float B, int ldb, float *C, int ldc)

void	gemm_ongpu (int TA, int TB, int M, int N, int K, float ALPHA, float A_gpu, int lda, float B_gpu, int ldb, float BETA, float *C_gpu, int ldc)

void	gemm_tn (int M, int N, int K, float ALPHA, float A, int lda, float B, int ldb, float *C, int ldc)

void	gemm_tt (int M, int N, int K, float ALPHA, float A, int lda, float B, int ldb, float *C, int ldc)

void	im2col_cpu_custom (float data_im, int channels, int height, int width, int ksize, int stride, int pad, float data_col)

void	im2col_cpu_custom_bin (float data_im, int channels, int height, int width, int ksize, int stride, int pad, float data_col, int bit_align)

void	im2col_cpu_custom_transpose (float data_im, int channels, int height, int width, int ksize, int stride, int pad, float data_col, int ldb_align)

void	init_cpu ()

int	is_avx ()

int	is_fma_avx2 ()

float *	random_matrix (int rows, int cols)

void	repack_input (float input, float re_packed_input, int w, int h, int c)

void	test_gpu_accuracy (int TA, int TB, int m, int k, int n)

int	test_gpu_blas ()

void	time_gpu_random_matrix (int TA, int TB, int m, int k, int n)

void	time_ongpu (int TA, int TB, int m, int k, int n)

void	transpose_block_SSE4x4 (float A, float B, const int n, const int m, const int lda, const int ldb, const int block_size)

static void	transpose_scalar_block (float A, float B, const int lda, const int ldb, const int block_size)

void	transpose_uint32 (uint32_t src, uint32_t dst, int src_h, int src_w, int src_align, int dst_align)

Detailed Description

General matrix multiplication (GEMM)

Macro Definition Documentation

◆ PUT_IN_REGISTER

#define PUT_IN_REGISTER register

◆ TILE_K

#define TILE_K 16

◆ TILE_M

#define TILE_M 4

Todo:: V3 Would be nice to know where this file came from, and to see if there are updates available.

◆ TILE_N

#define TILE_N 16

Function Documentation

◆ activate_array_cpu_custom()

void activate_array_cpu_custom	(	float *	x,
		const int	n,
		const ACTIVATION	a
	)

Here is the call graph for this function:

Here is the caller graph for this function:

◆ convolution_2d()

void convolution_2d	(	int	w,
		int	h,
		int	ksize,
		int	n,
		int	c,
		int	pad,
		int	stride,
		float *	weights,
		float *	input,
		float *	output,
		float *	mean
	)

◆ convolution_repacked()

void convolution_repacked	(	uint32_t *	packed_input,
		uint32_t *	packed_weights,
		float *	output,
		int	w,
		int	h,
		int	c,
		int	n,
		int	size,
		int	pad,
		int	new_lda,
		float *	mean_arr
	)

◆ float_to_bit()

void float_to_bit	(	float *	src,
		unsigned char *	dst,
		size_t	size
	)

Here is the caller graph for this function:

◆ forward_maxpool_layer_avx()

void forward_maxpool_layer_avx	(	float *	src,
		float *	dst,
		int *	indexes,
		int	size,
		int	w,
		int	h,
		int	out_w,
		int	out_h,
		int	c,
		int	pad,
		int	stride,
		int	batch
	)

Here is the caller graph for this function:

◆ gemm_gpu()

void gemm_gpu	(	int	TA,
		int	TB,
		int	M,
		int	N,
		int	K,
		float	ALPHA,
		float *	A,
		int	lda,
		float *	B,
		int	ldb,
		float	BETA,
		float *	C,
		int	ldc
	)

Here is the call graph for this function:

Here is the caller graph for this function:

◆ gemm_nn()

void gemm_nn	(	int	M,
		int	N,
		int	K,
		float	ALPHA,
		float *	A,
		int	lda,
		float *	B,
		int	ldb,
		float *	C,
		int	ldc
	)

◆ gemm_nn_bin_32bit_packed()

void gemm_nn_bin_32bit_packed	(	int	M,
		int	N,
		int	K,
		float	ALPHA,
		uint32_t *	A,
		int	lda,
		uint32_t *	B,
		int	ldb,
		float *	C,
		int	ldc,
		float *	mean_arr
	)

◆ gemm_nn_bin_transposed_32bit_packed()

void gemm_nn_bin_transposed_32bit_packed	(	int	M,
		int	N,
		int	K,
		float	ALPHA,
		uint32_t *	A,
		int	lda,
		uint32_t *	B,
		int	ldb,
		float *	C,
		int	ldc,
		float *	mean_arr
	)

◆ gemm_nn_custom_bin_mean_transposed()

void gemm_nn_custom_bin_mean_transposed	(	int	M,
		int	N,
		int	K,
		float	ALPHA_UNUSED,
		unsigned char *	A,
		int	lda,
		unsigned char *	B,
		int	ldb,
		float *	C,
		int	ldc,
		float *	mean_arr
	)

Here is the caller graph for this function:

◆ gemm_nn_fast()

void gemm_nn_fast	(	int	M,
		int	N,
		int	K,
		float	ALPHA,
		float *	A,
		int	lda,
		float *	B,
		int	ldb,
		float *	C,
		int	ldc
	)

◆ gemm_nt()

void gemm_nt	(	int	M,
		int	N,
		int	K,
		float	ALPHA,
		float *	A,
		int	lda,
		float *	B,
		int	ldb,
		float *	C,
		int	ldc
	)

◆ gemm_ongpu()

void gemm_ongpu	(	int	TA,
		int	TB,
		int	M,
		int	N,
		int	K,
		float	ALPHA,
		float *	A_gpu,
		int	lda,
		float *	B_gpu,
		int	ldb,
		float	BETA,
		float *	C_gpu,
		int	ldc
	)

Here is the call graph for this function:

Here is the caller graph for this function:

◆ gemm_tn()

void gemm_tn	(	int	M,
		int	N,
		int	K,
		float	ALPHA,
		float *	A,
		int	lda,
		float *	B,
		int	ldb,
		float *	C,
		int	ldc
	)

◆ gemm_tt()

void gemm_tt	(	int	M,
		int	N,
		int	K,
		float	ALPHA,
		float *	A,
		int	lda,
		float *	B,
		int	ldb,
		float *	C,
		int	ldc
	)

◆ im2col_cpu_custom()

void im2col_cpu_custom	(	float *	data_im,
		int	channels,
		int	height,
		int	width,
		int	ksize,
		int	stride,
		int	pad,
		float *	data_col
	)

Here is the call graph for this function:

Here is the caller graph for this function:

◆ im2col_cpu_custom_bin()

void im2col_cpu_custom_bin	(	float *	data_im,
		int	channels,
		int	height,
		int	width,
		int	ksize,
		int	stride,
		int	pad,
		float *	data_col,
		int	bit_align
	)

Here is the call graph for this function:

Here is the caller graph for this function:

◆ im2col_cpu_custom_transpose()

void im2col_cpu_custom_transpose	(	float *	data_im,
		int	channels,
		int	height,
		int	width,
		int	ksize,
		int	stride,
		int	pad,
		float *	data_col,
		int	ldb_align
	)

◆ init_cpu()

void init_cpu ( )

Here is the call graph for this function:

Here is the caller graph for this function:

◆ is_avx()

int is_avx ( )

Here is the caller graph for this function:

◆ is_fma_avx2()

int is_fma_avx2 ( )

Here is the caller graph for this function:

◆ random_matrix()

float * random_matrix	(	int	rows,
		int	cols
	)

Here is the caller graph for this function:

◆ repack_input()

void repack_input	(	float *	input,
		float *	re_packed_input,
		int	w,
		int	h,
		int	c
	)

Here is the caller graph for this function:

◆ test_gpu_accuracy()

void test_gpu_accuracy	(	int	TA,
		int	TB,
		int	m,
		int	k,
		int	n
	)

Here is the call graph for this function:

◆ test_gpu_blas()

int test_gpu_blas ( )

Here is the call graph for this function:

◆ time_gpu_random_matrix()

void time_gpu_random_matrix	(	int	TA,
		int	TB,
		int	m,
		int	k,
		int	n
	)

Here is the call graph for this function:

◆ time_ongpu()

void time_ongpu	(	int	TA,
		int	TB,
		int	m,
		int	k,
		int	n
	)

Here is the call graph for this function:

Here is the caller graph for this function:

◆ transpose_block_SSE4x4()

void transpose_block_SSE4x4	(	float *	A,
		float *	B,
		const int	n,
		const int	m,
		const int	lda,
		const int	ldb,
		const int	block_size
	)

◆ transpose_scalar_block()

static void transpose_scalar_block	(	float *	A,
		float *	B,
		const int	lda,
		const int	ldb,
		const int	block_size
	)

inlinestatic

◆ transpose_uint32()

void transpose_uint32	(	uint32_t *	src,
		uint32_t *	dst,
		int	src_h,
		int	src_w,
		int	src_align,
		int	dst_align
	)

Here is the caller graph for this function:

Macros

Functions

Detailed Description

Macro Definition Documentation

◆ PUT_IN_REGISTER

◆ TILE_K

◆ TILE_M

◆ TILE_N

Function Documentation

◆ activate_array_cpu_custom()

◆ convolution_2d()

◆ convolution_repacked()

◆ float_to_bit()

◆ forward_maxpool_layer_avx()

◆ gemm_gpu()

◆ gemm_nn()

◆ gemm_nn_bin_32bit_packed()

◆ gemm_nn_bin_transposed_32bit_packed()

◆ gemm_nn_custom_bin_mean_transposed()

◆ gemm_nn_fast()

◆ gemm_nt()

◆ gemm_ongpu()

◆ gemm_tn()

◆ gemm_tt()

◆ im2col_cpu_custom()

◆ im2col_cpu_custom_bin()

◆ im2col_cpu_custom_transpose()

◆ init_cpu()

◆ is_avx()

◆ is_fma_avx2()

◆ random_matrix()

◆ repack_input()

◆ test_gpu_accuracy()

◆ test_gpu_blas()

◆ time_gpu_random_matrix()

◆ time_ongpu()

◆ transpose_block_SSE4x4()

◆ transpose_scalar_block()

◆ transpose_uint32()