#include <cstdint>
#include <cstdio>
#include <cassert>
#include "darknet_gpu.hpp"
#include "im2col.hpp"
#include "dark_cuda.hpp"
#include "Timing.hpp"

Macros
#define	CUDA_KERNEL_LOOP(i, n)

#define	swap(a0, a1, j, m) t = (a0 ^ (a1 >>j)) & m; a0 = a0 ^ t; a1 = a1 ^ (t << j);

Functions
template<typename T >
__device__ uint32_t	__ballot_custom (T val)

template<typename T1 , typename T2 >
__device__ T1	__shfl_custom (T1 val, T2 lane)

int	CAFFE_GET_BLOCKS (const int N)

void	convolve_bin_gpu (float input, float weights, float output, int in_w, int in_h, int in_c, int n, int size, int pad, int new_lda, float mean_arr_gpu)

__global__ void	convolve_bin_gpu_kernel (float input, float weights, float output, int in_w, int in_h, int in_c, int n, int size, int pad, int new_lda, float mean_arr_gpu)

void	convolve_gpu (float input, float weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad)

__global__ void	convolve_gpu_kernel (float input, float weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad)

void	fill_int8_gpu (unsigned char *src, unsigned char val, size_t size)

__global__ void	fill_int8_gpu_kernel (unsigned char *src, unsigned char val, size_t size)

void	float_to_bit_gpu (float src, unsigned char dst, size_t size)

__global__ void	float_to_bit_gpu_kernel (float src, unsigned char dst, size_t size)

void	gemm_nn_custom_bin_mean_transposed_gpu (int M, int N, int K, unsigned char A, int lda, unsigned char B, int ldb, float C, int ldc, float mean_arr, float bias, int leaky_activation, float shortcut_in_gpu, float *shortcut_out_gpu)

__global__ void	gemm_nn_custom_bin_mean_transposed_gpu_kernel (int M, int N, int K, unsigned char A, int lda, unsigned char B, int ldb, float C, int ldc, float mean_arr, float bias_arr, int leaky_activation, float shortcut_in_gpu, float *shortcut_out_gpu)

__device__ static __host__ unsigned char	get_bit (unsigned char const *const src, size_t index)

__global__ void	im2col_align_bin_gpu_kernel (const int n, const float data_im, const int height, const int width, const int ksize, const int channels, const int pad, const int stride, const int height_col, const int width_col, float data_col, const int bit_align)

void	im2col_align_bin_ongpu (float im, int channels, int height, int width, int ksize, int stride, int pad, float data_col, int bit_align)

__global__ void	im2col_align_gpu_kernel (const int n, const float data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, float data_col, const int bit_align)

void	im2col_align_ongpu (float im, int channels, int height, int width, int ksize, int stride, int pad, float data_col, int bit_align)

void	im2col_gpu_ext (const float data_im, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, float data_col)

__global__ void	im2col_gpu_kernel (const int n, const float data_im, const int height, const int width, const int ksize, const int pad, const int stride, const int height_col, const int width_col, float data_col)

__global__ void	im2col_gpu_kernel_ext (const int n, const float data_im, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int height_col, const int width_col, float data_col)

void	im2col_ongpu (float im, int channels, int height, int width, int ksize, int stride, int pad, float data_col)

void	repack_input_gpu (float input, float re_packed_input, int w, int h, int c)

void	repack_input_gpu_bin (float input, uint32_t re_packed_input_bin, int w, int h, int c)

__global__ void	repack_input_kernel (float input, float re_packed_input, int w, int h, int c)

__global__ void	repack_input_kernel_2 (float input, float re_packed_input, int w, int h, int c)

__global__ void	repack_input_kernel_bin (float input, uint32_t re_packed_input_bin, int w, int h, int c)

__device__ uint32_t	reverse_32_bit (uint32_t a)

__device__ __host__ uint8_t	reverse_8_bit (uint8_t a)

__device__ __host__ unsigned char	reverse_byte (unsigned char a)

__device__ __host__ unsigned char	reverse_byte_2 (unsigned char a)

__device__ unsigned char	reverse_byte_CUDA (unsigned char a)

__device__ void	transpose32_optimized (uint32_t A[32])

__device__ void	transpose8rS32_reversed_diagonale (unsigned char A, unsigned char B, int m, int n)

__device__ void	transpose_32x32_bits_reversed_diagonale (uint32_t A, uint32_t B, int m, int n)

void	transpose_bin_gpu (unsigned char A, unsigned char B, const int n, const int m, const int lda, const int ldb, const int block_size)

__global__ void	transpose_bin_gpu_kernel (unsigned char A, unsigned char B, const int n, const int m, const int lda, const int ldb, const int block_size)

__global__ void	transpose_bin_gpu_kernel_32 (uint32_t A, uint32_t B, const int n, const int m, const int lda, const int ldb, const int block_size)

void	transpose_uint32_gpu (uint32_t src, uint32_t dst, int src_h, int src_w, int src_align, int dst_align)

__global__ void	transpose_uint32_kernel (uint32_t src, uint32_t dst, int src_h, int src_w, int src_align, int dst_align)

__global__ void	transpose_uint32_kernel_2 (uint32_t src, uint32_t dst, int src_h, int src_w, int src_align, int dst_align)

__inline__ __device__ int	warpAllReduceSum (int val)

__device__ static __host__ uint8_t	xnor_bit1 (uint8_t a, uint8_t b)

__device__ static __host__ ulonglong4	xor_int256 (ulonglong4 a, ulonglong4 b)

__device__ static __host__ uint32_t	xor_int32 (uint32_t a, uint32_t b)

__device__ static __host__ uint64_t	xor_int64 (uint64_t a, uint64_t b)

Variables
const int	CAFFE_CUDA_NUM_THREADS = 512

Macro Definition Documentation

◆ CUDA_KERNEL_LOOP

#define CUDA_KERNEL_LOOP	(	i,
		n
	)

Value:

for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
    i < (n); \
    i += blockDim.x * gridDim.x)

◆ swap

#define swap	(	a0,
		a1,
		j,
		m
	)	t = (a0 ^ (a1 >>j)) & m; a0 = a0 ^ t; a1 = a1 ^ (t << j);

Function Documentation

◆ __ballot_custom()

template<typename T >

__device__ uint32_t __ballot_custom ( T val )

inline

Here is the caller graph for this function:

◆ __shfl_custom()

template<typename T1 , typename T2 >

__device__ T1 __shfl_custom	(	T1	val,
		T2	lane
	)

inline

Here is the caller graph for this function:

◆ CAFFE_GET_BLOCKS()

int CAFFE_GET_BLOCKS ( const int N )

inline

Here is the caller graph for this function:

◆ convolve_bin_gpu()

void convolve_bin_gpu	(	float *	input,
		float *	weights,
		float *	output,
		int	in_w,
		int	in_h,
		int	in_c,
		int	n,
		int	size,
		int	pad,
		int	new_lda,
		float *	mean_arr_gpu
	)

Here is the call graph for this function:

◆ convolve_bin_gpu_kernel()

__global__ void convolve_bin_gpu_kernel	(	float *	input,
		float *	weights,
		float *	output,
		int	in_w,
		int	in_h,
		int	in_c,
		int	n,
		int	size,
		int	pad,
		int	new_lda,
		float *	mean_arr_gpu
	)

Here is the call graph for this function:

Here is the caller graph for this function:

◆ convolve_gpu()

void convolve_gpu	(	float *	input,
		float *	weights,
		float *	output,
		int	in_w,
		int	in_h,
		int	in_c,
		int	n,
		int	size,
		int	pad
	)

Here is the call graph for this function:

◆ convolve_gpu_kernel()

__global__ void convolve_gpu_kernel	(	float *	input,
		float *	weights,
		float *	output,
		int	in_w,
		int	in_h,
		int	in_c,
		int	n,
		int	size,
		int	pad
	)

Here is the caller graph for this function:

◆ fill_int8_gpu()

void fill_int8_gpu	(	unsigned char *	src,
		unsigned char	val,
		size_t	size
	)

Here is the call graph for this function:

◆ fill_int8_gpu_kernel()

__global__ void fill_int8_gpu_kernel	(	unsigned char *	src,
		unsigned char	val,
		size_t	size
	)

Here is the caller graph for this function:

◆ float_to_bit_gpu()

void float_to_bit_gpu	(	float *	src,
		unsigned char *	dst,
		size_t	size
	)

Here is the call graph for this function:

Here is the caller graph for this function:

◆ float_to_bit_gpu_kernel()

__global__ void float_to_bit_gpu_kernel	(	float *	src,
		unsigned char *	dst,
		size_t	size
	)

Here is the call graph for this function:

Here is the caller graph for this function:

◆ gemm_nn_custom_bin_mean_transposed_gpu()

void gemm_nn_custom_bin_mean_transposed_gpu	(	int	M,
		int	N,
		int	K,
		unsigned char *	A,
		int	lda,
		unsigned char *	B,
		int	ldb,
		float *	C,
		int	ldc,
		float *	mean_arr,
		float *	bias,
		int	leaky_activation,
		float *	shortcut_in_gpu,
		float *	shortcut_out_gpu
	)

Here is the call graph for this function:

Here is the caller graph for this function:

◆ gemm_nn_custom_bin_mean_transposed_gpu_kernel()

__global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel	(	int	M,
		int	N,
		int	K,
		unsigned char *	A,
		int	lda,
		unsigned char *	B,
		int	ldb,
		float *	C,
		int	ldc,
		float *	mean_arr,
		float *	bias_arr,
		int	leaky_activation,
		float *	shortcut_in_gpu,
		float *	shortcut_out_gpu
	)

Here is the call graph for this function:

Here is the caller graph for this function:

◆ get_bit()

__device__ static __host__ unsigned char get_bit	(	unsigned char const *const	src,
		size_t	index
	)

inlinestatic

Here is the caller graph for this function:

◆ im2col_align_bin_gpu_kernel()

__global__ void im2col_align_bin_gpu_kernel	(	const int	n,
		const float *	data_im,
		const int	height,
		const int	width,
		const int	ksize,
		const int	channels,
		const int	pad,
		const int	stride,
		const int	height_col,
		const int	width_col,
		float *	data_col,
		const int	bit_align
	)

Here is the call graph for this function:

Here is the caller graph for this function:

◆ im2col_align_bin_ongpu()

void im2col_align_bin_ongpu	(	float *	im,
		int	channels,
		int	height,
		int	width,
		int	ksize,
		int	stride,
		int	pad,
		float *	data_col,
		int	bit_align
	)

Here is the call graph for this function:

◆ im2col_align_gpu_kernel()

__global__ void im2col_align_gpu_kernel	(	const int	n,
		const float *	data_im,
		const int	height,
		const int	width,
		const int	ksize,
		const int	pad,
		const int	stride,
		const int	height_col,
		const int	width_col,
		float *	data_col,
		const int	bit_align
	)

Todo:: V3 should this be "*data_col_ptr" instead? Otherwise, what exactly are we incrementing here? (answer: the pointer...but why?)

Here is the caller graph for this function:

◆ im2col_align_ongpu()

void im2col_align_ongpu	(	float *	im,
		int	channels,
		int	height,
		int	width,
		int	ksize,
		int	stride,
		int	pad,
		float *	data_col,
		int	bit_align
	)

Here is the call graph for this function:

Here is the caller graph for this function:

◆ im2col_gpu_ext()

void im2col_gpu_ext	(	const float *	data_im,
		const int	channels,
		const int	height,
		const int	width,
		const int	kernel_h,
		const int	kernel_w,
		const int	pad_h,
		const int	pad_w,
		const int	stride_h,
		const int	stride_w,
		const int	dilation_h,
		const int	dilation_w,
		float *	data_col
	)

Here is the call graph for this function:

Here is the caller graph for this function:

◆ im2col_gpu_kernel()

__global__ void im2col_gpu_kernel	(	const int	n,
		const float *	data_im,
		const int	height,
		const int	width,
		const int	ksize,
		const int	pad,
		const int	stride,
		const int	height_col,
		const int	width_col,
		float *	data_col
	)

Here is the caller graph for this function:

◆ im2col_gpu_kernel_ext()

__global__ void im2col_gpu_kernel_ext	(	const int	n,
		const float *	data_im,
		const int	height,
		const int	width,
		const int	kernel_h,
		const int	kernel_w,
		const int	pad_h,
		const int	pad_w,
		const int	stride_h,
		const int	stride_w,
		const int	dilation_h,
		const int	dilation_w,
		const int	height_col,
		const int	width_col,
		float *	data_col
	)