How to Accelerate the Process of float32 to dtype with CPU

If PPU is not used to accelerate the vsi_nn_Float32ToDtype() process, the following CPU methods can be used to accelerate the process.

As the VSI generated code, for example: yolov5s_uint8_nbg_unify, we need to do some modification for the target.

1. For uint8 int8 and int16 format

If NN type is VSI_NN_TYPE_UINT8 VSI_NN_TYPE_INT8 VSI_NN_TYPE_INT16.
Using the lookup table to accelerate Float32 to Dtype.

Create a lookup table for three channels of RGB values according to std/mean/normalize

static uint8_t *u2d;

vsi_status vnn_PreTableInit(vsi_nn_graph_t *graph)
{
	vsi_nn_tensor_t *tensor;
	vsi_size_t w, h, c, stride;
	vsi_size_t tbl_sz = 256;
	float normalize = 255.0f;
	float mean = 0.485f;
	float std = 0.225f;

	tensor = vsi_nn_GetTensor(graph, graph->input.tensors[0]);
	stride = vis_nn_TypeGetBytes(tensor->attr.dtype.vx_type);
	w = tensor->attr.size[0];
	h = tensor->attr.size[1];
	c = tensor->attr.size[2];

	u2d = (uint8_t *)malloc(tbl_sz * stride);
	for( int i = 0; i < tbl_sz; i ++){
		float val = ((float)i * normalize - mean) / std;
		vsi_nn_Float32Dtype(val, u2d + i * stride, &tensor->attr.dtype);
	}
	
	return VSI_SUCCESS;
}

void vnn_PreTableDeinit()
{
	if(u2d){
		free(u2d);
		u2d = NULL;
	}
}

Convert RGB data to dtype. Open CPU multi-core parallel using OpenMP. Need to include header file "omp. h" and link -openmp.

uint8_t *_float32_to_dtype(float *fdata, vsi_nn_tensor_t *tensor)
{
    vsi_status status;
    uint8_t *iData, *oData;
    vsi_size_t sz, i, stride;
 
    sz = vsi_nn_GetElementNum(tensor);
    stride = vsi_nn_TypeGetBytes(tensor->attr.dtype.vx_type);

    if(stride == 0) stride = 1;

    iData = (uint8_t *)fdata;
    oData = (uint8_t *)malloc(stride * sz * sizeof(uint8_t));

  
    TEST_CHECK_PTR(oData, final);
    memset(oData, 0, stride * sz * sizeof(uint8_t));
    if (stride == 1) {
        uint8_t *pData = (uint8_t*)oData;
        uint8_t *lut = (uint8_t*)u2v;
        #pragma omp parallel for
        for(size_t i = 0; i < sz; ++i)
            pData[i] = lut[iData[i]];
    } else if (stride == 2) {
        uint16_t *pData = (uint16_t*)oData;
        uint16_t *lut = (uint16_t*)u2v;
		#pragma omp parallel for
        for(size_t i = 0; i < sz; ++i)
            pData[i] = lut[iData[i]];
    }

final:
    return oData;
}

or we can use NEON to replace OpenMP:

	 else if (stride == 2) {
		uint16_t *pData = (uint16_t*)oData;
        uint16_t *lut = (uint16_t*)u2v;
  
        size_t i = 0;
        size_t simd_end = sz / 8 * 8; // batch 8

        for(; i < simd_end; i+=8) {
            uint8x8_t i_u8 = vld1_u8(iData + i);
            uint16x8_t o_u16 = {
                lut[vget_lane_u8(i_u8, 0)],
                lut[vget_lane_u8(i_u8, 1)],
                lut[vget_lane_u8(i_u8, 2)],
                lut[vget_lane_u8(i_u8, 3)],
                lut[vget_lane_u8(i_u8, 4)],
                lut[vget_lane_u8(i_u8, 5)],
                lut[vget_lane_u8(i_u8, 6)],
                lut[vget_lane_u8(i_u8, 7)]
            };

            vst1q_u16(pData + i, o_u16);
        }

        for (; i < sz; ++i)
            pData[i] = lut[iData[i]];
    }

Add the following code to the main function

int main(int argc, char** argv)
{
	......
	/* Verify graph */
	status = vnn_VerifyGraph(graph);
	TEST_CHECK_STATUS(status, final);

	/* Create u2d table */
	status = vnn_PreTableInit(graph);
	TEST_CHECK_STATUS(status, final);

	.....
}

2. For float16 format

If the NN type is VSI_NN_TYPE_FLOAT16. We can use NEON to accelerate format conversion.

we can call fp16Tofp32() to replace vsi_nn_Float32ToDtype()
we can call fp32Tofp16() to replace vsi_nn_DtypeToFloat32()

The implementation of the above function is as follows:

void fp16Tofp32(float16_t* src, float* dst, int n)
{
    for (int i = 0; i < n; i += 4) {
        float16x4_t v_half = vld1_f16(src + i);  
        float32x4_t v_float = vcvt_f32_f16(v_half);
        vst1q_f32(dst + i, v_float);
    }
}

void fp32Tofp16(float* src, float16_t* dst, int n)
{
    for (int i = 0; i < n; i += 4) {
        float32x4_t v_float = vld1q_f32(src + i);  
        float16x4_t v_half = vcvt_f16_f32(v_float);
        vst1_f16(dst + i, v_half);
    }
}