How to Accelerate the Process of float32 to dtype with CPU

If PPU is not used to accelerate the vsi_nn_Float32ToDtype() process, the following CPU methods can be used to accelerate the process.

As the VSI generated code, for example: yolov5s_uint8_nbg_unify, we need to do some modification for the target.

1. For uint8 int8 and int16 format

If NN type is VSI_NN_TYPE_UINT8 VSI_NN_TYPE_INT8 VSI_NN_TYPE_INT16.
Using the lookup table to accelerate Float32 to Dtype.

  1. Create a lookup table for three channels of RGB values according to std/mean/normalize

static uint8_t *u2d; vsi_status vnn_PreTableInit(vsi_nn_graph_t *graph) { vsi_nn_tensor_t *tensor; vsi_size_t w, h, c, stride; vsi_size_t tbl_sz = 256; float normalize = 255.0f; float mean = 0.485f; float std = 0.225f; tensor = vsi_nn_GetTensor(graph, graph->input.tensors[0]); stride = vis_nn_TypeGetBytes(tensor->attr.dtype.vx_type); w = tensor->attr.size[0]; h = tensor->attr.size[1]; c = tensor->attr.size[2]; u2d = (uint8_t *)malloc(tbl_sz * stride); for( int i = 0; i < tbl_sz; i ++){ float val = ((float)i * normalize - mean) / std; vsi_nn_Float32Dtype(val, u2d + i * stride, &tensor->attr.dtype); } return VSI_SUCCESS; } void vnn_PreTableDeinit() { if(u2d){ free(u2d); u2d = NULL; } }
  1. Convert RGB data to dtype. Open CPU multi-core parallel using OpenMP. Need to include header file "omp. h" and link -openmp.

uint8_t *_float32_to_dtype(float *fdata, vsi_nn_tensor_t *tensor) {     vsi_status status;     uint8_t *iData, *oData;     vsi_size_t sz, i, stride;     sz = vsi_nn_GetElementNum(tensor);     stride = vsi_nn_TypeGetBytes(tensor->attr.dtype.vx_type);     if(stride == 0) stride = 1;     iData = (uint8_t *)fdata;     oData = (uint8_t *)malloc(stride * sz * sizeof(uint8_t));     TEST_CHECK_PTR(oData, final);     memset(oData, 0, stride * sz * sizeof(uint8_t));     if (stride == 1) {         uint8_t *pData = (uint8_t*)oData;         uint8_t *lut = (uint8_t*)u2v;         #pragma omp parallel for         for(size_t i = 0; i < sz; ++i)             pData[i] = lut[iData[i]];     } else if (stride == 2) {         uint16_t *pData = (uint16_t*)oData;         uint16_t *lut = (uint16_t*)u2v; #pragma omp parallel for         for(size_t i = 0; i < sz; ++i)             pData[i] = lut[iData[i]];     } final:     return oData; }

or we can use NEON to replace OpenMP:

else if (stride == 2) { uint16_t *pData = (uint16_t*)oData;         uint16_t *lut = (uint16_t*)u2v;         size_t i = 0;         size_t simd_end = sz / 8 * 8; // batch 8         for(; i < simd_end; i+=8) {             uint8x8_t i_u8 = vld1_u8(iData + i);             uint16x8_t o_u16 = {                 lut[vget_lane_u8(i_u8, 0)],                 lut[vget_lane_u8(i_u8, 1)],                 lut[vget_lane_u8(i_u8, 2)],                 lut[vget_lane_u8(i_u8, 3)],                 lut[vget_lane_u8(i_u8, 4)],                 lut[vget_lane_u8(i_u8, 5)],                 lut[vget_lane_u8(i_u8, 6)],                 lut[vget_lane_u8(i_u8, 7)]             };             vst1q_u16(pData + i, o_u16);         }         for (; i < sz; ++i)             pData[i] = lut[iData[i]];     }
  1. Add the following code to the main function

2. For float16 format

If the NN type is VSI_NN_TYPE_FLOAT16. We can use NEON to accelerate format conversion.

  • we can call fp16Tofp32() to replace vsi_nn_DtypeToFloat32()

  • we can call fp32Tofp16() to replace vsi_nn_Float32ToDtype()

The implementation of the above function is as follows: