How to Accelerate the Process of float32 to dtype with CPU
If PPU is not used to accelerate the vsi_nn_Float32ToDtype()
process, the following CPU methods can be used to accelerate the process.
As the VSI generated code, for example: yolov5s_uint8_nbg_unify, we need to do some modification for the target.
1. For uint8 int8 and int16 format
If NN type is VSI_NN_TYPE_UINT8
VSI_NN_TYPE_INT8
VSI_NN_TYPE_INT16
.
Using the lookup table to accelerate Float32 to Dtype.
Create a lookup table for three channels of RGB values according to std/mean/normalize
static uint8_t *u2d;
vsi_status vnn_PreTableInit(vsi_nn_graph_t *graph)
{
vsi_nn_tensor_t *tensor;
vsi_size_t w, h, c, stride;
vsi_size_t tbl_sz = 256;
float normalize = 255.0f;
float mean = 0.485f;
float std = 0.225f;
tensor = vsi_nn_GetTensor(graph, graph->input.tensors[0]);
stride = vis_nn_TypeGetBytes(tensor->attr.dtype.vx_type);
w = tensor->attr.size[0];
h = tensor->attr.size[1];
c = tensor->attr.size[2];
u2d = (uint8_t *)malloc(tbl_sz * stride);
for( int i = 0; i < tbl_sz; i ++){
float val = ((float)i * normalize - mean) / std;
vsi_nn_Float32Dtype(val, u2d + i * stride, &tensor->attr.dtype);
}
return VSI_SUCCESS;
}
void vnn_PreTableDeinit()
{
if(u2d){
free(u2d);
u2d = NULL;
}
}
Convert RGB data to dtype. Open CPU multi-core parallel using OpenMP. Need to include header file "omp. h" and link -openmp.
uint8_t *_float32_to_dtype(float *fdata, vsi_nn_tensor_t *tensor)
{
  vsi_status status;
  uint8_t *iData, *oData;
  vsi_size_t sz, i, stride;
  sz = vsi_nn_GetElementNum(tensor);
  stride = vsi_nn_TypeGetBytes(tensor->attr.dtype.vx_type);
  if(stride == 0) stride = 1;
  iData = (uint8_t *)fdata;
  oData = (uint8_t *)malloc(stride * sz * sizeof(uint8_t));
  TEST_CHECK_PTR(oData, final);
  memset(oData, 0, stride * sz * sizeof(uint8_t));
  if (stride == 1) {
    uint8_t *pData = (uint8_t*)oData;
    uint8_t *lut = (uint8_t*)u2v;
    #pragma omp parallel for
    for(size_t i = 0; i < sz; ++i)
      pData[i] = lut[iData[i]];
  } else if (stride == 2) {
    uint16_t *pData = (uint16_t*)oData;
    uint16_t *lut = (uint16_t*)u2v;
#pragma omp parallel for
    for(size_t i = 0; i < sz; ++i)
      pData[i] = lut[iData[i]];
  }
final:
  return oData;
}
or we can use NEON to replace OpenMP:
else if (stride == 2) {
uint16_t *pData = (uint16_t*)oData;
    uint16_t *lut = (uint16_t*)u2v;
    size_t i = 0;
    size_t simd_end = sz / 8 * 8; // batch 8
    for(; i < simd_end; i+=8) {
      uint8x8_t i_u8 = vld1_u8(iData + i);
      uint16x8_t o_u16 = {
        lut[vget_lane_u8(i_u8, 0)],
        lut[vget_lane_u8(i_u8, 1)],
        lut[vget_lane_u8(i_u8, 2)],
        lut[vget_lane_u8(i_u8, 3)],
        lut[vget_lane_u8(i_u8, 4)],
        lut[vget_lane_u8(i_u8, 5)],
        lut[vget_lane_u8(i_u8, 6)],
        lut[vget_lane_u8(i_u8, 7)]
      };
      vst1q_u16(pData + i, o_u16);
    }
    for (; i < sz; ++i)
      pData[i] = lut[iData[i]];
  }
Add the following code to the main function
2. For float16 format
If the NN type is VSI_NN_TYPE_FLOAT16
. We can use NEON to accelerate format conversion.
we can call
fp16Tofp32()
to replacevsi_nn_DtypeToFloat32()
we can call
fp32Tofp16()
to replacevsi_nn_Float32ToDtype()
The implementation of the above function is as follows: