If PPU is not used to accelerate the vsi_nn_Float32ToDtype()
process, the following CPU methods can be used to accelerate the process.
As the VSI generated code, for example: yolov5s_uint8_nbg_unify, we need to do some modification for the target.
1. For uint8 int8 and int16 format
If NN type is VSI_NN_TYPE_UINT8
VSI_NN_TYPE_INT8
VSI_NN_TYPE_INT16
.
Using the lookup table to accelerate Float32 to Dtype.
Create a lookup table for three channels of RGB values according to std/mean/normalize
static uint8_t *u2d; vsi_status vnn_PreTableInit(vsi_nn_graph_t *graph) { vsi_nn_tensor_t *tensor; vsi_size_t w, h, c, stride; vsi_size_t tbl_sz = 256; float normalize = 255.0f; float mean = 0.485f; float std = 0.225f; tensor = vsi_nn_GetTensor(graph, graph->input.tensors[0]); stride = vis_nn_TypeGetBytes(tensor->attr.dtype.vx_type); w = tensor->attr.size[0]; h = tensor->attr.size[1]; c = tensor->attr.size[2]; u2d = (uint8_t *)malloc(tbl_sz * stride); for( int i = 0; i < tbl_sz; i ++){ float val = ((float)i * normalize - mean) / std; vsi_nn_Float32Dtype(val, u2d + i * stride, &tensor->attr.dtype); } return VSI_SUCCESS; } void vnn_PreTableDeinit() { if(u2d){ free(u2d); u2d = NULL; } }
Convert RGB data to dtype. Open CPU multi-core parallel using OpenMP. Need to include header file "omp. h" and link -openmp.
uint8_t *_float32_to_dtype(float *fdata, vsi_nn_tensor_t *tensor) { vsi_status status; uint8_t *iData, *oData; vsi_size_t sz, i, stride; sz = vsi_nn_GetElementNum(tensor); stride = vsi_nn_TypeGetBytes(tensor->attr.dtype.vx_type); if(stride == 0) stride = 1; iData = (uint8_t *)fdata; oData = (uint8_t *)malloc(stride * sz * sizeof(uint8_t)); TEST_CHECK_PTR(oData, final); memset(oData, 0, stride * sz * sizeof(uint8_t)); if (stride == 1) { uint8_t *pData = (uint8_t*)oData; uint8_t *lut = (uint8_t*)u2v; #pragma omp parallel for for(size_t i = 0; i < sz; ++i) pData[i] = lut[iData[i]]; } else if (stride == 2) { uint16_t *pData = (uint16_t*)oData; uint16_t *lut = (uint16_t*)u2v; #pragma omp parallel for for(size_t i = 0; i < sz; ++i) pData[i] = lut[iData[i]]; } final: return oData; }
or we can use NEON to replace OpenMP:
else if (stride == 2) { uint16_t *pData = (uint16_t*)oData; uint16_t *lut = (uint16_t*)u2v; size_t i = 0; size_t simd_end = sz / 8 * 8; // batch 8 for(; i < simd_end; i+=8) { uint8x8_t i_u8 = vld1_u8(iData + i); uint16x8_t o_u16 = { lut[vget_lane_u8(i_u8, 0)], lut[vget_lane_u8(i_u8, 1)], lut[vget_lane_u8(i_u8, 2)], lut[vget_lane_u8(i_u8, 3)], lut[vget_lane_u8(i_u8, 4)], lut[vget_lane_u8(i_u8, 5)], lut[vget_lane_u8(i_u8, 6)], lut[vget_lane_u8(i_u8, 7)] }; vst1q_u16(pData + i, o_u16); } for (; i < sz; ++i) pData[i] = lut[iData[i]]; }
Add the following code to the main function
int main(int argc, char** argv) { ...... /* Verify graph */ status = vnn_VerifyGraph(graph); TEST_CHECK_STATUS(status, final); /* Create u2d table */ status = vnn_PreTableInit(graph); TEST_CHECK_STATUS(status, final); ..... }
2. For float16 format
If the NN type is VSI_NN_TYPE_FLOAT16
. We can use NEON to accelerate format conversion.
we can call
fp16Tofp32()
to replacevsi_nn_Float32ToDtype()
we can call
fp32Tofp16()
to replacevsi_nn_DtypeToFloat32()
The implementation of the above function is as follows:
void fp16Tofp32(float16_t* src, float* dst, int n) { for (int i = 0; i < n; i += 4) { float16x4_t v_half = vld1_f16(src + i); float32x4_t v_float = vcvt_f32_f16(v_half); vst1q_f32(dst + i, v_float); } } void fp32Tofp16(float* src, float16_t* dst, int n) { for (int i = 0; i < n; i += 4) { float32x4_t v_float = vld1q_f32(src + i); float16x4_t v_half = vcvt_f16_f32(v_float); vst1_f16(dst + i, v_half); } }