Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

As the VSI generated code, for example: yolov5s_uint8_nbg_unify, we need to do some modification for the target.

Panel
panelIconId1f334
panelIcon:palm_tree:
panelIconText🌴
bgColor#E6FCFF

This document is for both uint8 and int16 format.

Steps

  1. Add vnn_PreTableInit in vnn_pre_process.h

...

  1. Add vnn_PreTableInit and uint8 to dtype table named u2d in vnn_pre_process.c for the pre lookup table.

...

  1. Modify the original _float32_to_dtype() as using table lookup instead of direct calling VSI API.

...

  1. Create u2d table before open image in main.c.

...

  1. Add omp option in Makefile.

...

Test result

The following figure shows the measured performance data after we made the modifications as mentioned in the above steps.

uint8

It took 0.05ms to create the table and 4.70ms to convert the table lookup.

...

int16

It took 0.05ms to create the table and 5.09ms to convert the table lookup.

...

1. For uint8 int8 and int16 format

If NN type is VSI_NN_TYPE_UINT8 VSI_NN_TYPE_INT8 VSI_NN_TYPE_INT16.
Using the lookup table to accelerate Float32 to Dtype.

  1. Create a lookup table for three channels of RGB values according to std/mean/normalize

Code Block
languagecpp
static uint8_t *u2d;

vsi_status vnn_PreTableInit(vsi_nn_graph_t *graph)
{
	vsi_nn_tensor_t *tensor;
	vsi_size_t w, h, c, stride;
	vsi_size_t tbl_sz = 256;
	float normalize = 255.0f;
	float mean = 0.485f;
	float std = 0.225f;

	tensor = vsi_nn_GetTensor(graph, graph->input.tensors[0]);
	stride = vis_nn_TypeGetBytes(tensor->attr.dtype.vx_type);
	w = tensor->attr.size[0];
	h = tensor->attr.size[1];
	c = tensor->attr.size[2];

	u2d = (uint8_t *)malloc(tbl_sz * stride);
	for( int i = 0; i < tbl_sz; i ++){
		float val = ((float)i * normalize - mean) / std;
		vsi_nn_Float32Dtype(val, u2d + i * stride, &tensor->attr.dtype);
	}
	
	return VSI_SUCCESS;
}

void vnn_PreTableDeinit()
{
	if(u2d){
		free(u2d);
		u2d = NULL;
	}
}
  1. Convert RGB data to dtype. Open CPU multi-core parallel using OpenMP. Need to include header file "omp. h" and link -openmp.

Code Block
languagecpp
uint8_t *_float32_to_dtype(float *fdata, vsi_nn_tensor_t *tensor)
{
    vsi_status status;
    uint8_t *iData, *oData;
    vsi_size_t sz, i, stride;
 
    sz = vsi_nn_GetElementNum(tensor);
    stride = vsi_nn_TypeGetBytes(tensor->attr.dtype.vx_type);

    if(stride == 0) stride = 1;

    iData = (uint8_t *)fdata;
    oData = (uint8_t *)malloc(stride * sz * sizeof(uint8_t));

  
    TEST_CHECK_PTR(oData, final);
    memset(oData, 0, stride * sz * sizeof(uint8_t));
    if (stride == 1) {
        uint8_t *pData = (uint8_t*)oData;
        uint8_t *lut = (uint8_t*)u2v;
        #pragma omp parallel for
        for(size_t i = 0; i < sz; ++i)
            pData[i] = lut[iData[i]];
    } else if (stride == 2) {
        uint16_t *pData = (uint16_t*)oData;
        uint16_t *lut = (uint16_t*)u2v;
		#pragma omp parallel for
        for(size_t i = 0; i < sz; ++i)
            pData[i] = lut[iData[i]];
    }

final:
    return oData;
}

or we can use NEON to replace OpenMP:

Code Block
languagecpp
	 else if (stride == 2) {
		uint16_t *pData = (uint16_t*)oData;
        uint16_t *lut = (uint16_t*)u2v;
  
        size_t i = 0;
        size_t simd_end = sz / 8 * 8; // batch 8

        for(; i < simd_end; i+=8) {
            uint8x8_t i_u8 = vld1_u8(iData + i);
            uint16x8_t o_u16 = {
                lut[vget_lane_u8(i_u8, 0)],
                lut[vget_lane_u8(i_u8, 1)],
                lut[vget_lane_u8(i_u8, 2)],
                lut[vget_lane_u8(i_u8, 3)],
                lut[vget_lane_u8(i_u8, 4)],
                lut[vget_lane_u8(i_u8, 5)],
                lut[vget_lane_u8(i_u8, 6)],
                lut[vget_lane_u8(i_u8, 7)]
            };

            vst1q_u16(pData + i, o_u16);
        }

        for (; i < sz; ++i)
            pData[i] = lut[iData[i]];
    }
  1. Add the following code to the main function

Code Block
languagecpp
int main(int argc, char** argv)
{
	......
	/* Verify graph */
	status = vnn_VerifyGraph(graph);
	TEST_CHECK_STATUS(status, final);

	/* Create u2d table */
	status = vnn_PreTableInit(graph);
	TEST_CHECK_STATUS(status, final);

	.....
}

2. For float16 format

If the NN type is VSI_NN_TYPE_FLOAT16. We can use NEON to accelerate format conversion.

  • we can call fp16Tofp32() to replace vsi_nn_DtypeToFloat32()

  • we can call fp32Tofp16() to replace vsi_nn_Float32ToDtype()

The implementation of the above function is as follows:

Code Block
languagecpp
void fp16Tofp32(float16_t* src, float* dst, int n)
{
    for (int i = 0; i < n; i += 4) {
        float16x4_t v_half = vld1_f16(src + i);  
        float32x4_t v_float = vcvt_f32_f16(v_half);
        vst1q_f32(dst + i, v_float);
    }
}

void fp32Tofp16(float* src, float16_t* dst, int n)
{
    for (int i = 0; i < n; i += 4) {
        float32x4_t v_float = vld1q_f32(src + i);  
        float16x4_t v_half = vcvt_f16_f32(v_float);
        vst1_f16(dst + i, v_half);
    }
}