As the VSI generated code, for example: yolov5s_uint8_nbg_unify, we need to do some modification for the target.

Panel

panelIconId	1f334
panelIcon	:palm_tree:
panelIconText	🌴
bgColor	#E6FCFF

This document is for both uint8 and int16 format.

Steps

Add vnn_PreTableInit in vnn_pre_process.h

...

Add vnn_PreTableInit and uint8 to dtype table named u2d in vnn_pre_process.c for the pre lookup table.

...

Modify the original _float32_to_dtype() as using table lookup instead of direct calling VSI API.

...

Create u2d table before open image in main.c.

...

Add omp option in Makefile.

...

Test result

The following figure shows the measured performance data after we made the modifications as mentioned in the above steps.

uint8

It took 0.05ms to create the table and 4.70ms to convert the table lookup.

...

int16

It took 0.05ms to create the table and 5.09ms to convert the table lookup.

...

1. For uint8 int8 and int16 format

If NN type is VSI_NN_TYPE_UINT8 VSI_NN_TYPE_INT8 VSI_NN_TYPE_INT16.
Using the lookup table to accelerate Float32 to Dtype.

Create a lookup table for three channels of RGB values according to std/mean/normalize

Code Block

language	cpp

static uint8_t *u2d;

vsi_status vnn_PreTableInit(vsi_nn_graph_t *graph)
{
	vsi_nn_tensor_t *tensor;
	vsi_size_t w, h, c, stride;
	vsi_size_t tbl_sz = 256;
	float normalize = 255.0f;
	float mean = 0.485f;
	float std = 0.225f;

	tensor = vsi_nn_GetTensor(graph, graph->input.tensors[0]);
	stride = vis_nn_TypeGetBytes(tensor->attr.dtype.vx_type);
	w = tensor->attr.size[0];
	h = tensor->attr.size[1];
	c = tensor->attr.size[2];

	u2d = (uint8_t *)malloc(tbl_sz * stride);
	for( int i = 0; i < tbl_sz; i ++){
		float val = ((float)i * normalize - mean) / std;
		vsi_nn_Float32Dtype(val, u2d + i * stride, &tensor->attr.dtype);
	}
	
	return VSI_SUCCESS;
}

void vnn_PreTableDeinit()
{
	if(u2d){
		free(u2d);
		u2d = NULL;
	}
}

Convert RGB data to dtype. Open CPU multi-core parallel using OpenMP. Need to include header file "omp. h" and link -openmp.

Code Block

language	cpp

uint8_t *_float32_to_dtype(float *fdata, vsi_nn_tensor_t *tensor)
{
    vsi_status status;
    uint8_t *iData, *oData;
    vsi_size_t sz, i, stride;
 
    sz = vsi_nn_GetElementNum(tensor);
    stride = vsi_nn_TypeGetBytes(tensor->attr.dtype.vx_type);

    if(stride == 0) stride = 1;

    iData = (uint8_t *)fdata;
    oData = (uint8_t *)malloc(stride * sz * sizeof(uint8_t));

  
    TEST_CHECK_PTR(oData, final);
    memset(oData, 0, stride * sz * sizeof(uint8_t));
    if (stride == 1) {
        uint8_t *pData = (uint8_t*)oData;
        uint8_t *lut = (uint8_t*)u2v;
        #pragma omp parallel for
        for(size_t i = 0; i < sz; ++i)
            pData[i] = lut[iData[i]];
    } else if (stride == 2) {
        uint16_t *pData = (uint16_t*)oData;
        uint16_t *lut = (uint16_t*)u2v;
		#pragma omp parallel for
        for(size_t i = 0; i < sz; ++i)
            pData[i] = lut[iData[i]];
    }

final:
    return oData;
}

or we can use NEON to replace OpenMP:

Code Block

language	cpp

	 else if (stride == 2) {
		uint16_t *pData = (uint16_t*)oData;
        uint16_t *lut = (uint16_t*)u2v;
  
        size_t i = 0;
        size_t simd_end = sz / 8 * 8; // batch 8

        for(; i < simd_end; i+=8) {
            uint8x8_t i_u8 = vld1_u8(iData + i);
            uint16x8_t o_u16 = {
                lut[vget_lane_u8(i_u8, 0)],
                lut[vget_lane_u8(i_u8, 1)],
                lut[vget_lane_u8(i_u8, 2)],
                lut[vget_lane_u8(i_u8, 3)],
                lut[vget_lane_u8(i_u8, 4)],
                lut[vget_lane_u8(i_u8, 5)],
                lut[vget_lane_u8(i_u8, 6)],
                lut[vget_lane_u8(i_u8, 7)]
            };

            vst1q_u16(pData + i, o_u16);
        }

        for (; i < sz; ++i)
            pData[i] = lut[iData[i]];
    }

Add the following code to the main function

Code Block

language	cpp

int main(int argc, char** argv)
{
	......
	/* Verify graph */
	status = vnn_VerifyGraph(graph);
	TEST_CHECK_STATUS(status, final);

	/* Create u2d table */
	status = vnn_PreTableInit(graph);
	TEST_CHECK_STATUS(status, final);

	.....
}

2. For float16 format

If the NN type is VSI_NN_TYPE_FLOAT16. We can use NEON to accelerate format conversion.

we can call fp16Tofp32() to replace vsi_nn_Float32ToDtype()
we can call fp32Tofp16() to replace vsi_nn_DtypeToFloat32()

The implementation of the above function is as follows:

Code Block

language	cpp

void fp16Tofp32(float16_t* src, float* dst, int n)
{
    for (int i = 0; i < n; i += 4) {
        float16x4_t v_half = vld1_f16(src + i);  
        float32x4_t v_float = vcvt_f32_f16(v_half);
        vst1q_f32(dst + i, v_float);
    }
}

void fp32Tofp16(float* src, float16_t* dst, int n)
{
    for (int i = 0; i < n; i += 4) {
        float32x4_t v_float = vld1q_f32(src + i);  
        float16x4_t v_half = vcvt_f16_f32(v_float);
        vst1_f16(dst + i, v_half);
    }
}

Versions Compared

Old Version 3

New Version 4

Key

Steps

Test result

uint8

int16

1. For uint8 int8 and int16 format

2. For float16 format

Page Comparison

Versions Compared

Old Version 3

New Version 4

Key

Steps

Test result

uint8

int16

1. For uint8 int8 and int16 format

2. For float16 format