...
As the VSI generated code, for example: yolov5s_uint8_nbg_unify, we need to do some modification for the target.
Panel | ||||||||
---|---|---|---|---|---|---|---|---|
| ||||||||
This document is for both uint8 and int16 format. |
Steps
Add
vnn_PreTableInit
in vnn_pre_process.h
...
Add
vnn_PreTableInit
and uint8 to dtype table namedu2d
in vnn_pre_process.c for the pre lookup table.
...
Modify the original
_float32_to_dtype()
as using table lookup instead of direct calling VSI API.
...
Create u2d table before open image in main.c.
...
Add omp option in Makefile.
...
Test result
The following figure shows the measured performance data after we made the modifications as mentioned in the above steps.
uint8
It took 0.05ms to create the table and 4.70ms to convert the table lookup.
...
int16
It took 0.05ms to create the table and 5.09ms to convert the table lookup.
...
1. For uint8 int8 and int16 format
If NN type is VSI_NN_TYPE_UINT8
VSI_NN_TYPE_INT8
VSI_NN_TYPE_INT16
.
Using the lookup table to accelerate Float32 to Dtype.
Create a lookup table for three channels of RGB values according to std/mean/normalize
Code Block | ||
---|---|---|
| ||
static uint8_t *u2d;
vsi_status vnn_PreTableInit(vsi_nn_graph_t *graph)
{
vsi_nn_tensor_t *tensor;
vsi_size_t w, h, c, stride;
vsi_size_t tbl_sz = 256;
float normalize = 255.0f;
float mean = 0.485f;
float std = 0.225f;
tensor = vsi_nn_GetTensor(graph, graph->input.tensors[0]);
stride = vis_nn_TypeGetBytes(tensor->attr.dtype.vx_type);
w = tensor->attr.size[0];
h = tensor->attr.size[1];
c = tensor->attr.size[2];
u2d = (uint8_t *)malloc(tbl_sz * stride);
for( int i = 0; i < tbl_sz; i ++){
float val = ((float)i * normalize - mean) / std;
vsi_nn_Float32Dtype(val, u2d + i * stride, &tensor->attr.dtype);
}
return VSI_SUCCESS;
}
void vnn_PreTableDeinit()
{
if(u2d){
free(u2d);
u2d = NULL;
}
} |
Convert RGB data to dtype. Open CPU multi-core parallel using OpenMP. Need to include header file "omp. h" and link -openmp.
Code Block | ||
---|---|---|
| ||
uint8_t *_float32_to_dtype(float *fdata, vsi_nn_tensor_t *tensor)
{
vsi_status status;
uint8_t *iData, *oData;
vsi_size_t sz, i, stride;
sz = vsi_nn_GetElementNum(tensor);
stride = vsi_nn_TypeGetBytes(tensor->attr.dtype.vx_type);
if(stride == 0) stride = 1;
iData = (uint8_t *)fdata;
oData = (uint8_t *)malloc(stride * sz * sizeof(uint8_t));
TEST_CHECK_PTR(oData, final);
memset(oData, 0, stride * sz * sizeof(uint8_t));
if (stride == 1) {
uint8_t *pData = (uint8_t*)oData;
uint8_t *lut = (uint8_t*)u2v;
#pragma omp parallel for
for(size_t i = 0; i < sz; ++i)
pData[i] = lut[iData[i]];
} else if (stride == 2) {
uint16_t *pData = (uint16_t*)oData;
uint16_t *lut = (uint16_t*)u2v;
#pragma omp parallel for
for(size_t i = 0; i < sz; ++i)
pData[i] = lut[iData[i]];
}
final:
return oData;
} |
or we can use NEON to replace OpenMP:
Code Block | ||
---|---|---|
| ||
else if (stride == 2) {
uint16_t *pData = (uint16_t*)oData;
uint16_t *lut = (uint16_t*)u2v;
size_t i = 0;
size_t simd_end = sz / 8 * 8; // batch 8
for(; i < simd_end; i+=8) {
uint8x8_t i_u8 = vld1_u8(iData + i);
uint16x8_t o_u16 = {
lut[vget_lane_u8(i_u8, 0)],
lut[vget_lane_u8(i_u8, 1)],
lut[vget_lane_u8(i_u8, 2)],
lut[vget_lane_u8(i_u8, 3)],
lut[vget_lane_u8(i_u8, 4)],
lut[vget_lane_u8(i_u8, 5)],
lut[vget_lane_u8(i_u8, 6)],
lut[vget_lane_u8(i_u8, 7)]
};
vst1q_u16(pData + i, o_u16);
}
for (; i < sz; ++i)
pData[i] = lut[iData[i]];
} |
Add the following code to the main function
Code Block | ||
---|---|---|
| ||
int main(int argc, char** argv)
{
......
/* Verify graph */
status = vnn_VerifyGraph(graph);
TEST_CHECK_STATUS(status, final);
/* Create u2d table */
status = vnn_PreTableInit(graph);
TEST_CHECK_STATUS(status, final);
.....
} |
2. For float16 format
If the NN type is VSI_NN_TYPE_FLOAT16
. We can use NEON to accelerate format conversion.
we can call
fp16Tofp32()
to replacevsi_nn_Float32ToDtype()
we can call
fp32Tofp16()
to replacevsi_nn_DtypeToFloat32()
The implementation of the above function is as follows:
Code Block | ||
---|---|---|
| ||
void fp16Tofp32(float16_t* src, float* dst, int n)
{
for (int i = 0; i < n; i += 4) {
float16x4_t v_half = vld1_f16(src + i);
float32x4_t v_float = vcvt_f32_f16(v_half);
vst1q_f32(dst + i, v_float);
}
}
void fp32Tofp16(float* src, float16_t* dst, int n)
{
for (int i = 0; i < n; i += 4) {
float32x4_t v_float = vld1q_f32(src + i);
float16x4_t v_half = vcvt_f16_f32(v_float);
vst1_f16(dst + i, v_half);
}
} |