Content Comparison

This document describes how to use NEON on the SP7350 to accelerate data processing.We will describe what scenarios are suitable for using NEON, NEON learning materials, and examples of how to use NEON acceleration.

...

There are many formats for YUV, but the format used in the example code is limited to YUYV, which has the following format: YUYV YUYV YUYV ... YUYV. It has the following characteristics：

Each pair of Y components shares a pair of UV components, so every two pixels occupy 4 bytes. For a YUYV image with a width of W and a height of H, it occupies W * H bytes.

RGB format

Due to the instructions for decentralized storage loading in neon, there is basically no impact on performance depending on which type of RGB is stored, whether it is RGB, BGA, RGBA, BGRA, etc.

...

bx is divided into b1, b2, b3, b4.
cx is divided into c1, c2, c3, c4.
As a result, the calculation of each column is independent and can be performed separately. In neon, it is possible for multiple instructions to be scheduled for simultaneous execution, such as the following: (Why? We don't know the exact reasons, but according to the official documentation, although there is no concept of kernel threads like in cuda, neon itself may have similar functional computation channels that allow for concurrent execution of memory loads/stores and computations.)
- r0 = vmlaq_lane_f32(r0, a1, vget_low_f32(b0), 1);
- r1 = vmlaq_lane_f32(r1, a1, vget_low_f32(b1), 1);
- r2 = vmlaq_lane_f32(r2, a1, vget_low_f32(b2), 1);
- r3 = vmlaq_lane_f32(r3, a1, vget_low_f32(b3), 1);

The main instructions used in the code

vmulq_lan_f32(a0, a1, lane_idx)
- Vector-scalar multiplication, where the scalar value comes from the lane_idx channel of a1.
- Pseudocode：
  - result[0] = a0[0] x a1[lane_idx]
  - result[1] = a0[1] x a1[lane_idx]
  - result[2] = a0[2] x a1[lane_idx]
  - result[3] = a0[3] x a1[lane_idx]
vmlaq_lane_f32(a0, a1, a2, lane_idx)
1. Multiply-Add Instruction: Pseudocode a0 + a1 x a2[lane_idx]
2. Note the data types. Of course, if there is an error, the compiler will prompt an error message.
3. This instruction can be replaced with vaddq(a0, vmulq_lan_f32(a1, a2, lane_idx))
  1. It has been changed into two instructions with clear dependency relations.
  2. Therefore, the multiply-add instruction will achieve better performance.
vget_low_f32Obtain the first two variables from the vector

a1, a2, a3, a4 --> a1, a2

vget_high_f32Obtain the last two variables from the vector

a1, a2, a3, a4 --> a3, a4

Complete Sample Code

Code Block

language	cpp

void matrix_mul_version1(float32_t* A, float32_t* B, float32_t* C)
{
    float32x4_t a0 = vld1q_f32(A + 0);
    float32x4_t a1 = vld1q_f32(A + 4);
    float32x4_t a2 = vld1q_f32(A + 8);
    float32x4_t a3 = vld1q_f32(A + 12);

    // Compute the first column of matrix C and store it
    float32x4_t bx = vld1q_f32(B + 0);
    float32x4_t cx = vmulq_lane_f32(a0, vget_low_f32(bx), 0);
    cx += vmlaq_lane_f32(a0, a1, vget_low_f32(bx), 1);
    cx += vmlaq_lane_f32(a0, a2, vget_high_f32(bx), 0);
    cx += vmlaq_lane_f32(a0, a3, vget_high_f32(bx), 1);
    vst1q_f32(C + 0, cx);

    // Compute the second column of matrix C and store it
    bx = vld1q_f32(B + 4);
    cx  = vmulq_lane_f32(a0, vget_low_f32(bx), 0);
    cx += vmlaq_lane_f32(a0, a1, vget_low_f32(bx), 1);
    cx += vmlaq_lane_f32(a0, a2, vget_high_f32(bx), 0);
    cx += vmlaq_lane_f32(a0, a3, vget_high_f32(bx), 1);
    vst1q_f32(C + 4, cx);

    // Compute the third column of matrix C and store it
    bx  = vld1q_f32(B + 8);
    cx  = vmulq_lane_f32(a0, vget_low_f32(bx), 0);
    cx += vmlaq_lane_f32(a0, a1, vget_low_f32(bx), 1);
    cx += vmlaq_lane_f32(a0, a2, vget_high_f32(bx), 0);
    cx += vmlaq_lane_f32(a0, a3, vget_high_f32(bx), 1);
    vst1q_f32(C + 8, cx);

    // Compute the fourth column of matrix C and store it
    bx  = vld1q_f32(B + 12);
    cx  = vmulq_lane_f32(a0, vget_low_f32(bx), 0);
    cx += vmlaq_lane_f32(a0, a1, vget_low_f32(bx), 1);
    cx += vmlaq_lane_f32(a0, a2, vget_high_f32(bx), 0);
    cx += vmlaq_lane_f32(a0, a3, vget_high_f32(bx), 1);
    vst1q_f32(C + 12, cx);
}

void matrix_mul_version2(float32_t* A, float32_t* B, float32_t* C)
{
    float32x4_t a0 = vld1q_f32(A + 0);
    float32x4_t a1 = vld1q_f32(A + 4);
    float32x4_t a2 = vld1q_f32(A + 8);
    float32x4_t a3 = vld1q_f32(A + 12);
    
    // Compute the x column of matrix C and store it
    float32x4_t bx, cx;
    for (int i = 0; i <= 12; i+=4)
    {
        bx = vld1q_f32(B + i);
        cx  = vmulq_lane_f32(a0, vget_low_f32(bx), 0);
        cx += vmlaq_lane_f32(a0, a1, vget_low_f32(bx), 1);
        cx += vmlaq_lane_f32(a0, a2, vget_high_f32(bx), 0);
        cx += vmlaq_lane_f32(a0, a3, vget_high_f32(bx), 1);
        vst1q_f32(C + i, cx);
    }
}

void matrix_mul_version3(float32_t* A, float32_t* B, float32_t* C)
{
    float32x4_t a0 = vld1q_f32(A + 0);
    float32x4_t a1 = vld1q_f32(A + 4);
    float32x4_t a2 = vld1q_f32(A + 8);
    float32x4_t a3 = vld1q_f32(A + 12);
    
    // Compute the x column of matrix C and store it
    for (int i = 0; i <= 12; i+=4)
    {
        float32x4_t bx = vld1q_f32(B + i);
        float32x4_t cx  = vmulq_lane_f32(a0, vget_low_f32(bx), 0);
        cx += vmlaq_lane_f32(a0, a1, vget_low_f32(bx), 1);
        cx += vmlaq_lane_f32(a0, a2, vget_high_f32(bx), 0);
        cx += vmlaq_lane_f32(a0, a3, vget_high_f32(bx), 1);
        vst1q_f32(C + i, cx);
    }
}

void matrix_mul_version4(float32_t* A, float32_t* B, float32_t* C)
{
    float32x4_t a0 = vld1q_f32(A + 0);
    float32x4_t a1 = vld1q_f32(A + 4);
    float32x4_t a2 = vld1q_f32(A + 8);
    float32x4_t a3 = vld1q_f32(A + 12);

    float32x4_t b1, b2, b3, b4;
    float32x4_t c1, c2, c3, c4;

    // Compute the first column of matrix C and store it
    b1 = vld1q_f32(B + 0);
    c1 = vmulq_lane_f32(a0, vget_low_f32(b1), 0);
    c1 += vmlaq_lane_f32(a0, a1, vget_low_f32(b1), 1);
    c1 += vmlaq_lane_f32(a0, a2, vget_high_f32(b1), 0);
    c1 += vmlaq_lane_f32(a0, a3, vget_high_f32(b1), 1);
    vst1q_f32(C + 0, c1);

    // Compute the second column of matrix C and store it
    b2 = vld1q_f32(B + 4);
    c2  = vmulq_lane_f32(a0, vget_low_f32(b2), 0);
    c2 += vmlaq_lane_f32(a0, a1, vget_low_f32(b2), 1);
    c2 += vmlaq_lane_f32(a0, a2, vget_high_f32(b2), 0);
    c2 += vmlaq_lane_f32(a0, a3, vget_high_f32(b2), 1);
    vst1q_f32(C + 4, c2);

    // Compute the third column of matrix C and store it
    b3  = vld1q_f32(B + 8);
    c3  = vmulq_lane_f32(a0, vget_low_f32(b3), 0);
    c3 += vmlaq_lane_f32(a0, a1, vget_low_f32(b3), 1);
    c3 += vmlaq_lane_f32(a0, a2, vget_high_f32(b3), 0);
    c3 += vmlaq_lane_f32(a0, a3, vget_high_f32(b3), 1);
    vst1q_f32(C + 8, c3);

    // Compute the fourth column of matrix C and store it
    b4  = vld1q_f32(B + 12);
    c4  = vmulq_lane_f32(a0, vget_low_f32(b4), 0);
    c4 += vmlaq_lane_f32(a0, a1, vget_low_f32(b4), 1);
    c4 += vmlaq_lane_f32(a0, a2, vget_high_f32(b4), 0);
    c4 += vmlaq_lane_f32(a0, a3, vget_high_f32(b4), 1);
    vst1q_f32(C + 12, c4);
}

static void print_matrix_memory(const char* tag, float32_t* matrix)
{
    printf("output matrix content for %s: \n", tag);
    printf("%.4f %.4f %.4f %.4f\n",   matrix[0],  matrix[1],  matrix[2],  matrix[3]);
    printf("%.4f %.4f %.4f %.4f\n",   matrix[4],  matrix[5],  matrix[6],  matrix[7]);
    printf("%.4f %.4f %.4f %.4f\n",   matrix[8],  matrix[9],  matrix[10], matrix[11]);
    printf("%.4f %.4f %.4f %.4f\n\n", matrix[12], matrix[13], matrix[14], matrix[15]);
}

int main(int argc, const char * argv[])
{
    float32_t A[16] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
    float32_t B[16] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
    float32_t C[16] = {};
    
    matrix_mul_version1(A, B, C);
    print_matrix_memory("verision1C", C);
    matrix_mul_version2(A, B, C);
    print_matrix_memory("verision2C", C);
    matrix_mul_version3(A, B, C);
    print_matrix_memory("verision3C", C);
    matrix_mul_version4(A, B, C);
    print_matrix_memory("verision4C", C);

    return 0;
}

Version	Old Version 1	New Version 2
Changes made by	zengping	zengping
Saved on	Apr 23, 2024	Apr 23, 2024

Versions Compared

Key

RGB format

The main instructions used in the code

Complete Sample Code