Android neon accelerated optimization

Time:2022-5-5

Neon is a SIMD (single instruction multiple data) instruction set. Its efficiency is equivalent to assembly. It is used for the optimization of ARM CPU platform, and its performance is greatly improved in the fields of audio, video, graphics and image processing. The CPU of arm architecture has supported neon (optional) since armv7a, so as to realize the function of parallel computing.

This article records how to use neon acceleration on Android.

First of all, needless to say, first create an Android project that supports native C + +

Then add support for neon in gradle:

    externalNativeBuild { 
        cmake { 
            cppFlags "-std=c++14" 
            arguments "-DANDROID_ARM_NEON=TRUE" 
        } 
    } 

Also add support for neon in cmake “- mfpu = neon”

Finally, #include in CPP<arm_neon.h>

The neon project can be accelerated and supported.

In order to compare the performance, now use the neon and pure C methods to compare the time of converting color images to gray

//Pure c function
void method_argb2gray_c(AndroidBitmapInfo info, void *pixels) {
    //RGB to gray value formula
    // Gray = (R*38 + G*75 + B*15) >> 7
    cv::TickMeter tm1;
    tm1.start();
    uint32_t *pixel = NULL;
    int a = 0, r = 0, g = 0, b = 0;
    int rows=info.height;
    int cols=info.width;

    for (int y = 0; y < rows; ++y) {
        for (int x = 0; x < cols; ++x) {
            pixel = (uint32_t *) pixels + info.width * y + x;
            a = (*pixel & 0xFF000000) >> 24;
            r = (*pixel & 0x00FF0000) >> 16;
            g = (*pixel & 0x0000FF00) >> 8;
            b = (*pixel & 0x000000FF) >> 0;
            int gray = (r * 38 + g * 75 + b * 15) >> 7;

            *pixel = ((a << 24) | (gray << 16) | (gray << 8) | gray);
        }
    }
    tm1.stop();
    LOGI("method_argb2gray_c      time: %lf", tm1.getTimeMilli());
}
//Neon function
void method_argb2gray_neon(AndroidBitmapInfo info, void *pixels) {
    // Gray = (R*38 + G*75 + B*15) >> 7
    TickMeter tm3;
    tm3.start();
    unsigned short *dst = (unsigned short *) pixels;
    unsigned char *src = (unsigned char *) pixels;
    uint8x8_t r = vdup_n_u8(38);
    uint8x8_t g = vdup_n_u8(75);
    uint8x8_t b = vdup_n_u8(15);
    uint16x8_t alp = vdupq_n_u16(255 << 8);

    uint16x8_t temp;
    uint8x8_t gray;
    uint8x8x4_t argb;
    uint16x8_t hight;
    uint16x8_t low;
    uint16x8x2_t res;
    int i, size = info.height * info.width / 8;

    for (i = 0; i < size; ++i) {

        //Obtain the R, G and b values and calculate the gray value
        argb = vld4_u8(src);
        temp = vmull_u8(argb.val[1], r);
        temp = vmlal_u8(temp, argb.val[2], g);
        temp = vmlal_u8(temp, argb.val[3], b);
        gray = vshrn_n_u16 (temp, 7);
        src += 8 * 4;

        //Assign 4-channel ARGB
        hight = vorrq_u16(alp, vmovl_u8(gray));
        low = vorrq_u16(vshlq_n_u16(vmovl_u8(gray), 8), vmovl_u8(gray));
        res = vzipq_u16(low, hight);
        vst1q_u16(dst, res.val[0]);
        dst += 8;
        vst1q_u16(dst, res.val[1]);
        dst += 8;

    }
    tm3.stop();
    LOGI("method_argb2gray_neon   time: %lf", tm3.getTimeMilli());
}

The measured speed is compared as follows

Android neon accelerated optimization

file