void add_clip_neon(uint8_t *dst, const int dst_stride, const uint8_t* src0, const int src0_stride, const int8_t* src1, const int src1_stride, int width, int height) { int i, j; for(i = 0; i < height; i++) { uint8_t *_dst = dst + i * dst_stride; const uint8_t *_src0 = src0 + i * src0_stride; const int8_t *_src1 = src1 + i * src1_stride; for(j = 0; j < width - 7; j += 8) { uint8x8_t s0 = vld1_u8(_src0); int8x8_t s1 = vld1_s8(_src1); uint16x8_t s2 = vmovl_u8(s0); int16x8_t s3 = vreinterpretq_s16_u16(s2); int16x8_t s4 = vaddw_s8(s3, s1); uint8x8_t s5 = vqmovun_s16(s4); vst1_u8(_dst, s5); _dst += 8; _src0 += 8; _src1 += 8; } for(; j < width; j ++) { int16_t v = _src0[0] + _src1[0]; if ( v < 0) v = 0; if ( v > 255) v = 255; _dst[0] = v; _src0 ++; _src1 ++; _dst ++; } } }
使用neon写的版本。