int bw = blockDim.x; int bh = blockDim.y; int tx = threadIdx.x%bw; int ty = threadIdx.y%bh; __shared__ uchar2 ys0[16][16]; __shared__ uchar2 ys1[16][16]; __shared__ uchar2 uvs[16][16]; ys0[ty][tx] = y0y1; ys1[ty][tx] = y2y3; uvs[ty][tx] = uv; __syncthreads(); if (threadIdx.x == 0 && threadIdx.y == 0) { for (int j = 0; j != bh; ++j) { uchar2* py0 = (uchar2*)(pDst + (iy + j) * 2 * nPitch + ix * 2); uchar2* py1 = (uchar2*)(pDst + ((iy + j) * 2+1) * nPitch + ix * 2); uchar2* puv = (uchar2*)(pDstUv + (iy + j)*nWidth + ix * 2); for (int i = 0; i != bw; ++i) { *py0++ = ys0[j][i]; *py1++ = ys1[j][i]; //*puv++ = uvs[j][i]; } } }