Julia集中的元素都是经过简单的迭代计算得到的,很适合用CUDA进行加速。对一个600*600的图像,需要进行360000次迭代计算,所以在CUDA中创建了600*600个线程块(block),每个线程块包含1个线程,并行执行360000次运行,图像的创建和显示通过OpenCV实现:
#include "cuda_runtime.h"
#include <highgui.hpp>
using namespace cv;
#define DIM 600 //图像长宽
struct cuComplex
{
float r;
float i;
__device__ cuComplex(float a, float b) : r(a), i(b) {}
__device__ float magnitude2(void)
{
return r * r + i * i;
}
__device__ cuComplex operator*(const cuComplex& a)
{
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
}
__device__ cuComplex operator+(const cuComplex& a)
{
return cuComplex(r + a.r, i + a.i);
}
};
__device__ int julia(int x, int y)
{
const float scale = 1.5;
float jx = scale * (float)(DIM / 2 - x) / (DIM / 2);
float jy = scale * (float)(DIM / 2 - y) / (DIM / 2);
cuComplex c(0.25, 0.010);
cuComplex a(jx, jy);
int i = 0;
for (i = 0; i < 200; i++)
{
a = a * a + c;
if (a.magnitude2() > 1000)
return 0;
}
return 1;
}
__global__ void kernel(unsigned char *ptr)
{
// map from blockIdx to pixel position
int x = blockIdx.x;
int y = blockIdx.y;
int offset = x + y * gridDim.x;
// now calculate the value at that position
int juliaValue = julia(x, y);
ptr[offset * 3 + 0] = 0;
ptr[offset * 3 + 1] = 0;
ptr[offset * 3 + 2] = 255 * juliaValue;
}
// globals needed by the update routine
struct DataBlock
{
unsigned char *dev_bitmap;
};
int main(void)
{
DataBlock data;
cudaError_t error;
Mat image = Mat(DIM, DIM, CV_8UC3, Scalar::all(0));
data.dev_bitmap = image.data;
unsigned char *dev_bitmap;
error = cudaMalloc((void**)&dev_bitmap, 3 * image.cols*image.rows);
data.dev_bitmap = dev_bitmap;
dim3 grid(DIM, DIM);
kernel << <grid, 1 >> > (dev_bitmap);
error = cudaMemcpy(image.data, dev_bitmap,
3 * image.cols*image.rows,
cudaMemcpyDeviceToHost);
error = cudaFree(dev_bitmap);
imshow("CUDA For Julia | c(0.25, 0.010)", image);
waitKey();
}
c(-0.8,0.156):
c(-0.85,0.06):
c(-0.305,0.60):
c(0.25,0.010):