测试平台为我的acer 5572ANWXCi笔记本,Core Duo T2250、945、1.5G DDR2、geforce 7300go 64bit 128M。
GPU | CPU | GPU* | CPU* | |
Joky.tif | 1.809444 | 28.306510 | 36.927075 | 12.229953 |
HDTV.tif | 7.248393 | 179.199637 | 465.021794 | 173.764878 |
单位为毫秒millisecond,加“*”表示回读GPU数据到内存。Joky.tif大小为300x400,120000 pixels。HDTV.tif大小为1920x1080,2073600 pixels。
测试结果表明,如果不回读,那么可以放心大胆的使用GPU计算。但是如果回读,那么速度将急剧下降。总线是一个原因,但估计更深层次的是GPU的工作机制以及设计。但是可以肯定的是,如果使用RM开发基于多核心的CPU比如CELL BE处理器的计算程序,那么带来的好处是及其明显的,避免了硬件编码优化工作,节省了人力物力,最重要的是可以获得相当不错的性能。
测试代码如下,其中CPU部分没有优化,但是打开了VC71的SSE2开关。
#include <TCHAR.h>
#include <cstdio>
#include <rapidmind/platform.hpp>
#include <cximage/ximage.h>
#pragma comment(lib,"rmplatform-vc7-md.lib")
#pragma comment(lib,"cximagecrt.lib")
using namespace rapidmind;
int main()
{
float Time;
CxImage Image("C:\\HDTV.tif",CXIMAGE_FORMAT_TIF);
long size = Image.GetWidth()*Image.GetHeight()*sizeof(BYTE)*4;
BYTE* DataPtr = NULL;
if( !Image.Encode2RGBA(DataPtr,size) )
printf("Shit!\n");
rapidmind::init();
/*
const mat3 RGBtoCIEmat = mat3(0.412453, 0.212671, 0.019334,
0.357580, 0.715160, 0.119193,
0.180423, 0.072169, 0.950227);
*/
Value3f C0(0.412453f,0.357580f,0.180423f);
Value3f C1(0.212671f,0.715160f,0.072169f);
Value3f C2(0.019334f,0.119193f,0.950227f);
Program Prog = RM_BEGIN_PROGRAM("stream"){
In<Value4ub> rgb;
Out<Value4ub> cie;
cie(0) = dot(rgb(0,1,2),C0);
cie(1) = dot(rgb(0,1,2),C1);
cie(2) = dot(rgb(0,1,2),C2);
cie(3) = 255;
}RM_END
Array<1,Value4ub> Input(Image.GetWidth()*Image.GetHeight());
DataPtr = Input.write_data();
Array<1,Value4ub> Output;
rapidmind::compile(Output,Prog(Input));
Output = Prog(Input);
DataPtr[0] = 255;
rapidmind::Timer Start = rapidmind::Timer::now();
Output = Prog(Input);
rapidmind::finish();
//const BYTE* RMResultPtr = Output.read_data();
rapidmind::Timer End = rapidmind::Timer::now();
rapidmind::finish();
Time = End.milliseconds() - Start.milliseconds();
printf("Use RM : %f milliseconds\n",Time);
int Width = Image.GetWidth(),Height = Image.GetHeight();
Start = rapidmind::Timer::now();
for(int i=0;i<Width*Height;i++){
float r = DataPtr[i*4+0],g=DataPtr[i*4+1],b=DataPtr[i*4+2];
float x = r*0.412453f + g*0.357580f + b*0.180423f;
float y = r*0.212671f + g*0.715160f + b*0.072169f;
float z = r*0.019334f + g*0.119193f + b*0.950227f;
DataPtr[i*4+0] = x;
DataPtr[i*4+1] = y;
DataPtr[i*4+2] = z;
DataPtr[i*4+2] = 255;
}
End = rapidmind::Timer::now();
Time = End.milliseconds() - Start.milliseconds();
printf("Use CPU: %f milliseconds\n",Time);
system("PAUSE");
return 0;
}
#include <cstdio>
#include <rapidmind/platform.hpp>
#include <cximage/ximage.h>
#pragma comment(lib,"rmplatform-vc7-md.lib")
#pragma comment(lib,"cximagecrt.lib")
using namespace rapidmind;
int main()
{
float Time;
CxImage Image("C:\\HDTV.tif",CXIMAGE_FORMAT_TIF);
long size = Image.GetWidth()*Image.GetHeight()*sizeof(BYTE)*4;
BYTE* DataPtr = NULL;
if( !Image.Encode2RGBA(DataPtr,size) )
printf("Shit!\n");
rapidmind::init();
/*
const mat3 RGBtoCIEmat = mat3(0.412453, 0.212671, 0.019334,
0.357580, 0.715160, 0.119193,
0.180423, 0.072169, 0.950227);
*/
Value3f C0(0.412453f,0.357580f,0.180423f);
Value3f C1(0.212671f,0.715160f,0.072169f);
Value3f C2(0.019334f,0.119193f,0.950227f);
Program Prog = RM_BEGIN_PROGRAM("stream"){
In<Value4ub> rgb;
Out<Value4ub> cie;
cie(0) = dot(rgb(0,1,2),C0);
cie(1) = dot(rgb(0,1,2),C1);
cie(2) = dot(rgb(0,1,2),C2);
cie(3) = 255;
}RM_END
Array<1,Value4ub> Input(Image.GetWidth()*Image.GetHeight());
DataPtr = Input.write_data();
Array<1,Value4ub> Output;
rapidmind::compile(Output,Prog(Input));
Output = Prog(Input);
DataPtr[0] = 255;
rapidmind::Timer Start = rapidmind::Timer::now();
Output = Prog(Input);
rapidmind::finish();
//const BYTE* RMResultPtr = Output.read_data();
rapidmind::Timer End = rapidmind::Timer::now();
rapidmind::finish();
Time = End.milliseconds() - Start.milliseconds();
printf("Use RM : %f milliseconds\n",Time);
int Width = Image.GetWidth(),Height = Image.GetHeight();
Start = rapidmind::Timer::now();
for(int i=0;i<Width*Height;i++){
float r = DataPtr[i*4+0],g=DataPtr[i*4+1],b=DataPtr[i*4+2];
float x = r*0.412453f + g*0.357580f + b*0.180423f;
float y = r*0.212671f + g*0.715160f + b*0.072169f;
float z = r*0.019334f + g*0.119193f + b*0.950227f;
DataPtr[i*4+0] = x;
DataPtr[i*4+1] = y;
DataPtr[i*4+2] = z;
DataPtr[i*4+2] = 255;
}
End = rapidmind::Timer::now();
Time = End.milliseconds() - Start.milliseconds();
printf("Use CPU: %f milliseconds\n",Time);
system("PAUSE");
return 0;
}