• 谈谈Hybird3D中的光栅化优化


    看到空明流转分享了他的SALVIA 0.5.2优化谈,我也来说说Hybird3D中和光栅化相关的一些优化技术。

    Hybird3D的设计目标是打造一款准实时的软件高质量渲染器,采用了光栅化和光线跟踪混合算法,光栅化用于渲染eye ray,光线跟踪则用于阴影、反射、折射、全局光等次级光线的计算。由于渲染器是以准实时(一帧渲染时间在几十毫秒到几秒之间)为设计目标的,因此性能优化是非常重要的,但同时为了能够实现高质量的软件渲染,在渲染器的架构和支持的特性上也不会因为性能而缩水和妥协。

    光栅化算法

    Hybird3D的光栅化算法主要采用的是Michael Abrash写的rasterization on Larrabee这篇paper,这个算法是目前我所知道的同时能够支持多线程和SIMD的最佳算法,这个算法保证了每个tile的光栅化可以由每个线程独立计算,而没有任何数据冲突,从而实现完全的无锁化计算。Larrabee有512bit的SIMD宽度,也就是一次可以计算16个float,在一般的支持SSE的CPU上我们只能使用128bit的SIMD指令一次计算4个float,虽然宽度少了,但不影响算法的应用。

    Hybird3D也是采用的16x16的tile,光栅化的主要任务是计算某个像素上对应的三角图元和三角形重心坐标(另外为了后续的mipmap纹理插值我们还需要保存重心坐标的差分值),有了图元和重心坐标就可以送到后端的LightShader和PixelShader中做进一步的计算了。一个像素上不一定只存在一个图元,半透明物体及其anti-alise等都会使得一个像素上存在多个图元,可以将这些图元保存为一个单向链表,同时为每个图元设置一个alpha值做为混合权重就可以了。

    内存优化技巧

    内存的优化主要包括内存的分配和cache的合理利用。在光栅化过程中会产生很多的临时对象,对象的数量不可预估但是每种对象的生命周期可以很容易的知道,所以我们可以采取多次分配一次释放的策略来实现非常高效的对象分配,根据不同的生命周期需要多个内存分配器,同时为了防止多线程冲突,每个线程需要独立的内存分配器。对cache的利用则需要合理的设计我们的渲染流水线以及合理的组织数据结构SoA(struct的array化),让数据访问尽可能的集中。SoA不但可以让数据访问变得集中而且对SIMD指令非常友好,不过SoA的编程难度很高,会让代码变得非常难写和难读,

    SIMD编程技巧

    SIMD的编程一直是件脏活和累活,15年前我就开始使用MMX指令来加速应用,那个时候只能内嵌汇编,而且指令与普通的运算指令差别很大,写完之后过段时间自己也看不懂了,代码的维护是一个非常让人头疼的问题。后来出了intrinsics指令,可以在C代码中用函数的形式来编写SIMD指令,免去了手工写汇编的痛苦,但是intrinsics指令同普通的C运算符差别还是很大,代码的可读性依然不佳,好在SSE指令集还是比较规整的,大部分运算指令可以用C++运算符重载来包装intrinsics指令,下面给出我的包装函数供大家参考。

     1 inline __m128 operator + (__m128 v1, __m128 v2)
     2 {
     3     return _mm_add_ps(v1, v2);
     4 }
     5 
     6 inline __m128 operator - (__m128 v1, __m128 v2)
     7 {
     8     return _mm_sub_ps(v1, v2);
     9 }
    10 
    11 inline __m128 operator * (__m128 v1, __m128 v2)
    12 {
    13     return _mm_mul_ps(v1, v2);
    14 }
    15 
    16 inline __m128 operator / (__m128 v1, __m128 v2)
    17 {
    18     return _mm_div_ps(v1, v2);
    19 }
    20 
    21 inline __m128 operator == (__m128 v1, __m128 v2)
    22 {
    23     return _mm_cmpeq_ps(v1, v2);
    24 }
    25 
    26 inline __m128 operator != (__m128 v1, __m128 v2)
    27 {
    28     return _mm_cmpneq_ps(v1, v2);
    29 }
    30 
    31 inline __m128 operator > (__m128 v1, __m128 v2)
    32 {
    33     return _mm_cmpgt_ps(v1, v2);
    34 }
    35 
    36 inline __m128 operator >= (__m128 v1, __m128 v2)
    37 {
    38     return _mm_cmpge_ps(v1, v2);
    39 }
    40 
    41 inline __m128 operator < (__m128 v1, __m128 v2)
    42 {
    43     return _mm_cmplt_ps(v1, v2);
    44 }
    45 
    46 inline __m128 operator <= (__m128 v1, __m128 v2)
    47 {
    48     return _mm_cmple_ps(v1, v2);
    49 }
    50 
    51 inline __m128 operator & (__m128 v1, __m128 v2)
    52 {
    53     return _mm_and_ps(v1, v2);
    54 }
    55 
    56 inline __m128 operator | (__m128 v1, __m128 v2)
    57 {
    58     return _mm_or_ps(v1, v2);
    59 }
    60 
    61 inline int MoveMask(__m128 v)
    62 {
    63     return _mm_movemask_ps(v);
    64 }
    65 
    66 inline __m128 Max(__m128 v1, __m128 v2)
    67 {
    68     return _mm_max_ps(v1, v2);
    69 }
    70 
    71 inline __m128 Min(__m128 v1, __m128 v2)
    72 {
    73     return _mm_min_ps(v1, v2);
    74 }
    75 
    76 //mask ? a : b
    77 inline __m128 Select(__m128 mask, __m128 a, __m128 b)
    78 {
    79     return _mm_or_ps(_mm_and_ps(a, mask), _mm_andnot_ps(mask, b));
    80 }
    81 
    82 inline __m128 Extract(__m128 m, int n)
    83 {
    84     switch(n)
    85     {
    86     case 0:
    87         return _mm_shuffle_ps(m, m, 0);
    88     case 1:
    89         return _mm_shuffle_ps(m, m, 0x55);
    90     case 2:
    91         return _mm_shuffle_ps(m, m, 0xaa);
    92     case 3:
    93         return _mm_shuffle_ps(m, m, 0xff);
    94     default:
    95         return m;
    96     }
    97 }

    最后是干货时间,放出Hybird3D中光栅化相关的代码供大家参考。

       1 #include "stdafx.h"
       2 #include "RayTracer.h"
       3 #include "Clipper.h"
       4 #include "PrimitiveTile.h"
       5 
       6 #pragma warning(disable: 4018)
       7 
       8 #define TILE_WIDTH  16
       9 #define TILE_HEIGHT 16
      10 
      11 const float NORMAL_THRESHOLD = 0.9f;
      12 extern int ReflectionDepth;
      13 
      14 _CRT_ALIGN(16) struct Illuminance
      15 {
      16     Float3 direction;
      17     float illuminance;
      18     Float3 color;
      19     float shadowFactor;
      20     Illuminance* next;
      21     LightShader* light;
      22 };
      23 
      24 struct VertexOutput
      25 {
      26     Float4 pos;
      27     Float4 normal;
      28     float attributes[0];
      29 };
      30 
      31 struct PolyPrimitive;
      32 
      33 _CRT_ALIGN(16) struct PixelContext
      34 {
      35     PixelContext* next;
      36     PolyPrimitive* prim;
      37     int triIndex;
      38     float alpha;
      39     Float4 pos;
      40     Float4 view;
      41     Float4 normal;
      42     Float2 uv;
      43     Float2 duvdx;    //d(uv) / dx
      44     Float2 duvdy;    //d(uv) / dy
      45     Illuminance* light;
      46     void* userData;
      47 };
      48 
      49 _CRT_ALIGN(16) const float FrustumClipPlane[6][4] = {
      50     { 0, 0, 1, 0},
      51     { 0, 0,-1, 1},
      52     { 1, 0, 0, 1},
      53     {-1, 0, 0, 1},
      54     { 0, 1, 0, 1},
      55     { 0,-1, 0, 1},
      56 };
      57 
      58 __m128 ScreenOffset;
      59 __m128 ScreenScale;
      60 __m128 ScreenScaleInv;
      61 
      62 struct RenderContext : public IRenderContext
      63 {
      64     Float4x4        ViewProjMatrix;
      65     Float4x4        ViewInvMatrix;
      66     Float4            _eye;
      67     float ScreenWidth, ScreenHeight;
      68     PrimitiveTile*    _primTiles;
      69     Bitmap*            _renderTarget;
      70     int                _tileCol, _tileRow;
      71     BYTE*            _vertexTempBuf;
      72     size_t            _vertexTempSize;
      73     ICamera*        _camera;
      74     Accel            _accelStruct;
      75     int                _aaLevel;
      76     int                _primCount;
      77     DWORD            _bkColor;
      78     Float4            _bkColorF;
      79     vector<LightShader*> _lights;
      80 
      81     RenderContext()
      82     {
      83         _vertexTempSize = 0;
      84         _vertexTempBuf = 0;
      85         _primTiles = 0;
      86         _aaLevel = 0;
      87         _tileCol = 0;
      88         _tileRow = 0;
      89         ScreenOffset = _mm_setr_ps(1, -1, 0, 0);
      90     }
      91 
      92     void AddLight(LightShader* light)
      93     {
      94         _lights.push_back(light);
      95     }
      96 
      97     void ClearLights()
      98     {
      99         _lights.clear();
     100     }
     101 
     102     void SetRenderTarget(Bitmap* target)
     103     {
     104         ScreenWidth = target->width;
     105         ScreenHeight = target->height;
     106         int tileCount = _tileCol * _tileRow;
     107         _renderTarget = target;
     108         _tileCol = Align(target->width, TILE_WIDTH) / TILE_WIDTH;
     109         _tileRow = Align(target->height, TILE_HEIGHT) / TILE_HEIGHT;
     110         if(tileCount < _tileCol * _tileRow)
     111         {
     112             if(_primTiles)
     113                 delete[] _primTiles;
     114 
     115             _primTiles = new PrimitiveTile[_tileCol * _tileRow];
     116         }
     117         for(int i = 0; i < _tileCol * _tileRow; ++i)
     118             _primTiles[i].Clear();
     119 
     120         ScreenScale = _mm_setr_ps(ScreenWidth * 0.5f, -ScreenHeight * 0.5f, 1, 1);
     121         ScreenScaleInv = m128(1) / ScreenScale;
     122     }
     123 
     124     void SetAntiAliasQuality(int level)
     125     {
     126         _aaLevel = min(max(0, level), 4);
     127     }
     128 
     129     void SetCamera(ICamera* camera)
     130     {
     131         _camera = camera;
     132     }
     133 
     134     ICamera* GetCamera()
     135     {
     136         return _camera;
     137     }
     138 
     139     void VertConvert(Float4* dest, VertexOutput* vert, int vertChannels)
     140     {
     141         __m128 pos = _mm_load_ps(vert->pos);
     142         __m128 w = _mm_shuffle_ps(pos, pos, _MM_SHUFFLE(3, 3, 3, 3));
     143 
     144         __m128 rhw = _mm_div_ss(_mm_set_ss(1), w);
     145         rhw = _mm_shuffle_ps(rhw, rhw, 0);
     146 
     147         _mm_store_ps(dest[0], _mm_mul_ps(_mm_add_ps(_mm_mul_ps(pos, rhw), ScreenOffset), ScreenScale));
     148 
     149         __m128* attr = (__m128*)&vert->normal;
     150         for(int k = 0; k < vertChannels; k++)
     151             _mm_store_ps(dest[k + 1], _mm_mul_ps(attr[k], rhw));
     152 
     153         _mm_store_ss(&dest[0].w, rhw);
     154     }
     155 
     156     virtual void BeginScene()
     157     {
     158         _accelStruct.BeginBuild();
     159     }
     160 
     161     virtual void AddPolyons(VertexOutput* verts, int vertSize,
     162         int vertCount, DWORD* triangles, int count, Shader* shader)
     163     {
     164 
     165         _accelStruct.AddPolygons((BYTE*)verts, triangles, vertSize, vertCount, count, shader);
     166     }
     167 
     168     virtual void EndScene()
     169     {
     170         _accelStruct.Build();
     171     }
     172 
     173     virtual void SetBackground(DWORD color)
     174     {
     175         _bkColor = color;
     176         _bkColorF = Float4((float)(color & 0xff),
     177             (float)((color >> 8) & 0xff),
     178             (float)((color >> 16) & 0xff), 1);
     179 
     180         _bkColorF /= 255.f;
     181 
     182         _bkColorF = _bkColorF * _bkColorF;
     183     }
     184 
     185     void RasterTile(PrimitiveTile* tile, int x, int y,
     186         DWORD* target, int pitch, struct FGSampleTable* FGSamples = 0);
     187 
     188     void RasterFGSample(PrimitiveTile* tile, int x, int y, struct FGSampleMap& dest);
     189     
     190     void RasterFragmentSample(PrimitiveTile* tile, int x, int y, struct FragmentSampleMap& dest);
     191     
     192     void FGShader(struct FGSampleRef* samples, int count);
     193 
     194     void DrawPrimitive(TriVertex** vert, TrianglePrim& tri);
     195 
     196     void ClippingAndDraw(TriVertex** verts, TrianglePrim& tri);
     197 
     198     void DrawTriangle(TrianglePrim& tri);
     199 
     200     void Render();
     201 
     202     static void* operator new (size_t size)
     203     {
     204         return _aligned_malloc(sizeof(RenderContext), 16);
     205     }
     206 };
     207 
     208 void Create4TransPixels(PixelContext** pixels, TriPrimitive* prim, const Float4& eye,
     209                         float* rhw, float x, float y, Allocator& alloc)
     210 {
     211     __m128 ma = _mm_loadu_ps(prim->a);
     212     __m128 mb = _mm_loadu_ps(prim->b);
     213     __m128 a0 =  ma * (m128(x - prim->p0.x)) + mb * (m128(y - prim->p0.y)) + _mm_loadu_ps(prim->c);
     214     for(int i = 0; i < 4; ++i)
     215     {
     216         __m128 a = a0;
     217         if(rhw[i] > 0)
     218         {
     219             PixelContext pixel;
     220             __m128 adx = a + ma;
     221             __m128 ady = a + mb;
     222             __m128 r = _mm_div_ss(m128(1), a);
     223             a = a * Extract(r, 0);
     224             adx = a - adx * Extract(_mm_rcp_ss(adx), 0);
     225             ady = a - ady * Extract(_mm_rcp_ss(ady), 0);
     226             _mm_store_ss(&pixel.pos.w, a0);
     227             pixel.prim = prim->prim;
     228             pixel.triIndex = prim->triIndex;
     229             _mm_storeu_ps(pixel.uv, _mm_shuffle_ps(a, adx, _MM_SHUFFLE(2, 1, 2, 1)));
     230             _mm_storeu_ps(pixel.duvdy, _mm_shuffle_ps(ady, ady, _MM_SHUFFLE(2, 1, 2, 1)));
     231             float alpha = prim->prim->shader->TransprentShader(&pixel);
     232             if(alpha > 0.01f)
     233             {
     234                 //insert pixel
     235                 PixelContext* p = (PixelContext*)alloc.Alloc(sizeof(PixelContext), 16);
     236                 p->alpha = alpha;
     237                 p->prim = prim->prim;
     238                 p->triIndex = prim->triIndex;
     239                 p->uv = pixel.uv;
     240                 p->duvdx = pixel.duvdx;
     241                 p->duvdy = pixel.duvdy;
     242                 prim->prim->GetPosNormal(prim->triIndex, pixel.uv, &p->pos, &p->normal);
     243                 p->view = NormalizeFast(eye - p->pos);
     244                 p->light = 0;
     245                 p->next = 0;
     246                 p->pos.w = pixel.pos.w;
     247 
     248                 float alpha2 = 1;
     249                 if(pixels[i] == 0)
     250                     pixels[i] = p;
     251                 else
     252                 {
     253                     PixelContext* prev = 0;
     254                     PixelContext* pp = pixels[i];
     255                     while(pp)
     256                     {
     257                         if(p->pos.w > pp->pos.w)
     258                             break;
     259                         alpha2 -= pp->alpha;
     260                         prev = pp;
     261                         pp = pp->next;
     262                     }
     263                     p->alpha = alpha * alpha2;
     264                     if(prev)
     265                     {
     266                         p->next = prev->next;
     267                         prev->next = p;
     268                     }
     269                     else
     270                     {
     271                         p->next = pixels[i];
     272                         pixels[i] = p;
     273                     }
     274 
     275                     if(alpha > 0.99f)
     276                     {
     277                         p->next = 0;
     278                     }
     279                     else
     280                     {
     281                         alpha = 1 - alpha;
     282                         pp = p->next;
     283                         while(pp)
     284                         {
     285                             pp->alpha *= alpha;
     286                             pp = pp->next;
     287                         }
     288                     }
     289                 }
     290             }
     291         }
     292         a0 = a0 + ma;
     293     }
     294 }
     295 
     296 void CreateMainPixels(PixelContext** pixels, TriPrimitive** primBuf, const Float4& eye,
     297                       float startX, float startY, int tileSize, float alpha, Allocator& alloc)
     298 {
     299     __m128 px = m128(startX);
     300     __m128 py = m128(startY);
     301     for(int i = 0; i < tileSize; ++i)
     302     {
     303         if(i % 16 == 0 && i > 0)
     304         {
     305             py = py + m128(1);
     306             px = m128(startX);
     307         }
     308         TriPrimitive* prim = primBuf[i];
     309         if(prim)
     310         {
     311             PixelContext* p = (PixelContext*)alloc.Alloc(sizeof(PixelContext), 16);
     312             __m128 ma = _mm_loadu_ps(prim->a);
     313             __m128 mb = _mm_loadu_ps(prim->b);
     314             __m128 a =  ma * (px - m128(prim->p0.x)) +
     315                         mb * (py - m128(prim->p0.y)) + _mm_loadu_ps(prim->c);
     316             __m128 rhw = a;
     317 
     318             __m128 r = _mm_div_ss(m128(1), a);
     319             __m128 w = _mm_shuffle_ps(r, r, 0);
     320             __m128 adx = a + ma;
     321             __m128 ady = a + mb;
     322             a = a * w;
     323             r = _mm_rcp_ss(adx);
     324             adx = a - adx * _mm_shuffle_ps(r, r, 0);
     325             r = _mm_rcp_ss(ady);
     326             ady = a - ady * _mm_shuffle_ps(r, r, 0);
     327             _mm_storeu_ps(p->uv, _mm_shuffle_ps(a, adx,  _MM_SHUFFLE(2, 1, 2, 1)));
     328             _mm_storeu_ps(p->duvdy, _mm_shuffle_ps(ady, ady,  _MM_SHUFFLE(2, 1, 2, 1)));
     329             p->prim = prim->prim;
     330             p->triIndex = prim->triIndex;
     331             p->prim->GetPosNormal(p->triIndex, p->uv, &p->pos, &p->normal);
     332             p->view = NormalizeFast(eye - p->pos);
     333             p->light = 0;
     334             p->alpha = alpha;
     335             p->next = 0;
     336             pixels[i] = p;
     337             _mm_store_ss(&p->pos.w, rhw);
     338         }
     339         else
     340             pixels[i] = 0;
     341         px = px + m128(1);
     342     }
     343 }
     344 
     345 void InsertPixel(PixelContext** pixel, PixelContext* p)
     346 {
     347     if(*pixel == 0)
     348         *pixel = p;
     349     else
     350     {
     351         PixelContext* prev = 0;
     352         PixelContext* pp = *pixel;
     353         while(pp)
     354         {
     355             if(p->pos.w > pp->pos.w)
     356                 break;
     357             prev = pp;
     358             pp = pp->next;
     359         }
     360         if(prev)
     361         {
     362             p->next = prev->next;
     363             prev->next = p;
     364         }
     365         else
     366         {
     367             p->next = *pixel;
     368             *pixel = p;
     369         }
     370     }
     371 }
     372 
     373 void CreatePixels(PixelContext** pixels, TriPrimitive** primBuf, const Float4& eye,
     374                   float alpha, float startX, float startY, int tileSize, Allocator& alloc)
     375 {
     376     __m128 px = m128(startX);
     377     __m128 py = m128(startY);
     378     for(int i = 0; i < tileSize; ++i)
     379     {
     380         if(i % 16 == 0 && i > 0)
     381         {
     382             py = py + m128(1);
     383             px = m128(startX);
     384         }
     385         TriPrimitive* prim = primBuf[i];
     386         if(prim)
     387         {
     388             PixelContext* pixel = pixels[i];
     389             while(pixel)
     390             {
     391                 if(pixel->prim == prim->prim)
     392                 {
     393                     pixel->alpha += alpha;
     394                     goto _SkipPixel;
     395                 }
     396                 pixel = pixel->next;
     397             }
     398 
     399             PixelContext* p = (PixelContext*)alloc.Alloc(sizeof(PixelContext), 16);
     400             __m128 ma = _mm_loadu_ps(prim->a);
     401             __m128 mb = _mm_loadu_ps(prim->b);
     402 
     403             __m128 a =  ma * (px - m128(prim->p0.x)) +
     404                         mb * (py - m128(prim->p0.y)) + _mm_loadu_ps(prim->c);
     405             __m128 rhw = a;
     406 
     407             __m128 r = _mm_div_ss(m128(1), a);
     408             __m128 w = Extract(r, 0);
     409             __m128 adx = a + ma;
     410             __m128 ady = a + mb;
     411             a = a * w;
     412             r = _mm_rcp_ss(adx);
     413             adx = a - adx * Extract(r, 0);
     414             r = _mm_rcp_ss(ady);
     415             ady = a - ady * Extract(r, 0);
     416             _mm_storeu_ps(p->uv, _mm_shuffle_ps(a, adx,  _MM_SHUFFLE(2, 1, 2, 1)));
     417             _mm_storeu_ps(p->duvdy, _mm_shuffle_ps(ady, ady,  _MM_SHUFFLE(2, 1, 2, 1)));
     418             p->prim = prim->prim;
     419             p->triIndex = prim->triIndex;
     420             p->prim->GetPosNormal(p->triIndex, p->uv, &p->pos, &p->normal);
     421             p->view = NormalizeFast(eye - p->pos);
     422             p->light = 0;
     423             p->alpha = alpha;
     424             p->next = 0;
     425             _mm_store_ss(&p->pos.w, rhw);
     426             InsertPixel(&pixels[i], p);
     427         }
     428     _SkipPixel:
     429         px = px + m128(1);
     430     }
     431 }
     432 
     433 void RasterFullCoverPrim(TriPrimitive* prim, float startX,
     434                          float startY, float* primBuf, float* wBuf)
     435 {
     436     __m128 startW = m128((startX - prim->p0.x) * prim->a[0]
     437                     + (startY - prim->p0.y) * prim->b[0] + prim->c[0]);
     438     __m128 rhwDx = m128(prim->a[0] * 4);
     439     __m128 primData = m128(*(float*)&prim);
     440     startW = startW + m128(prim->a[0]) * _mm_set_ps(3, 2, 1, 0);
     441 
     442     for(int i = 0; i < TILE_HEIGHT; ++i)
     443     {
     444         __m128 rhw = startW;
     445         for(int j = 0; j < TILE_WIDTH; j += 4)
     446         {
     447             __m128 oldW = _mm_load_ps(wBuf + j);
     448             __m128 mask = rhw > oldW;
     449             _mm_store_ps(wBuf + j, Select(mask, rhw, oldW));
     450             _mm_store_ps(primBuf + j, Select(mask, primData, _mm_load_ps(primBuf + j)));
     451             rhw = rhw + rhwDx;
     452         }
     453         wBuf += TILE_WIDTH;
     454         primBuf += TILE_WIDTH;
     455         startW = startW + m128(prim->b[0]);
     456     }
     457 }
     458 
     459 void RasterPrim(TriPrimitive* prim, float x, float y,
     460                 float xs, float ys, TriPrimitive** primBuf, float* wBuf)
     461 {
     462     __m128 ex[3];
     463     __m128 ey[3];
     464     __m128 xOff[3];
     465     __m128 yOff[3];
     466     __m128 mask0[3];
     467     __m128 primData = m128(*(float*)&prim);
     468 
     469     for(int i = 0; i < 3; ++i)
     470     {
     471         ex[i] = m128(prim->ea[i]);
     472         ey[i] = m128(prim->eb[i]);
     473         xOff[i] = (ex[i] > m128(0)) & m128(4);
     474         yOff[i] = (ey[i] > m128(0)) & m128(4);
     475     }
     476     __m128 p0x = m128(prim->p0.x);
     477     __m128 p0y = m128(prim->p0.y);
     478     __m128 p1x = p0x - ey[0];
     479     __m128 p1y = p0y + ex[0];
     480 
     481     mask0[0] = (m128(x) - p0x) * ex[0] + (m128(y) - p0y) * ey[0];
     482     mask0[1] = (m128(x) - p1x) * ex[1] + (m128(y) - p1y) * ey[1];
     483     mask0[2] = (m128(x) - p0x) * ex[2] + (m128(y) - p0y) * ey[2];
     484 
     485     __m128 rhw0 = (_mm_set_ps(3, 2, 1, 0) + m128(x + xs) - p0x) * m128(prim->a[0]) +
     486                   (m128(y + ys) - p0y) * m128(prim->b[0]) + m128(prim->c[0]);
     487     __m128* mprimBuf = (__m128*)primBuf;
     488     __m128* mwBuf = (__m128*)wBuf;
     489 
     490     __m128 yStep = m128(0);
     491     for(int iy = 0; iy < 4; ++iy)
     492     {
     493         __m128 mask;
     494         __m128 xStep = _mm_set_ps(12, 8, 4, 0);
     495         mask =  ((mask0[0] + (xStep + xOff[0]) * ex[0] + (yStep + yOff[0]) * ey[0]) >= m128(0)) &
     496                 ((mask0[1] + (xStep + xOff[1]) * ex[1] + (yStep + yOff[1]) * ey[1]) >= m128(0)) &
     497                 ((mask0[2] + (xStep + xOff[2]) * ex[2] + (yStep + yOff[2]) * ey[2]) >= m128(0));
     498 
     499         int* imask = (int*)&mask;
     500         if(MoveMask(mask))
     501         {
     502             __m128 rhw1 = rhw0;
     503             for(int ix = 0; ix < 4; ++ix)
     504             {
     505                 if(imask[ix])
     506                 {
     507                     __m128 mask1[3];
     508                     __m128 xpos = _mm_set_ps(3, 2, 1, 0) + m128((float)(ix * 4) + xs);
     509                     __m128 ypos = yStep + m128(ys);
     510                     mask1[0] = mask0[0] + xpos * ex[0] + ypos * ey[0];
     511                     mask1[1] = mask0[1] + xpos * ex[1] + ypos * ey[1];
     512                     mask1[2] = mask0[2] + xpos * ex[2] + ypos * ey[2];
     513 
     514                     __m128* mprimBuf0 = mprimBuf + ix;
     515                     __m128* mwBuf0 = mwBuf + ix;
     516                     __m128 rhw = rhw1;
     517                     for(int j = 0; j < 4; ++j)
     518                     {
     519                         __m128 pmask =  (rhw > *mwBuf0) &
     520                                         (mask1[0] >= m128(0)) &
     521                                         (mask1[1] >= m128(0)) &
     522                                         (mask1[2] >= m128(0));
     523 
     524                         *mwBuf0 = Select(pmask, rhw, *mwBuf0);
     525                         *mprimBuf0 = Select(pmask, primData, *mprimBuf0);
     526                         mask1[0] = mask1[0] + ey[0];
     527                         mask1[1] = mask1[1] + ey[1];
     528                         mask1[2] = mask1[2] + ey[2];
     529                         mprimBuf0 += 4;
     530                         mwBuf0 += 4;
     531                         rhw = rhw + m128(prim->b[0]);
     532                     }
     533                 }
     534                 rhw1 = rhw1 + m128(prim->a[0]) * m128(4);
     535             }
     536         }
     537         rhw0 = rhw0 + m128(4) * m128(prim->b[0]);
     538         mprimBuf += 16;
     539         mwBuf += 16;
     540         yStep = yStep + m128(4);
     541     }
     542 }
     543 
     544 void CreateReflectRay(Ray* rays, int count, PixelContext* pixel, ReflectInfo* refInfo, const Float4& eye)
     545 {
     546     if(count == 1)
     547     {
     548         Float4 pos = pixel->pos;
     549         Float4 normal = pixel->normal;
     550         Ray& ray = rays[0];
     551 
     552         Float4 refVec = -Normalize(Reflect(pixel->view, normal));
     553         pos = pos + refVec * 0.02f;
     554         _mm_store_ps(ray.pos, pos.m);
     555         _mm_store_ps(ray.dir, refVec.m);
     556         ray.triIndex = -1;
     557         ray.tmin = 0;
     558         ray.tmax = 1e10;
     559         ray.userData = refInfo;
     560         return;
     561     }
     562 
     563     static const Float2 offset[] = {
     564         Float2(0, 0),
     565         Float2(-0.4f, -0.4f),
     566         Float2(0.4f, -0.4f),
     567         Float2(0, 0.4f)
     568     };
     569     for(int i = 0; i < count; ++i)
     570     {
     571         Float4 pos = pixel->pos;
     572         Float4 normal = pixel->normal;
     573         Ray& ray = rays[i];
     574 
     575         Float4 dpos, dnormal;
     576         pixel->prim->GetPosNormalDifferential(pixel->triIndex, pixel->duvdx * offset[i].x, &dpos, &dnormal);
     577         pos = pos + dpos;
     578         normal = normal + dnormal;
     579 
     580         pixel->prim->GetPosNormalDifferential(pixel->triIndex, pixel->duvdy * offset[i].y, &dpos, &dnormal);
     581         pos = pos + dpos;
     582         normal = NormalizeFast(normal + dnormal);
     583 
     584         Float4 Vn;
     585         //Vn = pixel->view;
     586         Vn = NormalizeFast(eye - pos);
     587 
     588         Float4 refVec = -Normalize(Reflect(Vn, normal));
     589         pos = pos + refVec * 0.02f;
     590         _mm_store_ps(ray.pos, pos.m);
     591         _mm_store_ps(ray.dir, refVec.m);
     592         ray.triIndex = -1;
     593         ray.tmin = 0;
     594         ray.tmax = 1e10;
     595         ray.userData = refInfo;
     596     }
     597 }
     598 
     599 void CreateReflectPixel(PixelContext** pixels, Ray& ray, Allocator* alloc)
     600 {
     601     ReflectInfo* refinfo = (ReflectInfo*)ray.userData;
     602     /*PixelContext* pixel = pixels[refinfo->index];
     603     while(pixel)
     604     {
     605         if(pixel->prim == ray.prim)
     606         {
     607             pixel->alpha += refinfo->strength;
     608             return;
     609         }
     610         pixel = pixel->next;
     611     }*/
     612     
     613     PixelContext* p = (PixelContext*)alloc->Alloc(sizeof(PixelContext), 16);
     614     p->prim = ray.prim;
     615     p->triIndex = ray.triIndex;
     616     p->uv.x = ray.u;
     617     p->uv.y = ray.v;
     618 
     619     Float4 posddx, normalddx;
     620     Float4 posddy, normalddy;
     621     refinfo->context->prim->GetPosNormalDifferential(refinfo->context->triIndex,
     622                                     refinfo->context->duvdx, &posddx, &normalddx);
     623     refinfo->context->prim->GetPosNormalDifferential(refinfo->context->triIndex,
     624                                     refinfo->context->duvdy, &posddy, &normalddy);
     625 
     626     Float2 uvdx, uvdy;
     627     p->prim->GetRayDifferential(ray.triIndex, *(Float4*)&ray.pos, *(Float4*)&ray.dir,
     628                                 posddx, posddy, normalddx, normalddy, &uvdx, &uvdy);
     629 
     630     p->duvdx = uvdx - p->uv;
     631     p->duvdy = uvdy - p->uv;
     632     p->alpha = refinfo->strength;
     633     ray.prim->GetPosition(ray.triIndex, p->uv, &p->pos);
     634     ray.prim->GetNormal(ray.triIndex, p->uv, &p->normal);
     635     p->view = NormalizeFast(Float4(ray.pos, 1) - p->pos);
     636     p->light = 0;
     637     p->next = pixels[refinfo->index];
     638     pixels[refinfo->index] = p;
     639 }
     640 
     641 void CopyBuf(float* wBuf, float* wBuf2, float* primBuf, float* primBuf2, int size)
     642 {
     643     for(int i = 0; i < size; i += 4)
     644     {
     645         _mm_store_ps(wBuf2 + i, _mm_load_ps(wBuf + i));
     646         _mm_store_ps((float*)primBuf2 + i, _mm_load_ps((float*)primBuf + i));
     647     }
     648 }
     649 
     650 float CopyBuf2(float* wBuf, float* wBuf2, float* primBuf, float* primBuf2, int size)
     651 {
     652     __m128 minRHW = m128(FLT_MAX);
     653     for(int i = 0; i < size; i += 4)
     654     {
     655         __m128 rhw = _mm_load_ps(wBuf + i);
     656         minRHW = Min(minRHW, rhw);
     657         _mm_store_ps((float*)primBuf2 + i, _mm_load_ps((float*)primBuf + i));
     658         _mm_store_ps(wBuf2 + i, rhw);
     659     }
     660 
     661     minRHW = Min(Min(Extract(minRHW, 0), Extract(minRHW, 1)),
     662         Min(Extract(minRHW, 2), Extract(minRHW, 3)));
     663     float m;
     664     _mm_store_ss(&m, minRHW);
     665     return m;
     666 }
     667 
     668 int AACount[] = {0, 2, 4, 8, 16};
     669 float AACfgAlpha[] = {1, 1 / 3.f, 1 / 5.f, 1 / 9.f, 1 / 17.f};
     670 Float2 AASampler[5][16] = {
     671     { Float2(0, 0) },
     672 
     673     { Float2(0.25f, 0.25f), Float2(0.75f, 0.75f) },
     674 
     675     { Float2(0.25f, 0.25f), Float2(0.75f, 0.75f), Float2(0.25f, 0.75f), Float2(0.75f, 0.25f) },
     676 
     677     { Float2(0.2f, 0.5f), Float2(0.8f, 0.5f), Float2(0.5f, 0.2f), Float2(0.5f, 0.8f),
     678       Float2(0.25f, 0.25f), Float2(0.75f, 0.25f), Float2(0.25f, 0.75f), Float2(0.75f, 0.75f)},
     679 
     680     { Float2(0.2f, 0.2f), Float2(0.4f, 0.2f), Float2(0.6f, 0.2f), Float2(0.8f, 0.2f),
     681       Float2(0.2f, 0.4f), Float2(0.4f, 0.4f), Float2(0.6f, 0.4f), Float2(0.8f, 0.4f),
     682       Float2(0.2f, 0.6f), Float2(0.4f, 0.6f), Float2(0.6f, 0.6f), Float2(0.8f, 0.6f),
     683       Float2(0.2f, 0.8f), Float2(0.4f, 0.8f), Float2(0.6f, 0.8f), Float2(0.8f, 0.8f) }
     684 };
     685 
     686 void RenderContext::ClippingAndDraw(TriVertex** verts, TrianglePrim& tri)
     687 {
     688     if(!BackCullTest((VertexOutput**)verts))
     689         return;
     690 
     691     Float4 vertTmpBuf[256];
     692     Float4* vertBuf = vertTmpBuf;
     693 
     694     int vertCount = 3;
     695     for(int i = 0; i < 6; i++)
     696     {
     697         vertCount = ClipPoly(*(const Float4*)FrustumClipPlane[i],
     698                             (VertexOutput**)verts, vertCount, 2, vertBuf);
     699         if(vertCount < 3)
     700             return;
     701     }
     702 
     703     Float4 vertsTmp[256];
     704     for(int i = 0; i < vertCount; i++)
     705         VertConvert(vertsTmp + i * 2, (VertexOutput*)verts[i], 1);
     706 
     707     TriVertex* triangle[3];
     708     for(int i = 0; i < vertCount - 2; i++)
     709     {
     710         triangle[0] = (TriVertex*)vertsTmp;
     711         triangle[1] = (TriVertex*)(vertsTmp + (i + 1) * 2);
     712         triangle[2] = (TriVertex*)(vertsTmp + (i + 2) * 2);
     713         DrawPrimitive(triangle, tri);
     714     }
     715 }
     716 
     717 void RenderContext::DrawTriangle(TrianglePrim& tri)
     718 {
     719     TriVertex verts[3];
     720     verts[0].pos = Mul(tri.p0, ViewProjMatrix);
     721     verts[1].pos = Mul(tri.p1, ViewProjMatrix);
     722     verts[2].pos = Mul(tri.p2, ViewProjMatrix);
     723     verts[0].uv = Float4(0, 0, 0, 0);
     724     verts[1].uv = Float4(1, 0, 0, 0);
     725     verts[2].uv = Float4(0, 1, 0, 0);
     726 
     727     TriVertex* verts2[36];
     728     verts2[0] = verts;
     729     verts2[1] = verts + 1;
     730     verts2[2] = verts + 2;
     731     ClippingAndDraw(verts2, tri);
     732 }
     733 
     734 void RenderContext::RasterTile(PrimitiveTile* tile, int x, int y, DWORD* target,
     735                                 int pitch, FGSampleTable* FGSamples)
     736 {
     737     const int tileSize = TILE_WIDTH * TILE_HEIGHT;
     738     _CRT_ALIGN(16) TriPrimitive* primBuf[tileSize];
     739     _CRT_ALIGN(16) float wBuf[tileSize];
     740     _CRT_ALIGN(16) PixelContext* pixels[tileSize];
     741     //_CRT_ALIGN(16) PixelContext* mainPixels[tileSize];
     742     //_CRT_ALIGN(16) PixelContext* transPixels[tileSize];
     743     _CRT_ALIGN(16) TriPrimitive* primBuf2[tileSize];
     744     _CRT_ALIGN(16) float wBuf2[tileSize];
     745     _CRT_ALIGN(16) Float4 colorBuf[tileSize];
     746 
     747     Allocator allocA(Align((BYTE*)_alloca(1024 * 1024), 16), 1024 * 1024 - 15);
     748     Allocator allocB(Align((BYTE*)_alloca(1024 * 1024), 16), 1024 * 1024 - 15);
     749     Allocator* alloc = &allocA;
     750 
     751     tile->MergePrimitives();
     752     
     753     if(!tile->HasPrimitive())
     754     {
     755         for(int i = 0; i < tileSize; ++i)
     756             *((DWORD*)((BYTE*)target + pitch * (i / 16)) + (i % 16)) = _bkColor;
     757         return;
     758     }
     759 
     760     float startX = (float)x + 0.5f;
     761     float startY = (float)y + 0.5f;
     762 
     763     for(int i = 0; i < tileSize; i += 4)
     764     {
     765         _mm_store_ps(wBuf + i, m128(0));
     766         _mm_store_ps((float*)primBuf + i, m128(0));
     767         //_mm_store_ps((float*)pixels + i, m128(0));
     768         //_mm_store_ps((float*)transPixels + i, m128(0));
     769     }
     770     float farRhw = 0;
     771     bool hasFullPrim = false;
     772     while(true)
     773     {
     774         TriPrimitive* prim = tile->NextFullPrimitive();
     775         if(!prim)
     776             break;
     777         hasFullPrim = true;
     778         RasterFullCoverPrim(prim, startX, startY, (float*)primBuf, wBuf);
     779     }
     780     if(hasFullPrim)
     781         farRhw = CopyBuf2(wBuf, wBuf2, (float*)primBuf, (float*)primBuf2, tileSize);
     782     else
     783     {
     784         for(int i = 0; i < tileSize; i += 4)
     785         {
     786             _mm_store_ps(wBuf2 + i, m128(0));
     787             _mm_store_ps((float*)primBuf2 + i, m128(0));
     788         }
     789     }
     790 
     791     int aaCount = AACount[_aaLevel];
     792     float alpha = AACfgAlpha[_aaLevel];
     793     Float2* sampler = AASampler[_aaLevel];
     794 
     795     while(true)
     796     {
     797         TriPrimitive* prim = tile->NextOpaquePrimitive();
     798         if(!prim)
     799             break;
     800         if(prim->maxRhw < farRhw)
     801             continue;
     802 
     803         RasterPrim(prim, x, y, 0.5f, 0.5f, primBuf2, wBuf2);
     804     }
     805     tile->Reset();
     806     CreateMainPixels(pixels, primBuf2, _eye, startX, startY, tileSize, alpha, *alloc);
     807     farRhw = CopyBuf2(wBuf, wBuf2, (float*)primBuf, (float*)primBuf2, tileSize);
     808     farRhw *= 0.99f;
     809 
     810     for(int aa = 0; aa < aaCount; ++aa)
     811     {
     812         float xs = sampler[aa].x;
     813         float ys = sampler[aa].y;
     814         while(true)
     815         {
     816             TriPrimitive* prim = tile->NextOpaquePrimitive();
     817             if(!prim)
     818                 break;
     819             if(prim->maxRhw < farRhw)
     820                 continue;
     821 
     822             RasterPrim(prim, x, y, xs, ys, primBuf2, wBuf2);
     823         }
     824         tile->Reset();
     825         CreatePixels(pixels, primBuf2, _eye, alpha, startX - 0.5f + xs, startY - 0.5f + ys, tileSize, *alloc);
     826         CopyBuf(wBuf, wBuf2, (float*)primBuf, (float*)primBuf2, tileSize);
     827     }
     828 
     829     do
     830     {
     831         bool fullScreen;
     832         TriPrimitive* prim = tile->NextTransPrimitive(fullScreen);
     833         if(!prim)
     834             break;
     835         while(prim)
     836         {
     837             if(prim->maxRhw < farRhw)
     838             {
     839                 prim = tile->NextTransPrimitive(fullScreen);
     840                 continue;
     841             }
     842             PixelContext** tpixels = pixels;
     843             __m128 ex[3];
     844             __m128 ey[3];
     845             __m128 mask0[3];
     846             __m128 xOff[3];
     847             for(int i = 0; i < 3; ++i)
     848             {
     849                 ex[i] = m128(prim->ea[i]);
     850                 ey[i] = m128(prim->eb[i]);
     851                 xOff[i] = (ex[i] > m128(0)) & m128(4);
     852             }
     853 
     854             __m128 p0x = m128(prim->p0.x);
     855             __m128 p0y = m128(prim->p0.y);
     856             __m128 p1x = p0x - ey[0];
     857             __m128 p1y = p0y + ex[0];
     858 
     859             mask0[0] = (m128(x) - p0x) * ex[0] + (m128(y + 0.5f) - p0y) * ey[0];
     860             mask0[1] = (m128(x) - p1x) * ex[1] + (m128(y + 0.5f) - p1y) * ey[1];
     861             mask0[2] = (m128(x) - p0x) * ex[2] + (m128(y + 0.5f) - p0y) * ey[2];
     862 
     863             __m128 rhw0 = (_mm_set_ps(3, 2, 1, 0) + m128(startX) - p0x) * m128(prim->a[0]) +
     864                           (m128(startY) - p0y) * m128(prim->b[0]) + m128(prim->c[0]);
     865             __m128* mwBuf = (__m128*)wBuf2;
     866             for(int iy = 0; iy < 16; ++iy)
     867             {
     868                 __m128 xStep = _mm_set_ps(12, 8, 4, 0);
     869                 __m128 mask =    ((mask0[0] + (xStep + xOff[0]) * ex[0]) >= m128(0)) &
     870                                 ((mask0[1] + (xStep + xOff[1]) * ex[1]) >= m128(0)) &
     871                                 ((mask0[2] + (xStep + xOff[2]) * ex[2]) >= m128(0));
     872                 if(MoveMask(mask))
     873                 {
     874                     __m128 mask1[3];
     875                     xStep = _mm_set_ps(3.5f, 2.5f, 1.5f, 0.5f);
     876                     mask1[0] = mask0[0] + xStep * ex[0];
     877                     mask1[1] = mask0[1] + xStep * ex[1];
     878                     mask1[2] = mask0[2] + xStep * ex[2];
     879                     __m128 rhw = rhw0;
     880 
     881                     for(int ix = 0; ix < 4; ++ix)
     882                     {
     883                         __m128 pmask = ((rhw > *mwBuf) &
     884                                         (mask1[0] >= m128(0)) &
     885                                         (mask1[1] >= m128(0)) &
     886                                         (mask1[2] >= m128(0)));
     887                         if(MoveMask(pmask))
     888                         {
     889                             __m128 rhw1 = rhw & pmask;
     890                             Create4TransPixels(tpixels, prim, _eye, (float*)&rhw1,
     891                                                x + ix * 4 + 0.5f, y + iy + 0.5f, *alloc);
     892                         }
     893                         rhw = rhw + m128(4) * m128(prim->a[0]);
     894                         mask1[0] = mask1[0] + m128(4) * ex[0];
     895                         mask1[1] = mask1[1]    + m128(4) * ex[1];
     896                         mask1[2] = mask1[2]    + m128(4) * ex[2];
     897                         mwBuf++;
     898                         tpixels += 4;
     899                     }
     900                 }
     901                 else
     902                 {
     903                     mwBuf += 4;
     904                     tpixels += 16;
     905                 }
     906                 rhw0 = rhw0 + m128(prim->b[0]);
     907                 mask0[0] = mask0[0] + ey[0];
     908                 mask0[1] = mask0[1] + ey[1];
     909                 mask0[2] = mask0[2] + ey[2];
     910             }
     911             prim = tile->NextTransPrimitive(fullScreen);
     912         }
     913     }while(0);
     914 
     915     for(int i = 0; i < tileSize; ++i)
     916         colorBuf[i].m = m128(0);
     917 
     918     Ray reflectRays[64];
     919     ReflectInfo refInfos[64];
     920     int refRayIndex = 0;
     921     int refInfoIndex = 0;
     922 
     923     for(int depth = 0; depth <= ReflectionDepth; ++depth)
     924     {
     925         bool hasReflection = false;
     926         for(int j = 0; j < _lights.size(); ++j)
     927         {
     928             int from, to;
     929             if(_lights[j]->Interpolate(&from, &to) && FGSamples)
     930                 continue;
     931             _lights[j]->DirectIlluminate(pixels, tileSize, &_accelStruct, alloc);
     932         }
     933         if(FGSamples && depth == 0)
     934         {
     935             for(int i = 0; i < tileSize; ++i)
     936             {
     937                 int sx = x + (i % 16);
     938                 int sy = y + i / 16;
     939                 PixelContext* pixel = pixels[i];
     940                 while(pixel)
     941                 {
     942                     Float4 norm;
     943                     pixel->prim->GetFaceNormal(pixel->triIndex, &norm);
     944                     
     945                     Float4 color = FGSamples->Lookup(pixel->prim, *(Float3*)&norm, sx, sy);
     946                     if(color.x + color.y + color.z > 0)
     947                     {
     948                         Illuminance* illum = (Illuminance*)alloc->Alloc(sizeof(Illuminance));
     949                         illum->color.x = color.x;
     950                         illum->color.y = color.y;
     951                         illum->color.z = color.z;
     952                         illum->direction.x = 0;
     953                         illum->direction.y = 0;
     954                         illum->direction.z = 1;
     955                         illum->illuminance = 1;
     956                         illum->light = 0;
     957                         illum->next = pixel->light;
     958                         illum->shadowFactor = 0;
     959                         pixel->light = illum;
     960                     }
     961                     pixel = pixel->next;
     962                 }
     963             }
     964         }
     965 
     966         if(alloc == &allocA)
     967             alloc = &allocB;
     968         else
     969             alloc = &allocA;
     970         alloc->Clear();
     971 
     972         for(int i = 0; i < tileSize; ++i)
     973         {
     974             PixelContext* pixel = pixels[i];
     975             pixels[i] = 0;
     976             Float4 color;
     977             color.m = m128(0);
     978             float alpha = 0;
     979 
     980             while(pixel)
     981             {
     982                 ReflectInfo reflect;
     983                 reflect.strength = 0;
     984                 alpha += pixel->alpha;
     985                 color += pixel->prim->shader->PixelShader(pixel, &reflect) * pixel->alpha;
     986                 reflect.strength *= pixel->alpha;
     987 
     988                 if(reflect.strength > 0.01f)
     989                 {
     990                     ReflectInfo& refInfo = refInfos[refInfoIndex++];
     991                     refInfo = reflect;
     992                     refInfo.context = pixel;
     993                     refInfo.index = i;
     994                     refInfo.strength *= 0.25f;
     995 
     996                     if(depth == 0)
     997                     {
     998                         CreateReflectRay(reflectRays + refRayIndex, 4, pixel, &refInfo, _eye);
     999                         refRayIndex += 4;
    1000                     }
    1001                     else
    1002                     {
    1003                         CreateReflectRay(reflectRays + refRayIndex, 4, pixel, &refInfo, _eye);
    1004                         refRayIndex += 4;
    1005                     }
    1006                     hasReflection = true;
    1007 
    1008                     if(refRayIndex >= 64)
    1009                     {
    1010                         _accelStruct.TraceIntersect(reflectRays, refRayIndex);
    1011                         for(int r = 0; r < refRayIndex; ++r)
    1012                         {
    1013                             Ray& ray = reflectRays[r];
    1014                             if(ray.prim)
    1015                                 CreateReflectPixel(pixels, ray, alloc);
    1016                         }
    1017                         refInfoIndex = 0;
    1018                         refRayIndex = 0;
    1019                     }
    1020                 }
    1021                 pixel = pixel->next;
    1022             }
    1023             if(refRayIndex > 0)
    1024             {
    1025                 _accelStruct.TraceIntersect(reflectRays, refRayIndex);
    1026                 for(int r = 0; r < refRayIndex; ++r)
    1027                 {
    1028                     Ray& ray = reflectRays[r];
    1029                     if(ray.prim)
    1030                         CreateReflectPixel(pixels, ray, alloc);
    1031                 }
    1032                 refInfoIndex = 0;
    1033                 refRayIndex = 0;
    1034             }
    1035             if(depth == 0)
    1036             {
    1037                 if(alpha < 0.99f)
    1038                     color = color + _bkColorF * (1 - alpha);
    1039                 colorBuf[i] = color;
    1040             }
    1041             else
    1042                 colorBuf[i] += color;
    1043         }
    1044 
    1045         if(!hasReflection)
    1046             break;
    1047     }
    1048 
    1049     for(int i = 0; i < tileSize; ++i)
    1050     {
    1051         __m128i icolor = _mm_cvttps_epi32(_mm_rsqrt_ps(colorBuf[i].m) * colorBuf[i].m * m128(255));
    1052         icolor = _mm_packs_epi32(icolor, icolor);
    1053         icolor = _mm_packus_epi16(icolor, icolor);
    1054         
    1055         *((DWORD*)((BYTE*)target + pitch * (i / 16)) + (i % 16)) = _mm_cvtsi128_si32(icolor);
    1056     }
    1057 }
    1058 
    1059 void RenderContext::DrawPrimitive(TriVertex** p, TrianglePrim& tri)
    1060 {
    1061     if((p[2]->pos.x - p[0]->pos.x) * (p[1]->pos.y - p[0]->pos.y)
    1062         - (p[1]->pos.x - p[0]->pos.x) * (p[2]->pos.y - p[0]->pos.y) <= 0)
    1063         return;
    1064 
    1065     Float3 edge[3];
    1066     edge[0] = CalcEdge(p[0]->pos, p[1]->pos);
    1067     edge[1] = CalcEdge(p[1]->pos, p[2]->pos);
    1068     edge[2] = CalcEdge(p[2]->pos, p[0]->pos);
    1069 
    1070     TriPrimitive* prim = (TriPrimitive*)MemoryHeapMT::Alloc(sizeof(TriPrimitive));
    1071     prim->prim = tri.prim;
    1072     prim->maxRhw = max(max(p[0]->pos.w, p[1]->pos.w), p[2]->pos.w);
    1073     prim->triIndex = tri.triIndex;
    1074     prim->p0.x = p[0]->pos.x;
    1075     prim->p0.y = p[0]->pos.y;
    1076     for(int i = 0; i < 3; ++i)
    1077     {
    1078         prim->ea[i] = -edge[i].x;
    1079         prim->eb[i] = -edge[i].y;
    1080         //prim->edge[i].x = -edge[i].x;
    1081         //prim->edge[i].y = -edge[i].y;
    1082     }
    1083     __m128 A = m128(1 / ((p[0]->pos.x - p[1]->pos.x) * (p[0]->pos.y - p[2]->pos.y)
    1084                     - (p[0]->pos.y - p[1]->pos.y) * (p[0]->pos.x - p[2]->pos.x)));
    1085     __m128 attr[3];
    1086     for(int i = 0; i < 3; ++i)
    1087         attr[i] = _mm_loadu_ps(&p[i]->pos.w);
    1088 
    1089     _mm_storeu_ps(prim->a, A * (m128(edge[0].x) * attr[2] + m128(edge[1].x) * attr[0] + m128(edge[2].x) * attr[1]));
    1090     _mm_storeu_ps(prim->b, A * (m128(edge[0].y) * attr[2] + m128(edge[1].y) * attr[0] + m128(edge[2].y) * attr[1]));
    1091     prim->c[0] = p[0]->pos.w;
    1092     prim->c[1] = p[0]->uv.x;
    1093     prim->c[2] = p[0]->uv.y;
    1094 
    1095     __m128 maxP = Min(Max(Max(p[0]->pos.m, p[1]->pos.m), p[2]->pos.m) + m128(1.5f),
    1096                       _mm_set_ps(0, 0, ScreenHeight, ScreenWidth));
    1097     __m128 minP = Max(Min(Min(p[0]->pos.m, p[1]->pos.m), p[2]->pos.m) - m128(0.5f), m128(0));
    1098 
    1099     __m128i bound = _mm_cvtps_epi32(_mm_unpacklo_ps(minP, maxP));
    1100 
    1101     bound = _mm_add_epi32(bound, _mm_set_epi32(TILE_HEIGHT - 1, 0, TILE_WIDTH - 1, 0));
    1102     bound = _mm_and_si128(bound, _mm_set_epi32(~(TILE_HEIGHT - 1),
    1103                           ~(TILE_HEIGHT - 1), ~(TILE_WIDTH - 1), ~(TILE_WIDTH - 1)));
    1104 
    1105     edge[0] = -edge[0];
    1106     edge[1] = -edge[1];
    1107     edge[2] = -edge[2];
    1108 
    1109     const int& minX = ((int*)&bound)[0];
    1110     const int& maxX = ((int*)&bound)[1];
    1111     const int& minY = ((int*)&bound)[2];
    1112     const int& maxY = ((int*)&bound)[3];
    1113 
    1114     __m128 offX, offY;
    1115     __m128 ex = _mm_set_ps(0, edge[2].x, edge[1].x, edge[0].x);
    1116     __m128 ey = _mm_set_ps(0, edge[2].y, edge[1].y, edge[0].y);
    1117     __m128 ez = _mm_set_ps(0, edge[2].z, edge[1].z, edge[0].z);
    1118     offX = (ex > m128(0)) & m128(TILE_WIDTH);
    1119     offY = (ey > m128(0)) & m128(TILE_HEIGHT);
    1120 
    1121     PrimitiveTile* tile = _primTiles + (minY / TILE_HEIGHT) * _tileCol + (minX / TILE_WIDTH);
    1122 
    1123     bool trans = tri.prim->shader->IsTransprency();
    1124 
    1125     for(int y = minY; y < maxY; y += TILE_HEIGHT)
    1126     {
    1127         PrimitiveTile* tile2 = tile;
    1128         for(int x = minX; x < maxX; x += TILE_WIDTH)
    1129         {
    1130             if(MoveMask((m128(x) + offX) * ex + (m128(y) + offY) * ey + ez) == 0)
    1131             {
    1132                 bool fullCovered = MoveMask((m128(x + TILE_WIDTH) - offX) * ex
    1133                                     + (m128(y + TILE_HEIGHT) - offY) * ey + ez) == 0;
    1134                 if(trans)
    1135                 {
    1136                     tile2->AddPrimitive(prim, Tanslusent, fullCovered);
    1137                 }
    1138                 else
    1139                 {
    1140                     if(fullCovered)
    1141                         tile2->InsertFullPrimitive(prim, prim->maxRhw);
    1142                     else
    1143                         tile2->AddPrimitive(prim, Opaque, false);
    1144                 }
    1145             }
    1146             tile2++;
    1147         }
    1148         tile += _tileCol;
    1149     }
    1150 }
    1151 
    1152 void RenderContext::Render()
    1153 {
    1154     ViewProjMatrix = _camera->GetViewProjMatrix();
    1155     _eye = Float4(_camera->GetEyePos(), 1);
    1156     DWORD startTime = ::timeGetTime();
    1157 
    1158     struct VertexProcess
    1159     {
    1160         LONG index;
    1161         RenderContext* rc;
    1162         TrianglePrim* prims;
    1163         int triCount;
    1164 
    1165         static void Run(int id, void* _context)
    1166         {
    1167             VertexProcess* context = (VertexProcess*)_context;
    1168             RenderContext* rc = context->rc;
    1169                 
    1170             while(true)
    1171             {
    1172                 LONG index = ::InterlockedIncrement(&context->index) - 1;
    1173                 int c = index * 64;
    1174                 if(c >= context->triCount)
    1175                     break;
    1176                 int e = min(c + 64, context->triCount);
    1177                 for(int i = c; i < e; ++i)
    1178                     rc->DrawTriangle(context->prims[i]);
    1179             }
    1180         }
    1181 
    1182         VertexProcess()
    1183         {
    1184             index = 0;
    1185         }
    1186     };
    1187 
    1188     VertexProcess p;
    1189     p.rc = this;
    1190     p.prims = _accelStruct.GetPrims(&p.triCount);
    1191     Parallel::Run(VertexProcess::Run, &p);
    1192 
    1193     LogInfo("Vertex Process time: %d\n", ::timeGetTime() - startTime);
    1194     startTime = ::timeGetTime();
    1195 
    1196 
    1197     struct PixelProcess
    1198     {
    1199         LONG index;
    1200         RenderContext* rc;
    1201 
    1202         static void Run(int id, void* _context)
    1203         {
    1204             PixelProcess* context = (PixelProcess*)_context;
    1205             RenderContext* rc = context->rc;
    1206             while(true)
    1207             {
    1208                 LONG index = ::InterlockedIncrement(&context->index) - 1;
    1209                 if(index >= (rc->_tileCol * rc->_tileRow))
    1210                     break;
    1211 
    1212                 int col = index % (rc->_tileCol);
    1213                 int row = index / (rc->_tileCol);
    1214 
    1215                 int x = col * TILE_WIDTH;
    1216                 int y = row * TILE_HEIGHT;
    1217                 index = row * rc->_tileCol + col;
    1218 
    1219                 rc->RasterTile(rc->_primTiles + index, x, y,
    1220                     (*rc->_renderTarget)[y] + x, rc->_renderTarget->pitch);
    1221             }
    1222         }
    1223 
    1224         PixelProcess()
    1225         {
    1226             index = 0;
    1227         }
    1228     };
    1229 
    1230     PixelProcess pp;
    1231     pp.rc = this;
    1232     Parallel::Run(PixelProcess::Run, &pp);
    1233 
    1234     LogInfo("Pixel Process time: %d\n", ::timeGetTime() - startTime);
    1235 }
  • 相关阅读:
    The password has to have a minimum of 6 characters, including at least 1 small letter, 1 uppercase letter and 1 number
    Angular i18n的技术分享、踩过的坑
    转: .Net 4.0 ExpandoObject 使用
    min_square
    KALMAN PYTHON
    双系统安装 win + ubuntu
    docker
    drl
    shell
    导航定位方案
  • 原文地址:https://www.cnblogs.com/Hybird3D/p/Rasterization_in_Hybird3D.html
Copyright © 2020-2023  润新知