diff --git a/vsfilter/subtitles/Rasterizer.cpp b/vsfilter/subtitles/Rasterizer.cpp index 515643629..6212d645b 100644 --- a/vsfilter/subtitles/Rasterizer.cpp +++ b/vsfilter/subtitles/Rasterizer.cpp @@ -773,8 +773,21 @@ bool Rasterizer::Rasterize(int xsub, int ysub, bool fBlur) static __forceinline void pixmix(DWORD *dst, DWORD color, DWORD alpha) { - int a = (((alpha)*(color>>24))>>12)&0xff; + int a = (((alpha)*(color>>24))>>6)&0xff; + // Make sure both a and ia are in range 1..256 for the >>8 operations below to be correct int ia = 256-a; + a+=1; + + *dst = ((((*dst&0x00ff00ff)*ia + (color&0x00ff00ff)*a)&0xff00ff00)>>8) + | ((((*dst&0x0000ff00)*ia + (color&0x0000ff00)*a)&0x00ff0000)>>8) + | ((((*dst>>8)&0x00ff0000)*ia)&0xff000000); +} + +static __forceinline void pixmix2(DWORD *dst, DWORD color, DWORD shapealpha, DWORD clipalpha) +{ + int a = (((shapealpha)*(clipalpha)*(color>>24))>>12)&0xff; + int ia = 256-a; + a+=1; *dst = ((((*dst&0x00ff00ff)*ia + (color&0x00ff00ff)*a)&0xff00ff00)>>8) | ((((*dst&0x0000ff00)*ia + (color&0x0000ff00)*a)&0x00ff0000)>>8) @@ -786,11 +799,30 @@ static __forceinline void pixmix(DWORD *dst, DWORD color, DWORD alpha) static __forceinline void pixmix_sse2(DWORD* dst, DWORD color, DWORD alpha) { - alpha = ((alpha * (color>>24)) >> 12) & 0xff; + alpha = (((alpha) * (color>>24)) >> 6) & 0xff; color &= 0xffffff; __m128i zero = _mm_setzero_si128(); - __m128i a = _mm_set1_epi32((alpha << 16) | (0x100 - alpha)); + __m128i a = _mm_set1_epi32(((alpha+1) << 16) | (0x100 - alpha)); + __m128i d = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*dst), zero); + __m128i s = _mm_unpacklo_epi8(_mm_cvtsi32_si128(color), zero); + __m128i r = _mm_unpacklo_epi16(d, s); + + r = _mm_madd_epi16(r, a); + r = _mm_srli_epi32(r, 8); + r = _mm_packs_epi32(r, r); + r = _mm_packus_epi16(r, r); + + *dst = (DWORD)_mm_cvtsi128_si32(r); +} + +static __forceinline void pixmix2_sse2(DWORD* dst, DWORD color, DWORD shapealpha, DWORD clipalpha) +{ + int alpha = (((shapealpha)*(clipalpha)*(color>>24))>>12)&0xff; + color &= 0xffffff; + + __m128i zero = _mm_setzero_si128(); + __m128i a = _mm_set1_epi32(((alpha+1) << 16) | (0x100 - alpha)); __m128i d = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*dst), zero); __m128i s = _mm_unpacklo_epi8(_mm_cvtsi32_si128(color), zero); __m128i r = _mm_unpacklo_epi16(d, s); @@ -813,7 +845,8 @@ static const __int64 _00ff00ff00ff00ff = 0x00ff00ff00ff00ffi64; // clipRect is a rectangular clip region to render inside. // pAlphaMask is an alpha clipping mask. // xsub and ysub ??? -// switchpts seems to be an array of interlaced colour switching coordinates/colours to switch to. +// switchpts seems to be an array of fill colours interlaced with coordinates. +// switchpts[i*2] contains a colour and switchpts[i*2+1] contains the coordinate to use that colour from // fBody tells whether to render the body of the subs. // fBorder tells whether to render the border of the subs. CRect Rasterizer::Draw(SubPicDesc& spd, CRect& clipRect, byte* pAlphaMask, int xsub, int ysub, const long* switchpts, bool fBody, bool fBorder) @@ -853,13 +886,16 @@ CRect Rasterizer::Draw(SubPicDesc& spd, CRect& clipRect, byte* pAlphaMask, int x // The alpha bitmap of the subtitles? const byte* src = mpOverlayBuffer + 2*(mOverlayWidth * yo + xo); + // s points to what the "body" to use is + // If we're rendering body fill and border, src+1 points to the array of + // widened regions which contain both border and fill in one. const byte* s = fBorder ? (src+1) : src; // The complex "vector clip mask" I think. const byte* am = pAlphaMask + spd.w * y + x; // How would this differ from src? unsigned long* dst = (unsigned long *)((char *)spd.bits + spd.pitch * y) + x; - // ??? What is switchpts ? + // Grab the first colour unsigned long color = switchpts[0]; // CPUID from VDub @@ -871,23 +907,24 @@ CRect Rasterizer::Draw(SubPicDesc& spd, CRect& clipRect, byte* pAlphaMask, int x // Basic case of no complex clipping mask if(!pAlphaMask) { - // Again, what is switchpts? + // If the first colour switching coordinate is at "infinite" we're + // never switching and can use some simpler code. + // ??? Is this optimisation really worth the extra readability issues it adds? if(switchpts[1] == 0xffffffff) { - // Are we rendering the fill or a border/shadow? I think... + // fBody is true if we're rendering a fill or a shadow. if(fBody) { // Run over every pixel, overlaying the subtitles with the fill colour if(fSSE2) for(int wt=0; wt= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];} - pixmix_sse2(&dst[wt], color, s[wt*2]<<6); + pixmix_sse2(&dst[wt], color, s[wt*2]); } else for(int wt=0; wt= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];} - pixmix(&dst[wt], color, s[wt*2]<<6); + pixmix(&dst[wt], color, s[wt*2]); } } // Not body @@ -939,13 +976,13 @@ CRect Rasterizer::Draw(SubPicDesc& spd, CRect& clipRect, byte* pAlphaMask, int x for(int wt=0; wt= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];} - pixmix_sse2(&dst[wt], color, (src[wt*2+1] - src[wt*2])<<6); + pixmix_sse2(&dst[wt], color, src[wt*2+1] - src[wt*2]); } else for(int wt=0; wt= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];} - pixmix(&dst[wt], color, (src[wt*2+1] - src[wt*2])<<6); + pixmix(&dst[wt], color, src[wt*2+1] - src[wt*2]); } } } @@ -957,12 +994,6 @@ CRect Rasterizer::Draw(SubPicDesc& spd, CRect& clipRect, byte* pAlphaMask, int x { if(fBody) { - /*const byte* s = fBorder?(src+1):src; - - for(int wt=0; wt= sw[1]) - { - while(wt+xo >= sw[1]) sw += 2; - color = sw[-2]; - } - - pixmix2(s[wt*2]); - }*/ if(fSSE2) for(int wt=0; wt= sw[1]) sw += 2; color = sw[-2]; } - pixmix_sse2(&dst[wt], color, s[wt*2] * am[wt]); + pixmix2_sse2(&dst[wt], color, s[wt*2], am[wt]); } else for(int wt=0; wt= sw[1]) sw += 2; color = sw[-2]; } - pixmix(&dst[wt], color, s[wt*2] * am[wt]); + pixmix2(&dst[wt], color, s[wt*2], am[wt]); } } else { - /*for(int wt=0; wt= sw[1]) - { - while(wt+xo >= sw[1]) sw += 2; - color = sw[-2]; - } - - pixmix2(src[wt*2+1]-src[wt*2]); - }*/ if(fSSE2) for(int wt=0; wt= sw[1]) sw += 2; color = sw[-2]; } - pixmix_sse2(&dst[wt], color, (src[wt*2+1] - src[wt*2]) * am[wt]); + pixmix2_sse2(&dst[wt], color, src[wt*2+1] - src[wt*2], am[wt]); } else for(int wt=0; wt= sw[1]) sw += 2; color = sw[-2]; } - pixmix(&dst[wt], color, (src[wt*2+1] - src[wt*2]) * am[wt]); + pixmix2(&dst[wt], color, src[wt*2+1] - src[wt*2], am[wt]); } } }