// VirtualDub - Video processing and capture application // Copyright (C) 1998-2001 Avery Lee // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. // // Notes: // - BitBltFromI420ToRGB is from VirtualDub // - The core assembly function of CCpuID is from DVD2AVI // - sse2 yv12 to yuy2 conversion by Haali // (- vd.cpp/h should be renamed to something more sensible already :) #include "stdafx.h" #include "vd.h" #pragma warning(disable : 4799) // no emms... blahblahblah CCpuID::CCpuID() { DWORD flags = 0; __asm { mov eax, 1 cpuid test edx, 0x00800000 // STD MMX jz TEST_SSE or [flags], 1 TEST_SSE: test edx, 0x02000000 // STD SSE jz TEST_SSE2 or [flags], 2 or [flags], 4 TEST_SSE2: test edx, 0x04000000 // SSE2 jz TEST_3DNOW or [flags], 8 TEST_3DNOW: mov eax, 0x80000001 cpuid test edx, 0x80000000 // 3D NOW jz TEST_SSEMMX or [flags], 16 TEST_SSEMMX: test edx, 0x00400000 // SSE MMX jz TEST_END or [flags], 2 TEST_END: } m_flags = (flag_t)flags; } CCpuID g_cpuid; void memcpy_accel(void* dst, const void* src, size_t len) { if((g_cpuid.m_flags & CCpuID::flag_t::ssefpu) && len >= 128 && !((DWORD)src&15) && !((DWORD)dst&15)) { __asm { mov esi, dword ptr [src] mov edi, dword ptr [dst] mov ecx, len shr ecx, 7 memcpy_accel_sse_loop: prefetchnta [esi+16*8] movaps xmm0, [esi] movaps xmm1, [esi+16*1] movaps xmm2, [esi+16*2] movaps xmm3, [esi+16*3] movaps xmm4, [esi+16*4] movaps xmm5, [esi+16*5] movaps xmm6, [esi+16*6] movaps xmm7, [esi+16*7] movntps [edi], xmm0 movntps [edi+16*1], xmm1 movntps [edi+16*2], xmm2 movntps [edi+16*3], xmm3 movntps [edi+16*4], xmm4 movntps [edi+16*5], xmm5 movntps [edi+16*6], xmm6 movntps [edi+16*7], xmm7 add esi, 128 add edi, 128 dec ecx jne memcpy_accel_sse_loop mov ecx, len and ecx, 127 cmp ecx, 0 je memcpy_accel_sse_end memcpy_accel_sse_loop2: mov dl, byte ptr[esi] mov byte ptr[edi], dl inc esi inc edi dec ecx jne memcpy_accel_sse_loop2 memcpy_accel_sse_end: emms sfence } } else if((g_cpuid.m_flags & CCpuID::flag_t::mmx) && len >= 64 && !((DWORD)src&7) && !((DWORD)dst&7)) { __asm { mov esi, dword ptr [src] mov edi, dword ptr [dst] mov ecx, len shr ecx, 6 memcpy_accel_mmx_loop: movq mm0, qword ptr [esi] movq mm1, qword ptr [esi+8*1] movq mm2, qword ptr [esi+8*2] movq mm3, qword ptr [esi+8*3] movq mm4, qword ptr [esi+8*4] movq mm5, qword ptr [esi+8*5] movq mm6, qword ptr [esi+8*6] movq mm7, qword ptr [esi+8*7] movq qword ptr [edi], mm0 movq qword ptr [edi+8*1], mm1 movq qword ptr [edi+8*2], mm2 movq qword ptr [edi+8*3], mm3 movq qword ptr [edi+8*4], mm4 movq qword ptr [edi+8*5], mm5 movq qword ptr [edi+8*6], mm6 movq qword ptr [edi+8*7], mm7 add esi, 64 add edi, 64 loop memcpy_accel_mmx_loop mov ecx, len and ecx, 63 cmp ecx, 0 je memcpy_accel_mmx_end memcpy_accel_mmx_loop2: mov dl, byte ptr [esi] mov byte ptr [edi], dl inc esi inc edi dec ecx jne memcpy_accel_mmx_loop2 memcpy_accel_mmx_end: emms } } else { memcpy(dst, src, len); } } bool BitBltFromI420ToI420(int w, int h, BYTE* dsty, BYTE* dstu, BYTE* dstv, int dstpitch, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch) { if((w&1)) return(false); if(w > 0 && w == srcpitch && w == dstpitch) { memcpy_accel(dsty, srcy, h*srcpitch); memcpy_accel(dstu, srcu, h/2*srcpitch/2); memcpy_accel(dstv, srcv, h/2*srcpitch/2); } else { int pitch = min(abs(srcpitch), abs(dstpitch)); for(int y = 0; y < h; y++, srcy += srcpitch, dsty += dstpitch) memcpy_accel(dsty, srcy, pitch); srcpitch >>= 1; dstpitch >>= 1; pitch = min(abs(srcpitch), abs(dstpitch)); for(int y = 0; y < h; y+=2, srcu += srcpitch, dstu += dstpitch) memcpy_accel(dstu, srcu, pitch); for(int y = 0; y < h; y+=2, srcv += srcpitch, dstv += dstpitch) memcpy_accel(dstv, srcv, pitch); } return true; } bool BitBltFromYUY2ToYUY2(int w, int h, BYTE* dst, int dstpitch, BYTE* src, int srcpitch) { if(w > 0 && w == srcpitch && w == dstpitch) { memcpy_accel(dst, src, h*srcpitch); } else { int pitch = min(abs(srcpitch), abs(dstpitch)); for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch) memcpy_accel(dst, src, pitch); } return(true); } extern "C" void asm_YUVtoRGB32_row(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width); extern "C" void asm_YUVtoRGB24_row(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width); extern "C" void asm_YUVtoRGB16_row(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width); extern "C" void asm_YUVtoRGB32_row_MMX(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width); extern "C" void asm_YUVtoRGB24_row_MMX(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width); extern "C" void asm_YUVtoRGB16_row_MMX(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width); extern "C" void asm_YUVtoRGB32_row_ISSE(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width); extern "C" void asm_YUVtoRGB24_row_ISSE(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width); extern "C" void asm_YUVtoRGB16_row_ISSE(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width); bool BitBltFromI420ToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch) { if(w<=0 || h<=0 || (w&1) || (h&1)) return(false); void (*asm_YUVtoRGB_row)(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width) = NULL;; if((g_cpuid.m_flags & CCpuID::ssefpu) && !(w&7)) { switch(dbpp) { case 16: asm_YUVtoRGB_row = asm_YUVtoRGB16_row/*_ISSE*/; break; // TODO: fix _ISSE (555->565) case 24: asm_YUVtoRGB_row = asm_YUVtoRGB24_row_ISSE; break; case 32: asm_YUVtoRGB_row = asm_YUVtoRGB32_row_ISSE; break; } } else if((g_cpuid.m_flags & CCpuID::mmx) && !(w&7)) { switch(dbpp) { case 16: asm_YUVtoRGB_row = asm_YUVtoRGB16_row/*_MMX*/; break; // TODO: fix _MMX (555->565) case 24: asm_YUVtoRGB_row = asm_YUVtoRGB24_row_MMX; break; case 32: asm_YUVtoRGB_row = asm_YUVtoRGB32_row_MMX; break; } } else { switch(dbpp) { case 16: asm_YUVtoRGB_row = asm_YUVtoRGB16_row; break; case 24: asm_YUVtoRGB_row = asm_YUVtoRGB24_row; break; case 32: asm_YUVtoRGB_row = asm_YUVtoRGB32_row; break; } } if(!asm_YUVtoRGB_row) return(false); do { asm_YUVtoRGB_row(dst + dstpitch, dst, srcy + srcpitch, srcy, srcu, srcv, w/2); dst += 2*dstpitch; srcy += srcpitch*2; srcu += srcpitch/2; srcv += srcpitch/2; } while(h -= 2); if(g_cpuid.m_flags & CCpuID::mmx) __asm emms if(g_cpuid.m_flags & CCpuID::ssefpu) __asm sfence return true; } static void yuvtoyuy2row_c(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width) { WORD* dstw = (WORD*)dst; for(; width > 1; width -= 2) { *dstw++ = (*srcu++<<8)|*srcy++; *dstw++ = (*srcv++<<8)|*srcy++; } } static void __declspec(naked) yuvtoyuy2row_MMX(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width) { __asm { push ebp push edi push esi push ebx mov edi, [esp+20] // dst mov ebp, [esp+24] // srcy mov ebx, [esp+28] // srcu mov esi, [esp+32] // srcv mov ecx, [esp+36] // width shr ecx, 3 yuvtoyuy2row_loop: movd mm0, [ebx] punpcklbw mm0, [esi] movq mm1, [ebp] movq mm2, mm1 punpcklbw mm1, mm0 punpckhbw mm2, mm0 movq [edi], mm1 movq [edi+8], mm2 add ebp, 8 add ebx, 4 add esi, 4 add edi, 16 dec ecx jnz yuvtoyuy2row_loop pop ebx pop esi pop edi pop ebp ret }; } static void yuvtoyuy2row_avg_c(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width, DWORD pitchuv) { WORD* dstw = (WORD*)dst; for(; width > 1; width -= 2, srcu++, srcv++) { *dstw++ = (((srcu[0]+srcu[pitchuv])>>1)<<8)|*srcy++; *dstw++ = (((srcv[0]+srcv[pitchuv])>>1)<<8)|*srcy++; } } static void __declspec(naked) yuvtoyuy2row_avg_MMX(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width, DWORD pitchuv) { static const __int64 mask = 0x7f7f7f7f7f7f7f7fi64; __asm { push ebp push edi push esi push ebx movq mm7, mask mov edi, [esp+20] // dst mov ebp, [esp+24] // srcy mov ebx, [esp+28] // srcu mov esi, [esp+32] // srcv mov ecx, [esp+36] // width mov eax, [esp+40] // pitchuv shr ecx, 3 yuvtoyuy2row_avg_loop: movd mm0, [ebx] punpcklbw mm0, [esi] movq mm1, mm0 movd mm2, [ebx + eax] punpcklbw mm2, [esi + eax] movq mm3, mm2 // (x+y)>>1 == (x&y)+((x^y)>>1) pand mm0, mm2 pxor mm1, mm3 psrlq mm1, 1 pand mm1, mm7 paddb mm0, mm1 movq mm1, [ebp] movq mm2, mm1 punpcklbw mm1, mm0 punpckhbw mm2, mm0 movq [edi], mm1 movq [edi+8], mm2 add ebp, 8 add ebx, 4 add esi, 4 add edi, 16 dec ecx jnz yuvtoyuy2row_avg_loop pop ebx pop esi pop edi pop ebp ret }; } static void __declspec(naked) yv12_yuy2_row_sse2() { __asm { // ebx - Y // edx - U // esi - V // edi - dest // ecx - halfwidth xor eax, eax one: movdqa xmm0, [ebx + eax*2] // YYYYYYYY movdqa xmm1, [ebx + eax*2 + 16] // YYYYYYYY movdqa xmm2, [edx + eax] // UUUUUUUU movdqa xmm3, [esi + eax] // VVVVVVVV movdqa xmm4, xmm2 movdqa xmm5, xmm0 movdqa xmm6, xmm1 punpcklbw xmm2, xmm3 // VUVUVUVU punpckhbw xmm4, xmm3 // VUVUVUVU punpcklbw xmm0, xmm2 // VYUYVYUY punpcklbw xmm1, xmm4 punpckhbw xmm5, xmm2 punpckhbw xmm6, xmm4 movntdq [edi + eax*4], xmm0 movntdq [edi + eax*4 + 16], xmm5 movntdq [edi + eax*4 + 32], xmm1 movntdq [edi + eax*4 + 48], xmm6 add eax, 16 cmp eax, ecx jb one ret }; } static void __declspec(naked) yv12_yuy2_row_sse2_linear() { __asm { // ebx - Y // edx - U // esi - V // edi - dest // ecx - width // ebp - uv_stride xor eax, eax one: movdqa xmm0, [ebx + eax*2] // YYYYYYYY movdqa xmm1, [ebx + eax*2 + 16] // YYYYYYYY movdqa xmm2, [edx] movdqa xmm3, [esi] pavgb xmm2, [edx + ebp] // UUUUUUUU pavgb xmm3, [esi + ebp] // VVVVVVVV movdqa xmm4, xmm2 movdqa xmm5, xmm0 movdqa xmm6, xmm1 punpcklbw xmm2, xmm3 // VUVUVUVU punpckhbw xmm4, xmm3 // VUVUVUVU punpcklbw xmm0, xmm2 // VYUYVYUY punpcklbw xmm1, xmm4 punpckhbw xmm5, xmm2 punpckhbw xmm6, xmm4 movntdq [edi + eax*4], xmm0 movntdq [edi + eax*4 + 16], xmm5 movntdq [edi + eax*4 + 32], xmm1 movntdq [edi + eax*4 + 48], xmm6 add eax, 16 add edx, 16 add esi, 16 cmp eax, ecx jb one ret }; } static void __declspec(naked) yv12_yuy2_row_sse2_linear_interlaced() { __asm { // ebx - Y // edx - U // esi - V // edi - dest // ecx - width // ebp - uv_stride xor eax, eax one: movdqa xmm0, [ebx + eax*2] // YYYYYYYY movdqa xmm1, [ebx + eax*2 + 16] // YYYYYYYY movdqa xmm2, [edx] movdqa xmm3, [esi] pavgb xmm2, [edx + ebp*2] // UUUUUUUU pavgb xmm3, [esi + ebp*2] // VVVVVVVV movdqa xmm4, xmm2 movdqa xmm5, xmm0 movdqa xmm6, xmm1 punpcklbw xmm2, xmm3 // VUVUVUVU punpckhbw xmm4, xmm3 // VUVUVUVU punpcklbw xmm0, xmm2 // VYUYVYUY punpcklbw xmm1, xmm4 punpckhbw xmm5, xmm2 punpckhbw xmm6, xmm4 movntdq [edi + eax*4], xmm0 movntdq [edi + eax*4 + 16], xmm5 movntdq [edi + eax*4 + 32], xmm1 movntdq [edi + eax*4 + 48], xmm6 add eax, 16 add edx, 16 add esi, 16 cmp eax, ecx jb one ret }; } void __declspec(naked) yv12_yuy2_sse2(const BYTE *Y, const BYTE *U, const BYTE *V, int halfstride, unsigned halfwidth, unsigned height, BYTE *YUY2, int d_stride) { __asm { push ebx push esi push edi push ebp mov ebx, [esp + 20] // Y mov edx, [esp + 24] // U mov esi, [esp + 28] // V mov edi, [esp + 44] // D mov ebp, [esp + 32] // uv_stride mov ecx, [esp + 36] // uv_width mov eax, ecx add eax, 15 and eax, 0xfffffff0 sub [esp + 32], eax cmp dword ptr [esp + 40], 2 jbe last2 row: sub dword ptr [esp + 40], 2 call yv12_yuy2_row_sse2 lea ebx, [ebx + ebp*2] add edi, [esp + 48] call yv12_yuy2_row_sse2_linear add edx, [esp + 32] add esi, [esp + 32] lea ebx, [ebx + ebp*2] add edi, [esp + 48] cmp dword ptr [esp + 40], 2 ja row last2: call yv12_yuy2_row_sse2 dec dword ptr [esp + 40] jz done lea ebx, [ebx + ebp*2] add edi, [esp + 48] call yv12_yuy2_row_sse2 done: pop ebp pop edi pop esi pop ebx ret }; } void __declspec(naked) yv12_yuy2_sse2_interlaced(const BYTE *Y, const BYTE *U, const BYTE *V, int halfstride, unsigned halfwidth, unsigned height, BYTE *YUY2, int d_stride) { __asm { push ebx push esi push edi push ebp mov ebx, [esp + 20] // Y mov edx, [esp + 24] // U mov esi, [esp + 28] // V mov edi, [esp + 44] // D mov ebp, [esp + 32] // uv_stride mov ecx, [esp + 36] // uv_width mov eax, ecx add eax, 15 and eax, 0xfffffff0 sub [esp + 32], eax cmp dword ptr [esp + 40], 4 jbe last4 row: sub dword ptr [esp + 40], 4 call yv12_yuy2_row_sse2 // first row, first field lea ebx, [ebx + ebp*2] add edi, [esp + 48] add edx, ebp add esi, ebp call yv12_yuy2_row_sse2 // first row, second field lea ebx, [ebx + ebp*2] add edi, [esp + 48] sub edx, ebp sub esi, ebp call yv12_yuy2_row_sse2_linear_interlaced // second row, first field add edx, [esp + 32] add esi, [esp + 32] lea ebx, [ebx + ebp*2] add edi, [esp + 48] call yv12_yuy2_row_sse2_linear_interlaced // second row, second field add edx, [esp + 32] add esi, [esp + 32] lea ebx, [ebx + ebp*2] add edi, [esp + 48] cmp dword ptr [esp + 40], 4 ja row last4: call yv12_yuy2_row_sse2 lea ebx, [ebx + ebp*2] add edi, [esp + 48] add edx, ebp add esi, ebp call yv12_yuy2_row_sse2 lea ebx, [ebx + ebp*2] add edi, [esp + 48] sub edx, ebp sub esi, ebp call yv12_yuy2_row_sse2 lea ebx, [ebx + ebp*2] add edi, [esp + 48] add edx, ebp add esi, ebp call yv12_yuy2_row_sse2 pop ebp pop edi pop esi pop ebx ret }; } bool BitBltFromI420ToYUY2(int w, int h, BYTE* dst, int dstpitch, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch, bool fInterlaced) { if(w<=0 || h<=0 || (w&1) || (h&1)) return(false); if(srcpitch == 0) srcpitch = w; if((g_cpuid.m_flags & CCpuID::sse2) && ((DWORD_PTR)srcy&15) && ((DWORD_PTR)srcu&15) && ((DWORD_PTR)srcv&15) && !(srcpitch&15) && ((DWORD_PTR)dst&15) && !(dstpitch&15)) { if(!fInterlaced) yv12_yuy2_sse2(srcy, srcu, srcv, srcpitch/2, w/2, h, dst, dstpitch); else yv12_yuy2_sse2_interlaced(srcy, srcu, srcv, srcpitch/2, w/2, h, dst, dstpitch); return true; } else { ASSERT(!fInterlaced); } void (*yuvtoyuy2row)(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width) = NULL; void (*yuvtoyuy2row_avg)(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width, DWORD pitchuv) = NULL; if((g_cpuid.m_flags & CCpuID::mmx) && !(w&7)) { yuvtoyuy2row = yuvtoyuy2row_MMX; yuvtoyuy2row_avg = yuvtoyuy2row_avg_MMX; } else { yuvtoyuy2row = yuvtoyuy2row_c; yuvtoyuy2row_avg = yuvtoyuy2row_avg_c; } if(!yuvtoyuy2row) return(false); do { yuvtoyuy2row(dst, srcy, srcu, srcv, w); yuvtoyuy2row_avg(dst + dstpitch, srcy + srcpitch, srcu, srcv, w, srcpitch/2); dst += 2*dstpitch; srcy += srcpitch*2; srcu += srcpitch/2; srcv += srcpitch/2; } while((h -= 2) > 2); yuvtoyuy2row(dst, srcy, srcu, srcv, w); yuvtoyuy2row(dst + dstpitch, srcy + srcpitch, srcu, srcv, w); if(g_cpuid.m_flags & CCpuID::mmx) __asm emms return(true); } bool BitBltFromRGBToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* src, int srcpitch, int sbpp) { if(dbpp == sbpp) { int rowbytes = w*dbpp>>3; if(rowbytes > 0 && rowbytes == srcpitch && rowbytes == dstpitch) { memcpy_accel(dst, src, h*rowbytes); } else { for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch) memcpy_accel(dst, src, rowbytes); } return(true); } if(sbpp != 16 && sbpp != 24 && sbpp != 32 || dbpp != 16 && dbpp != 24 && dbpp != 32) return(false); if(dbpp == 16) { for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch) { if(sbpp == 24) { BYTE* s = (BYTE*)src; WORD* d = (WORD*)dst; for(int x = 0; x < w; x++, s+=3, d++) *d = (WORD)(((*((DWORD*)s)>>8)&0xf800)|((*((DWORD*)s)>>5)&0x07e0)|((*((DWORD*)s)>>3)&0x1f)); } else if(sbpp == 32) { DWORD* s = (DWORD*)src; WORD* d = (WORD*)dst; for(int x = 0; x < w; x++, s++, d++) *d = (WORD)(((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x1f)); } } } else if(dbpp == 24) { for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch) { if(sbpp == 16) { WORD* s = (WORD*)src; BYTE* d = (BYTE*)dst; for(int x = 0; x < w; x++, s++, d+=3) { // not tested, r-g-b might be in reverse d[0] = (*s&0x001f)<<3; d[1] = (*s&0x07e0)<<5; d[2] = (*s&0xf800)<<8; } } else if(sbpp == 32) { BYTE* s = (BYTE*)src; BYTE* d = (BYTE*)dst; for(int x = 0; x < w; x++, s+=4, d+=3) {d[0] = s[0]; d[1] = s[1]; d[2] = s[2];} } } } else if(dbpp == 32) { for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch) { if(sbpp == 16) { WORD* s = (WORD*)src; DWORD* d = (DWORD*)dst; for(int x = 0; x < w; x++, s++, d++) *d = ((*s&0xf800)<<8)|((*s&0x07e0)<<5)|((*s&0x001f)<<3); } else if(sbpp == 24) { BYTE* s = (BYTE*)src; DWORD* d = (DWORD*)dst; for(int x = 0; x < w; x++, s+=3, d++) *d = *((DWORD*)s)&0xffffff; } } } return(true); } static void asm_blend_row_clipped_c(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) { BYTE* src2 = src + srcpitch; do {*dst++ = (*src++ + *src2++ + 1) >> 1;} while(w--); } static void asm_blend_row_c(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) { BYTE* src2 = src + srcpitch; BYTE* src3 = src2 + srcpitch; do {*dst++ = (*src++ + (*src2++ << 1) + *src3++ + 2) >> 2;} while(w--); } static void __declspec(naked) asm_blend_row_clipped_MMX(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) { static const __int64 _x0001000100010001 = 0x0001000100010001; __asm { push ebp push edi push esi push ebx mov edi,[esp+20] mov esi,[esp+24] sub edi,esi mov ebp,[esp+28] mov edx,[esp+32] shr ebp, 3 movq mm6, _x0001000100010001 pxor mm7, mm7 xloop: movq mm0, [esi] movq mm3, mm0 punpcklbw mm0, mm7 punpckhbw mm3, mm7 movq mm1, [esi+edx] movq mm4, mm1 punpcklbw mm1, mm7 punpckhbw mm4, mm7 paddw mm1, mm0 paddw mm1, mm6 psrlw mm1, 1 paddw mm4, mm3 paddw mm4, mm6 psrlw mm4, 1 add esi, 8 packuswb mm1, mm4 movq [edi+esi-8], mm1 dec ebp jne xloop pop ebx pop esi pop edi pop ebp ret }; } static void __declspec(naked) asm_blend_row_MMX(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) { static const __int64 mask0 = 0xfcfcfcfcfcfcfcfci64; static const __int64 mask1 = 0x7f7f7f7f7f7f7f7fi64; static const __int64 mask2 = 0x3f3f3f3f3f3f3f3fi64; static const __int64 _x0002000200020002 = 0x0002000200020002; __asm { push ebp push edi push esi push ebx mov edi, [esp+20] mov esi, [esp+24] sub edi, esi mov ebp, [esp+28] mov edx, [esp+32] shr ebp, 3 movq mm6, _x0002000200020002 pxor mm7, mm7 xloop: movq mm0, [esi] movq mm3, mm0 punpcklbw mm0, mm7 punpckhbw mm3, mm7 movq mm1, [esi+edx] movq mm4, mm1 punpcklbw mm1, mm7 punpckhbw mm4, mm7 movq mm2, [esi+edx*2] movq mm5, mm2 punpcklbw mm2, mm7 punpckhbw mm5, mm7 psllw mm1, 1 paddw mm1, mm0 paddw mm1, mm2 paddw mm1, mm6 psrlw mm1, 2 psllw mm4, 1 paddw mm4, mm3 paddw mm4, mm5 paddw mm4, mm6 psrlw mm4, 2 add esi, 8 packuswb mm1, mm4 movq [edi+esi-8], mm1 dec ebp jne xloop // sadly the original code makes a lot of visible banding artifacts on yuv // (it seems those shiftings without rounding introduce too much error) /* mov edi,[esp+20] mov esi,[esp+24] sub edi,esi mov ebp,[esp+28] mov edx,[esp+32] movq mm5,mask0 movq mm6,mask1 movq mm7,mask2 shr ebp,1 jz oddpart xloop: movq mm2,[esi] movq mm0,mm5 movq mm1,[esi+edx] pand mm0,mm2 psrlq mm1,1 movq mm2,[esi+edx*2] psrlq mm2,2 pand mm1,mm6 psrlq mm0,2 pand mm2,mm7 paddb mm0,mm1 add esi,8 paddb mm0,mm2 dec ebp movq [edi+esi-8],mm0 jne xloop oddpart: test byte ptr [esp+28],1 jz nooddpart mov ecx,[esi] mov eax,0fcfcfcfch mov ebx,[esi+edx] and eax,ecx shr ebx,1 mov ecx,[esi+edx*2] shr ecx,2 and ebx,07f7f7f7fh shr eax,2 and ecx,03f3f3f3fh add eax,ebx add eax,ecx mov [edi+esi],eax nooddpart: */ pop ebx pop esi pop edi pop ebp ret }; } __declspec(align(16)) static BYTE const_1_16_bytes[] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}; static void asm_blend_row_SSE2(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) { __asm { mov edx, srcpitch mov esi, src mov edi, dst sub edi, esi mov ecx, w mov ebx, ecx shr ecx, 4 and ebx, 15 movdqa xmm7, [const_1_16_bytes] asm_blend_row_SSE2_loop: movdqa xmm0, [esi] movdqa xmm1, [esi+edx] movdqa xmm2, [esi+edx*2] pavgb xmm0, xmm1 pavgb xmm2, xmm1 psubusb xmm0, xmm7 pavgb xmm0, xmm2 movdqa [esi+edi], xmm0 add esi, 16 dec ecx jnz asm_blend_row_SSE2_loop test ebx,15 jz asm_blend_row_SSE2_end mov ecx, ebx xor ax, ax xor bx, bx xor dx, dx asm_blend_row_SSE2_loop2: mov al, [esi] mov bl, [esi+edx] mov dl, [esi+edx*2] add ax, bx inc ax shr ax, 1 add dx, bx inc dx shr dx, 1 add ax, dx shr ax, 1 mov [esi+edi], al inc esi dec ecx jnz asm_blend_row_SSE2_loop2 asm_blend_row_SSE2_end: } } static void asm_blend_row_clipped_SSE2(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) { __asm { mov edx, srcpitch mov esi, src mov edi, dst sub edi, esi mov ecx, w mov ebx, ecx shr ecx, 4 and ebx, 15 movdqa xmm7, [const_1_16_bytes] asm_blend_row_clipped_SSE2_loop: movdqa xmm0, [esi] movdqa xmm1, [esi+edx] pavgb xmm0, xmm1 movdqa [esi+edi], xmm0 add esi, 16 dec ecx jnz asm_blend_row_clipped_SSE2_loop test ebx,15 jz asm_blend_row_clipped_SSE2_end mov ecx, ebx xor ax, ax xor bx, bx asm_blend_row_clipped_SSE2_loop2: mov al, [esi] mov bl, [esi+edx] add ax, bx inc ax shr ax, 1 mov [esi+edi], al inc esi dec ecx jnz asm_blend_row_clipped_SSE2_loop2 asm_blend_row_clipped_SSE2_end: } } void DeinterlaceBlend(BYTE* dst, BYTE* src, DWORD rowbytes, DWORD h, DWORD dstpitch, DWORD srcpitch) { void (*asm_blend_row_clipped)(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) = NULL; void (*asm_blend_row)(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) = NULL; if((g_cpuid.m_flags & CCpuID::sse2) && !((DWORD)src&0xf) && !((DWORD)dst&0xf) && !(srcpitch&0xf)) { asm_blend_row_clipped = asm_blend_row_clipped_SSE2; asm_blend_row = asm_blend_row_SSE2; } else if(g_cpuid.m_flags & CCpuID::mmx) { asm_blend_row_clipped = asm_blend_row_clipped_MMX; asm_blend_row = asm_blend_row_MMX; } else { asm_blend_row_clipped = asm_blend_row_clipped_c; asm_blend_row = asm_blend_row_c; } if(!asm_blend_row_clipped) return; asm_blend_row_clipped(dst, src, rowbytes, srcpitch); if((h -= 2) > 0) do { dst += dstpitch; asm_blend_row(dst, src, rowbytes, srcpitch); src += srcpitch; } while(--h); asm_blend_row_clipped(dst + dstpitch, src, rowbytes, srcpitch); if(g_cpuid.m_flags & CCpuID::mmx) __asm emms } void DeinterlaceBob(BYTE* dst, BYTE* src, DWORD rowbytes, DWORD h, DWORD dstpitch, DWORD srcpitch, bool topfield) { if(topfield) { BitBltFromRGBToRGB(rowbytes, h/2, dst, dstpitch*2, 8, src, srcpitch*2, 8); AvgLines8(dst, h, dstpitch); } else { BitBltFromRGBToRGB(rowbytes, h/2, dst + dstpitch, dstpitch*2, 8, src + srcpitch, srcpitch*2, 8); AvgLines8(dst + dstpitch, h-1, dstpitch); } } void AvgLines8(BYTE* dst, DWORD h, DWORD pitch) { if(h <= 1) return; BYTE* s = dst; BYTE* d = dst + (h-2)*pitch; for(; s < d; s += pitch*2) { BYTE* tmp = s; if((g_cpuid.m_flags & CCpuID::sse2) && !((DWORD)tmp&0xf) && !((DWORD)pitch&0xf)) { __asm { mov esi, tmp mov ebx, pitch mov ecx, ebx shr ecx, 4 AvgLines8_sse2_loop: movdqa xmm0, [esi] pavgb xmm0, [esi+ebx*2] movdqa [esi+ebx], xmm0 add esi, 16 dec ecx jnz AvgLines8_sse2_loop mov tmp, esi } for(int i = pitch&7; i--; tmp++) { tmp[pitch] = (tmp[0] + tmp[pitch<<1] + 1) >> 1; } } else if(g_cpuid.m_flags & CCpuID::mmx) { __asm { mov esi, tmp mov ebx, pitch mov ecx, ebx shr ecx, 3 pxor mm7, mm7 AvgLines8_mmx_loop: movq mm0, [esi] movq mm1, mm0 punpcklbw mm0, mm7 punpckhbw mm1, mm7 movq mm2, [esi+ebx*2] movq mm3, mm2 punpcklbw mm2, mm7 punpckhbw mm3, mm7 paddw mm0, mm2 psrlw mm0, 1 paddw mm1, mm3 psrlw mm1, 1 packuswb mm0, mm1 movq [esi+ebx], mm0 lea esi, [esi+8] dec ecx jnz AvgLines8_mmx_loop mov tmp, esi } for(int i = pitch&7; i--; tmp++) { tmp[pitch] = (tmp[0] + tmp[pitch<<1] + 1) >> 1; } } else { for(int i = pitch; i--; tmp++) { tmp[pitch] = (tmp[0] + tmp[pitch<<1] + 1) >> 1; } } } if(!(h&1) && h >= 2) { dst += (h-2)*pitch; memcpy_accel(dst + pitch, dst, pitch); } __asm emms; } void AvgLines555(BYTE* dst, DWORD h, DWORD pitch) { if(h <= 1) return; unsigned __int64 __0x7c007c007c007c00 = 0x7c007c007c007c00; unsigned __int64 __0x03e003e003e003e0 = 0x03e003e003e003e0; unsigned __int64 __0x001f001f001f001f = 0x001f001f001f001f; BYTE* s = dst; BYTE* d = dst + (h-2)*pitch; for(; s < d; s += pitch*2) { BYTE* tmp = s; __asm { mov esi, tmp mov ebx, pitch mov ecx, ebx shr ecx, 3 movq mm6, __0x03e003e003e003e0 movq mm7, __0x001f001f001f001f AvgLines555_loop: movq mm0, [esi] movq mm1, mm0 movq mm2, mm0 psrlw mm0, 10 // red1 bits: mm0 = 001f001f001f001f pand mm1, mm6 // green1 bits: mm1 = 03e003e003e003e0 pand mm2, mm7 // blue1 bits: mm2 = 001f001f001f001f movq mm3, [esi+ebx*2] movq mm4, mm3 movq mm5, mm3 psrlw mm3, 10 // red2 bits: mm3 = 001f001f001f001f pand mm4, mm6 // green2 bits: mm4 = 03e003e003e003e0 pand mm5, mm7 // blue2 bits: mm5 = 001f001f001f001f paddw mm0, mm3 psrlw mm0, 1 // (red1+red2)/2 psllw mm0, 10 // red bits at 7c007c007c007c00 paddw mm1, mm4 psrlw mm1, 1 // (green1+green2)/2 pand mm1, mm6 // green bits at 03e003e003e003e0 paddw mm2, mm5 psrlw mm2, 1 // (blue1+blue2)/2 // blue bits at 001f001f001f001f (no need to pand, lower bits were discareded) por mm0, mm1 por mm0, mm2 movq [esi+ebx], mm0 lea esi, [esi+8] dec ecx jnz AvgLines555_loop mov tmp, esi } for(int i = (pitch&7)>>1; i--; tmp++) { tmp[pitch] = ((((*tmp&0x7c00) + (tmp[pitch<<1]&0x7c00)) >> 1)&0x7c00)| ((((*tmp&0x03e0) + (tmp[pitch<<1]&0x03e0)) >> 1)&0x03e0)| ((((*tmp&0x001f) + (tmp[pitch<<1]&0x001f)) >> 1)&0x001f); } } if(!(h&1) && h >= 2) { dst += (h-2)*pitch; memcpy_accel(dst + pitch, dst, pitch); } __asm emms; } void AvgLines565(BYTE* dst, DWORD h, DWORD pitch) { if(h <= 1) return; unsigned __int64 __0xf800f800f800f800 = 0xf800f800f800f800; unsigned __int64 __0x07e007e007e007e0 = 0x07e007e007e007e0; unsigned __int64 __0x001f001f001f001f = 0x001f001f001f001f; BYTE* s = dst; BYTE* d = dst + (h-2)*pitch; for(; s < d; s += pitch*2) { WORD* tmp = (WORD*)s; __asm { mov esi, tmp mov ebx, pitch mov ecx, ebx shr ecx, 3 movq mm6, __0x07e007e007e007e0 movq mm7, __0x001f001f001f001f AvgLines565_loop: movq mm0, [esi] movq mm1, mm0 movq mm2, mm0 psrlw mm0, 11 // red1 bits: mm0 = 001f001f001f001f pand mm1, mm6 // green1 bits: mm1 = 07e007e007e007e0 pand mm2, mm7 // blue1 bits: mm2 = 001f001f001f001f movq mm3, [esi+ebx*2] movq mm4, mm3 movq mm5, mm3 psrlw mm3, 11 // red2 bits: mm3 = 001f001f001f001f pand mm4, mm6 // green2 bits: mm4 = 07e007e007e007e0 pand mm5, mm7 // blue2 bits: mm5 = 001f001f001f001f paddw mm0, mm3 psrlw mm0, 1 // (red1+red2)/2 psllw mm0, 11 // red bits at f800f800f800f800 paddw mm1, mm4 psrlw mm1, 1 // (green1+green2)/2 pand mm1, mm6 // green bits at 03e003e003e003e0 paddw mm2, mm5 psrlw mm2, 1 // (blue1+blue2)/2 // blue bits at 001f001f001f001f (no need to pand, lower bits were discareded) por mm0, mm1 por mm0, mm2 movq [esi+ebx], mm0 lea esi, [esi+8] dec ecx jnz AvgLines565_loop mov tmp, esi } for(int i = (pitch&7)>>1; i--; tmp++) { tmp[pitch] = ((((*tmp&0xf800) + (tmp[pitch<<1]&0xf800)) >> 1)&0xf800)| ((((*tmp&0x07e0) + (tmp[pitch<<1]&0x07e0)) >> 1)&0x07e0)| ((((*tmp&0x001f) + (tmp[pitch<<1]&0x001f)) >> 1)&0x001f); } } if(!(h&1) && h >= 2) { dst += (h-2)*pitch; memcpy_accel(dst + pitch, dst, pitch); } __asm emms; } extern "C" void mmx_YUY2toRGB24(const BYTE* src, BYTE* dst, const BYTE* src_end, int src_pitch, int row_size, bool rec709); extern "C" void mmx_YUY2toRGB32(const BYTE* src, BYTE* dst, const BYTE* src_end, int src_pitch, int row_size, bool rec709); bool BitBltFromYUY2ToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* src, int srcpitch) { void (* YUY2toRGB)(const BYTE* src, BYTE* dst, const BYTE* src_end, int src_pitch, int row_size, bool rec709) = NULL; if(g_cpuid.m_flags & CCpuID::mmx) { YUY2toRGB = dbpp == 32 ? mmx_YUY2toRGB32 : dbpp == 24 ? mmx_YUY2toRGB24 : // dbpp == 16 ? mmx_YUY2toRGB16 : // TODO NULL; } else { // TODO } if(!YUY2toRGB) return(false); YUY2toRGB(src, dst, src + h*srcpitch, srcpitch, w, false); return(true); }