Aegisub/vsfilter/dsutil/convert_a.asm

; Avisynth v2.5.  Copyright 2002 Ben Rudiak-Gould et al.
; http://www.avisynth.org
;
; This program is free software; you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation; either version 2 of the License, or
; (at your option) any later version.
;
; This program is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with this program; if not, write to the Free Software
; Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
; http://www.gnu.org/copyleft/gpl.html .
;
; Linking Avisynth statically or dynamically with other modules is making a
; combined work based on Avisynth.  Thus, the terms and conditions of the GNU
; General Public License cover the whole combination.
;
; As a special exception, the copyright holders of Avisynth give you
; permission to link Avisynth with independent modules that communicate with
; Avisynth solely through the interfaces defined in avisynth.h, regardless of the license
; terms of these independent modules, and to copy and distribute the
; resulting combined work under terms of your choice, provided that
; every copy of the combined work is accompanied by a complete copy of
; the source code of Avisynth (the version of Avisynth used to produce the
; combined work), being distributed under the terms of the GNU General
; Public License plus this exception.  An independent module is a module
; which is not derived from or based on Avisynth, such as 3rd-party filters,
; import and export plugins, or graphical user interfaces.

	.586
	.mmx
	.model	flat

; alignment has to be 'page' so that I can use 'align 32' below

_TEXT64	segment	page public use32 'CODE'

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	align	8

yuv2rgb_constants:

x0000_0000_0010_0010	dq	00000000000100010h
x0080_0080_0080_0080	dq	00080008000800080h
x00FF_00FF_00FF_00FF	dq	000FF00FF00FF00FFh
x00002000_00002000	dq	00000200000002000h
xFF000000_FF000000	dq	0FF000000FF000000h
cy			dq	000004A8500004A85h
crv			dq	03313000033130000h
cgu_cgv			dq	0E5FCF377E5FCF377h
cbu			dq	00000408D0000408Dh

yuv2rgb_constants_rec709:

	dq	00000000000100010h
	dq	00080008000800080h
	dq	000FF00FF00FF00FFh
	dq	00000200000002000h
	dq	0FF000000FF000000h
	dq	000004A8500004A85h
	dq	03960000039600000h
	dq	0EEF5F930EEF5F930h
	dq	00000439B0000439Bh

ofs_x0000_0000_0010_0010 = 0
ofs_x0080_0080_0080_0080 = 8
ofs_x00FF_00FF_00FF_00FF = 16
ofs_x00002000_00002000 = 24
ofs_xFF000000_FF000000 = 32
ofs_cy = 40
ofs_crv = 48
ofs_cgu_cgv = 56
ofs_cbu = 64

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

GET_Y	MACRO	mma,uyvy
IF &uyvy
	psrlw		mma,8
ELSE
	pand		mma,[edx+ofs_x00FF_00FF_00FF_00FF]
ENDIF
	ENDM

GET_UV	MACRO	mma,uyvy
	GET_Y		mma,1-uyvy
	ENDM

YUV2RGB_INNER_LOOP	MACRO	uyvy,rgb32,no_next_pixel

;; This YUV422->RGB conversion code uses only four MMX registers per
;; source dword, so I convert two dwords in parallel.  Lines corresponding
;; to the "second pipe" are indented an extra space.  There's almost no
;; overlap, except at the end and in the three lines marked ***.
;; revised 4july,2002 to properly set alpha in rgb32 to default "on" & other small memory optimizations

	movd		mm0, dword ptr [esi]
	 movd		 mm5, dword ptr [esi+4]
	movq		mm1,mm0
	GET_Y		mm0,&uyvy	; mm0 = __________Y1__Y0
	 movq		 mm4,mm5
	GET_UV		mm1,&uyvy	; mm1 = __________V0__U0
	 GET_Y		 mm4,&uyvy
	movq		mm2,mm5		; *** avoid reload from [esi+4]
	 GET_UV		 mm5,&uyvy
	psubw		mm0, qword ptr [edx+ofs_x0000_0000_0010_0010]
	 movd		 mm6, dword ptr [esi+8-4*(no_next_pixel)]
	GET_UV		mm2,&uyvy	; mm2 = __________V2__U2
	 psubw		 mm4, qword ptr [edx+ofs_x0000_0000_0010_0010]
	paddw		mm2,mm1
	 GET_UV		 mm6,&uyvy
	psubw		mm1, qword ptr [edx+ofs_x0080_0080_0080_0080]
	 paddw		 mm6,mm5
	psllq		mm2,32
	 psubw		 mm5, qword ptr [edx+ofs_x0080_0080_0080_0080]
	punpcklwd	mm0,mm2		; mm0 = ______Y1______Y0
	 psllq		 mm6,32
	pmaddwd		mm0, qword ptr [edx+ofs_cy]
	 punpcklwd	 mm4,mm6
	paddw		mm1,mm1
	 pmaddwd	 mm4, qword ptr [edx+ofs_cy]
	 paddw		 mm5,mm5
	paddw		mm1,mm2		; mm1 = __V1__U1__V0__U0 * 2
	paddd		mm0,[edx+ofs_x00002000_00002000]
	 paddw		 mm5,mm6
	movq		mm2,mm1
	 paddd		 mm4,[edx+ofs_x00002000_00002000]
	movq		mm3,mm1
	 movq		 mm6,mm5
	pmaddwd		mm1,[edx+ofs_crv]
	 movq		 mm7,mm5
	paddd		mm1,mm0
	 pmaddwd	 mm5,[edx+ofs_crv]
	psrad		mm1,14		; mm1 = RRRRRRRRrrrrrrrr
	 paddd		 mm5,mm4
	pmaddwd		mm2,[edx+ofs_cgu_cgv]
	 psrad		 mm5,14
	paddd		mm2,mm0
	 pmaddwd	 mm6,[edx+ofs_cgu_cgv]
	psrad		mm2,14		; mm2 = GGGGGGGGgggggggg
	 paddd		 mm6,mm4
	pmaddwd		mm3,[edx+ofs_cbu]
	 psrad		 mm6,14
	paddd		mm3,mm0
	 pmaddwd	 mm7,[edx+ofs_cbu]
       add	       esi,8
       add	       edi,12+4*rgb32
IFE &no_next_pixel
       cmp	       esi,ecx
ENDIF
	psrad		mm3,14		; mm3 = BBBBBBBBbbbbbbbb
	 paddd		 mm7,mm4
	pxor		mm0,mm0
	 psrad		 mm7,14
	packssdw	mm3,mm2	; mm3 = GGGGggggBBBBbbbb
	 packssdw	 mm7,mm6
	packssdw	mm1,mm0	; mm1 = ________RRRRrrrr
	 packssdw	 mm5,mm0	; *** avoid pxor mm4,mm4
	movq		mm2,mm3
	 movq		 mm6,mm7
	punpcklwd	mm2,mm1	; mm2 = RRRRBBBBrrrrbbbb
	 punpcklwd	 mm6,mm5
	punpckhwd	mm3,mm1	; mm3 = ____GGGG____gggg
	 punpckhwd	 mm7,mm5
	movq		mm0,mm2
	 movq		 mm4,mm6
	punpcklwd	mm0,mm3	; mm0 = ____rrrrggggbbbb
	 punpcklwd	 mm4,mm7
IFE &rgb32
	psllq		mm0,16
	 psllq		 mm4,16
ENDIF
	punpckhwd	mm2,mm3	; mm2 = ____RRRRGGGGBBBB
	 punpckhwd	 mm6,mm7
	packuswb	mm0,mm2	; mm0 = __RRGGBB__rrggbb <- ta dah!
	 packuswb	 mm4,mm6

IF &rgb32
	por mm0, [edx+ofs_xFF000000_FF000000]	 ; set alpha channels "on"
	 por mm4, [edx+ofs_xFF000000_FF000000]
	movq	[edi-16],mm0	; store the quadwords independently
	 movq	 [edi-8],mm4
ELSE
	psrlq	mm0,8		; pack the two quadwords into 12 bytes
	psllq	mm4,8		; (note: the two shifts above leave
	movd	dword ptr [edi-12],mm0	; mm0,4 = __RRGGBBrrggbb__)
	psrlq	mm0,32
	por	mm4,mm0
	movd	dword ptr [edi-8],mm4
	psrlq	mm4,32
	movd	dword ptr [edi-4],mm4
ENDIF

	ENDM

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

YUV2RGB_PROC	MACRO	procname,uyvy,rgb32

	PUBLIC	C _&procname

;;void __cdecl procname(
;;	[esp+ 4] const BYTE* src,
;;	[esp+ 8] BYTE* dst,
;;	[esp+12] const BYTE* src_end,
;;	[esp+16] int src_pitch,
;;	[esp+20] int row_size,
;;	[esp+24] bool rec709);

_&procname	PROC

	push	esi
	push	edi
	push	ebx

	mov	eax,[esp+16+12]
	mov	esi,[esp+12+12]		; read source bottom-up
	mov	edi,[esp+8+12]
	mov	ebx,[esp+20+12]
	mov	edx,offset yuv2rgb_constants
	test	byte ptr [esp+24+12],1
	jz	loop0
	mov	edx,offset yuv2rgb_constants_rec709

loop0:
	sub	esi,eax
	lea	ecx,[esi+ebx-8]

	align 32
loop1:
	YUV2RGB_INNER_LOOP	uyvy,rgb32,0
	jb	loop1

	YUV2RGB_INNER_LOOP	uyvy,rgb32,1

	sub	esi,ebx
	cmp	esi,[esp+4+12]
	ja	loop0

	emms
	pop	ebx
	pop	edi
	pop	esi
	retn

_&procname	ENDP

	ENDM

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

YUV2RGB_PROC	mmx_YUY2toRGB24,0,0
YUV2RGB_PROC	mmx_YUY2toRGB32,0,1

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	END