;***************************************************************************
; unit:    raster      release 0.37                                        *
; purpose: general manipulation n dimensional matrices n = 1, 2 and 3.     *
;          Use this file or rasterc.c. You cannot link both files together *
; licency:     GPL or LGPL                                                 *
; Copyright: (c) 2021-2025 Jaroslav Fojtik                                 *
;***************************************************************************

.CODE             ;Indicates the start of a code segment.

	extern	swap_bits_xlat:BYTE
	extern	swap_bits2_xlat:BYTE


;void Conv1_4(BYTE *Dest, const BYTE *Src, unsigned Size1D)
        public  Conv1_4
Conv1_4 proc \
        uses rdi rsi
;       Dest:ptr byte, \
;       Src:ptr byte, \
;       count:DWORD

        mov     rdi,rcx		; rdi=destination pointer
	mov	rcx,R8		; R8=amount of pixels
	or	rcx,rcx
        jz	ToEnd     ; it's as if strings are equal
        mov     rsi,rdx		; rsi=source pointer

octet:  mov     al,[rsi]	; new octet

	cbw			; Extend 8th bit to AH
	mov	dl,ah
        and	dl,0F0h		; mask low nibble
        
        rol     al,1		; 40h
        cbw			; Extend 7th bit to AH
        and	ah,0Fh		; mask high nibble
        
        sub	rcx,2
        jbe	ToEnd2		; 0 or -1

        or	ah,dl
	mov	[rdi],ah	; store converted byte
	
	inc	rdi
        
        rol     al,1		; 20h
        cbw			; Extend 6th bit to AH
	mov	dl,ah
        and	dl,0F0h		; mask low nibble
        
        rol     al,1		; 10h
        cbw			; Extend 5th bit to AH
        and	ah,0Fh
        
        sub	rcx,2
        jbe	ToEnd2		; 0 or -1
        
        or	ah,dl	
	mov	[rdi],ah	; store converted byte

	inc	rdi   

        rol     al,1		; 08h
        cbw			; Extend 4th bit to AH	
	mov	dl,ah
        and	dl,0F0h		; mask low nibble
        
        rol     al,1		; 04h
	cbw			; Extend 3rd bit to AH
        and	ah,0Fh
        
	sub	rcx,2
        jbe	ToEnd2		; 0 or -1
        
        or	ah,dl	
	mov	[rdi],ah	; store converted byte

	inc	rdi        
        
        rol     al,1		; 02h
        cbw			; Extend 2nd bit to AH

	mov	dl,ah
        and	dl,0F0h		; mask low nibble
        
        rol     al,1		; 01h
        cbw			; Extend 1st bit to AH
	and	ah,0Fh
	
	sub	rcx,2
	jbe	ToEnd2
	
        or	ah,dl	        
	mov	[rdi],ah	; store converted byte
	
	inc	rsi
	inc	rdi
	jmp	octet
        
	
ToEnd2:	jnz	ToEnd3		; when NZ, store only a first nibble
	or	dl,ah
ToEnd3:	mov	[rdi],dl

ToEnd:	ret                     ; _cdecl return
        
Conv1_4 endp


;*************************************************************************************


;void Conv1_8(BYTE *Dest, const BYTE *Src, unsigned Size1D)
        public  Conv1_8
Conv1_8 proc \
        uses rsi
;       Dest:ptr byte,	RCX
;       Src:ptr byte,	RDX
;       count:DWORD	R8

        mov     rsi,rdx		; rsi=source pointer
        mov     rdx,rcx		; rdi=destination pointer ->rdx
	mov	rcx,R8		; R8=amount of pixels
        or	rcx,rcx
        jz	toend		; it's as if strings are equal

octet:  lodsb			; new octet        

	cbw			; Extend 8th bit to AH
	mov	[rdx],ah	; store converted byte	

	dec	rcx
        jz      toend
        
        rol     al,1		; 40h
        cbw			; Extend 7th bit to AH
	mov	[rdx+1],ah	; store converted byte	
	dec	rcx
        jz      toend
        
        rol     al,1		; 20h
        cbw			; Extend 6th bit to AH
	mov	[rdx+2],ah	; store converted byte
	dec	rcx
        jz      toend
        
        rol     al,1		; 10h
        cbw			; Extend 5th bit to AH
	mov	[rdx+3],ah	; store converted byte
	dec	rcx
        jz      toend

        rol     al,1		; 08h
        cbw			; Extend 4th bit to AH
	mov	[rdx+4],ah	; store converted byte
	dec	rcx
        jz      toend
        
        rol     al,1		; 04h
	cbw			; Extend 3rd bit to AH
	mov	[rdx+5],ah	; store converted byte
	dec	rcx
        jz      toend
        
        rol     al,1		; 02h
        cbw			; Extend 2nd bit to AH
	mov	[rdx+6],ah	; store converted byte
	dec	rcx
        jz      toend
        
        rol     al,1		; 01h
        cbw			; Extend 1st bit to AH
	mov	[rdx+7],ah	; store converted byte
	add	rdx,8

	dec	rcx
	jne	octet

toend:
        ret                     ; _cdecl return

Conv1_8 endp


;*************************************************************************************

        public  Conv1_16
Conv1_16 proc \
        uses rdi rsi
;       Dest:ptr byte,	RCX
;       Src:ptr byte,	RDX
;       count:DWORD	R8

        mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels
        mov     rsi,rdx		; rdx second pointer
        or	rcx,rcx
        jz	toend		; array has zero size

Octet:  mov     ah,[rsi]	; new octet
        
        cwd			; extend 8th bit to DX
        mov	[rdi],dx

	dec	rcx
        jz      toend
        
        rol	ax,1        
        cwd			; extend 7th bit to DX
        mov	[rdi+2],dx

	dec	rcx
        jz      toend
        
        rol	ax,1        
        cwd			; extend 6th bit to DX
        mov	[rdi+4],dx

	dec	rcx
        jz      toend
        
        rol	ax,1        
        cwd			; extend 5th bit to DX
        mov	[rdi+6],dx

	dec	rcx
        jz      toend

        rol	ax,1        
        cwd			; extend 4th bit to DX
        mov	[rdi+8],dx

	dec	rcx
        jz      toend
        
        rol	ax,1        
        cwd			; extend 3rd bit to DX
        mov	[rdi+10],dx

	dec	rcx
        jz      toend
        
        rol	ax,1        
        cwd			; extend 2nd bit to DX
        mov	[rdi+12],dx

	dec	rcx
        jz      toend
        
        rol	ax,1        
        cwd			; extend 1st bit to DX
        mov	[rdi+14],dx

        inc	rsi
        add	rdi,16

	dec	rcx
	jnz	Octet

toend:
        ret                     ; _cdecl return

Conv1_16 endp



;void Conv1_24(BYTE *Dest, const BYTE *Src, unsigned Size1D)
        public  Conv1_24
Conv1_24 proc \
        uses rdi rsi
;       Dest:ptr byte,	RCX
;       Src:ptr byte,	RDX
;       count:DWORD	R8

        mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels
        mov     rsi,rdx		; rdx second destination pointer
        or	rcx,rcx
        jz	toend		; array has zero size

octet:  mov     ah,[rsi]	; new octet

	cwd			; Extend 8th bit to DX
	mov	[rdi],dx	; store converted byte
	dec	rcx
	mov	[rdi+2],dl	
        jz      toend
        
        rol     ax,1		; 40h
        cwd			; Extend 7th bit to DX
	mov	[rdi+3],dl	; store converted byte
	dec	rcx
	mov	[rdi+4],dx	
        jz      toend
        
        rol     ax,1		; 20h
        cwd			; Extend 6th bit to DX
	mov	[rdi+6],dx	; store converted byte
	dec	rcx
	mov	[rdi+8],dl
        jz      toend
        
        rol     ax,1		; 10h
        cwd			; Extend 5th bit to DX
	mov	[rdi+9],dl	; store converted byte
	dec	rcx
	mov	[rdi+10],dx
        jz      toend

        rol     ax,1		; 08h
        cwd			; Extend 4th bit to DX
	mov	[rdi+12],dx	; store converted byte
	dec	rcx
	mov	[rdi+14],dl
        jz      toend
        
        rol     ax,1		; 04h
	cwd			; Extend 3rd bit to AH
	mov	[rdi+15],dl	; store converted byte
	dec	rcx
	mov	[rdi+16],dx
        jz      toend
        
        rol     ax,1		; 02h
        cwd			; Extend 2nd bit to DX
	mov	[rdi+18],dx	; store converted byte
	dec	rcx
	mov	[rdi+20],dl
        jz      toend
        
        rol     ax,1		; 01h
        cwd			; Extend 1st bit to DX
	mov	[rdi+21],dl	; store converted byte
	mov	[rdi+22],dx
	add	rdi, 24	
	
	inc	rsi

	dec	rcx
	jne	octet

toend:
        ret                     ; _cdecl return

Conv1_24 endp


;*************************************************************************************

        public  Conv1_32
Conv1_32 proc \
        uses rdi rsi
;       Dest:ptr byte,	RCX
;       Src:ptr byte,	RDX
;       count:DWORD	R8

	mov	rsi,rdx		; src - rdx second pointer
	mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels
        or	rcx,rcx
        jz	toend		; array has zero size

	cld
Octet:  lodsb			; new octet
        shl	eax,24

	cdq			; extend 8th bit to EDX
        mov	[rdi],edx

	dec	rcx
        jz      toend
        
        rol	eax,1        
        cdq			; extend 7th bit to EDX
        mov	[rdi+4],edx

	dec	rcx
        jz      toend
        
        rol	eax,1        
        cdq			; extend 6th bit to EDX
        mov	[rdi+8],edx

	dec	rcx
        jz      toend
        
        rol	eax,1        
        cdq			; extend 5th bit to EDX
        mov	[rdi+12],edx        

	dec	rcx
        jz      toend

        rol	eax,1        
        cdq			; extend 4th bit to EDX
        mov	[rdi+16],edx

	dec	rcx
        jz      toend
        
        rol	eax,1        
        cdq			; extend 3rd bit to EDX
        mov	[rdi+20],edx

	dec	rcx
        jz      toend
        
        rol	eax,1        
        cdq			; extend 2nd bit to EDX
        mov	[rdi+24],edx        

	dec	rcx
        jz      toend
        
        rol	eax,1        
        cdq			; extend 1st bit to EDX
        mov	[rdi+28],edx

        add	rdi,32

	dec	rcx
	jnz	Octet

toend:
        ret                     ; _cdecl return

Conv1_32 endp


;*************************************************************************************


        public  Conv1_64
Conv1_64 proc \
        uses rdi rsi
;       Dest:ptr byte,	RCX
;       Src:ptr byte,	RDX
;       count:DWORD	R8

	mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels
        or	rcx,rcx
        jz	toend		; array has zero size

        mov     rsi,rdx		; rdx second pointer        

Octet:  mov     ah,[rsi]	; new octet

	shl	rax,48        
        cqo			; extend 8th bit to RDX
        mov	[rdi],rdx

	dec	rcx
        jz      toend
        
        rol	rax,1        
        cqo			; extend 7th bit to RDX
        mov	[rdi+8],rdx

	dec	rcx
        jz      toend
        
        rol	rax,1        
        cqo			; extend 6th bit to RDX
        mov	[rdi+16],rdx

	dec	rcx
        jz      toend
        
        rol	rax,1        
        cqo			; extend 5th bit to RDX
        mov	[rdi+24],rdx

	dec	rcx
        jz      toend

        rol	rax,1        
        cqo			; extend 4th bit to DX
        mov	[rdi+32],rdx

	dec	rcx
        jz      toend
                
        rol	rax,1
        cqo			; extend 3rd bit to DX
        mov	[rdi+40],rdx

	dec	rcx
        jz      toend
        
        rol	rax,1        
        cqo			; extend 2nd bit to DX
        mov	[rdi+48],rdx

	dec	rcx
        jz      toend
        
        rol	rax,1        
        cqo			; extend 1st bit to DX
        mov	[rdi+56],rdx

        inc	rsi
        add	rdi,64

	dec	rcx
	jnz	Octet

toend:
        ret                     ; _cdecl return

Conv1_64 endp


;*************************************************************************************
;*************************************************************************************


;void Conv4_1(BYTE *Dest, const BYTE *Src, unsigned Size1D)
        public  Conv4_1
Conv4_1 proc \
        uses rsi
;       Dest:ptr byte,	RCX
;       Src:ptr byte,	RDX
;       count:DWORD	R8

	mov     rsi,rdx		; rdx second pointer
	or	rsi,rsi
	jz	ToEnd
	mov     rdx,rcx		; rdx=first pointer
	or	rdx,rdx
	jz	ToEnd
	
        mov     rcx,R8		; cx=amount of pixels

NextOct:sub	rcx,8		; array has zero, or small size
        jb	LastOct
        
        mov	al,[rsi]
        shl	ax,1
        shl	al,3
        shl	ax,1
	mov	al,[rsi+1]
        shl	ax,1
        shl	al,3
        shl	ax,1        
        mov	al,[rsi+2]
        shl	ax,1
        shl	al,3
        shl	ax,1        
        mov	al,[rsi+3]
        shl	ax,1
        shl	al,3        
        shl	ax,1

	mov	[rdx],ah
	inc	rdx
	add	rsi,4
        jmp	NextOct
        
LastOct:add	rcx,8
	jz	ToEnd
        
        cld        
        mov	ah,1		; add end byte mark
PIXEL:	lodsb
	rol	al,1		; copy the highest bit to CY
	rcl	ah,1		; transfer bit from CY to AH
	
	dec	rcx		; 2nd nibble
	jz	First1
	rol	al,4		; copy original 4th bit to CY
	rcl	ah,1		; transfer bit from CY to AH
			
	jnc	NoOctet			
	mov	[rdx],ah	; Full 8 bits finished, 1 travelled to CY.
	inc	rdx
	mov	ah,1		; add end byte mark
	loop	PIXEL
	jmp	toend		; all done here
	
NoOctet:loop	PIXEL

First1:	sal	ah,1		; shift must be finished to 8th bit
	jnc	First1
	mov	[rdx],ah	; store last incomplete byte

toend:
        ret                     ; _cdecl return
                
Conv4_1 endp



;void Conv4_8(BYTE *Dest, const BYTE *Src, unsigned Size1D)
        public  Conv4_8
Conv4_8 proc \
        uses rdi rsi
;       Dest:ptr byte,	RCX
;       Src:ptr byte,	RDX
;       count:DWORD	R8

	mov     rdi,rcx		; rdi=first pointer
	jrcxz	toend		; NULL dst pointer
        mov     rcx,R8		; cx=amount of pixels
        jrcxz	toend		; array has zero size

        mov     rsi,rdx		; rdx second pointer        
        cld

PIXEL:	lodsb
	mov	ah,al
	mov	dx,ax		; 21 21
	rol	ax,4		; 12 12
	and	dx,00FF0h	;  2 1
	and	ax,0F00Fh	; 2   1
	or	ax,dx
	sub	rcx,2
	jb	ToEndStor1
	stosw
	jnz	PIXEL

toend:
        ret                     ; _cdecl return
        
ToEndStor1:			; one remaining byte needs to be stored
	stosb
	ret        
                
Conv4_8 endp



;void Conv4_16(WORD *Dest, const BYTE *Src, unsigned Size1D)
        public  Conv4_16
Conv4_16 proc \
	uses rdi rsi
;       Dest:ptr byte,	RCX
;       Src:ptr byte,	RDX
;       count:DWORD	R8        

        mov     rdi,rcx		; rdi = destination pointer

	mov     rcx,R8		; rcx=amount of pixels
        jrcxz	toend		; array has zero size

        mov     rsi,rdx		; rsi = source pointer
        
        cld

PIXEL:	lodsb
	mov	ah,al
	mov	dx,ax
	sal	eax,16
	mov	ax,dx
	mov	edx,eax		; 21 21 21 21
	
	rol	eax,4		; 12 12 12 12
	and	edx,00F0FF0F0h	;  2  2 1  1
	and	eax,0F0F00F0Fh	; 1  1   2  2
	or	eax,edx
	sub	rcx,2
	jb	ToEndStor1	; only 1 pixel is remaining
	stosd
	jnz	PIXEL        

toend:
        ret                     ; _cdecl return
        
ToEndStor1:
	stosw
	ret        
                
Conv4_16 endp


;*************************************************************************************


;void Conv4_24(DWORD *Dest, const BYTE *Src, unsigned Size1D)
        public  Conv4_24
Conv4_24 proc \
	uses rdi rsi
;       Dest:ptr byte,	RCX
;       Src:ptr byte,	RDX
;       count:DWORD	R8        

        mov     rdi,rcx		; rdi = destination pointer

	mov     rcx,R8		; rcx=amount of pixels
        jrcxz	toend		; array has zero size

        mov     rsi,rdx		; rsi = source pointer
        
        cld

PIXEL:	lodsb
	mov	ah,al
	mov	dx,ax
	sal	eax,16
	mov	ax,dx
	mov	edx,eax		; 21 21 21 21
	
	rol	eax,4		; 12 12 12 12
	and	edx,00F0FF0F0h	;  2  2 1  1
	and	eax,0F0F00F0Fh	; 1  1   2  2
	or	eax,edx
	stosw			; 2*lower pix
	ror	eax,8
	sub	rcx,2
	jb	ToEndStor1	; only 1/3 of pixel is remaining
	stosw			; lower pix & upper pix
	ror	eax,8
	stosw			; 2*upper pix
	jnz	PIXEL        

toend:
        ret                     ; _cdecl return
        
ToEndStor1:
	stosb			; only 1 byte remains
	ret        
                
Conv4_24 endp


;*************************************************************************************


;void Conv4_32(DWORD *Dest, const BYTE *Src, unsigned Size1D)
        public  Conv4_32
Conv4_32 proc \
	uses rdi rsi
;       Dest:ptr byte,	RCX
;       Src:ptr byte,	RDX
;       count:DWORD	R8

        mov     rdi,rcx		; rdi = destination pointer

        mov     rcx,R8		; cx=amount of pixels
        jrcxz	toend		; array has zero size

        mov     rsi,rdx		; rsi = source pointer
        
        mov	R8d,11111111h
PIXEL:	mov	R9b,byte ptr [rsi]
	movzx	eax,R9b
	inc	rsi
        shr	al,4
        mul	R8d		; EDX is destroyed
        mov	dword ptr [rdi],eax
        dec	rcx
        jz	ToEnd
        movzx	eax,R9b
        and	al,0Fh
        mul	R8d		; EDX is destroyed
        mov	dword ptr [rdi+4],eax
        add	rdi,8
        loop	PIXEL

toend:
        ret                     ; _cdecl return
                
Conv4_32 endp


;*************************************************************************************


;void Conv4_64(DWORD *Dest, const BYTE *Src, unsigned Size1D)
        public  Conv4_64
Conv4_64 proc \
	uses rdi rsi
;       Dest:ptr byte,	RCX
;       Src:ptr byte,	RDX
;       count:DWORD	R8

        mov     rdi,rcx		; rdi = destination pointer

        mov     rcx,R8		; cx=amount of pixels
        jrcxz	toend		; array has zero size

        mov     rsi,rdx		; rsi = source pointer

        cld
        
	mov	R9, 1111111111111111h
PIXEL:	movzx	rax,byte ptr [rsi]
	mov	R8,rax
	inc	rsi	
	shr	rax,4
	mul	R9		; rdx is cleared
	stosq
	dec	rcx
	jz	toend
	mov	rax,R8
	and	al,0Fh	
	mul	R9
	stosq
	loop	PIXEL

toend:
        ret                     ; _cdecl return
                
Conv4_64 endp



;*************************************************************************************
;*************************************************************************************

        public  Conv8_1
Conv8_1 proc \
	uses rsi
;       Dest:ptr qword
;       Src:ptr byte
;       count:DWORD

        mov     rsi,rdx		;
        or	rsi,rsi
        jz	ToEnd
	mov     rdx,rcx		; rdi=first pointer
	or	rdx,rdx
	jz	ToEnd
        mov     rcx,R8		; cx=amount of pixels
        
NextOct:sub	rcx,8
        jb	LastOct
        
        mov	al,[rsi]
        shl	ax,1
        mov	al,[rsi+1]
        shl	ax,1
	mov	al,[rsi+2]
        shl	ax,1
        mov	al,[rsi+3]
        shl	ax,1
        mov	al,[rsi+4]
        shl	ax,1
        mov	al,[rsi+5]
        shl	ax,1
	mov	al,[rsi+6]
        shl	ax,1
        mov	al,[rsi+7]
        shl	ax,1

	mov	[rdx],ah
	inc	rdx
	add	rsi,8        
        jmp	NextOct
        
LastOct:add	rcx,8		; zero count or all full octets exhausted
	jz	ToEnd
        
        cld        
PIXEL1:	mov	ah,1		; add end byte mark
PIXEL:	lodsb
	rcl	ax,1		; copy the highest bit to AH; transfer bit from CY
	jc	Octet
	loop	PIXEL

First1:	sal	ah,1		; shift must be finished to 8th bit
	jnc	First1
	mov	[rdx],ah	; store last incomplete byte

toend:	ret                     ; _cdecl return
	
Octet:	mov	[rdx],ah
	inc	rdx
	loop	PIXEL1
	jmp	toend		; all done here
                
Conv8_1 endp


;*************************************************************************************


        public  Conv8_4
Conv8_4 proc \
        uses rdi rsi
;       Dest:ptr byte,
;       Src:ptr byte,
;       count:DWORD
        
	mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels
        jrcxz	toend		; array has zero size
        
        mov     rsi,rdx		;
        
        cld
PIXEL:	lodsb			; load 1st byte
	and	al,0F0h
	
	dec	rcx
	jnz	NIBBLE2
	stosb			;store incomplete nibble
	jmp	toend

NIBBLE2:mov	ah,al
        lodsb			; load 2nd byte
        and	al,0F0h
	ror	al,4
	or	al,ah
	stosb			;store both nibbles
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv8_4 endp



;*************************************************************************************


        public  Conv8_16
Conv8_16 proc \
	uses rdi rsi
;       Dest:ptr qword, \
;       Src:ptr byte, \
;       count:DWORD

        mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels

        mov     rsi,rdx		;
        
        cld
        sub	rcx,8
        jl	PIXEL1
        
PIXEL8: lodsq			; pixels 1,2,3,4,5,6,7,8
        mov	rdx,rax
        
        mov	al,dh		; 4321
        sal	eax,16		; 22--
        mov	ah,dl
        mov	al,dl        
        ;stosd			; converted pixel 1 & 2
        sal	rax,16		; xx1122xx
        
        shr	rdx,16
        mov	al,dh
        mov	ah,dh
        rol	rax,16
        mov	ah,dl
        mov	al,dl
        ror	rax,32
        stosq			; store converted pixel 3 & 4
				; Rdx: pixels 3, 4,5,6,7,8
        ror	rdx,32		; 7,8 ......... 3, 4,5,6
        mov	al,dh		; 8
        mov	ah,dh
        sal	eax,16
        mov	ah,dl		; 7
        mov	al,dl
        ;stosd			; converted pixel 5 & 6
        sal	rax,16

        rol	rdx,16		; 5,6, ..........
        mov	al,dh
        mov	ah,dh
        sal	rax,16
        mov	ah,dl
        mov	al,dl
        stosq			; store converted pixel 5,6,7,8
        
	sub	rcx,8
	jae	PIXEL8

PIXEL1: add	rcx,8
        jz	toend		; array has zero size               
PIXEL:	lodsb
	mov	ah,al
	stosw
	loop	PIXEL
        
ToEnd:	ret			; _cdecl return
                
Conv8_16 endp


;*************************************************************************************

        public  Conv8_24
Conv8_24 proc \
        uses rdi rsi
;       Dest:ptr qword,
;       Src:ptr byte,
;       count:DWORD

        mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels
        jrcxz	toend		; array has zero size

        mov     rsi,rdx		;
        
        cld
PIXEL:	lodsb	
	stosb
	stosb
	stosb
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv8_24 endp


;*************************************************************************************

        public  Conv8_32
Conv8_32 proc \
        uses rdi rsi
;       Dest:ptr qword, \
;       Src:ptr byte, \
;       count:DWORD

        mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels

        mov     rsi,rdx		;        
        
        sub	rcx,8
        jl	PIXEL1

	mov	R8,01010101h
PIXEL8: mov	R9,qword ptr [rsi]
	add	rsi,8
	
	movzx	eax,R9b
	mul	R8d
	shr	R9,8
	mov	dword ptr [rdi],eax
	
	movzx	eax,R9b
	mul	R8d
	shr	R9,8
	mov	dword ptr [rdi+4],eax
	
	movzx	eax,R9b
	mul	R8d
	shr	R9,8
	mov	dword ptr [rdi+8],eax
	
	movzx	eax,R9b
	mul	R8d
	shr	R9,8
	mov	dword ptr [rdi+12],eax
	
	movzx	eax,R9b
	mul	R8d
	shr	R9,8
	mov	dword ptr [rdi+16],eax
	
	movzx	eax,R9b
	mul	R8d
	shr	R9,8
	mov	dword ptr [rdi+20],eax
	
	movzx	eax,R9b
	mul	R8d
	shr	R9,8
	mov	dword ptr [rdi+24],eax
	
	movzx	eax,R9b
	mul	R8d	
	mov	dword ptr [rdi+28],eax		; store pixel 8
	
	add	rdi,32

	sub	rcx,8
        jae	PIXEL8

PIXEL1: add	rcx,8
	jz	toend		; array has zero size      
	cld
PIXEL:	lodsb
	mov	ah,al
	mov	dx,ax
	rol	eax,16
	mov	ax,dx
	stosd
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv8_32 endp


;*************************************************************************************

        public  Conv8_64
Conv8_64 proc \
	uses rdi
;       Dest:ptr qword, \
;       Src:ptr byte, \
;       count:DWORD

        mov     rdi,rcx		; rdi=first pointer
        jrcxz	toend		; array has zero size
        mov     rcx,R8		; cx=amount of pixels
        jrcxz	toend		; array has zero size

	mov	R9,rdx
        or	rdx,rdx		;
        jz	toend
        
        cld
	mov	R8, 101010101010101h
PIXEL:	movzx	rax,byte ptr [R9]
	inc	R9
	mul	R8		; RDX is cleared!
	stosq
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv8_64 endp


;*************************************************************************************
;*************************************************************************************

        public  Conv16_1
Conv16_1 proc \
        uses rsi
;       Dest:ptr byte, \
;       Src:ptr word, \
;       count:DWORD

        mov     rsi,rdx		;
        or	rdx,rdx
        jz	toend
        inc	rsi

        mov     rdx,rcx		; rdx=first pointer        
        jrcxz	toend
        mov     rcx,R8		; rcx=amount of pixels

NextOct:sub	rcx,8
        jb	LastOct	
        mov	al,[rsi]
        rcl	ax,1
        mov	al,[rsi+2]
        rcl	ax,1
	mov	al,[rsi+4]
        rcl	ax,1
        mov	al,[rsi+6]
        rcl	ax,1
        mov	al,[rsi+8]
        rcl	ax,1
        mov	al,[rsi+10]
        rcl	ax,1
	mov	al,[rsi+12]
        rcl	ax,1
        mov	al,[rsi+14]
        rcl	ax,1
	mov	[rdx],ah
	inc	rdx
	add	rsi,16   
        jmp	NextOct
        
LastOct:add	rcx,8
	jz	ToEnd        

PIXEL1: mov	ah,1		; add end byte mark
PIXEL:	mov	al,[rsi]
	add	rsi,2
	rcl	ax,1		; copy the highest bit to CY
	jnc	NoOctet
	mov	[rdx],ah
	inc	rdx
	loop	PIXEL1
	jmp	toend		; all done here
	
NoOctet:loop	PIXEL

First1:	sal	ah,1		; shift must be finished to 8th bit
	jnc	First1
	mov	[rdx],ah	; store last incomplete byte

toend:
        ret                     ; _cdecl return
                
Conv16_1 endp



;*************************************************************************************


        public  Conv16_4
Conv16_4 proc \
        uses rdi rsi
;       Dest:ptr byte, \
;       Src:ptr word, \
;       count:DWORD

        mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels
        jrcxz	toend		; array has zero size

        mov     rsi,rdx		;
        
        cld
PIXEL:	inc	rsi
	lodsb			; load 1st hi byte
	and	al,0F0h
	
	dec	rcx
	jnz	NIBBLE2
	stosb			;store incomplete nibble
	jmp	toend

NIBBLE2:mov	ah,al
	inc	rsi
        lodsb			; load 2nd byte
        and	al,0F0h
	ror	al,4
	or	al,ah
	stosb			;store both nibbles
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv16_4 endp



;*************************************************************************************


        public  Conv16_8
Conv16_8 proc \
        uses rdi rsi
;       Dest:ptr qword, \
;       Src:ptr byte, \
;       count:DWORD

        mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels
        
        mov     rsi,rdx		;
        
        cld
        
	sub	rcx,4
        jl	PIXEL1

PIXEL4: mov	eax,[rsi]	;		[2-1-]
	mov	edx,[rsi+4]	;		[4-3-]
	add	rsi,8
	mov	al,dh		; pixel 3	[2-13]
	mov	dl,ah		; pixel 1	[4-31]
	
	rol	edx,8		;		[-314]
	mov	ah,dl		; pixel 4	[2-43]
	
	ror	eax,16		;		[432-]
	mov	al,dh		; pixel 1	[4321]
	stosd
        
        sub	rcx,4
        jae	PIXEL4

PIXEL1:	add	rcx,4
	jz	ToEnd        
PIXEL:	lodsw
	mov	al,ah
	stosb
	loop	PIXEL
        
ToEnd:
        ret                     ; _cdecl return
                
Conv16_8 endp


;*************************************************************************************


        public  Conv16_24
Conv16_24 proc \
        uses rdi rsi
;       Dest:ptr byte,	RCX
;       Src:ptr byte,	RDX
;       count:DWORD	R8

        mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels
        jrcxz	toend		; array has zero size
        
        mov     rsi,rdx		;
        
        cld
PIXEL:	lodsw
	mov	[rdi],ah
	inc	rdi
	stosb
	mov	[rdi],ah
	inc	rdi	
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv16_24 endp


;*************************************************************************************


        public  Conv16_32
Conv16_32 proc \
        uses rdi rsi
;       Dest:ptr qword, \
;       Src:ptr byte, \
;       count:DWORD

        mov     rdi,rcx		;
        mov     rcx,R8             ; cx=amount of pixels
        jrcxz	toend		; array has zero size

        mov     rsi,rdx		; rdi=first pointer
        
        cld
PIXEL:	lodsw
	mov	dx,ax
	rol	eax,16
	mov	ax,dx
	stosd
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv16_32 endp


;*************************************************************************************

        public  Conv16_64
Conv16_64 proc \
        uses rdi rsi
;       Dest:ptr qword, \
;       Src:ptr byte, \
;       count:DWORD

        mov     rdi,rcx		;
        mov     rcx,R8             ; cx=amount of pixels
        jrcxz	toend		; array has zero size

        mov     rsi,rdx		; rdi=first pointer
        
        cld
        mov	R8, 001000100010001h
PIXEL:	xor	rax,rax
	lodsw
	mul	R8
	stosq
	loop	PIXEL

toend:
        ret                     ; _cdecl return
                
Conv16_64 endp


;*************************************************************************************
;*************************************************************************************



        public  Conv24_1
Conv24_1 proc \
	uses rsi
;       Dest:ptr qword
;       Src:ptr byte
;       count:DWORD

	mov     rsi,rdx		;
	or	rsi,rsi
	jz	ToEnd
	mov     rdx,rcx		; rdx=first pointer
	or	rdx,rdx
	jz	ToEnd
        mov     rcx,R8		; cx=amount of pixels        
        inc	rsi
        inc	rsi
        
NextOct:sub	rcx,8
        jb	LastOct	
        mov	al,[rsi]
        rcl	ax,1
        mov	al,[rsi+3]
        rcl	ax,1
	mov	al,[rsi+6]
        rcl	ax,1
        mov	al,[rsi+9]
        rcl	ax,1
        mov	al,[rsi+12]
        rcl	ax,1
        mov	al,[rsi+15]
        rcl	ax,1
	mov	al,[rsi+18]
        rcl	ax,1
        mov	al,[rsi+21]
        rcl	ax,1
	mov	[rdx],ah
	inc	rdx
	add	rsi,24
        jmp	NextOct
        
LastOct:add	rcx,8
	jz	ToEnd        

PIXEL1:	mov	ah,1		; add end byte mark
PIXEL:	mov	al,[rsi]
	add	rsi,3

	rol	al,1		; copy the highest bit to CY
	rcl	ah,1		; transfer bit from CY
	jnc	NoOctet
	mov	[rdx],ah
	inc	rdx
	loop	PIXEL1
	jmp	toend		; all done here
	
NoOctet:loop	PIXEL

First1:	sal	ah,1		; shift must be finished to 8th bit
	jnc	First1
	mov	[rdx],ah	; store last incomplete byte

toend:
        ret                     ; _cdecl return
                
Conv24_1 endp



;*************************************************************************************


        public  Conv24_4
Conv24_4 proc \
        uses rdi rsi
;       Dest:ptr byte,
;       Src:ptr byte,
;       count:DWORD
        
	mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels
        jrcxz	toend		; array has zero size
        
        mov     rsi,rdx		;
        
        add	rsi,2
        cld
PIXEL:	mov	al,[rsi]	; load 1st byte
	and	al,0F0h
	
	dec	rcx
	jnz	NIBBLE2
	stosb			;store incomplete nibble
	jmp	toend

NIBBLE2:mov	ah,al
        mov	al,[rsi+3]	; load 2nd byte
        and	al,0F0h
	ror	al,4
	or	al,ah
	stosb			;store both nibbles
	add	rsi,6
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv24_4 endp


;*************************************************************************************


        public  Conv24_8
Conv24_8 proc \
        uses rdi rsi
;       Dest:ptr qword,
;       Src:ptr byte,
;       count:DWORD

        mov     rdi,rcx		; rdi=source pointer 
        mov     rcx,R8             ; cx=amount of pixels
        jrcxz	toend		; array has zero size

        mov     rsi,rdx		; rsi=destination pointer
        
        add	rsi,2
        cld
PIXEL:	mov	al,[rsi]
        add	rsi,3	
	stosb
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv24_8 endp


;*************************************************************************************

        public  Conv24_16
Conv24_16 proc \
        uses rdi rsi
;       Dest:ptr qword,
;       Src:ptr byte,
;       count:DWORD

        mov     rdi,rcx		; si=source pointer 
        mov     rcx,R8		; cx=amount of pixels
        jrcxz	toend		; array has zero size

        mov     rsi,rdx		; rdi=destination pointer
        
        inc	rsi
        cld
PIXEL:	mov	al,[rsi]
	inc	rsi
	mov	ah,[rsi]
        add	rsi,2	
	stosw
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv24_16 endp


;*************************************************************************************

        public  Conv24_32
Conv24_32 proc \
        uses rdi rsi
;       Dest:ptr qword,
;       Src:ptr byte,
;       count:DWORD

        mov     rdi,rcx		; si=source pointer 
        mov     rcx,R8		; cx=amount of pixels
        jrcxz	toend		; array has zero size

        mov     rsi,rdx		; di=destination pointer

        cld
PIXEL:	lodsw
	shl	eax,16		; B2 B1 x x
	lodsb			; B2 B1 x B3
	mov	ah,al		; B2 B1 B3 B3  duplicate last 8 bits
	ror	eax,8		; B3 B2 B1 B3

	stosd
	
	dec	rcx
	jz	ToEnd
	
	lodsb			; x x x B1
	ror	eax,8		; B1 x x x
	lodsw			; B1 x B3 B2
	ror	eax,8		; B2 B1 x B3
	mov	ah,al		; B2 B1 B3 B3	
	ror	eax,8		; B3 B2 B1 B3	
	stosd
	
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv24_32 endp



;*************************************************************************************

        public  Conv24_64
Conv24_64 proc \
        uses rdi rsi
;       Dest:ptr qword,
;       Src:ptr byte,
;       count:DWORD

        mov     rdi,rcx		; si=source pointer 
        mov     rcx,R8		; cx=amount of pixels
        jrcxz	toend		; array has zero size

        mov     rsi,rdx		; di=destination pointer

        cld
PIXEL:	lodsw			;  x x B2 B1
	mov	dh,ah
	shl	eax,16		; B2 B1 x x
	mov	ah,[rsi]	; B2 B1 B3 x
	mov	al,dh		; B2 B1 B3 B2  << Stored lo DWORD
	mov	dl,ah
	inc	rsi
	
	stosd
	rol	eax,16		; B3 B2 B2 B1
	mov	ah,al		; B3 B2 B1 B1
	mov	al,dl		; B3 B2 B1 B3  << Stored hi DWORD
	stosd

	dec	rcx
	jz	ToEnd
	
	lodsb			; x x x B1
	ror	eax,8		; B1 x x x
	lodsw			; B1 x B3 B2
	mov	dl,al		; B2	
	ror	eax,8		; B2 B1 x B3
	mov	ah,al		; B2 B1 B3 B3
	mov	al,dl		; B2 B1 B3 B2
	mov	dh,ah		; B3	
	stosd
	
	rol	eax,16		; B3 B2 B2 B1
	mov	ah,al		; B3 B2 B1 B1
	mov	al,dh		; B3 B2 B1 B3
	stosd
	
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv24_64 endp


;*************************************************************************************
;*************************************************************************************


        public  Conv32_1
Conv32_1 proc \
        uses rsi
;       Dest:ptr dword, RCX
;       Src:ptr byte, RDX
;       count:DWORD

	mov     rsi,rdx		; RDX=source pointer
	or	rsi,rsi
	jz	toend		; sourceptr NULL
	add	rsi,3
        mov     rdx,rcx		; destination
        jrcxz	toend		; destination NULL ptr
        mov     rcx,R8          ; R8->rcx=amount of pixels
        
NextOct:sub	rcx,8
        jb	LastOct	
        mov	al,[rsi]
        rol	ax,1
        mov	al,[rsi+4]
        rol	ax,1
	mov	al,[rsi+8]
        rol	ax,1
        mov	al,[rsi+12]
        rol	ax,1
        mov	al,[rsi+16]
        rol	ax,1
        mov	al,[rsi+20]
        rol	ax,1
	mov	al,[rsi+24]
        rol	ax,1
        mov	al,[rsi+28]
        rol	ax,1
	mov	[rdx],ah
	inc	rdx
	add	rsi,32
        jmp	NextOct
        
LastOct:add	rcx,8
	jz	ToEnd

PIXEL1:	mov	ah,1		; add end byte mark
PIXEL:	mov	al,[rsi]
	add	rsi,4
	rcl	ax,1		; copy the octet end bit to CY; transfer bit from AL to AH
	jnc	NoOctet
	mov	[rdx],ah
	inc	rdx
	loop	PIXEL1
	jmp	toend		; all done here

NoOctet:loop	PIXEL

First1:	sal	ah,1		; shift must be finished to 8th bit
	jnc	First1
	mov	[rdx],ah	; store last incomplete byte

toend:
        ret                     ; _cdecl return
                
Conv32_1 endp


;*************************************************************************************


        public  Conv32_4
Conv32_4 proc \
        uses rdi rsi
;       Dest:ptr byte
;       Src:ptr dword
;       count:DWORD

        mov     rdi,rcx		;
        mov     rcx,R8             ; cx=amount of pixels
        jrcxz	toend		; array has zero size

        mov     rsi,rdx		; di=first pointer
        
        cld
PIXEL:	add	rsi,3
	lodsb			; load 1st hi byte
	and	al,0F0h
	
	dec	rcx
	jnz	NIBBLE2
	stosb			;store incomplete nibble
	jmp	toend

NIBBLE2:mov	ah,al
	add	rsi,3
        lodsb			; load 2nd hi byte
        and	al,0F0h
	ror	al,4
	or	al,ah
	stosb			;store both nibbles
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv32_4 endp



;*************************************************************************************


        public  Conv32_8
Conv32_8 proc \
        uses rdi rsi
;       Dest:ptr qword,
;       Src:ptr byte,
;       count:DWORD

	mov     rdi,rcx		; rdi=second ptr
	jrcxz	toend		; NULL dst ptr
        mov     rcx,R8		; rcx=amount of pixels
        mov     rsi,rdx		; rsi=first pointer
        
        cld
	sub	rcx,8
        jb	PIXEL1

PIXEL8:	mov	ah,byte ptr[rsi+31]
	mov	al,byte ptr[rsi+27]
	shl	eax,16
	mov	ah,byte ptr[rsi+23]
	mov	al,byte ptr[rsi+19]
	shl	rax,16
	mov	ah,byte ptr[rsi+15]
	mov	al,byte ptr[rsi+11]
	shl	rax,16
	mov	ah,byte ptr[rsi+7]
	mov	al,byte ptr[rsi+3]
	stosq
	add	rsi,32
	sub	rcx,8
        jae	PIXEL8
	
PIXEL1: add	ecx,8
	jz	ToEnd

PIXEL:	add	rsi,3
	movsb
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv32_8 endp


;*************************************************************************************


        public  Conv32_16
Conv32_16 proc \
        uses rdi rsi
;       Dest:ptr qword,
;       Src:ptr byte,
;       count:DWORD

        mov     rdi, rcx	;
        mov     rcx, R8		; rcx=amount of pixels
        mov     rsi,rdx		; di=first pointer
        
        cld
	sub	rcx,4
        jb	PIXEL1

PIXEL4:	mov	ax,word ptr[rsi+14]
	shl	eax,16
	mov	ax,word ptr[rsi+10]
	shl	rax,16
	mov	ax,word ptr[rsi+6]
	shl	rax,16
	mov	ax,word ptr[rsi+2]
	stosq
	add	rsi,16
	sub	rcx,4
        jae	PIXEL4

PIXEL1: add	rcx,4
	jz	ToEnd		; array has zero size or everything done        
PIXEL:	add	rsi,2
	movsw
	loop	PIXEL
        
toend:	ret                     ; _cdecl return
                
Conv32_16 endp


;*************************************************************************************


        public  Conv32_24
Conv32_24 proc \
        uses rdi rsi
;       Dest:ptr qword
;       Src:ptr byte
;       count:DWORD

        mov     rdi,rcx		;
        mov     rcx,R8             ; cx=amount of pixels
        jrcxz	toend		; array has zero size

        mov     rsi,rdx		; di=first pointer
        
        cld
PIXEL:	lodsd

	shr	eax,8
	mov	[rdi],ax
	add	rdi,2
	shr	eax,8
	mov	[rdi],ah
	inc	rdi

	dec	rcx
	jz	ToEnd
	
	lodsd
	shr	eax,8
	mov	[rdi],al
	inc	rdi
	shr	eax,8
	mov	[rdi],ax
        add	rdi,2		

	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv32_24 endp


;*************************************************************************************


        public  Conv32_64
Conv32_64 proc \
        uses rdi rsi
;       Dest:ptr qword,
;       Src:ptr byte,
;	count:DWORD

        mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels
        jrcxz	toend		; array has zero size

        mov     rsi,rdx		;
        
        cld
PIXEL:	lodsd
	stosd
	stosd
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv32_64 endp


;*************************************************************************************


        public  Conv64_32
Conv64_32 proc \
        uses rdi rsi
;       Dest:ptr qword,
;       Src:ptr byte,
;	count:DWORD

        mov     rdi,rcx		; rdi=first pointer
        mov     rcx,R8		; cx=amount of pixels
        jrcxz	toend		; array has zero size

        mov     rsi,rdx		;
        
        add	rsi,4
        cld
PIXEL:	mov	eax,[rsi]
	add	rsi,8
	stosd
	loop	PIXEL
        
toend:
        ret                     ; _cdecl return
                
Conv64_32 endp



;########################################################################################
;########################################################################################
;########################################################################################

	public  Flip1
Flip1	proc \
        uses rbx rsi rdi
;       Data:ptr byte,	RCX
;       count:DWORD	RDX

	mov	rdi, offset swap_bits_xlat

	mov	R8,RCX		; pointer
	mov	R9,RDX		; pixel amount

        mov     rcx,rdx		; cx=amount of pixels
        mov     rsi,R8		; di=first pointer (es=segment part)
        
        cmp	rcx,1
        jle	ToEnd		; ignore values 0 and 1

	xor	rax,rax
	dec	rcx        
        shr	rcx,3
        jz	LastByte

	add	rcx,rsi
		
	xor	rbx,rbx
PIXEL:	mov	al,[rcx]
	mov	bl,[rsi]
	mov	al,[rdi+rax]
	mov	bl,[rdi+rbx]
	mov	[rcx],bl
	mov	[rsi],al
	dec	rcx
	inc	rsi
	cmp	rsi,rcx
	jl	PIXEL		
	jne	ToEnd8
LastByte:mov	al,[rsi]
	mov	al,[rdi+rax]
	mov	[rsi],al
	
ToEnd8:	mov     rcx,R9
	mov	rbx,rcx
	mov	rsi,R8
	and	cl,7
	jz	ToEnd		; no shift needed
	xor	cl,7
	inc	cl
	
	mov	ch,[rsi]	; prepare first byte
	shr	rbx,3
	jz	LastShift

LoopShift:
	mov	al,ch
	mov	ah,[rsi+1]
	mov	ch,ah
	rol	ax,cl
	mov	[rsi],al
	inc	rsi
	dec	rbx
	jnz	LoopShift
	
LastShift: shl	ch,cl
	mov	[rsi],ch
	
ToEnd:
	ret			; _cdecl return
Flip1	endp


;########################################################################################

	public  Flip2
Flip2	proc \
        uses rbx rsi rdi
;       Data:ptr byte,	RCX
;       count:DWORD	RDX

	mov	rdi, offset swap_bits2_xlat

	mov	R8,RCX		; pointer
	mov	R9,RDX		; pixel amount

        mov     rcx,rdx		; cx=amount of pixels
        mov     rsi,R8		; di=first pointer (es=segment part)
        
        cmp	rcx,1
        jle	ToEnd		; ignore values 0 and 1

	xor	rax,rax
	dec	rcx        
        shr	rcx,2
        jz	LastByte

	add	rcx,rsi
		
	xor	rbx,rbx
PIXEL:	mov	al,[rcx]
	mov	bl,[rsi]
	mov	al,[rdi+rax]
	mov	bl,[rdi+rbx]
	mov	[rcx],bl
	mov	[rsi],al
	dec	rcx
	inc	rsi
	cmp	rsi,rcx
	jl	PIXEL		
	jne	ToEnd8
LastByte:mov	al,[rsi]
	mov	al,[rdi+rax]
	mov	[rsi],al
	
ToEnd8:	mov     rcx,R9
	mov	rbx,rcx
	mov	rsi,R8
	sal	cl,1
	and	cl,7
	jz	ToEnd		; no shift needed
	xor	cl,7
	inc	cl
	
	mov	ch,[rsi]	; prepare first byte
	shr	rbx,2
	jz	LastShift

LoopShift:
	mov	al,ch
	mov	ah,[rsi+1]
	mov	ch,ah
	rol	ax,cl
	mov	[rsi],al
	inc	rsi
	dec	rbx
	jnz	LoopShift
	
LastShift: shl	ch,cl
	mov	[rsi],ch
	
ToEnd:
	ret			; _cdecl return
Flip2	endp



;*************************************************************************************


	public  Flip4
Flip4	proc \
        uses rdi rsi
;       Data:ptr byte,	RCX
;       count:DWORD	RDX

        mov     rdi,rcx		; [data] di=first pointer
        mov     rcx,rdx		; [count] cx=amount of pixels
        cmp	rcx,1		; 1 or less pixels makes no sense to flip
        jle	ToEnd		; ignore values 0 and 1

        mov     rsi,rdi		;
        
        shr	rcx,1		; divide 2       
        jc	PixOdd
        
	add	rsi,rcx		; pixel count is even
        dec	rsi
        
        cmp	rdi,rsi
        je	LastNibble	; This can occur for size=2.
LoopEven:mov	al,[rdi]	; Process first byte with nibbles
	rol	al,4		; this shift flips nibbles	

	mov	dl,[rsi]	; Process second byte with nibbles
	rol	dl,4

	mov	[rsi],al
	mov	[rdi],dl

	dec	rsi
	inc	rdi
	cmp	rdi,rsi
	jb	LoopEven	; rsi<rdi
	jne	ToEnd		; No one byte nible needs to be flipped.
LastNibble:
	mov	al,[rdi]
	rol	al,4
	mov	[rdi],al	; Last byte needs to flip nibbles.	
	jmp	ToEnd

                
PixOdd:	add	rsi,rcx		; pixel count is odd i.e. >=3.
	mov	dl,[rsi]
LoopOddD:mov	dh,dl

	mov	al,[rdi]
	mov	ah,al

	and	ax,0F00Fh
	and	dx,0FF0h
		
	or	ax,dx
	;mov	[rdi],al	; nibble 1 flipped with nibble n - no need to store here	
	mov	[rsi],ah	; nibble n flipped with nibble 1

	dec	rsi
	cmp	rsi,rdi
	je	ToEndStore
	
	mov	ah,al		; contained in [rdi]
	and	ax,0F00Fh	
	
	mov	dl,[rsi]
	mov	dh,dl
	and	dx,0FF0h
	
	or	dx,ax
	mov	[rdi],dh	; nibble 2 flipped with nibble n-1		
	;mov	[rsi],dl	; nibble n-1 flipped with nibble 2; no need to store here.

	inc	rdi	
	cmp	rdi,rsi
	jb	LoopOddD
	mov	[rsi],dl	; after loop exit realise lazy store.

ToEnd:
        ret                     ; _cdecl return
        
ToEndStore:
	mov	[rdi],al	; nibble 1 flipped with nibble n	
        ret                     ; _cdecl return
                
Flip4 endp


;*************************************************************************************


	public  Flip8
Flip8	proc \
        uses rsi
;       Data:ptr byte,	RCX
;       count:DWORD	RDX

				; rcx=first pointer
        cmp	rdx,2		; rdx=amount of pixels
        jle	PxSize2		; ignore values 0 and 1; 2 has special handling

        mov     rsi,rcx		; end pointer
        add	rsi,rdx
        test	rdx,1
	jz	PxOp16s		; Test for even value.

	dec	rsi
PIXEL:	mov	al,[rcx]
	mov	ah,[rsi]
	mov	[rcx],ah
	mov	[rsi],al
	inc	rcx
	dec	rsi
	cmp	rcx,rsi
	jl	PIXEL
ToEnd:
        ret                     ; _cdecl return
        
PxOp16s:test	rdx,2		; test for division by 4
	jz	PxOp32s		; Optimised word loop for even 'x' only.
	sub	rsi,2
PxOp16L:mov	dx,[rsi]
	mov	ax,[rcx]
	xchg	dl,dh
	xchg	al,ah
	mov	[rcx],dx
	mov	[rsi],ax
	add	rcx,2
	sub	rsi,2
	cmp	rcx,rsi
	jl	PxOp16L
PxSize2:jnz	ToEnd2		; No middle WORD, bail out.
	
	mov	ax,[rcx]	; Middle WORD must be also flipped.
	xchg	al,ah
	mov	[rcx],ax
ToEnd2:	ret


PxOp32L:mov	edx,[rsi]
	mov	eax,[rcx]
	bswap	edx
	bswap	eax
	mov	[rcx],edx
	mov	[rsi],eax
	add	rcx,4
PxOp32s:sub	rsi,4		; Loop entry point here is quite tricky. It alligns ESI to DWORD boundary and fixes special case x=2.
	cmp	rcx,rsi
	jl	PxOp32L
	jnz	ToEnd3		; No middle DWORD, bail out.
	
	mov	eax,[rcx]	; Middle DWORD must be also flipped.
	bswap	eax	
	mov	[rcx],eax
ToEnd3:	ret
        

Flip8 endp


;*************************************************************************************


	public  Flip16
Flip16	proc
;       Data:ptr byte,	RCX
;       count:DWORD	RDX

	cmp	rdx, 1		; rdx=amount of pixels - 1, set flags
        jle	ToEnd		; ignore values 0 and 1

	mov     R8,rcx		; R8 last pixel position
        mov     rax,rdx		; count
        mov	rdx,rcx		; rdi=first pointer

        add	R8,rax		; Ptr + counr
        add	R8,rax		; Ptr + 2*counr
        
        test	rax,1
	jz	PxOp32s		; Test for even value.

	sub	R8,2
PIXEL:	mov	ax,[rdx]
	mov	cx,[R8]
	mov	[R8],ax
	mov	[rdx],cx
	add	rdx,2
	sub	R8,2
	cmp	rdx,R8
	jl	PIXEL	
ToEnd:
        ret                     ; _cdecl return
        
			; Optimised dword loop for even 'x' only.
PxOp32L:mov	ecx,[R8]
	mov	eax,[rdx]
	rol	ecx,16
	rol	eax,16
	mov	[rdx],ecx
	mov	[R8],eax
	add	rdx,4
PxOp32s:sub	R8,4		; Loop entry point here is quite tricky. It alligns RSI to WORD boundary and fixes special case x=2.
	cmp	rdx,R8
	jl	PxOp32L
	jnz	ToEnd2		; No middle DWORD, bail out.
	
	mov	eax,[rdx]	; Middle DWORD must be also flipped.
	rol	eax,16
	mov	[rdx],eax
ToEnd2:	ret
        
                
Flip16 endp


;*************************************************************************************


	public  Flip24
Flip24	proc \
        uses rdi rsi
;       Data:ptr byte,	RCX
;       count:DWORD	RDX

        mov     rdi,rcx		; rdi=first pointer
        mov     rcx,rdx		; cx=amount of pixels
        sub	rcx,1
        jle	ToEnd		; ignore values 0 and 1

        mov     rsi,rdi

        add	rsi,rcx
        add	rsi,rcx
        add	rsi,rcx		; 3*(size-1)        
                
PIXEL:	mov	al,[rdi]	; byte 1
        mov	cl,[rsi]
        mov	[rdi],cl
        mov	[rsi],al
        
        inc	rdi
        inc	rsi
        mov	al,[rdi]	; byte 2
        mov	cl,[rsi]
        mov	[rdi],cl
        mov	[rsi],al
        
        inc	rdi
        inc	rsi
        mov	al,[rdi]	; byte 3
        mov	cl,[rsi]
        mov	[rdi],cl
        mov	[rsi],al
        
        inc	rdi
        sub	rsi,5		; move to previous pixel +2 needs to shift -3 ...  ofs -5
	
	cmp	rdi,rsi
	jb	PIXEL		; unsigned comparison	

ToEnd:
        ret                     ; _cdecl return
                
Flip24 endp


;*************************************************************************************


	public  Flip32
Flip32	proc \
        uses rdi rsi
;       Data:ptr byte,	RCX
;       count:DWORD	RDX

        mov     rdi,rcx		; rdi=first pointer
        sub	rdx,1		; rdx=amount of pixels - 1, set flags
        jle	ToEnd		; ignore values 0 and 1
        
        mov     rsi,rdi		; rsi last pixel position
	
        shl	rdx, 2		; count*4 - 4
        add	rsi,rdx
                
PIXEL:	mov	eax,[rdi]
	mov	ecx,[rsi]
	mov	[rdi],ecx
	mov	[rsi],eax
	add	rdi,4
	sub	rsi,4
	cmp	rdi,rsi
	jl	PIXEL	

ToEnd:
        ret                     ; _cdecl return
                
Flip32	endp


;*************************************************************************************


	public  Flip64
Flip64	proc
;       Data:ptr byte,	RCX
;       count:DWORD	RDX

        mov     R8,rcx		; R8=first pointer
        jrcxz	ToEnd
        sub	rdx, 1		; rdx=amount of pixels - 1, set flags
        jle	ToEnd		; ignore values 0 and 1

	shl	rdx, 3		; count*8 - 8
	add	rdx,R8		; calc end pointer
       
PIXEL:       
       	mov	rax,[R8]
       	mov	rcx,[rdx]
	mov	[rdx],rax
	mov	[R8],rcx	; stosq is slower than this :(
	add	R8,8	
	sub	rdx,8

	cmp	R8,rdx
	jl	PIXEL	
ToEnd:
        ret                     ; _cdecl return
                
Flip64	endp



;*************************************************************************************

;void Peel1BitNStep(uint8_t *Buffer1Bit, const uint8_t *BufferSrc, unsigned count, uint16_t PlaneStep)
	public  Peel1BitNStep
Peel1BitNStep proc \
        uses rdi
;       Buffer1Bit	RCX
;       BufferSrc	RDX
;	count		R8
;	PlaneStep	R9
	
	or	rdx,rdx
	jz	ToEnd
	jrcxz	ToEnd		; bad pointer
	
	mov	rdi,rcx
	or	r8,r8	
	jz	ToEnd		; zero pixels
	mov	rcx,r9
	inc	cl
	
	shr	r9,8
	and	r9,0FFh		; byte increment

	mov	al,1
	cld
BitLoop:mov	ah,[rdx]
	add	rdx,r9
	
	shr	ah,cl		; needed bit goes to CY
	rcl	al,1
	jnc	GoLoop
	
	stosb			; store 8 bits
	mov	al,1

GoLoop:	dec	r8	
	jnz	BitLoop
	
	cmp	al,1
	jbe	ToEnd
ShiftAll:sal	al,1
	jnc	ShiftAll
	
	mov	[rdi],al

ToEnd:
        ret                     ; _cdecl return
                
Peel1BitNStep	endp




;void Join1BitNStep(const uint8_t *Buffer1Bit, uint8_t *Buffer, unsigned count, uint16_t PlaneStep)
	public  Join1BitNStep
Join1BitNStep proc \
        uses rdi
;       Buffer1Bit	RCX
;       Buffer		RDX
;	count		R8
;	PlaneStep	R9
	
	or	rdx,rdx
	jz	ToEnd
	jrcxz	ToEnd		; bad pointer
	
	mov	rdi,rcx
	or	r8,r8	
	jz	ToEnd		; zero pixels
	mov	rcx,r9
	
	mov	ch,1
	shl	ch,cl		; OR mask
	mov	cl,ch
	not	cl		; AND mask
	
	shr	r9,8
	and	r9,0FFh		; byte increment

	mov	al,[rdi]	; 1 bit datastream
	stc
	rcl	al,1		; Feed one abundant bit from CY. CY contains bit 8.
	jmp	BitLoop2
	
BitLoop:shl	al,1
BitLoop2:mov	ah,[rdx]
	jc	SetBit
	and	ah,cl
        jmp	StorByte	
	
SetBit:	or	ah,ch
StorByte:mov	[rdx],ah
	add	rdx,r9

	cmp	al,80h
	je	Inc1Bit
	dec	r8
	jnz	BitLoop
ToEnd:
        ret                     ; _cdecl return		

Inc1Bit:dec	r8
	jz	ToEnd

	inc	rdi
	mov	al,[rdi]	; Get a new byte from 1 bit datastream
	stc
	rcl	al,1
	jmp	BitLoop2
                
Join1BitNStep	endp


;void Join8BitNStep(const uint8_t *Buffer8Bit, uint8_t *Buffer, unsigned count, uint8_t ByteStep)
	public  Join8BitNStep
Join8BitNStep proc \
        uses rsi
;	Buffer8Bit	RCX
;	Buffer		RDX
;	count		R8
;	ByteStep	R9

	jrcxz	ToEnd
	mov	rsi,rcx
	or	rdx,rdx
	jz	ToEnd	
	mov	rcx,r8
	jrcxz	ToEnd
	and	r9,0FFh

	cld
ByteLop:lodsb
	mov	[rdx],al
	add	rdx,r9
	loop	ByteLop
ToEnd:
        ret                     ; _cdecl return        
        
Join8BitNStep endp


;void Peel8BitNStep(uint8_t *Buffer8Bit, const uint8_t *BufferSrc, unsigned count, uint8_t ByteStep)
	public  Peel8BitNStep
Peel8BitNStep proc \
        uses rdi
;	Buffer8Bit	RCX
;	BufferSrc	RDX
;	count		r8
;	ByteStep	r9

	or	rdx,rdx
	jz	ToEnd
	jrcxz	ToEnd
	mov	rdi,rcx
	mov	rcx,r8
	jrcxz	ToEnd
	and	r9,0FFh

	cld
ByteLop:mov	al,[rdx]
	add	rdx,r9
	stosb
	loop	ByteLop
ToEnd:
        ret                     ; _cdecl return        
        
Peel8BitNStep endp


        end
