.globl fb_write
.data
.balign 4
@ C prototype : extern void fb_write(uint64_t *screen, uint32_t *fb);

@ FASTBLIT v2 by Henrik Erlandsson aka Photon of Scoopex	
@ - Converts 32-bpp RGB buffer to 16-bpp (RGB565) buffer. NOTE: alpha channel must be 0 !
@ - Assemble with: arm-linux-as.exe -mcpu=arm920 -o fb_write.o fb_write.s
@ - Link to your main.c with : arm-linux-ld -e main fb_write.o main.o -static -s -o <your gpe>
@ ! Remember to put the shebang inside a .data section as r13 is saved (and mangled, briefly).

@ Credits to A_SN and SenorQuack from gp32x.com for the previous asm versions
@ This version blits 30x16 pixels per loop, 8 instructions per pixel-pair.
@ Derived from Senor Quack's (Dan Silsby's) version:
@ http://wiki.gp2x.org/wiki/Fast_32-bit_to_16-bit_framebuffer_blit
@ Benchmarks: 5684 ms / call, 77.8% of SenorQuack's version, 74.1% of A_SN's version.
@ (if you blit to screen buffer, use it as noncached, buffered for performance.)
@ (Manuals suggest aligning buffers to 8 word boundaries would remove the occasional
@  split burst, but if so, it's smaller than the small fluctuations in timing.)

@ Use freely for anything you want, credit me if you feel like it... if you use it in 
@ commercial software / engines / libs, ask me (photon.AT.coppershade.org) first.


@ Convert 2 32bpp pixels to a register containing 2 RGB565 pixels
.macro PIXELPAIR PX1,PX2,DEST
	and r10,r12,\PX1,lsl #18	@ mask first blue to top-6
	orr r10,r10,\PX1,lsr #19	@ shiftmask first red to bottom

	eor \PX1,\PX1,\PX2,lsl #16	@ or second green to empty top
	and \PX1,r11,\PX1,lsr #5	@ mask out greens and shift them into place

	orr \PX1,\PX1,r10,ror #5+16	@ second 16bpp color + first green done, r10 free

	and r10,r12,\PX2,lsl #18	@ mask second blue to top-6
	orr r10,r10,\PX2,lsr #19	@ shiftmask second red to bottom

	orr \DEST,\PX1,r10,ror #5	@ or first red + green to top half of the pixelpair word. Done!
.endm
	
fb_writeDone:					@ reorder so SaveSP is within reach in init _and_ exit code
	ldr r13,.SaveSP
	ldmfd sp!,{r0-r1,r4-r12,pc}	@ restore registers & return

.Gmask:
	.long 0x07e007e0
.SaveSP:
	.long 0

fb_write:					@ args: r0,r1 = start addr of 32bpp buf, start addr of 16bpp buf
	stmfd sp!,{r0-r1,r4-r12,lr}	@ store registers
	mov lr,r0				@ source
	mov r12,#160			@ loop count
	orr r12,r12,#0x1F<<21	@ use count register as mask also
	ldr r11,.Gmask			@ for masking both greens of 2 16bpp pixels 
	str r13,.SaveSP
	mov r13,r1				@ destination
.loop:
	ldmia lr!,{r0-r7}		@ load 8 32-bit pixels		
	PIXELPAIR r0,r1,r0
	PIXELPAIR r2,r3,r1
	PIXELPAIR r4,r5,r2
	PIXELPAIR r6,r7,r3
	ldmia lr!,{r4-r9}		@ load 6 32-bit pixels		
	sub r12,r12,#1			@ decrease counter during load-use interlock
	PIXELPAIR r4,r5,r4
	PIXELPAIR r6,r7,r5
	PIXELPAIR r8,r9,r6
	ldmia lr!,{r7-r8}		@ load 2 32-bit pixels		
	tst r12,#0xFF			@ test during load-use interlock: counter bits 0?
	PIXELPAIR r7,r8,r7
	stmia r13!,{r0-r7}		@ store 16 pixels (8 words)
.rept 29
	ldmia lr!,{r0-r7}		@ load 8 32-bit pixels		
	PIXELPAIR r0,r1,r0
	PIXELPAIR r2,r3,r1
	PIXELPAIR r4,r5,r2
	PIXELPAIR r6,r7,r3
	ldmia lr!,{r4-r9}		@ load 6 32-bit pixels		
	PIXELPAIR r4,r5,r4
	PIXELPAIR r6,r7,r5
	PIXELPAIR r8,r9,r6
	ldmia lr!,{r7-r8}		@ load 2 32-bit pixels		
	PIXELPAIR r7,r8,r7
	stmia r13!,{r0-r7}		@ store 16 pixels (8 words)
.endr
	bne .loop				@ if Count<>0, repeat
	b fb_writeDone
