;
;		Flick FLI-format Animation Viewer v1.2		  19 Feb 1994
;		--------------------------------------
;
;
;This program plays FLI/FLC-format bitmapped animation files on any ECS
;or AGA Amiga running OS2.04 or higher.  FLI/FLC-format files are
;produced by Autodesk Animator and Autodesk 3D Studio on a PC, as well
;as by other programs.
;
;The files in this archive may be distributed anywhere provided they are
;unmodified and are not sold for profit.
;
;Ownership and copyright of all files remains with the author:
;
;	Peter McGavin, 86 Totara Crescent, Lower Hutt, New Zealand.
;	e-mail: peterm@maths.grace.cri.nz
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;		xdef	_chunky2planar

; peterm/adaptive7.s
; Combines peterm/chunky4.s and jmccoull/blitter4pass.s
; The blitter works on the top portion of the display at the same time as
; the CPU converts the bottom portion.
; The blitter has completely finished before the routine returns.
; Both parts of every call are timed using the EClock.
; The partition point is recalculated at the end of the call in an attempt
; to keep the two routines taking about the same amount of time.
;
; The following formula is used:
;
;	n_blit = n * t_cpu * n_blit / (t_blit * n_cpu + t_cpu * n_blit)
;
; where:
;	n	is the total number of 32-byte units (i.e, width*height/32)
;	n_blit	is the number of 32-byte units above the partition
;	n_cpu	is the number of 32-byte units below the partition (=n-n_blit)
;	t_blit	is the time taken by the blitter in EClock units
;	t_cpu	is the time taken by the cpu in EClock units
;
; ECS Agnus required (for long blits)
; KS2.0 required (for utility.library and EClock)

;-----------------------------------------------------------------------------
; Set Macro68 defaults

		default	_branch,_word
		default	_adrbasedisp,_word
		default	_pcbasedisp,_word
		default	_outerdisp,_word
		default	_absolute,_abs

bltcpt	 	equ	$048
bltbpt	 	equ	$04c
bltapt	 	equ	$050
bltdpt	 	equ	$054
bltsizv  	equ	$05c
bltsizh  	equ	$05e
cleanup		equ	$40
_LVOReadEClock	equ	-60
_LVOCacheClearU	equ	-636
_LVOUMult32	equ	-144
_LVOUDivMod32	equ	-156

;-----------------------------------------------------------------------------
; chunky2planar:	(new Motorola syntax)
;  a0 -> chunky pixels (in FAST RAM)
;  a1 -> plane0 (assume other 7 planes are allocated contiguously)
;  a3 -> tmp chip buffer0, size=width*height
;  a4 -> tmp chip buffer1, size=width*height
;  a5 = TimerBase
;  a6 = GfxBase
;  d0 = UtilityBase
;  d1 = width*height/32 (if generic is defined)

	ifnd generic
pixels		equ	width*height
plsiz		equ	width*height/8
	endc


		section	code,code

	ifd generic
	ifeq depth-8
_c2p_8::
	else
	ifeq depth-6
_c2p_6::
	else
	ifeq depth-4
_c2p_4::
	endc
	endc
	endc
	else
	ifeq depth-8
		ifeq	width-320
_c2p320x200x8::
		endc
	else
	ifeq depth-6
		ifeq	width-320
_c2p320x200x6::
		endc
	else
		die	"Unrecognised resolution"
	endc
	endc
	endc

		movem.l	d2-d7/a2-a6,-(sp)

; save parameters

		movea.l	#mybltnode,a2
		move.l	a0,(chunky-mybltnode,a2)
		move.l	a1,(plane0-mybltnode,a2)
		move.l	a3,(buff0-mybltnode,a2)
		move.l	a4,(buff1-mybltnode,a2)
		move.l	a5,(timerbase-mybltnode,a2)
		move.l	a6,(gfxbase-mybltnode,a2)
		move.l	d0,(utilitybase-mybltnode,a2)

; if different size then initialise v_plsiz, v_plsiz_depth and split location
	ifd generic
		cmp.l	(v_plsiz-mybltnode,a2),d1
		beq.b	skip_relocate
		move.l	d1,(v_plsiz-mybltnode,a2)

		move.l	d1,d0
		lsr.l	#3,d0
		move.l	d0,(n_blit-mybltnode,a2)
		lsl.l	#2,d0
		move.l	d0,(plsiz_blit-mybltnode,a2)
		lsl.l	#3,d0
		move.l	d0,(pixels_blit-mybltnode,a2)

		moveq	#depth-1,d0
		movea.l	(utilitybase-mybltnode,a2),a6
		jsr	(_LVOUMult32,a6)
		move.l	d0,(v_plsiz_depth-mybltnode,a2)

		bclr	#0,(firsttimeflag-mybltnode,a2)
		bne.b	skip_relocate

; see if this is the first time thru
	else
		tst.b	(firsttimeflag-mybltnode,a2)
		bne.b	skip_relocate	; branch if not being called 1st time
	endc

; relocate the mainloop to a quad-longword boundary (for 030/040 cache line)

		lea	(begincode,pc),a4
		adda.w	#15,a4
		move.l	a4,d0
		and.w	#~15,d0
		movea.l	d0,a4
		lea	(mainloop,pc),a3
		move.w	#(endcode-mainloop)/2-1,d0
1$:		move.w	(a3)+,(a4)+
		dbra	d0,1$

; flush the caches

		movea.l	(4).w,a6
		jsr	(_LVOCacheClearU,a6)

skip_relocate:

; see if chunky data are in CHIP ram

		movea.l	(chunky-mybltnode,a2),a1
		move.l	a1,(source-mybltnode,a2)
		movea.l	(4).w,a6
		jsr	(_LVOTypeOfMem,a6)
		and.w	#MEMF_CHIP,d0
		bne.b	readstart	; branch if already in CHIP

; copy pixels_blit from chunky to buff0 (from FAST to CHIP) for the blitter

		movea.l	(chunky-mybltnode,a2),a0
		movea.l	(buff0-mybltnode,a2),a1
		move.l	a1,(source-mybltnode,a2)
		move.l	(pixels_blit-mybltnode,a2),d0
		jsr	(_LVOCopyMemQuick,a6)

; read the start time

readstart:	lea	(starttime-mybltnode,a2),a0
		movea.l	(timerbase-mybltnode,a2),a6
		jsr	(_LVOReadEClock,a6)

; start the blitter in the background

		st	(waitflag-mybltnode,a2)
		movea.l	a2,a1
		movea.l	(gfxbase-mybltnode,a2),a6
		jsr	(_LVOQBlit,a6)

; compute starting parameters for the CPU routine

	ifd generic
		move.l	(v_plsiz-mybltnode,a2),d0
		move.l	d0,-(sp)
		move.l	(v_plsiz_depth-mybltnode,a2),-(sp)
	else
		move.l	#plsiz,d0
	endc
		sub.l	(plsiz_blit-mybltnode,a2),d0
		lsr.l	#2,d0
		move.w	d0,-(sp)	; outer loop counter on stack

		move.l	(chunky-mybltnode,a2),a0
		adda.l	(pixels_blit-mybltnode,a2),a0	; offset into chunky

		move.l	(plane0-mybltnode,a2),a1
		adda.l	(plsiz_blit-mybltnode,a2),a1	; offset into plane

		lea	(buffers-mybltnode,a2),a3	; a3 -> buffers

	ifnd generic
	iflt 4*plsiz-4-32768
		adda.w	#3*plsiz,a1	; a1 -> plane 3
	else
	iflt 2*plsiz-4-32768
		adda.w	#1*plsiz,a1	; a1 -> plane 1
	endc
	endc
	endc

; set up register constants

		move.l	#$0f0f0f0f,d5	; d5 = constant $0f0f0f0f
		move.l	#$55555555,d6	; d6 = constant $55555555
		move.l	#$3333cccc,d7	; d7 = constant $3333cccc
		lea	(4,a3),a2	; used for inner loop end test

; load up address registers with buffer ptrs

		lea	(2*4,a3),a4	; a4 -> plane2buf
		lea	(2*4,a4),a5	; a5 -> plane4buf
		lea	(2*4,a5),a6	; a6 -> plane6buf

begincode:	rept	8		; space for mainloop code relocation
		nop
		endr

; main loop (starts here) processes 8 chunky pixels at a time

mainloop:	move.l	(a0)+,d2	; 12 get next 4 chunky pixels in d2
		move.l	(a0)+,d3	; 12 get next 4 chunky pixels in d3
	ifgt depth-4
		move.l	d2,d0		;  4
		and.l	d5,d2		;  8 d5=$0f0f0f0f
		move.l	d3,d1		;  4
		and.l	d5,d3		;  8 d5=$0f0f0f0f
		eor.l	d2,d0		;  8
		eor.l	d3,d1		;  8
		lsr.l	#4,d1		; 16
		or.l	d1,d0		;  8
	endc
		lsl.l	#4,d2		; 16
		or.l	d3,d2		;  8
		move.l	d2,d3		;  4
		and.l	d7,d3		;  8 d7=$3333cccc
		eor.l	d3,d2		;  8
		lsr.w	#2,d3		; 10
		swap	d3		;  4
		lsl.w	#2,d3		; 10
		or.l	d2,d3		;  8
	ifgt depth-4
		move.l	d0,d1		;  4
		and.l	d7,d1		;  8 d7=$3333cccc
		eor.l	d1,d0		;  8
		lsr.w	#2,d1		; 10
		swap	d1		;  4
		lsl.w	#2,d1		; 10
		or.l	d0,d1		;  8
		move.l	d1,d2		;  4
		lsr.l	#7,d2		; 22
		move.l	d1,d0		;  4
		and.l	d6,d0		;  8 d6=$55555555
	endc
	ifgt depth-4
		eor.l	d0,d1		;  8
		move.l	d2,d4		;  4
		and.l	d6,d4		;  8 d6=$55555555
		eor.l	d4,d2		;  8
restart:	or.l	d4,d1		;  8
		lsr.l	#1,d1		; 10
		move.b	d1,(4,a5)	; 12 plane 5
	ifgt depth-6
		swap	d1		;  4
		move.b	d1,(4,a6)	; 12 plane 7
	endc
		or.l	d0,d2		;  8
		move.b	d2,(a5)+	;  8 plane 4
	ifgt depth-6
		swap	d2		;  4
		move.b	d2,(a6)+	;  8 plane 6
	endc
	endc
		move.l	d3,d2		;  4
		lsr.l	#7,d2		; 22
		move.l	d3,d0		;  4
		and.l	d6,d0		;  8 d6=$55555555
		eor.l	d0,d3		;  8
		move.l	d2,d4		;  4
		and.l	d6,d4		;  8 d6=$55555555
		eor.l	d4,d2		;  8
		or.l	d4,d3		;  8
		lsr.l	#1,d3		; 10
restart2:	move.b	d3,(4,a3)	; 12 plane 1
		swap	d3		;  4
		move.b	d3,(4,a4)	; 12 plane 3
		or.l	d0,d2		;  8
		move.b	d2,(a3)+	;  8 plane 0
		swap	d2		;  4
		move.b	d2,(a4)+	;  8 plane 2

		cmpa.l	a3,a2		;128  6
		bne.w	mainloop	;130 10	total=512 (64.0 cycles/pixel)

; move stack buffers to bitplanes (longword writes) and restore ptrs
; Intersperse some instructions for the next loop between writes to do something
; useful while waiting for the Chip bus.
; Do this by defining macros b0..b8 which are then interleaved with some
; instructions for next loop.
; Check if finished, go back for more.

	ifd generic
b0		macro
		move.l	(6,sp),d4		; (v_plsiz) a1 points at plane 0
		adda.l	(2,sp),a1		; (v_plsiz_depth)
		endm
b1		macro
		move.l	(a6),(a1)		; plane 7
		suba.l	d4,a1			;
		endm
b2		macro
		move.l	-(a6),(a1)		; plane 6
		suba.l	d4,a1			;
		endm
b3		macro
		move.l	(a5),(a1)		; plane 5
		suba.l	d4,a1			;
		endm
b4		macro
		move.l	-(a5),(a1)		; plane 4
		suba.l	d4,a1			;
		endm
b5		macro
		move.l	(a4),(a1)		; plane 3
		suba.l	d4,a1			;
		endm
b6		macro
		move.l	-(a4),(a1)		; plane 2
		suba.l	d4,a1			;
		endm
b7		macro
		move.l	(a3),(a1)		; plane 1
		suba.l	d4,a1			;
		endm
b8		macro
		move.l	-(a3),(a1)+		; plane 0
		endm
	endc

	ifnd generic
	iflt 4*plsiz-4-32768		;    a1 points into plane 3
b0		macro
		move.l	(a4),(a1)+		; plane 3
		endm
b1		macro
		endm
b2		macro
		move.l	(a6),(4*plsiz-4,a1)	; plane 7
		endm
b3		macro
		move.l	-(a6),(3*plsiz-4,a1)	; plane 6
		endm
b4		macro
		move.l	(a5),(2*plsiz-4,a1)	; plane 5
		endm
b5		macro
		move.l	-(a5),(1*plsiz-4,a1)	; plane 4
		endm
b6		macro
		move.l	-(a4),(-1*plsiz-4,a1)	; plane 2
		endm
b7		macro
		move.l	(a3),(-2*plsiz-4,a1)	; plane 1
		endm
b8		macro
		move.l	-(a3),(-3*plsiz-4,a1)	; plane 0
		endm
	endc
	endc

	ifnd generic
	ifge 4*plsiz-4-32768
	iflt 2*plsiz-32768			; a1 points into plane 1
b0		macro
	ifgt depth-4
		adda.l	#4*plsiz,a1
	endc
		endm
b1		macro
		move.l	(a6),(2*plsiz,a1)	; plane 7
		endm
b2		macro
		move.l	-(a6),(1*plsiz,a1)	; plane 6
		endm
b3		macro
		move.l	(a5),(0*plsiz,a1)	; plane 5
		endm
b4		macro
		move.l	-(a5),(-1*plsiz,a1)	; plane 4
		suba.l	#4*plsiz,a1
		endm
b5		macro
		move.l	(a4),(2*plsiz,a1)	; plane 3
		endm
b6		macro
		move.l	-(a4),(1*plsiz,a1)	; plane 2
		endm
b7		macro
		move.l	(a3),(a1)+		; plane 1
		endm
b8		macro
		move.l	-(a3),(-1*plsiz-4,a1)	; plane 0
		endm
	endc
	endc
	endc

	ifnd generic
	ifge 2*plsiz-32768			; a1 points into plane 1
	iflt plsiz-32768			; a1 points into plane 0
b0		macro
	ifgt plsiz-5
		adda.l	#6*plsiz,a1
	else
		adda.l	#3*plsiz,a1
	endc
		endm
b1		macro
		move.l	(a6),(plsiz,a1)		; plane 7
		endm
b2		macro
		move.l	-(a6),(a1)		; plane 6
		endm
b3		macro
		move.l	(a5),(-plsiz,a1)	; plane 5
		suba.l	#3*plsiz,a1
		endm
b4		macro
		move.l	-(a5),(plsiz,a1)	; plane 4
		endm
b5		macro
		move.l	(a4),(a1)		; plane 3
		endm
b6		macro
		move.l	-(a4),(-plsiz,a1)	; plane 2
		suba.l	#3*plsiz,a1
		endm
b7		macro
		move.l	(a3),(plsiz,a1)		; plane 1
		endm
b8		macro
		move.l	-(a3),(a1)+		; plane 0
		endm
	endc
	endc
	endc

	ifnd generic
	ifge plsiz-32768			; a1 points into plane 0
b0		macro
		move.l	#plsiz,d4
		adda.l	#(depth-1)*plsiz,a1
		endm
b1		macro
		move.l	(a6),(a1)		; plane 7
		suba.l	d4,a1
		endm
b2		macro
		move.l	-(a6),(a1)		; plane 6
		suba.l	d4,a1
		endm
b3		macro
		move.l	(a5),(a1)		; plane 5
		suba.l	d4,a1
		endm
b4		macro
		move.l	-(a5),(a1)		; plane 4
		suba.l	d4,a1
		endm
b5		macro
		move.l	(a4),(a1)		; plane 3
		suba.l	d4,a1
		endm
b6		macro
		move.l	-(a4),(a1)		; plane 2
		suba.l	d4,a1
		endm
b7		macro
		move.l	(a3),(a1)		; plane 1
		suba.l	d4,a1
		endm
b8		macro
		move.l	-(a3),(a1)+		; plane 0
		endm
	endc
	endc

; Now use the macros b0..b8 interleaved in instructions for next loop

; 8 planes
	ifeq depth-8
		b0
		b1				; plane 7
		move.l	(a0)+,d2	; 12 get next 4 chunky pixels in d2
		move.l	(a0)+,d3	; 12 get next 4 chunky pixels in d3
		move.l	d2,d0		;  4
		and.l	d5,d2		;  8 d5=$0f0f0f0f
		b2				; plane 6
		move.l	d3,d1		;  4
		and.l	d5,d3		;  8 d5=$0f0f0f0f
		eor.l	d2,d0		;  8
		eor.l	d3,d1		;  8
		lsr.l	#4,d1		; 16
		b3				; plane 5
		or.l	d1,d0		;  8
		lsl.l	#4,d2		; 16
		or.l	d3,d2		;  8
		move.l	d2,d3		;  4
		and.l	d7,d3		;  8 d7=$3333cccc
		b4				; plane 4
		eor.l	d3,d2		;  8
		lsr.w	#2,d3		; 10
		swap	d3		;  4
		lsl.w	#2,d3		; 10
		or.l	d2,d3		;  8
		b5				; plane 3
		move.l	d0,d1		;  4
		and.l	d7,d1		;  8 d7=$3333cccc
		eor.l	d1,d0		;  8
		lsr.w	#2,d1		; 10
		swap	d1		;  4
		b6				; plane 2
		lsl.w	#2,d1		; 10
		or.l	d0,d1		;  8
		move.l	d1,d2		;  4
		lsr.l	#7,d2		; 22
		move.l	d1,d0		;  4
		b7				; plane 1
		and.l	d6,d0		;  8 d6=$55555555
		eor.l	d0,d1		;  8
		move.l	d2,d4		;  4
		and.l	d6,d4		;  8 d6=$55555555
		eor.l	d4,d2		;  8
		b8				; plane 0
		sub.w	#1,(sp)
		bne.w	restart
	else
	ifeq depth-6
		b0
		b3				; plane 5
		move.l	(a0)+,d2	; 12 get next 4 chunky pixels in d2
		move.l	(a0)+,d3	; 12 get next 4 chunky pixels in d3
		move.l	d2,d0		;  4
		and.l	d5,d2		;  8 d5=$0f0f0f0f
		move.l	d3,d1		;  4
		and.l	d5,d3		;  8 d5=$0f0f0f0f
		b4				; plane 4
		eor.l	d2,d0		;  8
		eor.l	d3,d1		;  8
		lsr.l	#4,d1		; 16
		or.l	d1,d0		;  8
		lsl.l	#4,d2		; 16
		or.l	d3,d2		;  8
		b5				; plane 3
		move.l	d2,d3		;  4
		and.l	d7,d3		;  8 d7=$3333cccc
		eor.l	d3,d2		;  8
		lsr.w	#2,d3		; 10
		swap	d3		;  4
		lsl.w	#2,d3		; 10
		b6				; plane 2
		or.l	d2,d3		;  8
		move.l	d0,d1		;  4
		and.l	d7,d1		;  8 d7=$3333cccc
		eor.l	d1,d0		;  8
		lsr.w	#2,d1		; 10
		swap	d1		;  4
		lsl.w	#2,d1		; 10
		or.l	d0,d1		;  8
		move.l	d1,d2		;  4
		b7				; plane 1
		lsr.l	#7,d2		; 22
		move.l	d1,d0		;  4
		and.l	d6,d0		;  8 d6=$55555555
		eor.l	d0,d1		;  8
		move.l	d2,d4		;  4
		and.l	d6,d4		;  8 d6=$55555555
		eor.l	d4,d2		;  8
		b8				; plane 0
		sub.w	#1,(sp)
		bne.w	restart
	else
	ifeq depth-4
		b0
		b5				; plane 3
		move.l	(a0)+,d2	; 12 get next 4 chunky pixels in d2
		move.l	(a0)+,d3	; 12 get next 4 chunky pixels in d3
		lsl.l	#4,d2		; 16
		or.l	d3,d2		;  8
		move.l	d2,d3		;  4
		and.l	d7,d3		;  8 d7=$3333cccc
		eor.l	d3,d2		;  8
		b6				; plane 2
		lsr.w	#2,d3		; 10
		swap	d3		;  4
		lsl.w	#2,d3		; 10
		or.l	d2,d3		;  8
		move.l	d3,d2		;  4
		lsr.l	#7,d2		; 22
		move.l	d3,d0		;  4
		b7				; plane 1
		and.l	d6,d0		;  8 d6=$55555555
		eor.l	d0,d3		;  8
		move.l	d2,d1		;  4
		and.l	d6,d1		;  8 d6=$55555555
		eor.l	d1,d2		;  8
		or.l	d1,d3		;  8
		lsr.l	#1,d3		; 10
		b8				; plane 0
		sub.w	#1,(sp)
		bne.w	restart2
	else
		die	"Unsupported depth"
	endc
	endc
	endc

		jmp	(endcode)	; break out of relocated code
endcode:

; CPU all done!  restore stack

	ifd generic
		add.w	#10,sp			; remove stack vars
	else
		addq.l	#2,sp			; remove outer loop counter
	endc

; find out how long it took

		lea	(endcputime-buffers,a3),a0
		movea.l	(timerbase-buffers,a3),a6	; timerbase
		jsr	(_LVOReadEClock,a6)

; wait for the blitter to finish
; busy-wait (for a very short time) on FAST bus, even on a CHIP-only machine

		movea.l	(gfxbase-buffers,a3),a6
		bra.b	endwaitloop
waitloop:	jsr	(_LVOWaitBlit,a6)
endwaitloop:	tst.b	(waitflag-buffers,a3)
		bne.b	waitloop

; get blittime,cputime,n_blit in d2,d3,d0

		move.l	(endblittime+4-buffers,a3),d2
		sub.l	(starttime+4-buffers,a3),d2

		move.l	(endcputime+4-buffers,a3),d3
		sub.l	(starttime+4-buffers,a3),d3

		move.l	(n_blit-buffers,a3),d0

; branch if this is not the first time through

		bset	#0,(firsttimeflag-buffers,a3)
		bne.b	simple

; calculate new partition point for next call using formula
; n_blit = n * (t_cpu * n_blit / (t_blit * n_cpu + t_cpu * n_blit))
; d0     = plsiz/4 * d3 * d0 / (d2 * (plsiz/4 - d0) + d3 * d0)

		movea.l	(utilitybase-buffers,a3),a6

		moveq	#10,d4
		lsr.l	d4,d2			; scale t_blit (avoid overflow)
		lsr.l	d4,d3			; scale t_cpu

		move.l	d0,d4
		move.l	d3,d1
		jsr	(_LVOUMult32,a6)
		move.l	d0,d3

	ifd generic
		move.l	(v_plsiz-buffers,a3),d1
		lsr.l	#2,d1
	else
		move.l	#plsiz/4,d1
	endc
		jsr	(_LVOUMult32,a6)
		move.l	d0,d5

	ifd generic
		move.l	(v_plsiz-buffers,a3),d0
		lsr.l	#2,d0
	else
		move.l	#plsiz/4,d0
	endc
		sub.l	d4,d0
		move.l	d2,d1
		jsr	(_LVOUMult32,a6)
		add.l	d0,d3

		move.l	d5,d0
		move.l	d3,d1
		jsr	(_LVOUDivMod32,a6)

		bra.b	done

; simple-minded adjustment

simple:		sub.l	d3,d2			; blittime-cputime
		beq.b	alldone			; can't do better than this
		bgt.b	1$
; blittime < cputime, increase n_blit
		addq.l	#8,d0
	ifd generic
		move.l	(v_plsiz-buffers,a3),d1
		lsr.l	#2,d1
		cmp.l	d1,d0
	else
		cmp.l	#plsiz/4,d0
	endc
		bcs.b	done
		bra.b	alldone			; don't go out of range
; blittime > cputime, decrease n_blit
1$:		subq.l	#8,d0
		bhi.b	done
		bra.b	alldone			; don't go out of range

; save the new partition point

done:		move.l	d0,(n_blit-buffers,a3)
		lsl.l	#2,d0
		move.l	d0,(plsiz_blit-buffers,a3)
		lsl.l	#3,d0
		move.l	d0,(pixels_blit-buffers,a3)

; all done!

alldone:	movem.l	(sp)+,d2-d7/a2-a6
		rts

;-----------------------------------------------------------------------------
; QBlit functions (called asynchronously)

blit11:		moveq	#-1,d0
		move.l	d0,(bltafwm,a0)
		move.l	#(8<<16)+8,(bltbmod,a0)	; also loads bltamod
		move.w	#0,(bltdmod,a0)
		move.l	(source-mybltnode,a1),d0
		move.l	d0,(bltapt,a0)		; source
		addq.l	#8,d0
		move.l	d0,(bltbpt,a0)		; source+8
		move.w	#%1111111100000000,(bltcdat,a0)
		move.l	(buff1-mybltnode,a1),(bltdpt,a0) ; buff1
		move.l	#$0DE48000,(bltcon0,a0)	; D=AC+(B>>8)~C
		move.l	(pixels_blit-mybltnode,a1),d0
		lsr.l	#4,d0
		move.w	d0,(bltsizv,a0)		; pixels_blit/16
		move.w	#4,(bltsizh,a0)		; do blit
		lea	(blit12,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		rts

blit12:		move.l	(source-mybltnode,a1),d0
		add.l	(pixels_blit-mybltnode,a1),d0
		sub.l	#8+2,d0
		move.l	d0,(bltapt,a0)		; source+pixels_blit-8-2
		addq.l	#8,d0
		move.l	d0,(bltbpt,a0)		; source+pixels_blit-2
		sub.l	(source-mybltnode,a1),d0
		add.l	(buff1-mybltnode,a1),d0
		move.l	d0,(bltdpt,a0)		; buff1+pixels_blit-2
		move.l	#$8DE40002,(bltcon0,a0)	; D=(A<<8)C+B~C, desc.
		move.w	#4,(bltsizh,a0)		; do blit
		lea	(blit21,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		rts

blit21:		move.l	#(4<<16)+4,(bltbmod,a0)	; also load bltamod
		move.l	(buff1-mybltnode,a1),d1
		move.l	d1,(bltapt,a0)		; buff1
		addq.l	#4,d1
		move.l	d1,(bltbpt,a0)		; buff1+4
		move.w	#%1111000011110000,(bltcdat,a0)
		move.l	(buff0-mybltnode,a1),(bltdpt,a0) ; buff0
		move.l	#$0DE44000,(bltcon0,a0)	; D=AC+(B>>4)~C
		move.l	(pixels_blit-mybltnode,a1),d0
		lsr.l	#3,d0			; bltsizv = pixels_blit/8
blit21a:	cmp.l	#32768,d0		; check for overflow blitter
		bls.b	blit21c			; branch if ok
		move.l	d0,(tmp_ptr-mybltnode,a1) ; else save (too big) bltsizv
		move.w	#32768,(bltsizv,a0)	; max possible bltsizv
		move.w	#1,(bltsizh,a0)		; do blit
		lea	(blit21b,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		rts

blit21b:	move.l	(tmp_ptr-mybltnode,a1),d0 ; restore (too big) bltsizv
		sub.l	#32768,d0		; subtract number already done
		bra.b	blit21a			; loop back

blit21c:	move.w	d0,(bltsizv,a0)		; pixels_blit/8
		move.w	#2,(bltsizh,a0)		; do blit
		lea	(blit22,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		rts

blit22:		move.l	(buff1-mybltnode,a1),d0
		add.l	(pixels_blit-mybltnode,a1),d0
		subq.l	#2+4,d0
		move.l	d0,(bltapt,a0)		; buff1+pixels_blit-2-4
		addq.l	#4,d0
		move.l	d0,(bltbpt,a0)		; buff1+pixels_blit-2
		sub.l	(buff1-mybltnode,a1),d0
		add.l	(buff0-mybltnode,a1),d0
		move.l	d0,(bltdpt,a0)		; buff0+pixels_blit-2
		move.l	#$4DE40002,(bltcon0,a0)	; D=(A<<4)C+B~C, desc.
		move.l	(pixels_blit-mybltnode,a1),d0
		lsr.l	#3,d0			; bltsizv = pixels_blit/8
blit22a:	cmp.l	#32768,d0		; check for overflow blitter
		bls.b	blit22c			; branch if ok
		move.l	d0,(tmp_ptr-mybltnode,a1) ; else save (too big) bltsizv
		move.w	#32768,(bltsizv,a0)	; max possible bltsizv
		move.w	#1,(bltsizh,a0)		; do blit
		lea	(blit22b,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		rts

blit22b:	move.l	(tmp_ptr-mybltnode,a1),d0 ; restore (too big) bltsizv
		sub.l	#32768,d0		; subtract number already done
		bra.b	blit22a			; loop back

blit22c:	move.w	d0,(bltsizv,a0)		; pixels_blit/8
		move.w	#2,(bltsizh,a0)		; do blit
		lea	(blit31,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		rts

blit31:		move.l	#(2<<16)+2,(bltbmod,a0)	; also load bltamod
		move.l	(buff0-mybltnode,a1),d0
		move.l	d0,(bltapt,a0)		; buff0
		addq.l	#2,d0
		move.l	d0,(bltbpt,a0)		; buff0+2
		move.w	#%1100110011001100,(bltcdat,a0)
		move.l	(buff1-mybltnode,a1),(bltdpt,a0) ; buff1
		move.l	#$0DE42000,(bltcon0,a0)	; D=AC+(B>>2)~C
		move.l	(pixels_blit-mybltnode,a1),d0
		lsr.l	#2,d0			; bltsizv = pixels_blit/4
blit31a:	cmp.l	#32768,d0		; check for overflow blitter
		bls.b	blit31c			; branch if ok
		move.l	d0,(tmp_ptr-mybltnode,a1) ; else save (too big) bltsizv
		move.w	#32768,(bltsizv,a0)	; max possible bltsizv
		move.w	#1,(bltsizh,a0)		; do blit
		lea	(blit31b,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		rts

blit31b:	move.l	(tmp_ptr-mybltnode,a1),d0 ; restore (too big) bltsizv
		sub.l	#32768,d0		; subtract number already done
		bra.b	blit31a			; loop back

blit31c:	move.w	d0,(bltsizv,a0)		; pixels_blit/4
		move.w	#1,(bltsizh,a0)		; do final blit
		lea	(blit32,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		rts

blit32:		move.l	(buff0-mybltnode,a1),d0
		add.l	(pixels_blit-mybltnode,a1),d0
		subq.l	#2+2,d0
		move.l	d0,(bltapt,a0)		; buff0+pixels_blit-2-2
		addq.l	#2,d0
		move.l	d0,(bltbpt,a0)		; buff0+pixels_blit-2
		sub.l	(buff0-mybltnode,a1),d0
		add.l	(buff1-mybltnode,a1),d0
		move.l	d0,(bltdpt,a0)		; buff1+pixels_blit-2
		move.l	#$2DE40002,(bltcon0,a0)	; D=(A<<2)C+B~C, desc.
		move.l	(pixels_blit-mybltnode,a1),d0
		lsr.l	#2,d0			; bltsizv = pixels_blit/4
blit32a:	cmp.l	#32768,d0		; check for overflow blitter
		bls.b	blit32c			; branch if ok
		move.l	d0,(tmp_ptr-mybltnode,a1) ; else save (too big) bltsizv
		move.w	#32768,(bltsizv,a0)	; max possible bltsizv
		move.w	#1,(bltsizh,a0)		; do blit
		lea	(blit32b,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		rts

blit32b:	move.l	(tmp_ptr-mybltnode,a1),d0 ; restore (too big) bltsizv
		sub.l	#32768,d0		; subtract number already done
		bra.b	blit32a			; loop back

blit32c:	move.w	d0,(bltsizv,a0)		; pixels_blit/4
		move.w	#1,(bltsizh,a0)		; do blit
		lea	(blit41,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		rts

blit41:		moveq	#0,d0
		move.l	d0,(bltbmod,a0)		; also load bltamod
		move.l	(buff1-mybltnode,a1),d0
		move.l	d0,(bltapt,a0)		; buff1+0*plsiz_blit
		add.l	(plsiz_blit-mybltnode,a1),d0
		move.l	d0,(bltbpt,a0)		; buff1+1*plsiz_blit
		move.l	d0,(tmp_ptr-mybltnode,a1)
		move.w	#%1010101010101010,(bltcdat,a0)
		move.l	(plane0-mybltnode,a1),d0
	ifd generic
		move.l	(v_plsiz-mybltnode,a1),d1
		lsl.l	#3,d1
		sub.l	(v_plsiz-mybltnode,a1),d1
		add.l	d1,d0
	else
		add.l	#7*plsiz,d0
	endc
		move.l	d0,(bltdpt,a0)		; Plane7
		move.l	(pixels_blit-mybltnode,a1),d0
		lsr.l	#4,d0
		move.w	d0,(bltsizv,a0)		; pixels_blit/16
		move.l	#$0DE41000,(bltcon0,a0)	; D=AC+(B>>1)~C
		move.w	#1,(bltsizh,a0)		; do blit
		lea	(blit42,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		rts

blit42:		move.l	(plsiz_blit-mybltnode,a1),d1
		move.l	(tmp_ptr-mybltnode,a1),d0
		add.l	d1,d0
		move.l	d0,(bltapt,a0)		; buff1+2*plsiz_blit
		add.l	d1,d0
		move.l	d0,(bltbpt,a0)		; buff1+3*plsiz_blit
		move.l	d0,(tmp_ptr-mybltnode,a1)
		move.l	(plane0-mybltnode,a1),d0
	ifd generic
		move.l	(v_plsiz-mybltnode,a1),d1
		add.l	d1,d0
		add.l	d1,d0
		add.l	d1,d0
	else
		add.l	#3*plsiz,d0
	endc
		move.l	d0,(bltdpt,a0)		; Plane3
		move.w	#1,(bltsizh,a0)		; do blit
		lea	(blit43,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		rts

blit43:		move.l	(plsiz_blit-mybltnode,a1),d1
		move.l	(tmp_ptr-mybltnode,a1),d0
		add.l	d1,d0
		move.l	d0,(bltapt,a0)		; buff1+4*plsiz_blit
		add.l	d1,d0
		move.l	d0,(bltbpt,a0)		; buff1+5*plsiz_blit
		move.l	d0,(tmp_ptr-mybltnode,a1)
		move.l	(plane0-mybltnode,a1),d0
	ifd generic
		move.l	(v_plsiz-mybltnode,a1),d1
		add.l	d1,d0
		lsl.l	#2,d1
		add.l	d1,d0
	else
		add.l	#5*plsiz,d0
	endc
		move.l	d0,(bltdpt,a0)		; Plane5
		move.w	#1,(bltsizh,a0)		; do blit
		lea	(blit44,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		rts

blit44:		move.l	(plsiz_blit-mybltnode,a1),d1
		move.l	(tmp_ptr-mybltnode,a1),d0
		add.l	d1,d0
		move.l	d0,(bltapt,a0)		; buff1+6*plsiz_blit
		add.l	d1,d0
		move.l	d0,(bltbpt,a0)		; buff1+7*plsiz_blit
		move.l	d0,(tmp_ptr-mybltnode,a1)
		move.l	(plane0-mybltnode,a1),d0
	ifd generic
		add.l	(v_plsiz-mybltnode,a1),d0
	else
		add.l	#1*plsiz,d0
	endc
		move.l	d0,(bltdpt,a0)		; Plane1
		move.w	#1,(bltsizh,a0)		; do blit
		lea	(blit45,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		rts

blit45:		move.l	(plsiz_blit-mybltnode,a1),d1
		move.l	(tmp_ptr-mybltnode,a1),d0
		add.l	d1,d0
		subq.l	#2,d0
		move.l	d0,(bltbpt,a0)		; buff1+8*plsiz_blit-2
		sub.l	d1,d0
		move.l	d0,(bltapt,a0)		; buff1+7*plsiz_blit-2
		move.l	d0,(tmp_ptr-mybltnode,a1)
		move.l	(plane0-mybltnode,a1),d0
		add.l	d1,d0
		subq.l	#2,d0
		move.l	d0,(bltdpt,a0)		; Plane0
		move.l	#$1DE40002,(bltcon0,a0)	; D=(A<<1)C+B~C, desc.
		move.w	#1,(bltsizh,a0)		; do blit
		lea	(blit46,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		rts

blit46:		move.l	(plsiz_blit-mybltnode,a1),d1
		move.l	(tmp_ptr-mybltnode,a1),d0
		sub.l	d1,d0
		move.l	d0,(bltbpt,a0)		; buff1+6*plsiz_blit-2
		sub.l	d1,d0
		move.l	d0,(bltapt,a0)		; buff1+5*plsiz_blit-2
		move.l	d0,(tmp_ptr-mybltnode,a1)
		move.l	(plane0-mybltnode,a1),d0
		add.l	d1,d0
	ifd generic
		move.l	(v_plsiz-mybltnode,a1),d1
		lsl.l	#2,d1
		add.l	d1,d0
		subq.l	#2,d0
	else
		add.l	#4*plsiz-2,d0
	endc
		move.l	d0,(bltdpt,a0)		; Plane4
		move.w	#1,(bltsizh,a0)		; do blit
		lea	(blit47,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		rts

blit47:		move.l	(plsiz_blit-mybltnode,a1),d1
		move.l	(tmp_ptr-mybltnode,a1),d0
		sub.l	d1,d0
		move.l	d0,(bltbpt,a0)		; buff1+4*plsiz_blit-2
		sub.l	d1,d0
		move.l	d0,(bltapt,a0)		; buff1+3*plsiz_blit-2
		move.l	d0,(tmp_ptr-mybltnode,a1)
		move.l	(plane0-mybltnode,a1),d0
		add.l	d1,d0
	ifd generic
		move.l	(v_plsiz-mybltnode,a1),d1
		add.l	d1,d0
		add.l	d1,d0
		subq.l	#2,d0
	else
		add.l	#2*plsiz-2,d0
	endc
		move.l	d0,(bltdpt,a0)		; Plane2
		move.w	#1,(bltsizh,a0)		; do blit
		lea	(blit48,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		rts

blit48:		move.l	(plsiz_blit-mybltnode,a1),d1
		move.l	(tmp_ptr-mybltnode,a1),d0
		sub.l	d1,d0
		move.l	d0,(bltbpt,a0)		; buff1+2*plsiz_blit-2
		sub.l	d1,d0
		move.l	d0,(bltapt,a0)		; buff1+1*plsiz_blit-2
		move.l	(plane0-mybltnode,a1),d0
		add.l	d1,d0
	ifd generic
		move.l	(v_plsiz-mybltnode,a1),d1
		sub.l	d1,d0
		sub.l	d1,d0
		lsl.l	#3,d1
		add.l	d1,d0
		subq.l	#2,d0
	else
		add.l	#6*plsiz-2,d0
	endc
		move.l	d0,(bltdpt,a0)		; Plane6
		move.w	#1,(bltsizh,a0)		; do blit
		lea	(blit11,pc),a0
		move.l	a0,(qblitfunc-mybltnode,a1)
		moveq	#0,d0			; set Z flag
		rts

qblitcleanup:	movem.l	a2/a6,-(sp)
		move.l	#mybltnode,a2
		lea	(endblittime-mybltnode,a2),a0
		move.l	(timerbase-mybltnode,a2),a6
		jsr	(_LVOReadEClock,a6)	; may be called from interrupts
		sf	(waitflag-mybltnode,a2)
		movem.l	(sp)+,a2/a6
		rts

;-----------------------------------------------------------------------------

		section	data,data

		quad
buffers:	dc.l	0,0,0,0,0,0,0,0
mybltnode:	dc.l	0		; next bltnode
qblitfunc:	dc.l	blit11		; ptr to qblitfunc()
		dc.b	cleanup		; stat
		dc.b	0		; filler
		dc.w	0		; blitsize
		dc.w	0		; beamsync
		dc.l	qblitcleanup	; ptr to qblitcleanup()

		quad
chunky:		dc.l	0		; ptr to original chunky data
plane0:		dc.l	0		; ptr to output planes
buff0:		dc.l	0		; ptr to chip buffer0, size = pixels
buff1:		dc.l	0		; ptr to chip buffer1, size = pixels
source:		dc.l	0		; copy of chunky (if chip) else buff0
	ifd generic
v_plsiz:	dc.l	0		; width*height/8
v_plsiz_depth:	dc.l	0		; (depth-1)*width*height/8
pixels_blit:	dc.l	0		; number of pixels handled by blitter
plsiz_blit:	dc.l	0		; & corresponding (partial) planesize
n_blit:		dc.l	0		; number of 32-byte units for blitter
	else
pixels_blit:	dc.l	pixels/2	; number of pixels handled by blitter
plsiz_blit:	dc.l	plsiz/2		; & corresponding (partial) planesize
n_blit:		dc.l	plsiz/4/2	; number of 32-byte units for blitter
	endc
tmp_ptr:	dc.l	0
gfxbase:	dc.l	0
timerbase:	dc.l	0
utilitybase:	dc.l	0
starttime:	dc.l	0,0
endblittime:	dc.l	0,0
endcputime:	dc.l	0,0
waitflag:	dc.b	0
firsttimeflag:	dc.b	0

;-----------------------------------------------------------------------------
;
;		section	bss,bss,chip	; MUST BE IN CHIP !!!!!
;
;		quad
;buff0:		ds.b	pixels		;Intermediate buffer 1
;buff1:		ds.b	pixels		;Intermediate buffer 1
;
;-----------------------------------------------------------------------------

		end
