From aff789af91b14462ae9e6df79720b1f71e4947ca Mon Sep 17 00:00:00 2001
From: Duncan Smith
Date: Sat, 12 Jun 2010 08:32:53 -0700
Subject: Patch received in email from Lionel Debroux

---
 main.asm | 53 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 30 insertions(+), 23 deletions(-)

(limited to 'main.asm')

diff --git a/main.asm b/main.asm
index abc7b89..685d74e 100644
--- a/main.asm
+++ b/main.asm
@@ -62,8 +62,8 @@ PUTB	MACRO			; 14 cycles, 4 bytes
 FETCHW	MACRO
 	;; XXX call deref
 	
-	move.b	1(a6,\1.w),\2	; 14/4
-	ror.w	#8,\2		;  4/2
+	move.b	1(a6,\1.w),-(sp); 18/4
+	move.w	(sp)+,\2	;  8/2
 	move.b	0(a6,\1.w),\2	; 14/4
 	ENDM
 
@@ -128,6 +128,7 @@ FETCHWI	MACRO			; 36 cycles, 12 bytes
 	;; XXX use deref
 	addq.w	#2,d2		;  4/2
 	move.b	-1(a6,d2.w),\1	; 14/4
+;; XXX why not rol #8,\1 ?? (and then you would be able to use the same trick as in FETCHW).
 	rol.w	#8,d2		;  4/2
 	move.b	-2(a6,d2.w),\1	; 14/4
 	ENDM
@@ -143,13 +144,13 @@ _align	SET	_align+$20
 	ENDM
 
 	;; When you want to use the high reg of a pair, use this first
-LOHI	MACRO			; 6 cycles, 2 bytes
-	ror	#8,\1
+LOHI	MACRO			; 22 cycles, 2 bytes
+	ror.w	#8,\1
 	ENDM
 
 	;; Then do your shit and finish with this
-HILO	MACRO			; 6 cycles, 2 bytes
-	rol	#8,\1
+HILO	MACRO			; 22 cycles, 2 bytes
+	rol.w	#8,\1
 	ENDM
 
 	;; calc84maniac suggests putting emu_fetch into this in order
@@ -163,6 +164,7 @@ DONE	MACRO			; 8 cycles, 2 bytes
 
 	;; Do a SUB \2,\1
 F_SUB_B	MACRO			;14 bytes?
+;; XXX use lea and then d(an) if you have a spare register.
 	move.b	\1,f_tmp_src_b	; preserve operands for flagging
 	move.b	\2,f_tmp_dst_b
 	move.b	#1,flag_n
@@ -207,6 +209,7 @@ F_DEC_W	MACRO
 
 
 _main:
+;; XXX in the current state of the code, you could just make _main and emu_setup point to the same address.
 	bsr	emu_setup
 	rts
 
@@ -214,7 +217,7 @@ _main:
 
 emu_setup:
 	movea	emu_plain_op,a5
-	movea	emu_fetch(pc),a2
+	lea	emu_fetch(pc),a2
 	;; XXX finish
 	rts
 
@@ -222,26 +225,30 @@ emu_setup:
 
 	;; Take a virtual address in d1 and dereference it.  Returns the
 	;; host address in a0.  Destroys a0, d0.
+;; XXX I added a masking of the upper bits of the Z80 address (d1) before translating them to host address.
+;; Please double-check, but AFAICT, it's the right thing to do.
 deref:
+	move.w	d1,d0
+	andi.w	#$3FFF,d0
+	movea.w	d0,a0
 	move.w	d1,d0
 	andi.w	#$C000,d0
 	rol.w	#5,d0
-	jmp	0(pc,d0)
+	jmp	0(pc,d0.w)
 	;; 00
-	movea	a1,a0
-	bra	deref_go
+	adda.l	a1,a0
+	rts
 	;; 01
-	movea	a2,a0
-	bra	deref_go
+	adda.l	a2,a0
+	rts
 	;; 02
-	movea	a3,a0
-	bra	deref_go
+	adda.l	a3,a0
+	rts
 	;; 03
-	movea	a4,a0
-deref_go:
-	adda	d1,a0
+	adda.l	a4,a0
 	rts
 
+
 ;; =========================================================================
 ;; instruction   instruction   instruction  ================================
 ;;      _ _                 _       _       ================================
@@ -257,12 +264,12 @@ emu_fetch:
 	;; Move this into DONE, saving 8 more cycles but using extra
 	;; space.
 	;;
-	;; See if I can get rid of the eor
-	eor.w	d0,d0		; 4 cycles
-	move.b	(a4)+,d0	; 8 cycles
-	rol.w	#5,d0		; 4 cycles   adjust to actual alignment
-	jmp	0(a5,d0)	;14 cycles
-	;; overhead:		 30 cycles
+	;; Likely impossible to get rid of the clr
+	clr.w	d0,d0		;  4 cycles
+	move.b	(a4)+,d0	;  8 cycles
+	rol.w	#5,d0		; 16 cycles   adjust to actual alignment
+	jmp	0(a5,d0.w)	; 14 cycles
+	;; overhead:		  42 cycles
 
 ;;; ========================================================================
 ;;; ========================================================================
-- 
cgit v1.2.3


From 7e93257ff6bc6c456001916db480e543ab65faf9 Mon Sep 17 00:00:00 2001
From: Duncan Smith
Date: Sat, 12 Jun 2010 10:12:34 -0700
Subject: Worked in Lionel's changes to main.asm

---
 main.asm | 39 ++++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

(limited to 'main.asm')

diff --git a/main.asm b/main.asm
index 685d74e..28ac5e4 100644
--- a/main.asm
+++ b/main.asm
@@ -11,7 +11,7 @@
 ;;; Registers used:
 ;;;
 ;;; A7 = sp
-;;; A6 =
+;;; A6 = emulated PC
 ;;; A5 = instruction table base pointer
 ;;; A4 = bank 3 base
 ;;; A3 = bank 2 base
@@ -59,9 +59,14 @@ PUTB	MACRO			; 14 cycles, 4 bytes
 
 	;; Macro to read a word from main memory at register \1
 	;; (unaligned).  Puts the word read in \2.
+	;;
+	;; XXX deref
+	;;
+	;; <debrouxl> It decrements sp by 2, but stores the result at
+	;; sp, not at 1(sp). So you essentially get a "free" shift
+	;; left by 8 bits. Much faster than lsl.w / rol.w #8, at
+	;; least.
 FETCHW	MACRO
-	;; XXX call deref
-	
 	move.b	1(a6,\1.w),-(sp); 18/4
 	move.w	(sp)+,\2	;  8/2
 	move.b	0(a6,\1.w),\2	; 14/4
@@ -125,12 +130,11 @@ FETCHBI	MACRO			; 40 cycles, 14 bytes
 
 	;; Macro to read an immediate word (unaligned) into \1.
 FETCHWI	MACRO			; 36 cycles, 12 bytes
-	;; XXX use deref
 	addq.w	#2,d2		;  4/2
-	move.b	-1(a6,d2.w),\1	; 14/4
-;; XXX why not rol #8,\1 ?? (and then you would be able to use the same trick as in FETCHW).
-	rol.w	#8,d2		;  4/2
-	move.b	-2(a6,d2.w),\1	; 14/4
+	;; See FETCHW for an explanation of this trick.
+	move.b	1(a6,d2.w),-(sp); 18/4
+	move.w	(sp)+,\1	;  8/2
+	move.b	0(a6,d2.w),\1	; 14/4
 	ENDM
 
 	;; == Common Opcode Macros =========================================
@@ -265,7 +269,7 @@ emu_fetch:
 	;; space.
 	;;
 	;; Likely impossible to get rid of the clr
-	clr.w	d0,d0		;  4 cycles
+	clr.w	d0		;  4 cycles
 	move.b	(a4)+,d0	;  8 cycles
 	rol.w	#5,d0		; 16 cycles   adjust to actual alignment
 	jmp	0(a5,d0.w)	; 14 cycles
@@ -1031,19 +1035,12 @@ emu_op_59:
 	START
 emu_op_5a:
 	;; LD	E,D
-	LOHI	d5
-	move.b	d5,d1
-	HILO	d5
-	move.b	d1,d5
-	DONE
-
-	;; Is this faster or slower?
-
-	andi.w	#$ff00,d5
-	move.b	d5,d1
-	lsr	#8,d1
-	or.w	d1,d5
+	andi.w	#$ff00,d5	; 8/4
+	move.b	d5,d1		; 4/2
+	lsr	#8,d1		;22/2
+	or.w	d1,d5		; 4/2
 	DONE
+				;38/2
 
 	START
 emu_op_5b:
-- 
cgit v1.2.3


From cf1021b481694b00e1b18bbc9dab67ab8c6c7553 Mon Sep 17 00:00:00 2001
From: Duncan Smith
Date: Sat, 12 Jun 2010 11:06:51 -0700
Subject: Worked in Lionel's changes to flags.asm, patched up main.asm slightly
 to fit

---
 flags.asm | 196 +++++++++++++++++++++++++++-----------------------------------
 main.asm  |  15 +++--
 2 files changed, 95 insertions(+), 116 deletions(-)

(limited to 'main.asm')

diff --git a/flags.asm b/flags.asm
index 0b85267..9c25dd7 100644
--- a/flags.asm
+++ b/flags.asm
@@ -1,43 +1,3 @@
-;;  N   =S
-;;   Z  = Z
-;;    V ~     P
-;;     C=       C
-;; 
-;; =CCR= == z80==
-;; XNZVC SZ5H3PNC
-;; 00000 00000000
-;; 00001 00000001
-;; 00010 00000100
-;; 00011 00000101
-;; 00100 01000000
-;; 00101 01000001
-;; 00110 01000100
-;; 00111 01000101
-;; 01000 10000000
-;; 01001 10000001
-;; 01010 10000100
-;; 01011 10000101
-;; 01100 11000000
-;; 01101 11000001
-;; 01110 11000100
-;; 01111 11000101
-;; 10000 00000000
-;; 10001 00000001
-;; 10010 00000100
-;; 10011 00000101
-;; 10100 01000000
-;; 10101 01000001
-;; 10110 01000100
-;; 10111 01000101
-;; 11000 10000000
-;; 11001 10000001
-;; 11010 10000100
-;; 11011 10000101
-;; 11100 11000000
-;; 11101 11000001
-;; 11110 11000100
-;; 11111 11000101
-
 	;; Routine to set the given flags
 	;;   Noted in \1 by a 1 bit
 F_SET	MACRO
@@ -56,28 +16,17 @@ F_CLEAR	MACRO
 	;; Use this when an instruction uses the P/V bit as Parity.
 	;; Sets or clears the bit explicitly.
 	;;
-	;; Byte for which parity is calculated must be in \1. (d1
+	;; Byte for which parity is calculated must be in \1.  High
+	;; byte of \1.w must be zero, using d0 is suggested. (d1
 	;; destroyed)
-;; XXX that's expensive. After making this a subroutine, to speed up parity computation, maybe you could use a 256-byte lookup table accessed by d(pc,ix.w).
-;; And if you have a spare address register, since xxx.l addressing mode is expensive speed-wise and size-wise (4 bytes + relocation),
-;; you should use lea d(pc) to preload the address of flag_valid into an address register,
-;; and then use (an) and d(an) to write to flag_valid and flag_byte.
+
 F_PAR	MACRO
-	move.b	\1,d1			;  4  2
-	lsr.w	#4,d1			; 14  2
-	eor.b	\1,d1			;  4  2
-	lsr.w	#2,d1			; 10  2
-	eor.b	\1,d1			;  4  2
-	lsr.w	#1,d1			;  8  2
-	eor.b	\1,d1			;  4  2
-	andi.b	#$01,d1			;  8  4
-	;; odd parity is now in d1
-	ori.b	#%00000100,flag_valid	; 20  8
-	andi.b	#%11111011,flag_byte	; 20  8
-	rol.b	#2,d1			;  6  2
-	or.b	d1,flag_byte		; 16  4
+	ori.b	#%00000100,flag_valid-flag_storage(a3)	; ??/4
+	move.b	flag_byte-flag_storage(a3),d1		; ??/2
+	andi.b	#%11111011,d1				; ??/4
+	or.b	lut_parity-flag_storage(a3,\1.w),d1	; ??/4
+	move.b	d1,flag_byte-flag_storage(a3)		; ??/2
 	ENDM				;xxx cycles (!)
-					;    xx bytes (make this a subroutine)
 
 
 	;; Use this when an instruction uses the P/V bit as Overflow.
@@ -88,42 +37,41 @@ F_OVFL	MACRO
 
 	;; Save the two operands from ADD \1,\2
 F_ADD_SAVE	MACRO
-	move.b	\1,f_tmp_src_b
-	move.b	\2,f_tmp_dst_b
-	move.b	#$01,f_tmp_byte
+	move.b	\1,f_tmp_src_b-flag_storage(a3)
+	move.b	\2,f_tmp_dst_b-flag_storage(a3)
+	move.b	#$01,f_tmp_byte-flag_storage(a3)
 	F_SET	#%
 	ENDM
 
 	;; Normalize and return carry bit (is loaded into Z bit)
 	;; Destroys d1
 f_norm_c:
-	move.b	flag_valid(pc),d1
-;; XXX you could use lsr #1 (same number of cycles, smaller) + bcc.s or bcs.s here.
+	move.b	flag_valid-flag_storage(a3),d1
 	andi.b	#%00000001,d1
 	bne.s	FNC_ok		; Bit is valid
-	move.b	f_host_ccr(pc),d1
+	move.b	f_host_ccr-flag_storage(a3),d1
 	andi.b	#%00000001,d1
 ;; XXX see above comment for using lea and then d(an) if you have a spare register.
-	or.b	d1,flag_byte
+	or.b	d1,flag_byte-flag_storage(a3)
 	ori.b	#%00000001,flag_valid
 FNC_ok:
-	move.b	flag_byte(pc),d1
+	move.b	flag_byte-flag_storage(a3),d1
 	andi.b	#%00000001,d1
 	rts
 
 	;; Normalize and return zero bit (loaded into Z bit)
 	;; Destroys d1
 f_norm_z:
-	move.b	flag_valid(pc),d1
+	move.b	flag_valid-flag_storage(a3),d1
 	andi.b	#%01000000,d1
 	bne.s	FNZ_ok		; Bit is valid
-	move.b	f_host_ccr(pc),d1
+	move.b	f_host_ccr-flag_storage(a3),d1
 	andi.b	#%01000000,d1
 ;; XXX see above comment for using lea and then d(an) if you have a spare register.
-	or.b	d1,flag_byte
-	ori.b	#%01000000,flag_valid
+	or.b	d1,flag_byte-flag_storage(a3)
+	ori.b	#%01000000,flag_valid-flag_storage(a3)
 FNZ_ok:
-	move.b	flag_byte(pc),d1
+	move.b	flag_byte-flag_storage(a3),d1
 	andi.b	#%01000000,d1
 	rts
 
@@ -132,15 +80,14 @@ FNZ_ok:
 	;; Preconditions:
 	;;   Flags to change are noted in d0 by a 1 bit
 flags_normalize:
-	move.b	f_host_ccr(pc),d1
-;; XXX .w because you don't want garbage in bits 8-15 when using d(pc,ix.w) or d(an,ix.w) ea mode. 
-	andi.w	#%00011111,d1	; Maybe TI uses the reserved bits for
-				; something ...
-	move.b	lut_ccr(pc,d1.w),d1
+	move.b	f_host_ccr-flag_storage(a3),d1	;  8/4
+	;; .w keeps d1 clean
+	andi.w	#%00011111,d1			;  8/4
+	move.b	lut_ccr(pc,d1.w),d1 		; 10/4
 	;; XXX do this
 	rts
 
-storage:
+flag_storage:
 	;; 1 if tmp_???b is valid, 0 if tmp_???w is valid
 f_tmp_byte:	ds.b	0
 	;; 2 if P is 0, 3 if P is 1, 4 if P is Parity, 5 if P is oVerflow
@@ -169,36 +116,65 @@ flag_valid:	ds.b	0	; Validity mask -- 1 if valid.
 
 	;; LUT for the CCR -> F mapping
 lut_ccr:
-	dc.b	%00000000
-	dc.b	%00000001
-	dc.b	%00000100
-	dc.b	%00000101
-	dc.b	%01000000
-	dc.b	%01000001
-	dc.b	%01000100
-	dc.b	%01000101
-	dc.b	%10000000
-	dc.b	%10000001
-	dc.b	%10000100
-	dc.b	%10000101
-	dc.b	%11000000
-	dc.b	%11000001
-	dc.b	%11000100
-	dc.b	%11000101
-	dc.b	%00000000
-	dc.b	%00000001
-	dc.b	%00000100
-	dc.b	%00000101
-	dc.b	%01000000
-	dc.b	%01000001
-	dc.b	%01000100
-	dc.b	%01000101
-	dc.b	%10000000
-	dc.b	%10000001
-	dc.b	%10000100
-	dc.b	%10000101
-	dc.b	%11000000
-	dc.b	%11000001
-	dc.b	%11000100
-	dc.b	%11000101
+				;;  N   =S
+				;;   Z  = Z
+				;;    V ~     P
+				;;     C=       C
+				;;
+				;; =CCR= == z80==
+				;; XNZVC SZ5H3PNC
+	dc.b	%00000000	;; 00000 00000000
+	dc.b	%00000001	;; 00001 00000001
+	dc.b	%00000100	;; 00010 00000100
+	dc.b	%00000101	;; 00011 00000101
+	dc.b	%01000000	;; 00100 01000000
+	dc.b	%01000001	;; 00101 01000001
+	dc.b	%01000100	;; 00110 01000100
+	dc.b	%01000101	;; 00111 01000101
+	dc.b	%10000000	;; 01000 10000000
+	dc.b	%10000001	;; 01001 10000001
+	dc.b	%10000100	;; 01010 10000100
+	dc.b	%10000101	;; 01011 10000101
+	dc.b	%11000000	;; 01100 11000000
+	dc.b	%11000001	;; 01101 11000001
+	dc.b	%11000100	;; 01110 11000100
+	dc.b	%11000101	;; 01111 11000101
+	dc.b	%00000000	;; 10000 00000000
+	dc.b	%00000001	;; 10001 00000001
+	dc.b	%00000100	;; 10010 00000100
+	dc.b	%00000101	;; 10011 00000101
+	dc.b	%01000000	;; 10100 01000000
+	dc.b	%01000001	;; 10101 01000001
+	dc.b	%01000100	;; 10110 01000100
+	dc.b	%01000101	;; 10111 01000101
+	dc.b	%10000000	;; 11000 10000000
+	dc.b	%10000001	;; 11001 10000001
+	dc.b	%10000100	;; 11010 10000100
+	dc.b	%10000101	;; 11011 10000101
+	dc.b	%11000000	;; 11100 11000000
+	dc.b	%11000001	;; 11101 11000001
+	dc.b	%11000100	;; 11110 11000100
+	dc.b	%11000101	;; 11111 11000101
+
+	;; 256-byte LUT for the Parity bit.
+	;; Keep this last so all storage references require only one
+	;; extension word.
+lut_parity:
+	dc.b	4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+	dc.b	0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+	dc.b	0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+	dc.b	4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+	dc.b	0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+	dc.b	4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+	dc.b	4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+	dc.b	0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+	dc.b	0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+	dc.b	4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+	dc.b	4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+	dc.b	0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+	dc.b	4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+	dc.b	0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+	dc.b	0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+	dc.b	4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+
 
diff --git a/main.asm b/main.asm
index 28ac5e4..1431220 100644
--- a/main.asm
+++ b/main.asm
@@ -11,12 +11,12 @@
 ;;; Registers used:
 ;;;
 ;;; A7 = sp
-;;; A6 = emulated PC
+;;; A6 = emulated PC XXX
 ;;; A5 = instruction table base pointer
-;;; A4 = bank 3 base
-;;; A3 = bank 2 base
-;;; A2 = bank 1 base
-;;; A1 = bank 0 base
+;;; A4 = emulated SP XXX
+;;; A3 = constants address (see flags.asm)
+;;; A2 =
+;;; A1 =
 ;;; A0 =
 ;;;
 ;;; D0 = current instruction, scratch for macros
@@ -213,7 +213,6 @@ F_DEC_W	MACRO
 
 
 _main:
-;; XXX in the current state of the code, you could just make _main and emu_setup point to the same address.
 	bsr	emu_setup
 	rts
 
@@ -222,6 +221,7 @@ _main:
 emu_setup:
 	movea	emu_plain_op,a5
 	lea	emu_fetch(pc),a2
+	lea	flag_storage(pc),a3	; Thanks to Lionel
 	;; XXX finish
 	rts
 
@@ -231,6 +231,9 @@ emu_setup:
 	;; host address in a0.  Destroys a0, d0.
 ;; XXX I added a masking of the upper bits of the Z80 address (d1) before translating them to host address.
 ;; Please double-check, but AFAICT, it's the right thing to do.
+
+	;; XXX these use the old setup, replace this with a writable
+	;; LUT.
 deref:
 	move.w	d1,d0
 	andi.w	#$3FFF,d0
-- 
cgit v1.2.3