2 files changed, 58 insertions, 44 deletions
diff --git a/flags.asm b/flags.asm
index 8df7c5d..0b85267 100644
--- a/flags.asm
+++ b/flags.asm
@@ -58,22 +58,26 @@ F_CLEAR	MACRO
 	;;
 	;; Byte for which parity is calculated must be in \1. (d1
 	;; destroyed)
+;; XXX that's expensive. After making this a subroutine, to speed up parity computation, maybe you could use a 256-byte lookup table accessed by d(pc,ix.w).
+;; And if you have a spare address register, since xxx.l addressing mode is expensive speed-wise and size-wise (4 bytes + relocation),
+;; you should use lea d(pc) to preload the address of flag_valid into an address register,
+;; and then use (an) and d(an) to write to flag_valid and flag_byte.
 F_PAR	MACRO
 	move.b	\1,d1			;  4  2
-	lsr	#4,d1			;  6  2
+	lsr.w	#4,d1			; 14  2
 	eor.b	\1,d1			;  4  2
-	lsr	#2,d1			;  6  2
+	lsr.w	#2,d1			; 10  2
 	eor.b	\1,d1			;  4  2
-	lsr	#1,d1			;  6  2
+	lsr.w	#1,d1			;  8  2
 	eor.b	\1,d1			;  4  2
 	andi.b	#$01,d1			;  8  4
 	;; odd parity is now in d1
-	ori.b	#%00000100,flag_valid	; 20  6
-	andi.b	#%11111011,flag_byte	; 20  6
+	ori.b	#%00000100,flag_valid	; 20  8
+	andi.b	#%11111011,flag_byte	; 20  8
 	rol.b	#2,d1			;  6  2
-	or.b	d1,flag_byte		;  8  4
-	ENDM				; 86 cycles (!)
-					;    36 bytes (make this a subroutine)
+	or.b	d1,flag_byte		; 16  4
+	ENDM				;xxx cycles (!)
+					;    xx bytes (make this a subroutine)
 
 
 	;; Use this when an instruction uses the P/V bit as Overflow.
@@ -86,37 +90,40 @@ F_OVFL	MACRO
 F_ADD_SAVE	MACRO
 	move.b	\1,f_tmp_src_b
 	move.b	\2,f_tmp_dst_b
-	movei.b	#$01,f_tmp_byte
+	move.b	#$01,f_tmp_byte
 	F_SET	#%
 	ENDM
 
 	;; Normalize and return carry bit (is loaded into Z bit)
 	;; Destroys d1
 f_norm_c:
-	move.b	flag_valid,d1
+	move.b	flag_valid(pc),d1
+;; XXX you could use lsr #1 (same number of cycles, smaller) + bcc.s or bcs.s here.
 	andi.b	#%00000001,d1
-	bne	FNC_ok		; Bit is valid
-	move.b	f_host_ccr,d1
+	bne.s	FNC_ok		; Bit is valid
+	move.b	f_host_ccr(pc),d1
 	andi.b	#%00000001,d1
+;; XXX see above comment for using lea and then d(an) if you have a spare register.
 	or.b	d1,flag_byte
 	ori.b	#%00000001,flag_valid
 FNC_ok:
-	move.b	flag_byte,d1
+	move.b	flag_byte(pc),d1
 	andi.b	#%00000001,d1
 	rts
 
 	;; Normalize and return zero bit (loaded into Z bit)
 	;; Destroys d1
 f_norm_z:
-	move.b	flag_valid,d1
+	move.b	flag_valid(pc),d1
 	andi.b	#%01000000,d1
-	bne	FNZ_ok		; Bit is valid
-	move.b	f_host_ccr,d1
+	bne.s	FNZ_ok		; Bit is valid
+	move.b	f_host_ccr(pc),d1
 	andi.b	#%01000000,d1
+;; XXX see above comment for using lea and then d(an) if you have a spare register.
 	or.b	d1,flag_byte
 	ori.b	#%01000000,flag_valid
 FNZ_ok:
-	move.b	flag_byte,d1
+	move.b	flag_byte(pc),d1
 	andi.b	#%01000000,d1
 	rts
 
@@ -125,11 +132,11 @@ FNZ_ok:
 	;; Preconditions:
 	;;   Flags to change are noted in d0 by a 1 bit
 flags_normalize:
-	move.b	f_host_ccr,d1
-	andi.b	#%00011111,d1	; Maybe TI uses the reserved bits for
+	move.b	f_host_ccr(pc),d1
+;; XXX .w because you don't want garbage in bits 8-15 when using d(pc,ix.w) or d(an,ix.w) ea mode. 
+	andi.w	#%00011111,d1	; Maybe TI uses the reserved bits for
 				; something ...
-	movea	lut_ccr(pc),a1
-	move.b	0(a1,d1),d1
+	move.b	lut_ccr(pc,d1.w),d1
 	;; XXX do this
 	rts
 
diff --git a/main.asm b/main.asm
index 66959db..e6febbb 100644
--- a/main.asm
+++ b/main.asm
@@ -62,8 +62,8 @@ PUTB	MACRO			; 14 cycles, 4 bytes
 FETCHW	MACRO
 	;; XXX call deref
 	
-	move.b	1(a6,\1.w),\2	; 14/4
-	ror.w	#8,\2		;  4/2
+	move.b	1(a6,\1.w),-(sp); 18/4
+	move.w	(sp)+,\2	;  8/2
 	move.b	0(a6,\1.w),\2	; 14/4
 	ENDM
 
@@ -128,6 +128,7 @@ FETCHWI	MACRO			; 36 cycles, 12 bytes
 	;; XXX use deref
 	addq.w	#2,d2		;  4/2
 	move.b	-1(a6,d2.w),\1	; 14/4
+;; XXX why not rol #8,\1 ?? (and then you would be able to use the same trick as in FETCHW).
 	rol.w	#8,d2		;  4/2
 	move.b	-2(a6,d2.w),\1	; 14/4
 	ENDM
@@ -143,13 +144,13 @@ _align	SET	_align+$20
 	ENDM
 
 	;; When you want to use the high reg of a pair, use this first
-LOHI	MACRO			; 6 cycles, 2 bytes
-	ror	#8,\1
+LOHI	MACRO			; 22 cycles, 2 bytes
+	ror.w	#8,\1
 	ENDM
 
 	;; Then do your shit and finish with this
-HILO	MACRO			; 6 cycles, 2 bytes
-	rol	#8,\1
+HILO	MACRO			; 22 cycles, 2 bytes
+	rol.w	#8,\1
 	ENDM
 
 	;; calc84maniac suggests putting emu_fetch into this in order
@@ -163,6 +164,7 @@ DONE	MACRO			; 8 cycles, 2 bytes
 
 	;; Do a SUB \2,\1
 F_SUB_B	MACRO			;14 bytes?
+;; XXX use lea and then d(an) if you have a spare register.
 	move.b	\1,f_tmp_src_b	; preserve operands for flagging
 	move.b	\2,f_tmp_dst_b
 	move.b	#1,flag_n
@@ -207,6 +209,7 @@ F_DEC_W	MACRO
 
 
 _main:
+;; XXX in the current state of the code, you could just make _main and emu_setup point to the same address.
 	bsr	emu_setup
 	rts
 
@@ -214,7 +217,7 @@ _main:
 
 emu_setup:
 	movea	emu_plain_op,a5
-	movea	emu_fetch(pc),a2
+	lea	emu_fetch(pc),a2
 	;; XXX finish
 	rts
 
@@ -222,26 +225,30 @@ emu_setup:
 
 	;; Take a virtual address in d1 and dereference it.  Returns the
 	;; host address in a0.  Destroys a0, d0.
+;; XXX I added a masking of the upper bits of the Z80 address (d1) before translating them to host address.
+;; Please double-check, but AFAICT, it's the right thing to do.
 deref:
 	move.w	d1,d0
+	andi.w	#$3FFF,d0
+	movea.w	d0,a0
+	move.w	d1,d0
 	andi.w	#$C000,d0
 	rol.w	#5,d0
-	jmp	0(pc,d0)
+	jmp	0(pc,d0.w)
 	;; 00
-	movea	a1,a0
-	bra	deref_go
+	adda.l	a1,a0
+	rts
 	;; 01
-	movea	a2,a0
-	bra	deref_go
+	adda.l	a2,a0
+	rts
 	;; 02
-	movea	a3,a0
-	bra	deref_go
+	adda.l	a3,a0
+	rts
 	;; 03
-	movea	a4,a0
-deref_go:
-	adda	d1,a0
+	adda.l	a4,a0
 	rts
 
+
 ;; =========================================================================
 ;; instruction   instruction   instruction  ================================
 ;;      _ _                 _       _       ================================
@@ -257,12 +264,12 @@ emu_fetch:
 	;; Move this into DONE, saving 8 more cycles but using extra
 	;; space.
 	;;
-	;; See if I can get rid of the eor
-	eor.w	d0,d0		; 4 cycles
-	move.b	(a4)+,d0	; 8 cycles
-	rol.w	#5,d0		; 4 cycles   adjust to actual alignment
-	jmp	0(a5,d0)	;14 cycles
-	;; overhead:		 30 cycles
+	;; Likely impossible to get rid of the clr
+	clr.w	d0,d0		;  4 cycles
+	move.b	(a4)+,d0	;  8 cycles
+	rol.w	#5,d0		; 16 cycles   adjust to actual alignment
+	jmp	0(a5,d0.w)	; 14 cycles
+	;; overhead:		  42 cycles
 
 ;;; ========================================================================
 ;;; ========================================================================