Merge branch 'debrouxl'

author: Astrid Smith 2010-06-12 11:07:10 -0700
committer: Astrid Smith 2010-06-12 11:07:10 -0700
commit: 32ce71ce4ef22abbb228425e699d2cd2f0a3391b (patch)
tree: 3a6748e2610c0bdd38139e54b3d2ce33aa7f41a7
parent: 52e383ced4997f1ce1f3125499702b8f4531cbca (diff)
parent: f0882c0b92b01a290c62d3a3044bb027014b56c6 (diff)
3 files changed, 150 insertions, 160 deletions
diff --git a/README.markdown b/README.markdown
index 723462b..bd1a7ca 100644
--- a/README.markdown
+++ b/README.markdown
@@ -19,20 +19,20 @@ The most difficult challenge in writing a 68k-hosted emulator
 targetting the z80 is making it _fast_.  TI-83+ calculators have a
 clock rate in the neighborhood of 12MHz, as do TI-89s.  z80
 instructions take from 4 to 17 cycles to execute.  I can dispatch an
-instruction with a fixed 30 cycle overhead:
+instruction with a fixed 42 cycle overhead:
 
 	emu_fetch:
 	  eor.w    d0,d0     ; 4 cycles
 	  move.b   (a4)+,d0  ; 8 cycles
-	  rol.w    #5,d0     ; 4 cycles   adjust to actual alignment
+	  rol.w    #5,d0     ;16 cycles   adjust to actual alignment
 	  jmp      0(a3,d0)  ;14 cycles
-	  ;; overhead:        30 cycles
+	  ;; overhead:        42 cycles
 
 From there, an instruction will take anywhere from 0 to, well, lots of
-additional cycles.  Generally, however, it will take under 50, for 80
+additional cycles.  Generally, however, it will take under 50, for 92
 total.  In the worst reasonable case, a 4 cycle instruction emlulated
-in 80 cycles, that's a 20:1 ratio.  In the best possible case, a
-17-cycle instruction emulated in 30 cycles, is more nearly a 1:2
+in 92 cycles, that's a 23:1 ratio.  In the best possible case, a
+17-cycle instruction emulated in 42 cycles, is more nearly a 1:2
 ratio.
 
 I am not aiming for exactly correct relative timing of instructions,
diff --git a/flags.asm b/flags.asm
index 8df7c5d..9c25dd7 100644
--- a/flags.asm
+++ b/flags.asm
@@ -1,43 +1,3 @@
-;;  N   =S
-;;   Z  = Z
-;;    V ~     P
-;;     C=       C
-;; 
-;; =CCR= == z80==
-;; XNZVC SZ5H3PNC
-;; 00000 00000000
-;; 00001 00000001
-;; 00010 00000100
-;; 00011 00000101
-;; 00100 01000000
-;; 00101 01000001
-;; 00110 01000100
-;; 00111 01000101
-;; 01000 10000000
-;; 01001 10000001
-;; 01010 10000100
-;; 01011 10000101
-;; 01100 11000000
-;; 01101 11000001
-;; 01110 11000100
-;; 01111 11000101
-;; 10000 00000000
-;; 10001 00000001
-;; 10010 00000100
-;; 10011 00000101
-;; 10100 01000000
-;; 10101 01000001
-;; 10110 01000100
-;; 10111 01000101
-;; 11000 10000000
-;; 11001 10000001
-;; 11010 10000100
-;; 11011 10000101
-;; 11100 11000000
-;; 11101 11000001
-;; 11110 11000100
-;; 11111 11000101
-
 	;; Routine to set the given flags
 	;;   Noted in \1 by a 1 bit
 F_SET	MACRO
@@ -56,24 +16,17 @@ F_CLEAR	MACRO
 	;; Use this when an instruction uses the P/V bit as Parity.
 	;; Sets or clears the bit explicitly.
 	;;
-	;; Byte for which parity is calculated must be in \1. (d1
+	;; Byte for which parity is calculated must be in \1.  High
+	;; byte of \1.w must be zero, using d0 is suggested. (d1
 	;; destroyed)
+
 F_PAR	MACRO
-	move.b	\1,d1			;  4  2
-	lsr	#4,d1			;  6  2
-	eor.b	\1,d1			;  4  2
-	lsr	#2,d1			;  6  2
-	eor.b	\1,d1			;  4  2
-	lsr	#1,d1			;  6  2
-	eor.b	\1,d1			;  4  2
-	andi.b	#$01,d1			;  8  4
-	;; odd parity is now in d1
-	ori.b	#%00000100,flag_valid	; 20  6
-	andi.b	#%11111011,flag_byte	; 20  6
-	rol.b	#2,d1			;  6  2
-	or.b	d1,flag_byte		;  8  4
-	ENDM				; 86 cycles (!)
-					;    36 bytes (make this a subroutine)
+	ori.b	#%00000100,flag_valid-flag_storage(a3)	; ??/4
+	move.b	flag_byte-flag_storage(a3),d1		; ??/2
+	andi.b	#%11111011,d1				; ??/4
+	or.b	lut_parity-flag_storage(a3,\1.w),d1	; ??/4
+	move.b	d1,flag_byte-flag_storage(a3)		; ??/2
+	ENDM				;xxx cycles (!)
 
 
 	;; Use this when an instruction uses the P/V bit as Overflow.
@@ -84,39 +37,41 @@ F_OVFL	MACRO
 
 	;; Save the two operands from ADD \1,\2
 F_ADD_SAVE	MACRO
-	move.b	\1,f_tmp_src_b
-	move.b	\2,f_tmp_dst_b
-	movei.b	#$01,f_tmp_byte
+	move.b	\1,f_tmp_src_b-flag_storage(a3)
+	move.b	\2,f_tmp_dst_b-flag_storage(a3)
+	move.b	#$01,f_tmp_byte-flag_storage(a3)
 	F_SET	#%
 	ENDM
 
 	;; Normalize and return carry bit (is loaded into Z bit)
 	;; Destroys d1
 f_norm_c:
-	move.b	flag_valid,d1
+	move.b	flag_valid-flag_storage(a3),d1
 	andi.b	#%00000001,d1
-	bne	FNC_ok		; Bit is valid
-	move.b	f_host_ccr,d1
+	bne.s	FNC_ok		; Bit is valid
+	move.b	f_host_ccr-flag_storage(a3),d1
 	andi.b	#%00000001,d1
-	or.b	d1,flag_byte
+;; XXX see above comment for using lea and then d(an) if you have a spare register.
+	or.b	d1,flag_byte-flag_storage(a3)
 	ori.b	#%00000001,flag_valid
 FNC_ok:
-	move.b	flag_byte,d1
+	move.b	flag_byte-flag_storage(a3),d1
 	andi.b	#%00000001,d1
 	rts
 
 	;; Normalize and return zero bit (loaded into Z bit)
 	;; Destroys d1
 f_norm_z:
-	move.b	flag_valid,d1
+	move.b	flag_valid-flag_storage(a3),d1
 	andi.b	#%01000000,d1
-	bne	FNZ_ok		; Bit is valid
-	move.b	f_host_ccr,d1
+	bne.s	FNZ_ok		; Bit is valid
+	move.b	f_host_ccr-flag_storage(a3),d1
 	andi.b	#%01000000,d1
-	or.b	d1,flag_byte
-	ori.b	#%01000000,flag_valid
+;; XXX see above comment for using lea and then d(an) if you have a spare register.
+	or.b	d1,flag_byte-flag_storage(a3)
+	ori.b	#%01000000,flag_valid-flag_storage(a3)
 FNZ_ok:
-	move.b	flag_byte,d1
+	move.b	flag_byte-flag_storage(a3),d1
 	andi.b	#%01000000,d1
 	rts
 
@@ -125,15 +80,14 @@ FNZ_ok:
 	;; Preconditions:
 	;;   Flags to change are noted in d0 by a 1 bit
 flags_normalize:
-	move.b	f_host_ccr,d1
-	andi.b	#%00011111,d1	; Maybe TI uses the reserved bits for
-				; something ...
-	movea	lut_ccr(pc),a1
-	move.b	0(a1,d1),d1
+	move.b	f_host_ccr-flag_storage(a3),d1	;  8/4
+	;; .w keeps d1 clean
+	andi.w	#%00011111,d1			;  8/4
+	move.b	lut_ccr(pc,d1.w),d1 		; 10/4
 	;; XXX do this
 	rts
 
-storage:
+flag_storage:
 	;; 1 if tmp_???b is valid, 0 if tmp_???w is valid
 f_tmp_byte:	ds.b	0
 	;; 2 if P is 0, 3 if P is 1, 4 if P is Parity, 5 if P is oVerflow
@@ -162,36 +116,65 @@ flag_valid:	ds.b	0	; Validity mask -- 1 if valid.
 
 	;; LUT for the CCR -> F mapping
 lut_ccr:
-	dc.b	%00000000
-	dc.b	%00000001
-	dc.b	%00000100
-	dc.b	%00000101
-	dc.b	%01000000
-	dc.b	%01000001
-	dc.b	%01000100
-	dc.b	%01000101
-	dc.b	%10000000
-	dc.b	%10000001
-	dc.b	%10000100
-	dc.b	%10000101
-	dc.b	%11000000
-	dc.b	%11000001
-	dc.b	%11000100
-	dc.b	%11000101
-	dc.b	%00000000
-	dc.b	%00000001
-	dc.b	%00000100
-	dc.b	%00000101
-	dc.b	%01000000
-	dc.b	%01000001
-	dc.b	%01000100
-	dc.b	%01000101
-	dc.b	%10000000
-	dc.b	%10000001
-	dc.b	%10000100
-	dc.b	%10000101
-	dc.b	%11000000
-	dc.b	%11000001
-	dc.b	%11000100
-	dc.b	%11000101
+				;;  N   =S
+				;;   Z  = Z
+				;;    V ~     P
+				;;     C=       C
+				;;
+				;; =CCR= == z80==
+				;; XNZVC SZ5H3PNC
+	dc.b	%00000000	;; 00000 00000000
+	dc.b	%00000001	;; 00001 00000001
+	dc.b	%00000100	;; 00010 00000100
+	dc.b	%00000101	;; 00011 00000101
+	dc.b	%01000000	;; 00100 01000000
+	dc.b	%01000001	;; 00101 01000001
+	dc.b	%01000100	;; 00110 01000100
+	dc.b	%01000101	;; 00111 01000101
+	dc.b	%10000000	;; 01000 10000000
+	dc.b	%10000001	;; 01001 10000001
+	dc.b	%10000100	;; 01010 10000100
+	dc.b	%10000101	;; 01011 10000101
+	dc.b	%11000000	;; 01100 11000000
+	dc.b	%11000001	;; 01101 11000001
+	dc.b	%11000100	;; 01110 11000100
+	dc.b	%11000101	;; 01111 11000101
+	dc.b	%00000000	;; 10000 00000000
+	dc.b	%00000001	;; 10001 00000001
+	dc.b	%00000100	;; 10010 00000100
+	dc.b	%00000101	;; 10011 00000101
+	dc.b	%01000000	;; 10100 01000000
+	dc.b	%01000001	;; 10101 01000001
+	dc.b	%01000100	;; 10110 01000100
+	dc.b	%01000101	;; 10111 01000101
+	dc.b	%10000000	;; 11000 10000000
+	dc.b	%10000001	;; 11001 10000001
+	dc.b	%10000100	;; 11010 10000100
+	dc.b	%10000101	;; 11011 10000101
+	dc.b	%11000000	;; 11100 11000000
+	dc.b	%11000001	;; 11101 11000001
+	dc.b	%11000100	;; 11110 11000100
+	dc.b	%11000101	;; 11111 11000101
+
+	;; 256-byte LUT for the Parity bit.
+	;; Keep this last so all storage references require only one
+	;; extension word.
+lut_parity:
+	dc.b	4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+	dc.b	0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+	dc.b	0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+	dc.b	4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+	dc.b	0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+	dc.b	4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+	dc.b	4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+	dc.b	0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+	dc.b	0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+	dc.b	4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+	dc.b	4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+	dc.b	0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+	dc.b	4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+	dc.b	0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+	dc.b	0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+	dc.b	4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+
 
diff --git a/main.asm b/main.asm
index 94b4ea6..cafde71 100644
--- a/main.asm
+++ b/main.asm
@@ -11,12 +11,12 @@
 ;;; Registers used:
 ;;;
 ;;; A7 = sp
-;;; A6 =
+;;; A6 = emulated PC XXX
 ;;; A5 = instruction table base pointer
-;;; A4 = bank 3 base
-;;; A3 = bank 2 base
-;;; A2 = bank 1 base
-;;; A1 = bank 0 base
+;;; A4 = emulated SP XXX
+;;; A3 = constants address (see flags.asm)
+;;; A2 =
+;;; A1 =
 ;;; A0 =
 ;;;
 ;;; D0 = current instruction, scratch for macros
@@ -59,11 +59,16 @@ PUTB	MACRO			; 14 cycles, 4 bytes
 
 	;; Macro to read a word from main memory at register \1
 	;; (unaligned).  Puts the word read in \2.
+	;;
+	;; XXX deref
+	;;
+	;; <debrouxl> It decrements sp by 2, but stores the result at
+	;; sp, not at 1(sp). So you essentially get a "free" shift
+	;; left by 8 bits. Much faster than lsl.w / rol.w #8, at
+	;; least.
 FETCHW	MACRO
-	;; XXX call deref
-	
-	move.b	1(a6,\1.w),\2	; 14/4
-	ror.w	#8,\2		;  4/2
+	move.b	1(a6,\1.w),-(sp); 18/4
+	move.w	(sp)+,\2	;  8/2
 	move.b	0(a6,\1.w),\2	; 14/4
 	ENDM
 
@@ -125,11 +130,11 @@ FETCHBI	MACRO			; 40 cycles, 14 bytes
 
 	;; Macro to read an immediate word (unaligned) into \1.
 FETCHWI	MACRO			; 36 cycles, 12 bytes
-	;; XXX use deref
 	addq.w	#2,d2		;  4/2
-	move.b	-1(a6,d2.w),\1	; 14/4
-	rol.w	#8,d2		;  4/2
-	move.b	-2(a6,d2.w),\1	; 14/4
+	;; See FETCHW for an explanation of this trick.
+	move.b	1(a6,d2.w),-(sp); 18/4
+	move.w	(sp)+,\1	;  8/2
+	move.b	0(a6,d2.w),\1	; 14/4
 	ENDM
 
 	;; == Common Opcode Macros =========================================
@@ -143,13 +148,13 @@ _align	SET	_align+$20
 	ENDM
 
 	;; When you want to use the high reg of a pair, use this first
-LOHI	MACRO			; 6 cycles, 2 bytes
-	ror	#8,\1
+LOHI	MACRO			; 22 cycles, 2 bytes
+	ror.w	#8,\1
 	ENDM
 
 	;; Then do your shit and finish with this
-HILO	MACRO			; 6 cycles, 2 bytes
-	rol	#8,\1
+HILO	MACRO			; 22 cycles, 2 bytes
+	rol.w	#8,\1
 	ENDM
 
 	;; calc84maniac suggests putting emu_fetch into this in order
@@ -163,6 +168,7 @@ DONE	MACRO			; 8 cycles, 2 bytes
 
 	;; Do a SUB \2,\1
 F_SUB_B	MACRO			;14 bytes?
+;; XXX use lea and then d(an) if you have a spare register.
 	move.b	\1,f_tmp_src_b	; preserve operands for flagging
 	move.b	\2,f_tmp_dst_b
 	move.b	#1,flag_n
@@ -214,7 +220,8 @@ _main:
 
 emu_setup:
 	movea	emu_plain_op,a5
-	movea	emu_fetch(pc),a2
+	lea	emu_fetch(pc),a2
+	lea	flag_storage(pc),a3	; Thanks to Lionel
 	;; XXX finish
 	rts
 
@@ -222,26 +229,33 @@ emu_setup:
 
 	;; Take a virtual address in d1 and dereference it.  Returns the
 	;; host address in a0.  Destroys a0, d0.
+;; XXX I added a masking of the upper bits of the Z80 address (d1) before translating them to host address.
+;; Please double-check, but AFAICT, it's the right thing to do.
+
+	;; XXX these use the old setup, replace this with a writable
+	;; LUT.
 deref:
 	move.w	d1,d0
+	andi.w	#$3FFF,d0
+	movea.w	d0,a0
+	move.w	d1,d0
 	andi.w	#$C000,d0
 	rol.w	#5,d0
-	jmp	0(pc,d0)
+	jmp	0(pc,d0.w)
 	;; 00
-	movea	a1,a0
-	bra	deref_go
+	adda.l	a1,a0
+	rts
 	;; 01
-	movea	a2,a0
-	bra	deref_go
+	adda.l	a2,a0
+	rts
 	;; 02
-	movea	a3,a0
-	bra	deref_go
+	adda.l	a3,a0
+	rts
 	;; 03
-	movea	a4,a0
-deref_go:
-	adda	d1,a0
+	adda.l	a4,a0
 	rts
 
+
 ;; =========================================================================
 ;; instruction   instruction   instruction  ================================
 ;;      _ _                 _       _       ================================
@@ -257,12 +271,12 @@ emu_fetch:
 	;; Move this into DONE, saving 8 more cycles but using extra
 	;; space.
 	;;
-	;; See if I can get rid of the eor
-	eor.w	d0,d0		; 4 cycles
-	move.b	(a4)+,d0	; 8 cycles
-	rol.w	#5,d0		; 4 cycles   adjust to actual alignment
-	jmp	0(a5,d0)	;14 cycles
-	;; overhead:		 30 cycles
+	;; Likely impossible to get rid of the clr
+	clr.w	d0		;  4 cycles
+	move.b	(a4)+,d0	;  8 cycles
+	rol.w	#5,d0		; 16 cycles   adjust to actual alignment
+	jmp	0(a5,d0.w)	; 14 cycles
+	;; overhead:		  42 cycles
 
 ;;; ========================================================================
 ;;; ========================================================================
@@ -1025,19 +1039,12 @@ emu_op_59:
 	START
 emu_op_5a:
 	;; LD	E,D
-	LOHI	d5
-	move.b	d5,d1
-	HILO	d5
-	move.b	d1,d5
-	DONE
-
-	;; Is this faster or slower?
-
-	andi.w	#$ff00,d5
-	move.b	d5,d1
-	lsr	#8,d1
-	or.w	d1,d5
+	andi.w	#$ff00,d5	; 8/4
+	move.b	d5,d1		; 4/2
+	lsr	#8,d1		;22/2
+	or.w	d1,d5		; 4/2
 	DONE
+				;38/2
 
 	START
 emu_op_5b:
author	Astrid Smith	2010-06-12 11:07:10 -0700
committer	Astrid Smith	2010-06-12 11:07:10 -0700
commit	32ce71ce4ef22abbb228425e699d2cd2f0a3391b (patch)
tree	3a6748e2610c0bdd38139e54b3d2ce33aa7f41a7
parent	52e383ced4997f1ce1f3125499702b8f4531cbca (diff)
parent	f0882c0b92b01a290c62d3a3044bb027014b56c6 (diff)