summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDuncan Smith2010-06-12 11:07:10 -0700
committerDuncan Smith2010-06-12 11:07:10 -0700
commit9540f9351745b0a5386c014031d14c160f0ed6cd (patch)
tree04b1658b2dce54465d9930024e6cae982ac35ce8
parent49f38304e22252b1eb07aba3333eba23cbc5dae8 (diff)
parentcf1021b481694b00e1b18bbc9dab67ab8c6c7553 (diff)
Merge branch 'debrouxl'
-rw-r--r--README.markdown12
-rw-r--r--flags.asm199
-rw-r--r--main.asm99
3 files changed, 150 insertions, 160 deletions
diff --git a/README.markdown b/README.markdown
index 38a9f52..827339c 100644
--- a/README.markdown
+++ b/README.markdown
@@ -19,20 +19,20 @@ The most difficult challenge in writing a 68k-hosted emulator
targetting the z80 is making it _fast_. TI-83+ calculators have a
clock rate in the neighborhood of 12MHz, as do TI-89s. z80
instructions take from 4 to 17 cycles to execute. I can dispatch an
-instruction with a fixed 30 cycle overhead:
+instruction with a fixed 42 cycle overhead:
emu_fetch:
eor.w d0,d0 ; 4 cycles
move.b (a4)+,d0 ; 8 cycles
- rol.w #5,d0 ; 4 cycles adjust to actual alignment
+ rol.w #5,d0 ;16 cycles adjust to actual alignment
jmp 0(a3,d0) ;14 cycles
- ;; overhead: 30 cycles
+ ;; overhead: 42 cycles
From there, an instruction will take anywhere from 0 to, well, lots of
-additional cycles. Generally, however, it will take under 50, for 80
+additional cycles. Generally, however, it will take under 50, for 92
total. In the worst reasonable case, a 4 cycle instruction emlulated
-in 80 cycles, that's a 20:1 ratio. In the best possible case, a
-17-cycle instruction emulated in 30 cycles, is more nearly a 1:2
+in 92 cycles, that's a 23:1 ratio. In the best possible case, a
+17-cycle instruction emulated in 42 cycles, is more nearly a 1:2
ratio.
I am not aiming for exactly correct relative timing of instructions,
diff --git a/flags.asm b/flags.asm
index 8df7c5d..9c25dd7 100644
--- a/flags.asm
+++ b/flags.asm
@@ -1,43 +1,3 @@
-;; N =S
-;; Z = Z
-;; V ~ P
-;; C= C
-;;
-;; =CCR= == z80==
-;; XNZVC SZ5H3PNC
-;; 00000 00000000
-;; 00001 00000001
-;; 00010 00000100
-;; 00011 00000101
-;; 00100 01000000
-;; 00101 01000001
-;; 00110 01000100
-;; 00111 01000101
-;; 01000 10000000
-;; 01001 10000001
-;; 01010 10000100
-;; 01011 10000101
-;; 01100 11000000
-;; 01101 11000001
-;; 01110 11000100
-;; 01111 11000101
-;; 10000 00000000
-;; 10001 00000001
-;; 10010 00000100
-;; 10011 00000101
-;; 10100 01000000
-;; 10101 01000001
-;; 10110 01000100
-;; 10111 01000101
-;; 11000 10000000
-;; 11001 10000001
-;; 11010 10000100
-;; 11011 10000101
-;; 11100 11000000
-;; 11101 11000001
-;; 11110 11000100
-;; 11111 11000101
-
;; Routine to set the given flags
;; Noted in \1 by a 1 bit
F_SET MACRO
@@ -56,24 +16,17 @@ F_CLEAR MACRO
;; Use this when an instruction uses the P/V bit as Parity.
;; Sets or clears the bit explicitly.
;;
- ;; Byte for which parity is calculated must be in \1. (d1
+ ;; Byte for which parity is calculated must be in \1. High
+ ;; byte of \1.w must be zero, using d0 is suggested. (d1
;; destroyed)
+
F_PAR MACRO
- move.b \1,d1 ; 4 2
- lsr #4,d1 ; 6 2
- eor.b \1,d1 ; 4 2
- lsr #2,d1 ; 6 2
- eor.b \1,d1 ; 4 2
- lsr #1,d1 ; 6 2
- eor.b \1,d1 ; 4 2
- andi.b #$01,d1 ; 8 4
- ;; odd parity is now in d1
- ori.b #%00000100,flag_valid ; 20 6
- andi.b #%11111011,flag_byte ; 20 6
- rol.b #2,d1 ; 6 2
- or.b d1,flag_byte ; 8 4
- ENDM ; 86 cycles (!)
- ; 36 bytes (make this a subroutine)
+ ori.b #%00000100,flag_valid-flag_storage(a3) ; ??/4
+ move.b flag_byte-flag_storage(a3),d1 ; ??/2
+ andi.b #%11111011,d1 ; ??/4
+ or.b lut_parity-flag_storage(a3,\1.w),d1 ; ??/4
+ move.b d1,flag_byte-flag_storage(a3) ; ??/2
+ ENDM ;xxx cycles (!)
;; Use this when an instruction uses the P/V bit as Overflow.
@@ -84,39 +37,41 @@ F_OVFL MACRO
;; Save the two operands from ADD \1,\2
F_ADD_SAVE MACRO
- move.b \1,f_tmp_src_b
- move.b \2,f_tmp_dst_b
- movei.b #$01,f_tmp_byte
+ move.b \1,f_tmp_src_b-flag_storage(a3)
+ move.b \2,f_tmp_dst_b-flag_storage(a3)
+ move.b #$01,f_tmp_byte-flag_storage(a3)
F_SET #%
ENDM
;; Normalize and return carry bit (is loaded into Z bit)
;; Destroys d1
f_norm_c:
- move.b flag_valid,d1
+ move.b flag_valid-flag_storage(a3),d1
andi.b #%00000001,d1
- bne FNC_ok ; Bit is valid
- move.b f_host_ccr,d1
+ bne.s FNC_ok ; Bit is valid
+ move.b f_host_ccr-flag_storage(a3),d1
andi.b #%00000001,d1
- or.b d1,flag_byte
+;; XXX see above comment for using lea and then d(an) if you have a spare register.
+ or.b d1,flag_byte-flag_storage(a3)
ori.b #%00000001,flag_valid
FNC_ok:
- move.b flag_byte,d1
+ move.b flag_byte-flag_storage(a3),d1
andi.b #%00000001,d1
rts
;; Normalize and return zero bit (loaded into Z bit)
;; Destroys d1
f_norm_z:
- move.b flag_valid,d1
+ move.b flag_valid-flag_storage(a3),d1
andi.b #%01000000,d1
- bne FNZ_ok ; Bit is valid
- move.b f_host_ccr,d1
+ bne.s FNZ_ok ; Bit is valid
+ move.b f_host_ccr-flag_storage(a3),d1
andi.b #%01000000,d1
- or.b d1,flag_byte
- ori.b #%01000000,flag_valid
+;; XXX see above comment for using lea and then d(an) if you have a spare register.
+ or.b d1,flag_byte-flag_storage(a3)
+ ori.b #%01000000,flag_valid-flag_storage(a3)
FNZ_ok:
- move.b flag_byte,d1
+ move.b flag_byte-flag_storage(a3),d1
andi.b #%01000000,d1
rts
@@ -125,15 +80,14 @@ FNZ_ok:
;; Preconditions:
;; Flags to change are noted in d0 by a 1 bit
flags_normalize:
- move.b f_host_ccr,d1
- andi.b #%00011111,d1 ; Maybe TI uses the reserved bits for
- ; something ...
- movea lut_ccr(pc),a1
- move.b 0(a1,d1),d1
+ move.b f_host_ccr-flag_storage(a3),d1 ; 8/4
+ ;; .w keeps d1 clean
+ andi.w #%00011111,d1 ; 8/4
+ move.b lut_ccr(pc,d1.w),d1 ; 10/4
;; XXX do this
rts
-storage:
+flag_storage:
;; 1 if tmp_???b is valid, 0 if tmp_???w is valid
f_tmp_byte: ds.b 0
;; 2 if P is 0, 3 if P is 1, 4 if P is Parity, 5 if P is oVerflow
@@ -162,36 +116,65 @@ flag_valid: ds.b 0 ; Validity mask -- 1 if valid.
;; LUT for the CCR -> F mapping
lut_ccr:
- dc.b %00000000
- dc.b %00000001
- dc.b %00000100
- dc.b %00000101
- dc.b %01000000
- dc.b %01000001
- dc.b %01000100
- dc.b %01000101
- dc.b %10000000
- dc.b %10000001
- dc.b %10000100
- dc.b %10000101
- dc.b %11000000
- dc.b %11000001
- dc.b %11000100
- dc.b %11000101
- dc.b %00000000
- dc.b %00000001
- dc.b %00000100
- dc.b %00000101
- dc.b %01000000
- dc.b %01000001
- dc.b %01000100
- dc.b %01000101
- dc.b %10000000
- dc.b %10000001
- dc.b %10000100
- dc.b %10000101
- dc.b %11000000
- dc.b %11000001
- dc.b %11000100
- dc.b %11000101
+ ;; N =S
+ ;; Z = Z
+ ;; V ~ P
+ ;; C= C
+ ;;
+ ;; =CCR= == z80==
+ ;; XNZVC SZ5H3PNC
+ dc.b %00000000 ;; 00000 00000000
+ dc.b %00000001 ;; 00001 00000001
+ dc.b %00000100 ;; 00010 00000100
+ dc.b %00000101 ;; 00011 00000101
+ dc.b %01000000 ;; 00100 01000000
+ dc.b %01000001 ;; 00101 01000001
+ dc.b %01000100 ;; 00110 01000100
+ dc.b %01000101 ;; 00111 01000101
+ dc.b %10000000 ;; 01000 10000000
+ dc.b %10000001 ;; 01001 10000001
+ dc.b %10000100 ;; 01010 10000100
+ dc.b %10000101 ;; 01011 10000101
+ dc.b %11000000 ;; 01100 11000000
+ dc.b %11000001 ;; 01101 11000001
+ dc.b %11000100 ;; 01110 11000100
+ dc.b %11000101 ;; 01111 11000101
+ dc.b %00000000 ;; 10000 00000000
+ dc.b %00000001 ;; 10001 00000001
+ dc.b %00000100 ;; 10010 00000100
+ dc.b %00000101 ;; 10011 00000101
+ dc.b %01000000 ;; 10100 01000000
+ dc.b %01000001 ;; 10101 01000001
+ dc.b %01000100 ;; 10110 01000100
+ dc.b %01000101 ;; 10111 01000101
+ dc.b %10000000 ;; 11000 10000000
+ dc.b %10000001 ;; 11001 10000001
+ dc.b %10000100 ;; 11010 10000100
+ dc.b %10000101 ;; 11011 10000101
+ dc.b %11000000 ;; 11100 11000000
+ dc.b %11000001 ;; 11101 11000001
+ dc.b %11000100 ;; 11110 11000100
+ dc.b %11000101 ;; 11111 11000101
+
+ ;; 256-byte LUT for the Parity bit.
+ ;; Keep this last so all storage references require only one
+ ;; extension word.
+lut_parity:
+ dc.b 4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+ dc.b 0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+ dc.b 0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+ dc.b 4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+ dc.b 0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+ dc.b 4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+ dc.b 4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+ dc.b 0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+ dc.b 0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+ dc.b 4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+ dc.b 4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+ dc.b 0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+ dc.b 4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+ dc.b 0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+ dc.b 0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0
+ dc.b 4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4
+
diff --git a/main.asm b/main.asm
index 3e0fdfa..507e8ef 100644
--- a/main.asm
+++ b/main.asm
@@ -11,12 +11,12 @@
;;; Registers used:
;;;
;;; A7 = sp
-;;; A6 =
+;;; A6 = emulated PC XXX
;;; A5 = instruction table base pointer
-;;; A4 = bank 3 base
-;;; A3 = bank 2 base
-;;; A2 = bank 1 base
-;;; A1 = bank 0 base
+;;; A4 = emulated SP XXX
+;;; A3 = constants address (see flags.asm)
+;;; A2 =
+;;; A1 =
;;; A0 =
;;;
;;; D0 = current instruction, scratch for macros
@@ -59,11 +59,16 @@ PUTB MACRO ; 14 cycles, 4 bytes
;; Macro to read a word from main memory at register \1
;; (unaligned). Puts the word read in \2.
+ ;;
+ ;; XXX deref
+ ;;
+ ;; <debrouxl> It decrements sp by 2, but stores the result at
+ ;; sp, not at 1(sp). So you essentially get a "free" shift
+ ;; left by 8 bits. Much faster than lsl.w / rol.w #8, at
+ ;; least.
FETCHW MACRO
- ;; XXX call deref
-
- move.b 1(a6,\1.w),\2 ; 14/4
- ror.w #8,\2 ; 4/2
+ move.b 1(a6,\1.w),-(sp); 18/4
+ move.w (sp)+,\2 ; 8/2
move.b 0(a6,\1.w),\2 ; 14/4
ENDM
@@ -125,11 +130,11 @@ FETCHBI MACRO ; 40 cycles, 14 bytes
;; Macro to read an immediate word (unaligned) into \1.
FETCHWI MACRO ; 36 cycles, 12 bytes
- ;; XXX use deref
addq.w #2,d2 ; 4/2
- move.b -1(a6,d2.w),\1 ; 14/4
- rol.w #8,d2 ; 4/2
- move.b -2(a6,d2.w),\1 ; 14/4
+ ;; See FETCHW for an explanation of this trick.
+ move.b 1(a6,d2.w),-(sp); 18/4
+ move.w (sp)+,\1 ; 8/2
+ move.b 0(a6,d2.w),\1 ; 14/4
ENDM
;; == Common Opcode Macros =========================================
@@ -143,13 +148,13 @@ _align SET _align+$20
ENDM
;; When you want to use the high reg of a pair, use this first
-LOHI MACRO ; 6 cycles, 2 bytes
- ror #8,\1
+LOHI MACRO ; 22 cycles, 2 bytes
+ ror.w #8,\1
ENDM
;; Then do your shit and finish with this
-HILO MACRO ; 6 cycles, 2 bytes
- rol #8,\1
+HILO MACRO ; 22 cycles, 2 bytes
+ rol.w #8,\1
ENDM
;; calc84maniac suggests putting emu_fetch into this in order
@@ -163,6 +168,7 @@ DONE MACRO ; 8 cycles, 2 bytes
;; Do a SUB \2,\1
F_SUB_B MACRO ;14 bytes?
+;; XXX use lea and then d(an) if you have a spare register.
move.b \1,f_tmp_src_b ; preserve operands for flagging
move.b \2,f_tmp_dst_b
move.b #1,flag_n
@@ -214,7 +220,8 @@ _main:
emu_setup:
movea emu_plain_op,a5
- movea emu_fetch(pc),a2
+ lea emu_fetch(pc),a2
+ lea flag_storage(pc),a3 ; Thanks to Lionel
;; XXX finish
rts
@@ -222,26 +229,33 @@ emu_setup:
;; Take a virtual address in d1 and dereference it. Returns the
;; host address in a0. Destroys a0, d0.
+;; XXX I added a masking of the upper bits of the Z80 address (d1) before translating them to host address.
+;; Please double-check, but AFAICT, it's the right thing to do.
+
+ ;; XXX these use the old setup, replace this with a writable
+ ;; LUT.
deref:
move.w d1,d0
+ andi.w #$3FFF,d0
+ movea.w d0,a0
+ move.w d1,d0
andi.w #$C000,d0
rol.w #5,d0
- jmp 0(pc,d0)
+ jmp 0(pc,d0.w)
;; 00
- movea a1,a0
- bra deref_go
+ adda.l a1,a0
+ rts
;; 01
- movea a2,a0
- bra deref_go
+ adda.l a2,a0
+ rts
;; 02
- movea a3,a0
- bra deref_go
+ adda.l a3,a0
+ rts
;; 03
- movea a4,a0
-deref_go:
- adda d1,a0
+ adda.l a4,a0
rts
+
;; =========================================================================
;; instruction instruction instruction ================================
;; _ _ _ _ ================================
@@ -257,12 +271,12 @@ emu_fetch:
;; Move this into DONE, saving 8 more cycles but using extra
;; space.
;;
- ;; See if I can get rid of the eor
- eor.w d0,d0 ; 4 cycles
- move.b (a4)+,d0 ; 8 cycles
- rol.w #5,d0 ; 4 cycles adjust to actual alignment
- jmp 0(a5,d0) ;14 cycles
- ;; overhead: 30 cycles
+ ;; Likely impossible to get rid of the clr
+ clr.w d0 ; 4 cycles
+ move.b (a4)+,d0 ; 8 cycles
+ rol.w #5,d0 ; 16 cycles adjust to actual alignment
+ jmp 0(a5,d0.w) ; 14 cycles
+ ;; overhead: 42 cycles
;;; ========================================================================
;;; ========================================================================
@@ -1025,19 +1039,12 @@ emu_op_59:
START
emu_op_5a:
;; LD E,D
- LOHI d5
- move.b d5,d1
- HILO d5
- move.b d1,d5
- DONE
-
- ;; Is this faster or slower?
-
- andi.w #$ff00,d5
- move.b d5,d1
- lsr #8,d1
- or.w d1,d5
+ andi.w #$ff00,d5 ; 8/4
+ move.b d5,d1 ; 4/2
+ lsr #8,d1 ;22/2
+ or.w d1,d5 ; 4/2
DONE
+ ;38/2
START
emu_op_5b: