From aff789af91b14462ae9e6df79720b1f71e4947ca Mon Sep 17 00:00:00 2001 From: Duncan Smith Date: Sat, 12 Jun 2010 08:32:53 -0700 Subject: Patch received in email from Lionel Debroux --- main.asm | 53 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 23 deletions(-) (limited to 'main.asm') diff --git a/main.asm b/main.asm index abc7b89..685d74e 100644 --- a/main.asm +++ b/main.asm @@ -62,8 +62,8 @@ PUTB MACRO ; 14 cycles, 4 bytes FETCHW MACRO ;; XXX call deref - move.b 1(a6,\1.w),\2 ; 14/4 - ror.w #8,\2 ; 4/2 + move.b 1(a6,\1.w),-(sp); 18/4 + move.w (sp)+,\2 ; 8/2 move.b 0(a6,\1.w),\2 ; 14/4 ENDM @@ -128,6 +128,7 @@ FETCHWI MACRO ; 36 cycles, 12 bytes ;; XXX use deref addq.w #2,d2 ; 4/2 move.b -1(a6,d2.w),\1 ; 14/4 +;; XXX why not rol #8,\1 ?? (and then you would be able to use the same trick as in FETCHW). rol.w #8,d2 ; 4/2 move.b -2(a6,d2.w),\1 ; 14/4 ENDM @@ -143,13 +144,13 @@ _align SET _align+$20 ENDM ;; When you want to use the high reg of a pair, use this first -LOHI MACRO ; 6 cycles, 2 bytes - ror #8,\1 +LOHI MACRO ; 22 cycles, 2 bytes + ror.w #8,\1 ENDM ;; Then do your shit and finish with this -HILO MACRO ; 6 cycles, 2 bytes - rol #8,\1 +HILO MACRO ; 22 cycles, 2 bytes + rol.w #8,\1 ENDM ;; calc84maniac suggests putting emu_fetch into this in order @@ -163,6 +164,7 @@ DONE MACRO ; 8 cycles, 2 bytes ;; Do a SUB \2,\1 F_SUB_B MACRO ;14 bytes? +;; XXX use lea and then d(an) if you have a spare register. move.b \1,f_tmp_src_b ; preserve operands for flagging move.b \2,f_tmp_dst_b move.b #1,flag_n @@ -207,6 +209,7 @@ F_DEC_W MACRO _main: +;; XXX in the current state of the code, you could just make _main and emu_setup point to the same address. bsr emu_setup rts @@ -214,7 +217,7 @@ _main: emu_setup: movea emu_plain_op,a5 - movea emu_fetch(pc),a2 + lea emu_fetch(pc),a2 ;; XXX finish rts @@ -222,26 +225,30 @@ emu_setup: ;; Take a virtual address in d1 and dereference it. Returns the ;; host address in a0. Destroys a0, d0. +;; XXX I added a masking of the upper bits of the Z80 address (d1) before translating them to host address. +;; Please double-check, but AFAICT, it's the right thing to do. deref: + move.w d1,d0 + andi.w #$3FFF,d0 + movea.w d0,a0 move.w d1,d0 andi.w #$C000,d0 rol.w #5,d0 - jmp 0(pc,d0) + jmp 0(pc,d0.w) ;; 00 - movea a1,a0 - bra deref_go + adda.l a1,a0 + rts ;; 01 - movea a2,a0 - bra deref_go + adda.l a2,a0 + rts ;; 02 - movea a3,a0 - bra deref_go + adda.l a3,a0 + rts ;; 03 - movea a4,a0 -deref_go: - adda d1,a0 + adda.l a4,a0 rts + ;; ========================================================================= ;; instruction instruction instruction ================================ ;; _ _ _ _ ================================ @@ -257,12 +264,12 @@ emu_fetch: ;; Move this into DONE, saving 8 more cycles but using extra ;; space. ;; - ;; See if I can get rid of the eor - eor.w d0,d0 ; 4 cycles - move.b (a4)+,d0 ; 8 cycles - rol.w #5,d0 ; 4 cycles adjust to actual alignment - jmp 0(a5,d0) ;14 cycles - ;; overhead: 30 cycles + ;; Likely impossible to get rid of the clr + clr.w d0,d0 ; 4 cycles + move.b (a4)+,d0 ; 8 cycles + rol.w #5,d0 ; 16 cycles adjust to actual alignment + jmp 0(a5,d0.w) ; 14 cycles + ;; overhead: 42 cycles ;;; ======================================================================== ;;; ======================================================================== -- cgit v1.2.3 From 7e93257ff6bc6c456001916db480e543ab65faf9 Mon Sep 17 00:00:00 2001 From: Duncan Smith Date: Sat, 12 Jun 2010 10:12:34 -0700 Subject: Worked in Lionel's changes to main.asm --- main.asm | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) (limited to 'main.asm') diff --git a/main.asm b/main.asm index 685d74e..28ac5e4 100644 --- a/main.asm +++ b/main.asm @@ -11,7 +11,7 @@ ;;; Registers used: ;;; ;;; A7 = sp -;;; A6 = +;;; A6 = emulated PC ;;; A5 = instruction table base pointer ;;; A4 = bank 3 base ;;; A3 = bank 2 base @@ -59,9 +59,14 @@ PUTB MACRO ; 14 cycles, 4 bytes ;; Macro to read a word from main memory at register \1 ;; (unaligned). Puts the word read in \2. + ;; + ;; XXX deref + ;; + ;; It decrements sp by 2, but stores the result at + ;; sp, not at 1(sp). So you essentially get a "free" shift + ;; left by 8 bits. Much faster than lsl.w / rol.w #8, at + ;; least. FETCHW MACRO - ;; XXX call deref - move.b 1(a6,\1.w),-(sp); 18/4 move.w (sp)+,\2 ; 8/2 move.b 0(a6,\1.w),\2 ; 14/4 @@ -125,12 +130,11 @@ FETCHBI MACRO ; 40 cycles, 14 bytes ;; Macro to read an immediate word (unaligned) into \1. FETCHWI MACRO ; 36 cycles, 12 bytes - ;; XXX use deref addq.w #2,d2 ; 4/2 - move.b -1(a6,d2.w),\1 ; 14/4 -;; XXX why not rol #8,\1 ?? (and then you would be able to use the same trick as in FETCHW). - rol.w #8,d2 ; 4/2 - move.b -2(a6,d2.w),\1 ; 14/4 + ;; See FETCHW for an explanation of this trick. + move.b 1(a6,d2.w),-(sp); 18/4 + move.w (sp)+,\1 ; 8/2 + move.b 0(a6,d2.w),\1 ; 14/4 ENDM ;; == Common Opcode Macros ========================================= @@ -265,7 +269,7 @@ emu_fetch: ;; space. ;; ;; Likely impossible to get rid of the clr - clr.w d0,d0 ; 4 cycles + clr.w d0 ; 4 cycles move.b (a4)+,d0 ; 8 cycles rol.w #5,d0 ; 16 cycles adjust to actual alignment jmp 0(a5,d0.w) ; 14 cycles @@ -1031,19 +1035,12 @@ emu_op_59: START emu_op_5a: ;; LD E,D - LOHI d5 - move.b d5,d1 - HILO d5 - move.b d1,d5 - DONE - - ;; Is this faster or slower? - - andi.w #$ff00,d5 - move.b d5,d1 - lsr #8,d1 - or.w d1,d5 + andi.w #$ff00,d5 ; 8/4 + move.b d5,d1 ; 4/2 + lsr #8,d1 ;22/2 + or.w d1,d5 ; 4/2 DONE + ;38/2 START emu_op_5b: -- cgit v1.2.3 From cf1021b481694b00e1b18bbc9dab67ab8c6c7553 Mon Sep 17 00:00:00 2001 From: Duncan Smith Date: Sat, 12 Jun 2010 11:06:51 -0700 Subject: Worked in Lionel's changes to flags.asm, patched up main.asm slightly to fit --- flags.asm | 196 +++++++++++++++++++++++++++----------------------------------- main.asm | 15 +++-- 2 files changed, 95 insertions(+), 116 deletions(-) (limited to 'main.asm') diff --git a/flags.asm b/flags.asm index 0b85267..9c25dd7 100644 --- a/flags.asm +++ b/flags.asm @@ -1,43 +1,3 @@ -;; N =S -;; Z = Z -;; V ~ P -;; C= C -;; -;; =CCR= == z80== -;; XNZVC SZ5H3PNC -;; 00000 00000000 -;; 00001 00000001 -;; 00010 00000100 -;; 00011 00000101 -;; 00100 01000000 -;; 00101 01000001 -;; 00110 01000100 -;; 00111 01000101 -;; 01000 10000000 -;; 01001 10000001 -;; 01010 10000100 -;; 01011 10000101 -;; 01100 11000000 -;; 01101 11000001 -;; 01110 11000100 -;; 01111 11000101 -;; 10000 00000000 -;; 10001 00000001 -;; 10010 00000100 -;; 10011 00000101 -;; 10100 01000000 -;; 10101 01000001 -;; 10110 01000100 -;; 10111 01000101 -;; 11000 10000000 -;; 11001 10000001 -;; 11010 10000100 -;; 11011 10000101 -;; 11100 11000000 -;; 11101 11000001 -;; 11110 11000100 -;; 11111 11000101 - ;; Routine to set the given flags ;; Noted in \1 by a 1 bit F_SET MACRO @@ -56,28 +16,17 @@ F_CLEAR MACRO ;; Use this when an instruction uses the P/V bit as Parity. ;; Sets or clears the bit explicitly. ;; - ;; Byte for which parity is calculated must be in \1. (d1 + ;; Byte for which parity is calculated must be in \1. High + ;; byte of \1.w must be zero, using d0 is suggested. (d1 ;; destroyed) -;; XXX that's expensive. After making this a subroutine, to speed up parity computation, maybe you could use a 256-byte lookup table accessed by d(pc,ix.w). -;; And if you have a spare address register, since xxx.l addressing mode is expensive speed-wise and size-wise (4 bytes + relocation), -;; you should use lea d(pc) to preload the address of flag_valid into an address register, -;; and then use (an) and d(an) to write to flag_valid and flag_byte. + F_PAR MACRO - move.b \1,d1 ; 4 2 - lsr.w #4,d1 ; 14 2 - eor.b \1,d1 ; 4 2 - lsr.w #2,d1 ; 10 2 - eor.b \1,d1 ; 4 2 - lsr.w #1,d1 ; 8 2 - eor.b \1,d1 ; 4 2 - andi.b #$01,d1 ; 8 4 - ;; odd parity is now in d1 - ori.b #%00000100,flag_valid ; 20 8 - andi.b #%11111011,flag_byte ; 20 8 - rol.b #2,d1 ; 6 2 - or.b d1,flag_byte ; 16 4 + ori.b #%00000100,flag_valid-flag_storage(a3) ; ??/4 + move.b flag_byte-flag_storage(a3),d1 ; ??/2 + andi.b #%11111011,d1 ; ??/4 + or.b lut_parity-flag_storage(a3,\1.w),d1 ; ??/4 + move.b d1,flag_byte-flag_storage(a3) ; ??/2 ENDM ;xxx cycles (!) - ; xx bytes (make this a subroutine) ;; Use this when an instruction uses the P/V bit as Overflow. @@ -88,42 +37,41 @@ F_OVFL MACRO ;; Save the two operands from ADD \1,\2 F_ADD_SAVE MACRO - move.b \1,f_tmp_src_b - move.b \2,f_tmp_dst_b - move.b #$01,f_tmp_byte + move.b \1,f_tmp_src_b-flag_storage(a3) + move.b \2,f_tmp_dst_b-flag_storage(a3) + move.b #$01,f_tmp_byte-flag_storage(a3) F_SET #% ENDM ;; Normalize and return carry bit (is loaded into Z bit) ;; Destroys d1 f_norm_c: - move.b flag_valid(pc),d1 -;; XXX you could use lsr #1 (same number of cycles, smaller) + bcc.s or bcs.s here. + move.b flag_valid-flag_storage(a3),d1 andi.b #%00000001,d1 bne.s FNC_ok ; Bit is valid - move.b f_host_ccr(pc),d1 + move.b f_host_ccr-flag_storage(a3),d1 andi.b #%00000001,d1 ;; XXX see above comment for using lea and then d(an) if you have a spare register. - or.b d1,flag_byte + or.b d1,flag_byte-flag_storage(a3) ori.b #%00000001,flag_valid FNC_ok: - move.b flag_byte(pc),d1 + move.b flag_byte-flag_storage(a3),d1 andi.b #%00000001,d1 rts ;; Normalize and return zero bit (loaded into Z bit) ;; Destroys d1 f_norm_z: - move.b flag_valid(pc),d1 + move.b flag_valid-flag_storage(a3),d1 andi.b #%01000000,d1 bne.s FNZ_ok ; Bit is valid - move.b f_host_ccr(pc),d1 + move.b f_host_ccr-flag_storage(a3),d1 andi.b #%01000000,d1 ;; XXX see above comment for using lea and then d(an) if you have a spare register. - or.b d1,flag_byte - ori.b #%01000000,flag_valid + or.b d1,flag_byte-flag_storage(a3) + ori.b #%01000000,flag_valid-flag_storage(a3) FNZ_ok: - move.b flag_byte(pc),d1 + move.b flag_byte-flag_storage(a3),d1 andi.b #%01000000,d1 rts @@ -132,15 +80,14 @@ FNZ_ok: ;; Preconditions: ;; Flags to change are noted in d0 by a 1 bit flags_normalize: - move.b f_host_ccr(pc),d1 -;; XXX .w because you don't want garbage in bits 8-15 when using d(pc,ix.w) or d(an,ix.w) ea mode. - andi.w #%00011111,d1 ; Maybe TI uses the reserved bits for - ; something ... - move.b lut_ccr(pc,d1.w),d1 + move.b f_host_ccr-flag_storage(a3),d1 ; 8/4 + ;; .w keeps d1 clean + andi.w #%00011111,d1 ; 8/4 + move.b lut_ccr(pc,d1.w),d1 ; 10/4 ;; XXX do this rts -storage: +flag_storage: ;; 1 if tmp_???b is valid, 0 if tmp_???w is valid f_tmp_byte: ds.b 0 ;; 2 if P is 0, 3 if P is 1, 4 if P is Parity, 5 if P is oVerflow @@ -169,36 +116,65 @@ flag_valid: ds.b 0 ; Validity mask -- 1 if valid. ;; LUT for the CCR -> F mapping lut_ccr: - dc.b %00000000 - dc.b %00000001 - dc.b %00000100 - dc.b %00000101 - dc.b %01000000 - dc.b %01000001 - dc.b %01000100 - dc.b %01000101 - dc.b %10000000 - dc.b %10000001 - dc.b %10000100 - dc.b %10000101 - dc.b %11000000 - dc.b %11000001 - dc.b %11000100 - dc.b %11000101 - dc.b %00000000 - dc.b %00000001 - dc.b %00000100 - dc.b %00000101 - dc.b %01000000 - dc.b %01000001 - dc.b %01000100 - dc.b %01000101 - dc.b %10000000 - dc.b %10000001 - dc.b %10000100 - dc.b %10000101 - dc.b %11000000 - dc.b %11000001 - dc.b %11000100 - dc.b %11000101 + ;; N =S + ;; Z = Z + ;; V ~ P + ;; C= C + ;; + ;; =CCR= == z80== + ;; XNZVC SZ5H3PNC + dc.b %00000000 ;; 00000 00000000 + dc.b %00000001 ;; 00001 00000001 + dc.b %00000100 ;; 00010 00000100 + dc.b %00000101 ;; 00011 00000101 + dc.b %01000000 ;; 00100 01000000 + dc.b %01000001 ;; 00101 01000001 + dc.b %01000100 ;; 00110 01000100 + dc.b %01000101 ;; 00111 01000101 + dc.b %10000000 ;; 01000 10000000 + dc.b %10000001 ;; 01001 10000001 + dc.b %10000100 ;; 01010 10000100 + dc.b %10000101 ;; 01011 10000101 + dc.b %11000000 ;; 01100 11000000 + dc.b %11000001 ;; 01101 11000001 + dc.b %11000100 ;; 01110 11000100 + dc.b %11000101 ;; 01111 11000101 + dc.b %00000000 ;; 10000 00000000 + dc.b %00000001 ;; 10001 00000001 + dc.b %00000100 ;; 10010 00000100 + dc.b %00000101 ;; 10011 00000101 + dc.b %01000000 ;; 10100 01000000 + dc.b %01000001 ;; 10101 01000001 + dc.b %01000100 ;; 10110 01000100 + dc.b %01000101 ;; 10111 01000101 + dc.b %10000000 ;; 11000 10000000 + dc.b %10000001 ;; 11001 10000001 + dc.b %10000100 ;; 11010 10000100 + dc.b %10000101 ;; 11011 10000101 + dc.b %11000000 ;; 11100 11000000 + dc.b %11000001 ;; 11101 11000001 + dc.b %11000100 ;; 11110 11000100 + dc.b %11000101 ;; 11111 11000101 + + ;; 256-byte LUT for the Parity bit. + ;; Keep this last so all storage references require only one + ;; extension word. +lut_parity: + dc.b 4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4 + dc.b 0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0 + dc.b 0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0 + dc.b 4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4 + dc.b 0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0 + dc.b 4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4 + dc.b 4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4 + dc.b 0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0 + dc.b 0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0 + dc.b 4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4 + dc.b 4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4 + dc.b 0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0 + dc.b 4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4 + dc.b 0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0 + dc.b 0,4,4,0,4,0,0,4,4,0,0,4,0,4,4,0 + dc.b 4,0,0,4,0,4,4,0,0,4,4,0,4,0,0,4 + diff --git a/main.asm b/main.asm index 28ac5e4..1431220 100644 --- a/main.asm +++ b/main.asm @@ -11,12 +11,12 @@ ;;; Registers used: ;;; ;;; A7 = sp -;;; A6 = emulated PC +;;; A6 = emulated PC XXX ;;; A5 = instruction table base pointer -;;; A4 = bank 3 base -;;; A3 = bank 2 base -;;; A2 = bank 1 base -;;; A1 = bank 0 base +;;; A4 = emulated SP XXX +;;; A3 = constants address (see flags.asm) +;;; A2 = +;;; A1 = ;;; A0 = ;;; ;;; D0 = current instruction, scratch for macros @@ -213,7 +213,6 @@ F_DEC_W MACRO _main: -;; XXX in the current state of the code, you could just make _main and emu_setup point to the same address. bsr emu_setup rts @@ -222,6 +221,7 @@ _main: emu_setup: movea emu_plain_op,a5 lea emu_fetch(pc),a2 + lea flag_storage(pc),a3 ; Thanks to Lionel ;; XXX finish rts @@ -231,6 +231,9 @@ emu_setup: ;; host address in a0. Destroys a0, d0. ;; XXX I added a masking of the upper bits of the Z80 address (d1) before translating them to host address. ;; Please double-check, but AFAICT, it's the right thing to do. + + ;; XXX these use the old setup, replace this with a writable + ;; LUT. deref: move.w d1,d0 andi.w #$3FFF,d0 -- cgit v1.2.3