diff --git a/src/nba/include/nba/common/dsp/ring_buffer.hpp b/src/nba/include/nba/common/dsp/ring_buffer.hpp index d2dabde..c9018fa 100644 --- a/src/nba/include/nba/common/dsp/ring_buffer.hpp +++ b/src/nba/include/nba/common/dsp/ring_buffer.hpp @@ -33,11 +33,12 @@ struct RingBuffer : Stream { } } - auto Peek(int offset) -> T const { + // 这仨巨头也要inline一下! + inline auto Peek(int offset) -> T const { return data[(rd_ptr + offset) % length]; } - auto Read() -> T { + inline auto Read() -> T { T value = data[rd_ptr]; if(count > 0) { rd_ptr = (rd_ptr + 1) % length; @@ -46,7 +47,7 @@ struct RingBuffer : Stream { return value; } - void Write(T const& value) { + inline void Write(T const& value) { if(blocking && count == length) { return; } diff --git a/src/nba/include/nba/common/punning.hpp b/src/nba/include/nba/common/punning.hpp index a93e7b7..b929514 100644 --- a/src/nba/include/nba/common/punning.hpp +++ b/src/nba/include/nba/common/punning.hpp @@ -13,15 +13,33 @@ namespace nba { template -auto read(void const* data, uint offset) -> T { +T read(void const* data, uint offset) { T value; - memcpy(&value, (u8*)data + offset, sizeof(T)); + std::memcpy(&value, (u8*)data + offset, sizeof(T)); return value; } +template<> +inline u8 read(void const* data, uint offset){ + u8 const *p = (u8*)data; + return p[offset]; +} + +template<> +inline u16 read(void const* data, uint offset){ + u16 const *p = (u16*)data; + return p[ offset>>1 ]; +} + +template<> +inline u32 read(void const* data, uint offset) { + u32 const *p = (u32*)data; + return p[ offset>>2 ]; +} + template -void write(void* data, uint offset, T value) { - memcpy((u8*)data + offset, &value, sizeof(T)); +inline void write(void* data, uint offset, T value) { + std::memcpy((u8*)data + offset, &value, sizeof(T)); } } // namespace nba diff --git a/src/nba/include/nba/scheduler.hpp b/src/nba/include/nba/scheduler.hpp index d09dfa1..c79db49 100644 --- a/src/nba/include/nba/scheduler.hpp +++ b/src/nba/include/nba/scheduler.hpp @@ -135,6 +135,7 @@ struct Scheduler { } void AddCycles(int cycles) { + if( !cycles ) return; auto timestamp_next = timestamp_now + cycles; Step(timestamp_next); timestamp_now = timestamp_next; diff --git a/src/nba/src/arm/handlers/memory.inl b/src/nba/src/arm/handlers/memory.inl index 99104d2..ece0d46 100644 --- a/src/nba/src/arm/handlers/memory.inl +++ b/src/nba/src/arm/handlers/memory.inl @@ -5,15 +5,15 @@ * Refer to the included LICENSE file. */ -u32 ReadByte(u32 address, int access) { +ALWAYS_INLINE u32 ReadByte(u32 address, int access) { return bus.ReadByte(address, access); } -u32 ReadHalf(u32 address, int access) { +ALWAYS_INLINE u32 ReadHalf(u32 address, int access) { return bus.ReadHalf(address, access); } -u32 ReadWord(u32 address, int access) { +ALWAYS_INLINE u32 ReadWord(u32 address, int access) { return bus.ReadWord(address, access); } diff --git a/src/nba/src/bus/bus.cpp b/src/nba/src/bus/bus.cpp index 9d0f72e..ecc9069 100644 --- a/src/nba/src/bus/bus.cpp +++ b/src/nba/src/bus/bus.cpp @@ -96,90 +96,128 @@ auto Bus::Read(u32 address, int access) -> T { parallel_internal_cpu_cycle_limit = 0; - switch(page) { - // BIOS - case 0x00: { - Step(1); - return ReadBIOS(Align(address)); - } - // EWRAM (external work RAM) - case 0x02: { - Step(is_u32 ? 6 : 3); - return read(memory.wram.data(), Align(address) & 0x3FFFF); - } - // IWRAM (internal work RAM) - case 0x03: { - Step(1); - return read(memory.iram.data(), Align(address) & 0x7FFF); - } - // MMIO - case 0x04: { - Step(1); - address = Align(address); - if constexpr(std::is_same_v) return hw.ReadByte(address); - if constexpr(std::is_same_v) return hw.ReadHalf(address); - if constexpr(std::is_same_v) return hw.ReadWord(address); - return 0; - } - // PRAM (palette RAM) - case 0x05: { - return ReadPRAM(Align(address)); - } - // VRAM (video RAM) - case 0x06: { - return ReadVRAM(Align(address)); - } - // OAM (object attribute map) - case 0x07: { - return ReadOAM(Align(address)); - } + // 这个switch-case不好进行分支预测,所以要根据访问频度改成if likely + if( likely( page == 0x08 ) ){ // ROM (WS0, WS1, WS2) - case 0x08 ... 0x0D: { - address = Align(address); + address = Align(address); - auto sequential = access & Sequential; - bool code = access & Code; + auto sequential = access & Sequential; + bool code = access & Code; - if((address & 0x1'FFFF) == 0 || ((last_access & Dma) && !(access & Dma))) { - sequential = 0; - } - - if constexpr(std::is_same_v) { - auto shift = ((address & 1) << 3); - Prefetch(address, code, wait16[sequential][page]); - return memory.rom.ReadROM16(address) >> shift; - } - - if constexpr(std::is_same_v) { - Prefetch(address, code, wait16[sequential][page]); - return memory.rom.ReadROM16(address); - } - - if constexpr(std::is_same_v) { - Prefetch(address, code, wait32[sequential][page]); - return memory.rom.ReadROM32(address); - } - - return 0; + if((address & 0x1'FFFF) == 0 || ((last_access & Dma) && !(access & Dma))) { + sequential = 0; } - // SRAM or FLASH backup - case 0x0E ... 0x0F: { - StopPrefetch(); - Step(wait16[0][0xE]); - u32 value = memory.rom.ReadSRAM(address); - - if constexpr(std::is_same_v) value *= 0x0101; - if constexpr(std::is_same_v) value *= 0x01010101; - - return T(value); + if constexpr(std::is_same_v) { + auto shift = ((address & 1) << 3); + Prefetch(address, code, wait16[sequential][page]); + return memory.rom.ReadROM16(address) >> shift; } - // Unmapped memory - default: { - Step(1); - return ReadOpenBus(Align(address)); + + if constexpr(std::is_same_v) { + Prefetch(address, code, wait16[sequential][page]); + return memory.rom.ReadROM16(address); } - } + + if constexpr(std::is_same_v) { + Prefetch(address, code, wait32[sequential][page]); + return memory.rom.ReadROM32(address); + } + + return 0; + } + else if( likely( page == 0x03 ) ){ + // IWRAM (internal work RAM) + Step(1); + return read(memory.iram.data(), Align(address) & 0x7FFF); + } + else { + switch(page) { + // BIOS + case 0x00: { + Step(1); + return ReadBIOS(Align(address)); + } + // EWRAM (external work RAM) + case 0x02: { + Step(is_u32 ? 6 : 3); + return read(memory.wram.data(), Align(address) & 0x3FFFF); + } + /* 已经前置 // IWRAM (internal work RAM) + case 0x03: { + Step(1); + return read(memory.iram.data(), Align(address) & 0x7FFF); + }*/ + // MMIO + case 0x04: { + Step(1); + address = Align(address); + if constexpr(std::is_same_v) return hw.ReadByte(address); + if constexpr(std::is_same_v) return hw.ReadHalf(address); + if constexpr(std::is_same_v) return hw.ReadWord(address); + return 0; + } + // PRAM (palette RAM) + case 0x05: { + return ReadPRAM(Align(address)); + } + // VRAM (video RAM) + case 0x06: { + return ReadVRAM(Align(address)); + } + // OAM (object attribute map) + case 0x07: { + return ReadOAM(Align(address)); + } + // ROM (WS0, WS1, WS2) + // 08 已经前置 + case 0x09 ... 0x0D: { + address = Align(address); + + auto sequential = access & Sequential; + bool code = access & Code; + + if((address & 0x1'FFFF) == 0 || ((last_access & Dma) && !(access & Dma))) { + sequential = 0; + } + + if constexpr(std::is_same_v) { + auto shift = ((address & 1) << 3); + Prefetch(address, code, wait16[sequential][page]); + return memory.rom.ReadROM16(address) >> shift; + } + + if constexpr(std::is_same_v) { + Prefetch(address, code, wait16[sequential][page]); + return memory.rom.ReadROM16(address); + } + + if constexpr(std::is_same_v) { + Prefetch(address, code, wait32[sequential][page]); + return memory.rom.ReadROM32(address); + } + + return 0; + } + // SRAM or FLASH backup + case 0x0E ... 0x0F: { + StopPrefetch(); + Step(wait16[0][0xE]); + + u32 value = memory.rom.ReadSRAM(address); + + if constexpr(std::is_same_v) value *= 0x0101; + if constexpr(std::is_same_v) value *= 0x01010101; + + return T(value); + } + // Unmapped memory + default: { + Step(1); + return ReadOpenBus(Align(address)); + } + } + } return 0; } diff --git a/src/nba/src/bus/bus.hpp b/src/nba/src/bus/bus.hpp index 88df73b..14970db 100644 --- a/src/nba/src/bus/bus.hpp +++ b/src/nba/src/bus/bus.hpp @@ -130,7 +130,7 @@ struct Bus { void Write(u32 address, int access, T value); template - auto Align(u32 address) -> u32 { + auto ALWAYS_INLINE Align(u32 address) -> u32 { return address & ~(sizeof(T) - 1); } diff --git a/src/nba/src/bus/timing.cpp b/src/nba/src/bus/timing.cpp index 1aa3e10..ce75279 100644 --- a/src/nba/src/bus/timing.cpp +++ b/src/nba/src/bus/timing.cpp @@ -117,6 +117,8 @@ void Bus::StopPrefetch() { } void Bus::Step(int cycles) { + if( !cycles ) return; + scheduler.AddCycles(cycles); if(prefetch.active) { diff --git a/src/nba/src/hw/ppu/background.cpp b/src/nba/src/hw/ppu/background.cpp index 468989e..8ff870d 100644 --- a/src/nba/src/hw/ppu/background.cpp +++ b/src/nba/src/hw/ppu/background.cpp @@ -136,6 +136,7 @@ template void PPU::DrawBackgroundImpl(int cycles) { auto& bgpb = mmio.bgpb; auto& bgpd = mmio.bgpd; + // 这里用lambda不好inline话,看代码没循环,直接给它手动INLINE const auto AdvanceBGXY = [&](int id) { auto bg_id = 2 + id; @@ -156,11 +157,45 @@ template void PPU::DrawBackgroundImpl(int cycles) { }; if constexpr(mode >= 1 && mode <= 5) { - AdvanceBGXY(0); + // AdvanceBGXY(0); + int id = 0; + auto bg_id = 2 + id; + + /* Do not update internal X/Y unless the latched BG enable bit is set. + * This behavior was confirmed on real hardware. + */ + if(latched_dispcnt_and_current_dispcnt & (256U << bg_id)) { + if(mmio.bgcnt[bg_id].mosaic_enable) { + if(mosaic.bg._counter_y == 0) { + bgx[id]._current += mosaic.bg.size_y * bgpb[id]; + bgy[id]._current += mosaic.bg.size_y * bgpd[id]; + } + } else { + bgx[id]._current += bgpb[id]; + bgy[id]._current += bgpd[id]; + } + } } if constexpr(mode == 2) { - AdvanceBGXY(1); + // AdvanceBGXY(1); + int id = 1; + auto bg_id = 2 + id; + + /* Do not update internal X/Y unless the latched BG enable bit is set. + * This behavior was confirmed on real hardware. + */ + if(latched_dispcnt_and_current_dispcnt & (256U << bg_id)) { + if(mmio.bgcnt[bg_id].mosaic_enable) { + if(mosaic.bg._counter_y == 0) { + bgx[id]._current += mosaic.bg.size_y * bgpb[id]; + bgy[id]._current += mosaic.bg.size_y * bgpd[id]; + } + } else { + bgx[id]._current += bgpb[id]; + bgy[id]._current += bgpd[id]; + } + } } } diff --git a/src/nba/src/hw/ppu/merge.cpp b/src/nba/src/hw/ppu/merge.cpp index ec67197..2ade7df 100644 --- a/src/nba/src/hw/ppu/merge.cpp +++ b/src/nba/src/hw/ppu/merge.cpp @@ -11,7 +11,59 @@ namespace nba::core { -static u32 RGB555(u16 rgb555) { +// 原作者代码非常牛的,这里这仨巨头放这里inline掉可以快不少的,毕竟wasm可是要抠效率的 +ALWAYS_INLINE static u16 Blend(u16 color_a, u16 color_b, int eva, int evb) { + const int r_a = (color_a >> 0) & 31; + const int g_a = ((color_a >> 4) & 62) | (color_a >> 15); + const int b_a = (color_a >> 10) & 31; + + const int r_b = (color_b >> 0) & 31; + const int g_b = ((color_b >> 4) & 62) | (color_b >> 15); + const int b_b = (color_b >> 10) & 31; + + eva = std::min(16, eva); + evb = std::min(16, evb); + + const int r = std::min((r_a * eva + r_b * evb + 8) >> 4, 31); + const int g = std::min((g_a * eva + g_b * evb + 8) >> 4, 63) >> 1; + const int b = std::min((b_a * eva + b_b * evb + 8) >> 4, 31); + + return (u16)((b << 10) | (g << 5) | r); +} + +ALWAYS_INLINE static u16 Brighten(u16 color, int evy) { + evy = std::min(16, evy); + + int r = (color >> 0) & 31; + int g = ((color >> 4) & 62) | (color >> 15); + int b = (color >> 10) & 31; + + r += ((31 - r) * evy + 8) >> 4; + g += ((63 - g) * evy + 8) >> 4; + b += ((31 - b) * evy + 8) >> 4; + + g >>= 1; + + return (u16)((b << 10) | (g << 5) | r); +} + +ALWAYS_INLINE static u16 Darken(u16 color, int evy) { + evy = std::min(16, evy); + + int r = (color >> 0) & 31; + int g = ((color >> 4) & 62) | (color >> 15); + int b = (color >> 10) & 31; + + r -= (r * evy + 7) >> 4; + g -= (g * evy + 7) >> 4; + b -= (b * evy + 7) >> 4; + + g >>= 1; + + return (u16)((b << 10) | (g << 5) | r); +} + +ALWAYS_INLINE static u32 RGB555(u16 rgb555) { const uint r = (rgb555 >> 0) & 31U; const uint g = (rgb555 >> 5) & 31U; const uint b = (rgb555 >> 10) & 31U; @@ -265,55 +317,4 @@ void PPU::DrawMergeImpl(int cycles) { } } -auto PPU::Blend(u16 color_a, u16 color_b, int eva, int evb) -> u16 { - const int r_a = (color_a >> 0) & 31; - const int g_a = ((color_a >> 4) & 62) | (color_a >> 15); - const int b_a = (color_a >> 10) & 31; - - const int r_b = (color_b >> 0) & 31; - const int g_b = ((color_b >> 4) & 62) | (color_b >> 15); - const int b_b = (color_b >> 10) & 31; - - eva = std::min(16, eva); - evb = std::min(16, evb); - - const int r = std::min((r_a * eva + r_b * evb + 8) >> 4, 31); - const int g = std::min((g_a * eva + g_b * evb + 8) >> 4, 63) >> 1; - const int b = std::min((b_a * eva + b_b * evb + 8) >> 4, 31); - - return (u16)((b << 10) | (g << 5) | r); -} - -auto PPU::Brighten(u16 color, int evy) -> u16 { - evy = std::min(16, evy); - - int r = (color >> 0) & 31; - int g = ((color >> 4) & 62) | (color >> 15); - int b = (color >> 10) & 31; - - r += ((31 - r) * evy + 8) >> 4; - g += ((63 - g) * evy + 8) >> 4; - b += ((31 - b) * evy + 8) >> 4; - - g >>= 1; - - return (u16)((b << 10) | (g << 5) | r); -} - -auto PPU::Darken(u16 color, int evy) -> u16 { - evy = std::min(16, evy); - - int r = (color >> 0) & 31; - int g = ((color >> 4) & 62) | (color >> 15); - int b = (color >> 10) & 31; - - r -= (r * evy + 7) >> 4; - g -= (g * evy + 7) >> 4; - b -= (b * evy + 7) >> 4; - - g >>= 1; - - return (u16)((b << 10) | (g << 5) | r); -} - } // namespace nba::core diff --git a/src/nba/src/hw/ppu/ppu.hpp b/src/nba/src/hw/ppu/ppu.hpp index ec07f47..2c2388d 100644 --- a/src/nba/src/hw/ppu/ppu.hpp +++ b/src/nba/src/hw/ppu/ppu.hpp @@ -396,9 +396,9 @@ private: void DrawMerge(); void DrawMergeImpl(int cycles); - static auto Blend(u16 color_a, u16 color_b, int eva, int evb) -> u16; - static auto Brighten(u16 color, int evy) -> u16; - static auto Darken(u16 color, int evy) -> u16; + //static auto Blend(u16 color_a, u16 color_b, int eva, int evb) -> u16; + //static auto Brighten(u16 color, int evy) -> u16; + //static auto Darken(u16 color, int evy) -> u16; bool ALWAYS_INLINE ForcedBlank() const { return (mmio.dispcnt_latch[0] | mmio.dispcnt.hword) & 0x80U;