根据profile得到的数据,对

DrawBackgroundImpl
DrawMergeImpl
Bus::ReadHalf
进行了关于代码手动内联和分支预测的优化,似乎有点点效果(总之即使没恒定60fps,但是也是4个9的60fps了吧)
This commit is contained in:
root 2024-05-09 14:15:47 +08:00
parent 20879ab70a
commit 72e3027fd0
10 changed files with 241 additions and 145 deletions

View File

@ -33,11 +33,12 @@ struct RingBuffer : Stream<T> {
}
}
auto Peek(int offset) -> T const {
// 这仨巨头也要inline一下
inline auto Peek(int offset) -> T const {
return data[(rd_ptr + offset) % length];
}
auto Read() -> T {
inline auto Read() -> T {
T value = data[rd_ptr];
if(count > 0) {
rd_ptr = (rd_ptr + 1) % length;
@ -46,7 +47,7 @@ struct RingBuffer : Stream<T> {
return value;
}
void Write(T const& value) {
inline void Write(T const& value) {
if(blocking && count == length) {
return;
}

View File

@ -13,15 +13,33 @@
namespace nba {
template<typename T>
auto read(void const* data, uint offset) -> T {
T read(void const* data, uint offset) {
T value;
memcpy(&value, (u8*)data + offset, sizeof(T));
std::memcpy(&value, (u8*)data + offset, sizeof(T));
return value;
}
template<>
inline u8 read(void const* data, uint offset){
u8 const *p = (u8*)data;
return p[offset];
}
template<>
inline u16 read(void const* data, uint offset){
u16 const *p = (u16*)data;
return p[ offset>>1 ];
}
template<>
inline u32 read(void const* data, uint offset) {
u32 const *p = (u32*)data;
return p[ offset>>2 ];
}
template<typename T>
void write(void* data, uint offset, T value) {
memcpy((u8*)data + offset, &value, sizeof(T));
inline void write(void* data, uint offset, T value) {
std::memcpy((u8*)data + offset, &value, sizeof(T));
}
} // namespace nba

View File

@ -135,6 +135,7 @@ struct Scheduler {
}
void AddCycles(int cycles) {
if( !cycles ) return;
auto timestamp_next = timestamp_now + cycles;
Step(timestamp_next);
timestamp_now = timestamp_next;

View File

@ -5,15 +5,15 @@
* Refer to the included LICENSE file.
*/
u32 ReadByte(u32 address, int access) {
ALWAYS_INLINE u32 ReadByte(u32 address, int access) {
return bus.ReadByte(address, access);
}
u32 ReadHalf(u32 address, int access) {
ALWAYS_INLINE u32 ReadHalf(u32 address, int access) {
return bus.ReadHalf(address, access);
}
u32 ReadWord(u32 address, int access) {
ALWAYS_INLINE u32 ReadWord(u32 address, int access) {
return bus.ReadWord(address, access);
}

View File

@ -96,90 +96,128 @@ auto Bus::Read(u32 address, int access) -> T {
parallel_internal_cpu_cycle_limit = 0;
switch(page) {
// BIOS
case 0x00: {
Step(1);
return ReadBIOS(Align<T>(address));
}
// EWRAM (external work RAM)
case 0x02: {
Step(is_u32 ? 6 : 3);
return read<T>(memory.wram.data(), Align<T>(address) & 0x3FFFF);
}
// IWRAM (internal work RAM)
case 0x03: {
Step(1);
return read<T>(memory.iram.data(), Align<T>(address) & 0x7FFF);
}
// MMIO
case 0x04: {
Step(1);
address = Align<T>(address);
if constexpr(std::is_same_v<T, u8>) return hw.ReadByte(address);
if constexpr(std::is_same_v<T, u16>) return hw.ReadHalf(address);
if constexpr(std::is_same_v<T, u32>) return hw.ReadWord(address);
return 0;
}
// PRAM (palette RAM)
case 0x05: {
return ReadPRAM<T>(Align<T>(address));
}
// VRAM (video RAM)
case 0x06: {
return ReadVRAM<T>(Align<T>(address));
}
// OAM (object attribute map)
case 0x07: {
return ReadOAM<T>(Align<T>(address));
}
// 这个switch-case不好进行分支预测所以要根据访问频度改成if likely
if( likely( page == 0x08 ) ){
// ROM (WS0, WS1, WS2)
case 0x08 ... 0x0D: {
address = Align<T>(address);
address = Align<T>(address);
auto sequential = access & Sequential;
bool code = access & Code;
auto sequential = access & Sequential;
bool code = access & Code;
if((address & 0x1'FFFF) == 0 || ((last_access & Dma) && !(access & Dma))) {
sequential = 0;
}
if constexpr(std::is_same_v<T, u8>) {
auto shift = ((address & 1) << 3);
Prefetch(address, code, wait16[sequential][page]);
return memory.rom.ReadROM16(address) >> shift;
}
if constexpr(std::is_same_v<T, u16>) {
Prefetch(address, code, wait16[sequential][page]);
return memory.rom.ReadROM16(address);
}
if constexpr(std::is_same_v<T, u32>) {
Prefetch(address, code, wait32[sequential][page]);
return memory.rom.ReadROM32(address);
}
return 0;
if((address & 0x1'FFFF) == 0 || ((last_access & Dma) && !(access & Dma))) {
sequential = 0;
}
// SRAM or FLASH backup
case 0x0E ... 0x0F: {
StopPrefetch();
Step(wait16[0][0xE]);
u32 value = memory.rom.ReadSRAM(address);
if constexpr(std::is_same_v<T, u16>) value *= 0x0101;
if constexpr(std::is_same_v<T, u32>) value *= 0x01010101;
return T(value);
if constexpr(std::is_same_v<T, u8>) {
auto shift = ((address & 1) << 3);
Prefetch(address, code, wait16[sequential][page]);
return memory.rom.ReadROM16(address) >> shift;
}
// Unmapped memory
default: {
Step(1);
return ReadOpenBus(Align<T>(address));
if constexpr(std::is_same_v<T, u16>) {
Prefetch(address, code, wait16[sequential][page]);
return memory.rom.ReadROM16(address);
}
}
if constexpr(std::is_same_v<T, u32>) {
Prefetch(address, code, wait32[sequential][page]);
return memory.rom.ReadROM32(address);
}
return 0;
}
else if( likely( page == 0x03 ) ){
// IWRAM (internal work RAM)
Step(1);
return read<T>(memory.iram.data(), Align<T>(address) & 0x7FFF);
}
else {
switch(page) {
// BIOS
case 0x00: {
Step(1);
return ReadBIOS(Align<T>(address));
}
// EWRAM (external work RAM)
case 0x02: {
Step(is_u32 ? 6 : 3);
return read<T>(memory.wram.data(), Align<T>(address) & 0x3FFFF);
}
/* 已经前置 // IWRAM (internal work RAM)
case 0x03: {
Step(1);
return read<T>(memory.iram.data(), Align<T>(address) & 0x7FFF);
}*/
// MMIO
case 0x04: {
Step(1);
address = Align<T>(address);
if constexpr(std::is_same_v<T, u8>) return hw.ReadByte(address);
if constexpr(std::is_same_v<T, u16>) return hw.ReadHalf(address);
if constexpr(std::is_same_v<T, u32>) return hw.ReadWord(address);
return 0;
}
// PRAM (palette RAM)
case 0x05: {
return ReadPRAM<T>(Align<T>(address));
}
// VRAM (video RAM)
case 0x06: {
return ReadVRAM<T>(Align<T>(address));
}
// OAM (object attribute map)
case 0x07: {
return ReadOAM<T>(Align<T>(address));
}
// ROM (WS0, WS1, WS2)
// 08 已经前置
case 0x09 ... 0x0D: {
address = Align<T>(address);
auto sequential = access & Sequential;
bool code = access & Code;
if((address & 0x1'FFFF) == 0 || ((last_access & Dma) && !(access & Dma))) {
sequential = 0;
}
if constexpr(std::is_same_v<T, u8>) {
auto shift = ((address & 1) << 3);
Prefetch(address, code, wait16[sequential][page]);
return memory.rom.ReadROM16(address) >> shift;
}
if constexpr(std::is_same_v<T, u16>) {
Prefetch(address, code, wait16[sequential][page]);
return memory.rom.ReadROM16(address);
}
if constexpr(std::is_same_v<T, u32>) {
Prefetch(address, code, wait32[sequential][page]);
return memory.rom.ReadROM32(address);
}
return 0;
}
// SRAM or FLASH backup
case 0x0E ... 0x0F: {
StopPrefetch();
Step(wait16[0][0xE]);
u32 value = memory.rom.ReadSRAM(address);
if constexpr(std::is_same_v<T, u16>) value *= 0x0101;
if constexpr(std::is_same_v<T, u32>) value *= 0x01010101;
return T(value);
}
// Unmapped memory
default: {
Step(1);
return ReadOpenBus(Align<T>(address));
}
}
}
return 0;
}

View File

@ -130,7 +130,7 @@ struct Bus {
void Write(u32 address, int access, T value);
template<typename T>
auto Align(u32 address) -> u32 {
auto ALWAYS_INLINE Align(u32 address) -> u32 {
return address & ~(sizeof(T) - 1);
}

View File

@ -117,6 +117,8 @@ void Bus::StopPrefetch() {
}
void Bus::Step(int cycles) {
if( !cycles ) return;
scheduler.AddCycles(cycles);
if(prefetch.active) {

View File

@ -136,6 +136,7 @@ template<int mode> void PPU::DrawBackgroundImpl(int cycles) {
auto& bgpb = mmio.bgpb;
auto& bgpd = mmio.bgpd;
// 这里用lambda不好inline话看代码没循环直接给它手动INLINE
const auto AdvanceBGXY = [&](int id) {
auto bg_id = 2 + id;
@ -156,11 +157,45 @@ template<int mode> void PPU::DrawBackgroundImpl(int cycles) {
};
if constexpr(mode >= 1 && mode <= 5) {
AdvanceBGXY(0);
// AdvanceBGXY(0);
int id = 0;
auto bg_id = 2 + id;
/* Do not update internal X/Y unless the latched BG enable bit is set.
* This behavior was confirmed on real hardware.
*/
if(latched_dispcnt_and_current_dispcnt & (256U << bg_id)) {
if(mmio.bgcnt[bg_id].mosaic_enable) {
if(mosaic.bg._counter_y == 0) {
bgx[id]._current += mosaic.bg.size_y * bgpb[id];
bgy[id]._current += mosaic.bg.size_y * bgpd[id];
}
} else {
bgx[id]._current += bgpb[id];
bgy[id]._current += bgpd[id];
}
}
}
if constexpr(mode == 2) {
AdvanceBGXY(1);
// AdvanceBGXY(1);
int id = 1;
auto bg_id = 2 + id;
/* Do not update internal X/Y unless the latched BG enable bit is set.
* This behavior was confirmed on real hardware.
*/
if(latched_dispcnt_and_current_dispcnt & (256U << bg_id)) {
if(mmio.bgcnt[bg_id].mosaic_enable) {
if(mosaic.bg._counter_y == 0) {
bgx[id]._current += mosaic.bg.size_y * bgpb[id];
bgy[id]._current += mosaic.bg.size_y * bgpd[id];
}
} else {
bgx[id]._current += bgpb[id];
bgy[id]._current += bgpd[id];
}
}
}
}

View File

@ -11,7 +11,59 @@
namespace nba::core {
static u32 RGB555(u16 rgb555) {
// 原作者代码非常牛的这里这仨巨头放这里inline掉可以快不少的毕竟wasm可是要抠效率的
ALWAYS_INLINE static u16 Blend(u16 color_a, u16 color_b, int eva, int evb) {
const int r_a = (color_a >> 0) & 31;
const int g_a = ((color_a >> 4) & 62) | (color_a >> 15);
const int b_a = (color_a >> 10) & 31;
const int r_b = (color_b >> 0) & 31;
const int g_b = ((color_b >> 4) & 62) | (color_b >> 15);
const int b_b = (color_b >> 10) & 31;
eva = std::min<int>(16, eva);
evb = std::min<int>(16, evb);
const int r = std::min<u8>((r_a * eva + r_b * evb + 8) >> 4, 31);
const int g = std::min<u8>((g_a * eva + g_b * evb + 8) >> 4, 63) >> 1;
const int b = std::min<u8>((b_a * eva + b_b * evb + 8) >> 4, 31);
return (u16)((b << 10) | (g << 5) | r);
}
ALWAYS_INLINE static u16 Brighten(u16 color, int evy) {
evy = std::min<int>(16, evy);
int r = (color >> 0) & 31;
int g = ((color >> 4) & 62) | (color >> 15);
int b = (color >> 10) & 31;
r += ((31 - r) * evy + 8) >> 4;
g += ((63 - g) * evy + 8) >> 4;
b += ((31 - b) * evy + 8) >> 4;
g >>= 1;
return (u16)((b << 10) | (g << 5) | r);
}
ALWAYS_INLINE static u16 Darken(u16 color, int evy) {
evy = std::min<int>(16, evy);
int r = (color >> 0) & 31;
int g = ((color >> 4) & 62) | (color >> 15);
int b = (color >> 10) & 31;
r -= (r * evy + 7) >> 4;
g -= (g * evy + 7) >> 4;
b -= (b * evy + 7) >> 4;
g >>= 1;
return (u16)((b << 10) | (g << 5) | r);
}
ALWAYS_INLINE static u32 RGB555(u16 rgb555) {
const uint r = (rgb555 >> 0) & 31U;
const uint g = (rgb555 >> 5) & 31U;
const uint b = (rgb555 >> 10) & 31U;
@ -265,55 +317,4 @@ void PPU::DrawMergeImpl(int cycles) {
}
}
auto PPU::Blend(u16 color_a, u16 color_b, int eva, int evb) -> u16 {
const int r_a = (color_a >> 0) & 31;
const int g_a = ((color_a >> 4) & 62) | (color_a >> 15);
const int b_a = (color_a >> 10) & 31;
const int r_b = (color_b >> 0) & 31;
const int g_b = ((color_b >> 4) & 62) | (color_b >> 15);
const int b_b = (color_b >> 10) & 31;
eva = std::min<int>(16, eva);
evb = std::min<int>(16, evb);
const int r = std::min<u8>((r_a * eva + r_b * evb + 8) >> 4, 31);
const int g = std::min<u8>((g_a * eva + g_b * evb + 8) >> 4, 63) >> 1;
const int b = std::min<u8>((b_a * eva + b_b * evb + 8) >> 4, 31);
return (u16)((b << 10) | (g << 5) | r);
}
auto PPU::Brighten(u16 color, int evy) -> u16 {
evy = std::min<int>(16, evy);
int r = (color >> 0) & 31;
int g = ((color >> 4) & 62) | (color >> 15);
int b = (color >> 10) & 31;
r += ((31 - r) * evy + 8) >> 4;
g += ((63 - g) * evy + 8) >> 4;
b += ((31 - b) * evy + 8) >> 4;
g >>= 1;
return (u16)((b << 10) | (g << 5) | r);
}
auto PPU::Darken(u16 color, int evy) -> u16 {
evy = std::min<int>(16, evy);
int r = (color >> 0) & 31;
int g = ((color >> 4) & 62) | (color >> 15);
int b = (color >> 10) & 31;
r -= (r * evy + 7) >> 4;
g -= (g * evy + 7) >> 4;
b -= (b * evy + 7) >> 4;
g >>= 1;
return (u16)((b << 10) | (g << 5) | r);
}
} // namespace nba::core

View File

@ -396,9 +396,9 @@ private:
void DrawMerge();
void DrawMergeImpl(int cycles);
static auto Blend(u16 color_a, u16 color_b, int eva, int evb) -> u16;
static auto Brighten(u16 color, int evy) -> u16;
static auto Darken(u16 color, int evy) -> u16;
//static auto Blend(u16 color_a, u16 color_b, int eva, int evb) -> u16;
//static auto Brighten(u16 color, int evy) -> u16;
//static auto Darken(u16 color, int evy) -> u16;
bool ALWAYS_INLINE ForcedBlank() const {
return (mmio.dispcnt_latch[0] | mmio.dispcnt.hword) & 0x80U;