根据profile得到的数据,对
DrawBackgroundImpl DrawMergeImpl Bus::ReadHalf 进行了关于代码手动内联和分支预测的优化,似乎有点点效果(总之即使没恒定60fps,但是也是4个9的60fps了吧)
This commit is contained in:
parent
20879ab70a
commit
72e3027fd0
@ -33,11 +33,12 @@ struct RingBuffer : Stream<T> {
|
||||
}
|
||||
}
|
||||
|
||||
auto Peek(int offset) -> T const {
|
||||
// 这仨巨头也要inline一下!
|
||||
inline auto Peek(int offset) -> T const {
|
||||
return data[(rd_ptr + offset) % length];
|
||||
}
|
||||
|
||||
auto Read() -> T {
|
||||
inline auto Read() -> T {
|
||||
T value = data[rd_ptr];
|
||||
if(count > 0) {
|
||||
rd_ptr = (rd_ptr + 1) % length;
|
||||
@ -46,7 +47,7 @@ struct RingBuffer : Stream<T> {
|
||||
return value;
|
||||
}
|
||||
|
||||
void Write(T const& value) {
|
||||
inline void Write(T const& value) {
|
||||
if(blocking && count == length) {
|
||||
return;
|
||||
}
|
||||
|
@ -13,15 +13,33 @@
|
||||
namespace nba {
|
||||
|
||||
template<typename T>
|
||||
auto read(void const* data, uint offset) -> T {
|
||||
T read(void const* data, uint offset) {
|
||||
T value;
|
||||
memcpy(&value, (u8*)data + offset, sizeof(T));
|
||||
std::memcpy(&value, (u8*)data + offset, sizeof(T));
|
||||
return value;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline u8 read(void const* data, uint offset){
|
||||
u8 const *p = (u8*)data;
|
||||
return p[offset];
|
||||
}
|
||||
|
||||
template<>
|
||||
inline u16 read(void const* data, uint offset){
|
||||
u16 const *p = (u16*)data;
|
||||
return p[ offset>>1 ];
|
||||
}
|
||||
|
||||
template<>
|
||||
inline u32 read(void const* data, uint offset) {
|
||||
u32 const *p = (u32*)data;
|
||||
return p[ offset>>2 ];
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void write(void* data, uint offset, T value) {
|
||||
memcpy((u8*)data + offset, &value, sizeof(T));
|
||||
inline void write(void* data, uint offset, T value) {
|
||||
std::memcpy((u8*)data + offset, &value, sizeof(T));
|
||||
}
|
||||
|
||||
} // namespace nba
|
||||
|
@ -135,6 +135,7 @@ struct Scheduler {
|
||||
}
|
||||
|
||||
void AddCycles(int cycles) {
|
||||
if( !cycles ) return;
|
||||
auto timestamp_next = timestamp_now + cycles;
|
||||
Step(timestamp_next);
|
||||
timestamp_now = timestamp_next;
|
||||
|
@ -5,15 +5,15 @@
|
||||
* Refer to the included LICENSE file.
|
||||
*/
|
||||
|
||||
u32 ReadByte(u32 address, int access) {
|
||||
ALWAYS_INLINE u32 ReadByte(u32 address, int access) {
|
||||
return bus.ReadByte(address, access);
|
||||
}
|
||||
|
||||
u32 ReadHalf(u32 address, int access) {
|
||||
ALWAYS_INLINE u32 ReadHalf(u32 address, int access) {
|
||||
return bus.ReadHalf(address, access);
|
||||
}
|
||||
|
||||
u32 ReadWord(u32 address, int access) {
|
||||
ALWAYS_INLINE u32 ReadWord(u32 address, int access) {
|
||||
return bus.ReadWord(address, access);
|
||||
}
|
||||
|
||||
|
@ -96,90 +96,128 @@ auto Bus::Read(u32 address, int access) -> T {
|
||||
|
||||
parallel_internal_cpu_cycle_limit = 0;
|
||||
|
||||
switch(page) {
|
||||
// BIOS
|
||||
case 0x00: {
|
||||
Step(1);
|
||||
return ReadBIOS(Align<T>(address));
|
||||
}
|
||||
// EWRAM (external work RAM)
|
||||
case 0x02: {
|
||||
Step(is_u32 ? 6 : 3);
|
||||
return read<T>(memory.wram.data(), Align<T>(address) & 0x3FFFF);
|
||||
}
|
||||
// IWRAM (internal work RAM)
|
||||
case 0x03: {
|
||||
Step(1);
|
||||
return read<T>(memory.iram.data(), Align<T>(address) & 0x7FFF);
|
||||
}
|
||||
// MMIO
|
||||
case 0x04: {
|
||||
Step(1);
|
||||
address = Align<T>(address);
|
||||
if constexpr(std::is_same_v<T, u8>) return hw.ReadByte(address);
|
||||
if constexpr(std::is_same_v<T, u16>) return hw.ReadHalf(address);
|
||||
if constexpr(std::is_same_v<T, u32>) return hw.ReadWord(address);
|
||||
return 0;
|
||||
}
|
||||
// PRAM (palette RAM)
|
||||
case 0x05: {
|
||||
return ReadPRAM<T>(Align<T>(address));
|
||||
}
|
||||
// VRAM (video RAM)
|
||||
case 0x06: {
|
||||
return ReadVRAM<T>(Align<T>(address));
|
||||
}
|
||||
// OAM (object attribute map)
|
||||
case 0x07: {
|
||||
return ReadOAM<T>(Align<T>(address));
|
||||
}
|
||||
// 这个switch-case不好进行分支预测,所以要根据访问频度改成if likely
|
||||
if( likely( page == 0x08 ) ){
|
||||
// ROM (WS0, WS1, WS2)
|
||||
case 0x08 ... 0x0D: {
|
||||
address = Align<T>(address);
|
||||
address = Align<T>(address);
|
||||
|
||||
auto sequential = access & Sequential;
|
||||
bool code = access & Code;
|
||||
auto sequential = access & Sequential;
|
||||
bool code = access & Code;
|
||||
|
||||
if((address & 0x1'FFFF) == 0 || ((last_access & Dma) && !(access & Dma))) {
|
||||
sequential = 0;
|
||||
}
|
||||
|
||||
if constexpr(std::is_same_v<T, u8>) {
|
||||
auto shift = ((address & 1) << 3);
|
||||
Prefetch(address, code, wait16[sequential][page]);
|
||||
return memory.rom.ReadROM16(address) >> shift;
|
||||
}
|
||||
|
||||
if constexpr(std::is_same_v<T, u16>) {
|
||||
Prefetch(address, code, wait16[sequential][page]);
|
||||
return memory.rom.ReadROM16(address);
|
||||
}
|
||||
|
||||
if constexpr(std::is_same_v<T, u32>) {
|
||||
Prefetch(address, code, wait32[sequential][page]);
|
||||
return memory.rom.ReadROM32(address);
|
||||
}
|
||||
|
||||
return 0;
|
||||
if((address & 0x1'FFFF) == 0 || ((last_access & Dma) && !(access & Dma))) {
|
||||
sequential = 0;
|
||||
}
|
||||
// SRAM or FLASH backup
|
||||
case 0x0E ... 0x0F: {
|
||||
StopPrefetch();
|
||||
Step(wait16[0][0xE]);
|
||||
|
||||
u32 value = memory.rom.ReadSRAM(address);
|
||||
|
||||
if constexpr(std::is_same_v<T, u16>) value *= 0x0101;
|
||||
if constexpr(std::is_same_v<T, u32>) value *= 0x01010101;
|
||||
|
||||
return T(value);
|
||||
if constexpr(std::is_same_v<T, u8>) {
|
||||
auto shift = ((address & 1) << 3);
|
||||
Prefetch(address, code, wait16[sequential][page]);
|
||||
return memory.rom.ReadROM16(address) >> shift;
|
||||
}
|
||||
// Unmapped memory
|
||||
default: {
|
||||
Step(1);
|
||||
return ReadOpenBus(Align<T>(address));
|
||||
|
||||
if constexpr(std::is_same_v<T, u16>) {
|
||||
Prefetch(address, code, wait16[sequential][page]);
|
||||
return memory.rom.ReadROM16(address);
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr(std::is_same_v<T, u32>) {
|
||||
Prefetch(address, code, wait32[sequential][page]);
|
||||
return memory.rom.ReadROM32(address);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
else if( likely( page == 0x03 ) ){
|
||||
// IWRAM (internal work RAM)
|
||||
Step(1);
|
||||
return read<T>(memory.iram.data(), Align<T>(address) & 0x7FFF);
|
||||
}
|
||||
else {
|
||||
switch(page) {
|
||||
// BIOS
|
||||
case 0x00: {
|
||||
Step(1);
|
||||
return ReadBIOS(Align<T>(address));
|
||||
}
|
||||
// EWRAM (external work RAM)
|
||||
case 0x02: {
|
||||
Step(is_u32 ? 6 : 3);
|
||||
return read<T>(memory.wram.data(), Align<T>(address) & 0x3FFFF);
|
||||
}
|
||||
/* 已经前置 // IWRAM (internal work RAM)
|
||||
case 0x03: {
|
||||
Step(1);
|
||||
return read<T>(memory.iram.data(), Align<T>(address) & 0x7FFF);
|
||||
}*/
|
||||
// MMIO
|
||||
case 0x04: {
|
||||
Step(1);
|
||||
address = Align<T>(address);
|
||||
if constexpr(std::is_same_v<T, u8>) return hw.ReadByte(address);
|
||||
if constexpr(std::is_same_v<T, u16>) return hw.ReadHalf(address);
|
||||
if constexpr(std::is_same_v<T, u32>) return hw.ReadWord(address);
|
||||
return 0;
|
||||
}
|
||||
// PRAM (palette RAM)
|
||||
case 0x05: {
|
||||
return ReadPRAM<T>(Align<T>(address));
|
||||
}
|
||||
// VRAM (video RAM)
|
||||
case 0x06: {
|
||||
return ReadVRAM<T>(Align<T>(address));
|
||||
}
|
||||
// OAM (object attribute map)
|
||||
case 0x07: {
|
||||
return ReadOAM<T>(Align<T>(address));
|
||||
}
|
||||
// ROM (WS0, WS1, WS2)
|
||||
// 08 已经前置
|
||||
case 0x09 ... 0x0D: {
|
||||
address = Align<T>(address);
|
||||
|
||||
auto sequential = access & Sequential;
|
||||
bool code = access & Code;
|
||||
|
||||
if((address & 0x1'FFFF) == 0 || ((last_access & Dma) && !(access & Dma))) {
|
||||
sequential = 0;
|
||||
}
|
||||
|
||||
if constexpr(std::is_same_v<T, u8>) {
|
||||
auto shift = ((address & 1) << 3);
|
||||
Prefetch(address, code, wait16[sequential][page]);
|
||||
return memory.rom.ReadROM16(address) >> shift;
|
||||
}
|
||||
|
||||
if constexpr(std::is_same_v<T, u16>) {
|
||||
Prefetch(address, code, wait16[sequential][page]);
|
||||
return memory.rom.ReadROM16(address);
|
||||
}
|
||||
|
||||
if constexpr(std::is_same_v<T, u32>) {
|
||||
Prefetch(address, code, wait32[sequential][page]);
|
||||
return memory.rom.ReadROM32(address);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
// SRAM or FLASH backup
|
||||
case 0x0E ... 0x0F: {
|
||||
StopPrefetch();
|
||||
Step(wait16[0][0xE]);
|
||||
|
||||
u32 value = memory.rom.ReadSRAM(address);
|
||||
|
||||
if constexpr(std::is_same_v<T, u16>) value *= 0x0101;
|
||||
if constexpr(std::is_same_v<T, u32>) value *= 0x01010101;
|
||||
|
||||
return T(value);
|
||||
}
|
||||
// Unmapped memory
|
||||
default: {
|
||||
Step(1);
|
||||
return ReadOpenBus(Align<T>(address));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -130,7 +130,7 @@ struct Bus {
|
||||
void Write(u32 address, int access, T value);
|
||||
|
||||
template<typename T>
|
||||
auto Align(u32 address) -> u32 {
|
||||
auto ALWAYS_INLINE Align(u32 address) -> u32 {
|
||||
return address & ~(sizeof(T) - 1);
|
||||
}
|
||||
|
||||
|
@ -117,6 +117,8 @@ void Bus::StopPrefetch() {
|
||||
}
|
||||
|
||||
void Bus::Step(int cycles) {
|
||||
if( !cycles ) return;
|
||||
|
||||
scheduler.AddCycles(cycles);
|
||||
|
||||
if(prefetch.active) {
|
||||
|
@ -136,6 +136,7 @@ template<int mode> void PPU::DrawBackgroundImpl(int cycles) {
|
||||
auto& bgpb = mmio.bgpb;
|
||||
auto& bgpd = mmio.bgpd;
|
||||
|
||||
// 这里用lambda不好inline话,看代码没循环,直接给它手动INLINE
|
||||
const auto AdvanceBGXY = [&](int id) {
|
||||
auto bg_id = 2 + id;
|
||||
|
||||
@ -156,11 +157,45 @@ template<int mode> void PPU::DrawBackgroundImpl(int cycles) {
|
||||
};
|
||||
|
||||
if constexpr(mode >= 1 && mode <= 5) {
|
||||
AdvanceBGXY(0);
|
||||
// AdvanceBGXY(0);
|
||||
int id = 0;
|
||||
auto bg_id = 2 + id;
|
||||
|
||||
/* Do not update internal X/Y unless the latched BG enable bit is set.
|
||||
* This behavior was confirmed on real hardware.
|
||||
*/
|
||||
if(latched_dispcnt_and_current_dispcnt & (256U << bg_id)) {
|
||||
if(mmio.bgcnt[bg_id].mosaic_enable) {
|
||||
if(mosaic.bg._counter_y == 0) {
|
||||
bgx[id]._current += mosaic.bg.size_y * bgpb[id];
|
||||
bgy[id]._current += mosaic.bg.size_y * bgpd[id];
|
||||
}
|
||||
} else {
|
||||
bgx[id]._current += bgpb[id];
|
||||
bgy[id]._current += bgpd[id];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr(mode == 2) {
|
||||
AdvanceBGXY(1);
|
||||
// AdvanceBGXY(1);
|
||||
int id = 1;
|
||||
auto bg_id = 2 + id;
|
||||
|
||||
/* Do not update internal X/Y unless the latched BG enable bit is set.
|
||||
* This behavior was confirmed on real hardware.
|
||||
*/
|
||||
if(latched_dispcnt_and_current_dispcnt & (256U << bg_id)) {
|
||||
if(mmio.bgcnt[bg_id].mosaic_enable) {
|
||||
if(mosaic.bg._counter_y == 0) {
|
||||
bgx[id]._current += mosaic.bg.size_y * bgpb[id];
|
||||
bgy[id]._current += mosaic.bg.size_y * bgpd[id];
|
||||
}
|
||||
} else {
|
||||
bgx[id]._current += bgpb[id];
|
||||
bgy[id]._current += bgpd[id];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -11,7 +11,59 @@
|
||||
|
||||
namespace nba::core {
|
||||
|
||||
static u32 RGB555(u16 rgb555) {
|
||||
// 原作者代码非常牛的,这里这仨巨头放这里inline掉可以快不少的,毕竟wasm可是要抠效率的
|
||||
ALWAYS_INLINE static u16 Blend(u16 color_a, u16 color_b, int eva, int evb) {
|
||||
const int r_a = (color_a >> 0) & 31;
|
||||
const int g_a = ((color_a >> 4) & 62) | (color_a >> 15);
|
||||
const int b_a = (color_a >> 10) & 31;
|
||||
|
||||
const int r_b = (color_b >> 0) & 31;
|
||||
const int g_b = ((color_b >> 4) & 62) | (color_b >> 15);
|
||||
const int b_b = (color_b >> 10) & 31;
|
||||
|
||||
eva = std::min<int>(16, eva);
|
||||
evb = std::min<int>(16, evb);
|
||||
|
||||
const int r = std::min<u8>((r_a * eva + r_b * evb + 8) >> 4, 31);
|
||||
const int g = std::min<u8>((g_a * eva + g_b * evb + 8) >> 4, 63) >> 1;
|
||||
const int b = std::min<u8>((b_a * eva + b_b * evb + 8) >> 4, 31);
|
||||
|
||||
return (u16)((b << 10) | (g << 5) | r);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static u16 Brighten(u16 color, int evy) {
|
||||
evy = std::min<int>(16, evy);
|
||||
|
||||
int r = (color >> 0) & 31;
|
||||
int g = ((color >> 4) & 62) | (color >> 15);
|
||||
int b = (color >> 10) & 31;
|
||||
|
||||
r += ((31 - r) * evy + 8) >> 4;
|
||||
g += ((63 - g) * evy + 8) >> 4;
|
||||
b += ((31 - b) * evy + 8) >> 4;
|
||||
|
||||
g >>= 1;
|
||||
|
||||
return (u16)((b << 10) | (g << 5) | r);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static u16 Darken(u16 color, int evy) {
|
||||
evy = std::min<int>(16, evy);
|
||||
|
||||
int r = (color >> 0) & 31;
|
||||
int g = ((color >> 4) & 62) | (color >> 15);
|
||||
int b = (color >> 10) & 31;
|
||||
|
||||
r -= (r * evy + 7) >> 4;
|
||||
g -= (g * evy + 7) >> 4;
|
||||
b -= (b * evy + 7) >> 4;
|
||||
|
||||
g >>= 1;
|
||||
|
||||
return (u16)((b << 10) | (g << 5) | r);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static u32 RGB555(u16 rgb555) {
|
||||
const uint r = (rgb555 >> 0) & 31U;
|
||||
const uint g = (rgb555 >> 5) & 31U;
|
||||
const uint b = (rgb555 >> 10) & 31U;
|
||||
@ -265,55 +317,4 @@ void PPU::DrawMergeImpl(int cycles) {
|
||||
}
|
||||
}
|
||||
|
||||
auto PPU::Blend(u16 color_a, u16 color_b, int eva, int evb) -> u16 {
|
||||
const int r_a = (color_a >> 0) & 31;
|
||||
const int g_a = ((color_a >> 4) & 62) | (color_a >> 15);
|
||||
const int b_a = (color_a >> 10) & 31;
|
||||
|
||||
const int r_b = (color_b >> 0) & 31;
|
||||
const int g_b = ((color_b >> 4) & 62) | (color_b >> 15);
|
||||
const int b_b = (color_b >> 10) & 31;
|
||||
|
||||
eva = std::min<int>(16, eva);
|
||||
evb = std::min<int>(16, evb);
|
||||
|
||||
const int r = std::min<u8>((r_a * eva + r_b * evb + 8) >> 4, 31);
|
||||
const int g = std::min<u8>((g_a * eva + g_b * evb + 8) >> 4, 63) >> 1;
|
||||
const int b = std::min<u8>((b_a * eva + b_b * evb + 8) >> 4, 31);
|
||||
|
||||
return (u16)((b << 10) | (g << 5) | r);
|
||||
}
|
||||
|
||||
auto PPU::Brighten(u16 color, int evy) -> u16 {
|
||||
evy = std::min<int>(16, evy);
|
||||
|
||||
int r = (color >> 0) & 31;
|
||||
int g = ((color >> 4) & 62) | (color >> 15);
|
||||
int b = (color >> 10) & 31;
|
||||
|
||||
r += ((31 - r) * evy + 8) >> 4;
|
||||
g += ((63 - g) * evy + 8) >> 4;
|
||||
b += ((31 - b) * evy + 8) >> 4;
|
||||
|
||||
g >>= 1;
|
||||
|
||||
return (u16)((b << 10) | (g << 5) | r);
|
||||
}
|
||||
|
||||
auto PPU::Darken(u16 color, int evy) -> u16 {
|
||||
evy = std::min<int>(16, evy);
|
||||
|
||||
int r = (color >> 0) & 31;
|
||||
int g = ((color >> 4) & 62) | (color >> 15);
|
||||
int b = (color >> 10) & 31;
|
||||
|
||||
r -= (r * evy + 7) >> 4;
|
||||
g -= (g * evy + 7) >> 4;
|
||||
b -= (b * evy + 7) >> 4;
|
||||
|
||||
g >>= 1;
|
||||
|
||||
return (u16)((b << 10) | (g << 5) | r);
|
||||
}
|
||||
|
||||
} // namespace nba::core
|
||||
|
@ -396,9 +396,9 @@ private:
|
||||
void DrawMerge();
|
||||
void DrawMergeImpl(int cycles);
|
||||
|
||||
static auto Blend(u16 color_a, u16 color_b, int eva, int evb) -> u16;
|
||||
static auto Brighten(u16 color, int evy) -> u16;
|
||||
static auto Darken(u16 color, int evy) -> u16;
|
||||
//static auto Blend(u16 color_a, u16 color_b, int eva, int evb) -> u16;
|
||||
//static auto Brighten(u16 color, int evy) -> u16;
|
||||
//static auto Darken(u16 color, int evy) -> u16;
|
||||
|
||||
bool ALWAYS_INLINE ForcedBlank() const {
|
||||
return (mmio.dispcnt_latch[0] | mmio.dispcnt.hword) & 0x80U;
|
||||
|
Loading…
x
Reference in New Issue
Block a user