根据profile得到的数据，对

DrawBackgroundImpl DrawMergeImpl Bus::ReadHalf 进行了关于代码手动内联和分支预测的优化，似乎有点点效果（总之即使没恒定60fps，但是也是4个9的60fps了吧）
2024-05-09 14:15:47 +08:00 · 2024-05-09 14:15:47 +08:00 · 72e3027fd0
commit 72e3027fd0
parent 20879ab70a
10 changed files with 241 additions and 145 deletions
--- a/src/nba/include/nba/common/dsp/ring_buffer.hpp
+++ b/src/nba/include/nba/common/dsp/ring_buffer.hpp
@ -33,11 +33,12 @@ struct RingBuffer : Stream<T> {
    }
  }

-  auto Peek(int offset) -> T const {
+  // 这仨巨头也要inline一下！
+  inline auto Peek(int offset) -> T const {
    return data[(rd_ptr + offset) % length];
  }

-  auto Read() -> T {
+  inline auto Read() -> T {
    T value = data[rd_ptr];
    if(count > 0) {
      rd_ptr = (rd_ptr + 1) % length;
@ -46,7 +47,7 @@ struct RingBuffer : Stream<T> {
    return value;
  }

-  void Write(T const& value) {
+  inline void Write(T const& value) {
    if(blocking && count == length) {
      return;
    }
--- a/src/nba/include/nba/common/punning.hpp
+++ b/src/nba/include/nba/common/punning.hpp
@ -13,15 +13,33 @@
 namespace nba {

 template<typename T>
-auto read(void const* data, uint offset) -> T {
+T read(void const* data, uint offset) {
  T value;
-  memcpy(&value, (u8*)data + offset, sizeof(T));
+  std::memcpy(&value, (u8*)data + offset, sizeof(T));
  return value;
 }

+template<>
+inline u8 read(void const* data, uint offset){
+  u8 const *p = (u8*)data;
+  return p[offset];
+}
+
+template<>
+inline u16 read(void const* data, uint offset){
+  u16 const *p = (u16*)data;
+  return p[ offset>>1 ];
+}
+
+template<>
+inline u32 read(void const* data, uint offset) {
+  u32 const *p = (u32*)data;
+  return p[ offset>>2 ];
+}
+
 template<typename T>
-void write(void* data, uint offset, T value) {
-  memcpy((u8*)data + offset, &value, sizeof(T));
+inline void write(void* data, uint offset, T value) {
+  std::memcpy((u8*)data + offset, &value, sizeof(T));
 }

 } // namespace nba
--- a/src/nba/include/nba/scheduler.hpp
+++ b/src/nba/include/nba/scheduler.hpp
@ -135,6 +135,7 @@ struct Scheduler {
  }

  void AddCycles(int cycles) {
+    if( !cycles ) return;
    auto timestamp_next = timestamp_now + cycles;
    Step(timestamp_next);
    timestamp_now = timestamp_next;
--- a/src/nba/src/arm/handlers/memory.inl
+++ b/src/nba/src/arm/handlers/memory.inl
@ -5,15 +5,15 @@
 * Refer to the included LICENSE file.
 */

-u32 ReadByte(u32 address, int access) {
+ALWAYS_INLINE u32 ReadByte(u32 address, int access) {
  return bus.ReadByte(address, access);
 }

-u32 ReadHalf(u32 address, int access) {
+ALWAYS_INLINE u32 ReadHalf(u32 address, int access) {
  return bus.ReadHalf(address, access);
 }

-u32 ReadWord(u32 address, int access) {
+ALWAYS_INLINE u32 ReadWord(u32 address, int access) {
  return bus.ReadWord(address, access);
 }

--- a/src/nba/src/bus/bus.cpp
+++ b/src/nba/src/bus/bus.cpp
@ -96,90 +96,128 @@ auto Bus::Read(u32 address, int access) -> T {

  parallel_internal_cpu_cycle_limit = 0;

-  switch(page) {
-    // BIOS
-    case 0x00: {
-      Step(1);
-      return ReadBIOS(Align<T>(address));
-    }
-    // EWRAM (external work RAM)
-    case 0x02: {
-      Step(is_u32 ? 6 : 3);
-      return read<T>(memory.wram.data(), Align<T>(address) & 0x3FFFF);
-    }
-    // IWRAM (internal work RAM)
-    case 0x03: {
-      Step(1);
-      return read<T>(memory.iram.data(), Align<T>(address) & 0x7FFF);
-    }
-    // MMIO
-    case 0x04: {
-      Step(1);
-      address = Align<T>(address);
-      if constexpr(std::is_same_v<T,  u8>) return hw.ReadByte(address);
-      if constexpr(std::is_same_v<T, u16>) return hw.ReadHalf(address);
-      if constexpr(std::is_same_v<T, u32>) return hw.ReadWord(address);
-      return 0;
-    }
-    // PRAM (palette RAM)
-    case 0x05: {
-      return ReadPRAM<T>(Align<T>(address));
-    }
-    // VRAM (video RAM)
-    case 0x06: {
-      return ReadVRAM<T>(Align<T>(address));
-    }
-    // OAM (object attribute map)
-    case 0x07: {
-      return ReadOAM<T>(Align<T>(address));
-    }
+  // 这个switch-case不好进行分支预测，所以要根据访问频度改成if likely
+  if( likely( page == 0x08 ) ){
    // ROM (WS0, WS1, WS2)
-    case 0x08 ... 0x0D: {
-      address = Align<T>(address);
+    address = Align<T>(address);

-      auto sequential = access & Sequential;
-      bool code = access & Code;
+    auto sequential = access & Sequential;
+    bool code = access & Code;

-      if((address & 0x1'FFFF) == 0 || ((last_access & Dma) && !(access & Dma))) {
-        sequential = 0;
-      }
-
-      if constexpr(std::is_same_v<T,  u8>) {
-        auto shift = ((address & 1) << 3);
-        Prefetch(address, code, wait16[sequential][page]);
-        return memory.rom.ReadROM16(address) >> shift;
-      }
-
-      if constexpr(std::is_same_v<T, u16>) {
-        Prefetch(address, code, wait16[sequential][page]);
-        return memory.rom.ReadROM16(address);
-      }
-
-      if constexpr(std::is_same_v<T, u32>) {
-        Prefetch(address, code, wait32[sequential][page]);
-        return memory.rom.ReadROM32(address);  
-      }
-
-      return 0;
+    if((address & 0x1'FFFF) == 0 || ((last_access & Dma) && !(access & Dma))) {
+      sequential = 0;
    }
-    // SRAM or FLASH backup
-    case 0x0E ... 0x0F: {
-      StopPrefetch();
-      Step(wait16[0][0xE]);

-      u32 value = memory.rom.ReadSRAM(address);
-
-      if constexpr(std::is_same_v<T, u16>) value *= 0x0101;
-      if constexpr(std::is_same_v<T, u32>) value *= 0x01010101;
-
-      return T(value);
+    if constexpr(std::is_same_v<T,  u8>) {
+      auto shift = ((address & 1) << 3);
+      Prefetch(address, code, wait16[sequential][page]);
+      return memory.rom.ReadROM16(address) >> shift;
    }
-    // Unmapped memory
-    default: {
-      Step(1);
-      return ReadOpenBus(Align<T>(address));
+
+    if constexpr(std::is_same_v<T, u16>) {
+      Prefetch(address, code, wait16[sequential][page]);
+      return memory.rom.ReadROM16(address);
    }
-  }  
+
+    if constexpr(std::is_same_v<T, u32>) {
+      Prefetch(address, code, wait32[sequential][page]);
+      return memory.rom.ReadROM32(address);  
+    }
+
+    return 0;
+  }
+  else if( likely( page == 0x03 ) ){
+    // IWRAM (internal work RAM)
+    Step(1);
+    return read<T>(memory.iram.data(), Align<T>(address) & 0x7FFF);
+  }
+  else {
+    switch(page) {
+      // BIOS
+      case 0x00: {
+        Step(1);
+        return ReadBIOS(Align<T>(address));
+      }
+      // EWRAM (external work RAM)
+      case 0x02: {
+        Step(is_u32 ? 6 : 3);
+        return read<T>(memory.wram.data(), Align<T>(address) & 0x3FFFF);
+      }
+      /* 已经前置 // IWRAM (internal work RAM)
+      case 0x03: {
+        Step(1);
+        return read<T>(memory.iram.data(), Align<T>(address) & 0x7FFF);
+      }*/
+      // MMIO
+      case 0x04: {
+        Step(1);
+        address = Align<T>(address);
+        if constexpr(std::is_same_v<T,  u8>) return hw.ReadByte(address);
+        if constexpr(std::is_same_v<T, u16>) return hw.ReadHalf(address);
+        if constexpr(std::is_same_v<T, u32>) return hw.ReadWord(address);
+        return 0;
+      }
+      // PRAM (palette RAM)
+      case 0x05: {
+        return ReadPRAM<T>(Align<T>(address));
+      }
+      // VRAM (video RAM)
+      case 0x06: {
+        return ReadVRAM<T>(Align<T>(address));
+      }
+      // OAM (object attribute map)
+      case 0x07: {
+        return ReadOAM<T>(Align<T>(address));
+      }
+      // ROM (WS0, WS1, WS2)
+      // 08 已经前置
+      case 0x09 ... 0x0D: {
+        address = Align<T>(address);
+
+        auto sequential = access & Sequential;
+        bool code = access & Code;
+
+        if((address & 0x1'FFFF) == 0 || ((last_access & Dma) && !(access & Dma))) {
+          sequential = 0;
+        }
+
+        if constexpr(std::is_same_v<T,  u8>) {
+          auto shift = ((address & 1) << 3);
+          Prefetch(address, code, wait16[sequential][page]);
+          return memory.rom.ReadROM16(address) >> shift;
+        }
+
+        if constexpr(std::is_same_v<T, u16>) {
+          Prefetch(address, code, wait16[sequential][page]);
+          return memory.rom.ReadROM16(address);
+        }
+
+        if constexpr(std::is_same_v<T, u32>) {
+          Prefetch(address, code, wait32[sequential][page]);
+          return memory.rom.ReadROM32(address);  
+        }
+
+        return 0;
+      }
+      // SRAM or FLASH backup
+      case 0x0E ... 0x0F: {
+        StopPrefetch();
+        Step(wait16[0][0xE]);
+
+        u32 value = memory.rom.ReadSRAM(address);
+
+        if constexpr(std::is_same_v<T, u16>) value *= 0x0101;
+        if constexpr(std::is_same_v<T, u32>) value *= 0x01010101;
+
+        return T(value);
+      }
+      // Unmapped memory
+      default: {
+        Step(1);
+        return ReadOpenBus(Align<T>(address));
+      }
+    } 
+  }

  return 0;
 }
--- a/src/nba/src/bus/bus.hpp
+++ b/src/nba/src/bus/bus.hpp
@ -130,7 +130,7 @@ struct Bus {
  void Write(u32 address, int access, T value);

  template<typename T>
-  auto Align(u32 address) -> u32 {
+  auto ALWAYS_INLINE Align(u32 address) -> u32 {
    return address & ~(sizeof(T) - 1);
  }

--- a/src/nba/src/bus/timing.cpp
+++ b/src/nba/src/bus/timing.cpp
@ -117,6 +117,8 @@ void Bus::StopPrefetch() {
 }

 void Bus::Step(int cycles) {
+  if( !cycles ) return;
+  
  scheduler.AddCycles(cycles);

  if(prefetch.active) {
--- a/src/nba/src/hw/ppu/background.cpp
+++ b/src/nba/src/hw/ppu/background.cpp
@ -136,6 +136,7 @@ template<int mode> void PPU::DrawBackgroundImpl(int cycles) {
      auto& bgpb = mmio.bgpb;
      auto& bgpd = mmio.bgpd;

+      // 这里用lambda不好inline话，看代码没循环，直接给它手动INLINE
      const auto AdvanceBGXY = [&](int id) {
        auto bg_id = 2 + id;

@ -156,11 +157,45 @@ template<int mode> void PPU::DrawBackgroundImpl(int cycles) {
      };

      if constexpr(mode >= 1 && mode <= 5) {
-        AdvanceBGXY(0);
+        // AdvanceBGXY(0);
+        int id = 0;
+        auto bg_id = 2 + id;
+
+        /* Do not update internal X/Y unless the latched BG enable bit is set.
+         * This behavior was confirmed on real hardware.
+         */
+        if(latched_dispcnt_and_current_dispcnt & (256U << bg_id)) {
+          if(mmio.bgcnt[bg_id].mosaic_enable) {
+            if(mosaic.bg._counter_y == 0) {
+              bgx[id]._current += mosaic.bg.size_y * bgpb[id];
+              bgy[id]._current += mosaic.bg.size_y * bgpd[id];
+            }
+          } else {
+            bgx[id]._current += bgpb[id];
+            bgy[id]._current += bgpd[id];
+          }
+        }
      }

      if constexpr(mode == 2) {
-        AdvanceBGXY(1);
+        // AdvanceBGXY(1);
+        int id = 1;
+        auto bg_id = 2 + id;
+
+        /* Do not update internal X/Y unless the latched BG enable bit is set.
+         * This behavior was confirmed on real hardware.
+         */
+        if(latched_dispcnt_and_current_dispcnt & (256U << bg_id)) {
+          if(mmio.bgcnt[bg_id].mosaic_enable) {
+            if(mosaic.bg._counter_y == 0) {
+              bgx[id]._current += mosaic.bg.size_y * bgpb[id];
+              bgy[id]._current += mosaic.bg.size_y * bgpd[id];
+            }
+          } else {
+            bgx[id]._current += bgpb[id];
+            bgy[id]._current += bgpd[id];
+          }
+        }
      }
    }

--- a/src/nba/src/hw/ppu/merge.cpp
+++ b/src/nba/src/hw/ppu/merge.cpp
@ -11,7 +11,59 @@

 namespace nba::core {

-static u32 RGB555(u16 rgb555) {
+// 原作者代码非常牛的，这里这仨巨头放这里inline掉可以快不少的，毕竟wasm可是要抠效率的
+ALWAYS_INLINE static u16 Blend(u16 color_a, u16 color_b, int eva, int evb) {
+  const int r_a =  (color_a >>  0) & 31;
+  const int g_a = ((color_a >>  4) & 62) | (color_a >> 15);
+  const int b_a =  (color_a >> 10) & 31;
+
+  const int r_b =  (color_b >>  0) & 31;
+  const int g_b = ((color_b >>  4) & 62) | (color_b >> 15);
+  const int b_b =  (color_b >> 10) & 31;
+
+  eva = std::min<int>(16, eva);
+  evb = std::min<int>(16, evb);
+
+  const int r = std::min<u8>((r_a * eva + r_b * evb + 8) >> 4, 31);
+  const int g = std::min<u8>((g_a * eva + g_b * evb + 8) >> 4, 63) >> 1;
+  const int b = std::min<u8>((b_a * eva + b_b * evb + 8) >> 4, 31);
+
+  return (u16)((b << 10) | (g << 5) | r);
+}
+
+ALWAYS_INLINE static u16 Brighten(u16 color, int evy) {
+  evy = std::min<int>(16, evy);
+
+  int r =  (color >>  0) & 31;
+  int g = ((color >>  4) & 62) | (color >> 15);
+  int b =  (color >> 10) & 31;
+
+  r += ((31 - r) * evy + 8) >> 4;
+  g += ((63 - g) * evy + 8) >> 4;
+  b += ((31 - b) * evy + 8) >> 4;
+
+  g >>= 1;
+  
+  return (u16)((b << 10) | (g << 5) | r);
+}
+
+ALWAYS_INLINE static u16 Darken(u16 color, int evy) {
+  evy = std::min<int>(16, evy);
+
+  int r =  (color >>  0) & 31;
+  int g = ((color >>  4) & 62) | (color >> 15);
+  int b =  (color >> 10) & 31;
+
+  r -= (r * evy + 7) >> 4;
+  g -= (g * evy + 7) >> 4;
+  b -= (b * evy + 7) >> 4;
+
+  g >>= 1;
+
+  return (u16)((b << 10) | (g << 5) | r);
+}
+
+ALWAYS_INLINE static u32 RGB555(u16 rgb555) {
  const uint r = (rgb555 >>  0) & 31U;
  const uint g = (rgb555 >>  5) & 31U;
  const uint b = (rgb555 >> 10) & 31U;
@ -265,55 +317,4 @@ void PPU::DrawMergeImpl(int cycles) {
  }
 }

-auto PPU::Blend(u16 color_a, u16 color_b, int eva, int evb) -> u16 {
-  const int r_a =  (color_a >>  0) & 31;
-  const int g_a = ((color_a >>  4) & 62) | (color_a >> 15);
-  const int b_a =  (color_a >> 10) & 31;
-
-  const int r_b =  (color_b >>  0) & 31;
-  const int g_b = ((color_b >>  4) & 62) | (color_b >> 15);
-  const int b_b =  (color_b >> 10) & 31;
-
-  eva = std::min<int>(16, eva);
-  evb = std::min<int>(16, evb);
-
-  const int r = std::min<u8>((r_a * eva + r_b * evb + 8) >> 4, 31);
-  const int g = std::min<u8>((g_a * eva + g_b * evb + 8) >> 4, 63) >> 1;
-  const int b = std::min<u8>((b_a * eva + b_b * evb + 8) >> 4, 31);
-
-  return (u16)((b << 10) | (g << 5) | r);
-}
-
-auto PPU::Brighten(u16 color, int evy) -> u16 {
-  evy = std::min<int>(16, evy);
-
-  int r =  (color >>  0) & 31;
-  int g = ((color >>  4) & 62) | (color >> 15);
-  int b =  (color >> 10) & 31;
-
-  r += ((31 - r) * evy + 8) >> 4;
-  g += ((63 - g) * evy + 8) >> 4;
-  b += ((31 - b) * evy + 8) >> 4;
-
-  g >>= 1;
-  
-  return (u16)((b << 10) | (g << 5) | r);
-}
-
-auto PPU::Darken(u16 color, int evy) -> u16 {
-  evy = std::min<int>(16, evy);
-
-  int r =  (color >>  0) & 31;
-  int g = ((color >>  4) & 62) | (color >> 15);
-  int b =  (color >> 10) & 31;
-
-  r -= (r * evy + 7) >> 4;
-  g -= (g * evy + 7) >> 4;
-  b -= (b * evy + 7) >> 4;
-
-  g >>= 1;
-
-  return (u16)((b << 10) | (g << 5) | r);
-}
-
 } // namespace nba::core
--- a/src/nba/src/hw/ppu/ppu.hpp
+++ b/src/nba/src/hw/ppu/ppu.hpp
@ -396,9 +396,9 @@ private:
  void DrawMerge();
  void DrawMergeImpl(int cycles);
  
-  static auto Blend(u16 color_a, u16 color_b, int eva, int evb) -> u16;
-  static auto Brighten(u16 color, int evy) -> u16;
-  static auto Darken(u16 color, int evy) -> u16;
+  //static auto Blend(u16 color_a, u16 color_b, int eva, int evb) -> u16;
+  //static auto Brighten(u16 color, int evy) -> u16;
+  //static auto Darken(u16 color, int evy) -> u16;

  bool ALWAYS_INLINE ForcedBlank() const {
    return (mmio.dispcnt_latch[0] | mmio.dispcnt.hword) & 0x80U;