diff --git a/src/nba/src/hw/ppu/merge.cpp b/src/nba/src/hw/ppu/merge.cpp index 2ade7df..8b2cb69 100644 --- a/src/nba/src/hw/ppu/merge.cpp +++ b/src/nba/src/hw/ppu/merge.cpp @@ -4,7 +4,7 @@ * Licensed under GPLv3 or any later version. * Refer to the included LICENSE file. */ - +#include #include #include "ppu.hpp" @@ -24,10 +24,29 @@ ALWAYS_INLINE static u16 Blend(u16 color_a, u16 color_b, int eva, int evb) { eva = std::min(16, eva); evb = std::min(16, evb); - const int r = std::min((r_a * eva + r_b * evb + 8) >> 4, 31); - const int g = std::min((g_a * eva + g_b * evb + 8) >> 4, 63) >> 1; - const int b = std::min((b_a * eva + b_b * evb + 8) >> 4, 31); + auto colora = wasm_i32x4_make(r_a, g_a, b_a, 0); + auto colorb = wasm_i32x4_make(r_b, g_b, b_b, 0); + auto evavec = wasm_i32x4_make(eva, eva, eva, 0); + auto evbvec = wasm_i32x4_make(evb, evb, evb, 0); + auto number = wasm_i32x4_make(8, 8, 8, 0); + auto factor = wasm_i32x4_mul( colora, evavec ); + auto result = wasm_i32x4_mul( colorb, evbvec ); + result = wasm_i32x4_add( result, factor ); + result = wasm_i32x4_add( result, number ); + result = wasm_i32x4_shr( result, 4 ); + number = wasm_i32x4_make( 31, 63, 31, 0 ); + result = wasm_i32x4_min( result, number ); + int r = wasm_i32x4_extract_lane( result, 0 ); + int g = wasm_i32x4_extract_lane( result, 1 ); + int b = wasm_i32x4_extract_lane( result, 2 ); + + g >>= 1; + + // const int r = std::min((r_a * eva + r_b * evb + 8) >> 4, 31); + // const int g = std::min((g_a * eva + g_b * evb + 8) >> 4, 63) >> 1; + // const int b = std::min((b_a * eva + b_b * evb + 8) >> 4, 31); + return (u16)((b << 10) | (g << 5) | r); } @@ -38,9 +57,24 @@ ALWAYS_INLINE static u16 Brighten(u16 color, int evy) { int g = ((color >> 4) & 62) | (color >> 15); int b = (color >> 10) & 31; - r += ((31 - r) * evy + 8) >> 4; - g += ((63 - g) * evy + 8) >> 4; - b += ((31 - b) * evy + 8) >> 4; + auto source = wasm_i32x4_make(r, g, b, 0); + auto maxume = wasm_i32x4_make(31, 63, 31, 0); + auto factor = wasm_i32x4_make(r, g, b, 0); + auto evyvec = wasm_i32x4_make(evy, evy, evy, 0); + auto number = wasm_i32x4_make(8, 8, 8, 0); + auto result = wasm_i32x4_sub( maxume, factor ); + result = wasm_i32x4_mul( result, evyvec ); + result = wasm_i32x4_add( result, number ); + result = wasm_i32x4_shr( result, 4 ); + result = wasm_i32x4_add( source, result ); + + r = wasm_i32x4_extract_lane( result, 0 ); + g = wasm_i32x4_extract_lane( result, 1 ); + b = wasm_i32x4_extract_lane( result, 2 ); + + // r += ((31 - r) * evy + 8) >> 4; + // g += ((63 - g) * evy + 8) >> 4; + // b += ((31 - b) * evy + 8) >> 4; g >>= 1; @@ -54,9 +88,22 @@ ALWAYS_INLINE static u16 Darken(u16 color, int evy) { int g = ((color >> 4) & 62) | (color >> 15); int b = (color >> 10) & 31; - r -= (r * evy + 7) >> 4; - g -= (g * evy + 7) >> 4; - b -= (b * evy + 7) >> 4; + auto source = wasm_i32x4_make(r, g, b, 0); + auto factor = wasm_i32x4_make(r, g, b, 0); + auto evyvec = wasm_i32x4_make(evy, evy, evy, 0); + auto number = wasm_i32x4_make(7, 7, 7, 0); + auto result = wasm_i32x4_mul( factor, evyvec ); + result = wasm_i32x4_add( result, number ); + result = wasm_i32x4_shr( result, 4 ); + result = wasm_i32x4_sub( source, result ); + + r = wasm_i32x4_extract_lane( result, 0 ); + g = wasm_i32x4_extract_lane( result, 1 ); + b = wasm_i32x4_extract_lane( result, 2 ); + + // r -= (r * evy + 7) >> 4; + // g -= (g * evy + 7) >> 4; + // b -= (b * evy + 7) >> 4; g >>= 1;