WinNative-Emu · maxjivi05 · Jun 9, 2026 · Jun 9, 2026 · Jun 9, 2026 · Jun 9, 2026
diff --git a/app/src/main/cpp/CMakeLists.txt b/app/src/main/cpp/CMakeLists.txt
@@ -6,8 +6,6 @@ include(FetchContent)
 
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2 -Wno-unused-function -Wimplicit-function-declaration")
 
-# Zstandard is used by winlator/native_content_io.cpp. Keep this dependency in
-# the parent build instead of relying on the Steam client subproject to create it.
 FetchContent_Declare(
     zstd
     GIT_REPOSITORY https://github.com/facebook/zstd.git
@@ -46,14 +44,6 @@ add_subdirectory(wn-libsteamclient)
 
 find_package(curl REQUIRED CONFIG)
 
-# ----------------------------------------------------------------------------
-# SPIR-V shader compilation
-# Each .glsl is compiled by glslc (shipped with the NDK) into a .spv binary,
-# then converted to a C uint32_t array via bin2c.cmake. Headers are emitted
-# under ${CMAKE_CURRENT_BINARY_DIR}/shaders/*.spv.h and included from vk_renderer.c.
-# ----------------------------------------------------------------------------
-
-# Locate glslc shipped with the NDK. ANDROID_NDK is provided by the Android Gradle plugin.
 if(NOT DEFINED ANDROID_NDK)
     message(FATAL_ERROR "ANDROID_NDK not defined; this project must be built via the Android Gradle plugin")
 endif()
@@ -101,6 +91,20 @@ set(SHADER_LIST
     "effect_colorblind:frag:effect_colorblind_frag"
     "effect_pixelate:frag:effect_pixelate_frag"
     "sgsr1:frag:sgsr1_frag"
+    "motion:comp:motion_comp"
+    "motion_fp32:comp:motion_fp32_comp"
+    "interpolate:frag:interpolate_frag"
+    "cnn_pyramid:comp:cnn_pyramid_comp"
+    "cnn_conv:comp:cnn_conv_comp"
+    "cnn_conv_2pass:comp:cnn_conv_2pass_comp"
+    "cnn_correlation:comp:cnn_correlation_comp"
+    "cnn_correlation_cost9:comp:cnn_correlation_cost9_comp"
+    "cnn_correlation_g09:comp:cnn_correlation_g09_comp"
+    "cnn_correlation_warpfollow:comp:cnn_correlation_warpfollow_comp"
+    "cnn_flowreg:comp:cnn_flowreg_comp"
+    "cnn_occlusion:comp:cnn_occlusion_comp"
+    "cnn_generate:comp:cnn_generate_comp"
+    "fg_synthshift:comp:fg_synthshift_comp"
 )
 
 set(SHADER_HEADERS "")
@@ -128,11 +132,27 @@ foreach(entry ${SHADER_LIST})
     list(APPEND SHADER_HEADERS "${hdr}")
 endforeach()
 
-add_custom_target(winlator_shaders DEPENDS ${SHADER_HEADERS})
+set(WEIGHTS_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/winlator/vk/weights_v2")
+set(BIN2C_BYTES_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/winlator/vk/bin2c_bytes.cmake")
+set(WEIGHTS_LIST 05 06 07 14 20 21 22 24 25 26 27 28 29 36 37 42 45 51)
+foreach(id ${WEIGHTS_LIST})
+    set(winput "${WEIGHTS_SRC_DIR}/wnfg_${id}.weights.fp16")
+    set(whdr   "${SHADER_OUT_DIR}/wnfg_${id}_weights.h")
+    add_custom_command(
+        OUTPUT  "${whdr}"
+        COMMAND "${CMAKE_COMMAND}"
+            -DINPUT_FILE=${winput}
+            -DOUTPUT_FILE=${whdr}
+            -DVAR_NAME=wnfg_${id}_weights
+            -P "${BIN2C_BYTES_SCRIPT}"
+        DEPENDS "${winput}" "${BIN2C_BYTES_SCRIPT}"
+        COMMENT "Embedding weights wnfg_${id}.weights.fp16 -> wnfg_${id}_weights.h"
+        VERBATIM
+    )
+    list(APPEND SHADER_HEADERS "${whdr}")
+endforeach()
 
-# ----------------------------------------------------------------------------
-# Winlator native library (X-server, AHB, Vulkan compositor, helpers)
-# ----------------------------------------------------------------------------
+add_custom_target(winlator_shaders DEPENDS ${SHADER_HEADERS})
 
 add_library(winlator SHARED
         winlator/drawable.c

diff --git a/app/src/main/cpp/winlator/vk/bin2c_bytes.cmake b/app/src/main/cpp/winlator/vk/bin2c_bytes.cmake
@@ -0,0 +1,36 @@
+if(NOT INPUT_FILE OR NOT OUTPUT_FILE OR NOT VAR_NAME)
+    message(FATAL_ERROR "bin2c_bytes.cmake requires INPUT_FILE, OUTPUT_FILE, VAR_NAME")
+endif()
+
+file(READ "${INPUT_FILE}" hex_data HEX)
+string(LENGTH "${hex_data}" hex_len)
+
+set(bytes "")
+set(line_bytes "")
+set(bytes_per_line 0)
+
+set(i 0)
+while(i LESS hex_len)
+    string(SUBSTRING "${hex_data}" ${i} 2 b)
+    math(EXPR i "${i} + 2")
+    string(APPEND line_bytes "0x${b}, ")
+    math(EXPR bytes_per_line "${bytes_per_line} + 1")
+    if(bytes_per_line EQUAL 16)
+        string(APPEND bytes "    ${line_bytes}\n")
+        set(line_bytes "")
+        set(bytes_per_line 0)
+    endif()
+endwhile()
+if(bytes_per_line GREATER 0)
+    string(APPEND bytes "    ${line_bytes}\n")
+endif()
+
+file(WRITE "${OUTPUT_FILE}"
+"#pragma once
+#include <stdint.h>
+#include <stddef.h>
+
+static const uint8_t ${VAR_NAME}[] = {
+${bytes}};
+static const size_t ${VAR_NAME}_size = sizeof(${VAR_NAME});
+")
diff --git a/app/src/main/cpp/winlator/vk/shaders/cnn_conv.comp b/app/src/main/cpp/winlator/vk/shaders/cnn_conv.comp
@@ -0,0 +1,104 @@
+#version 450
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+
+layout(local_size_x = 16, local_size_y = 16, local_size_z = 1) in;
+
+layout(set = 0, binding = 0) uniform sampler2DArray uSrc;
+layout(set = 0, binding = 1, rgba8) uniform writeonly image2DArray uDst;
+layout(set = 0, binding = 3) uniform sampler2D       uLuma;
+
+layout(set = 0, binding = 8, std430) readonly buffer Weights { float16_t W[]; };
+
+layout(push_constant) uniform PC {
+    ivec2 size;
+    float t;
+    float mvScale;
+    uint  wBase;
+    int   cinT;
+    int   coutT;
+    int   flags;
+} pc;
+
+const ivec2 TAP[9] = ivec2[](
+    ivec2(-1,-1), ivec2(0,-1), ivec2(1,-1),
+    ivec2(-1, 0), ivec2(0, 0), ivec2(1, 0),
+    ivec2(-1, 1), ivec2(0, 1), ivec2(1, 1));
+
+const int MAX_T = 4;
+
+mat4 convMat(int k, int ci, int co) {
+    uint b = pc.wBase + uint((((k * pc.cinT + ci) * pc.coutT + co) * 4) * 4);
+    mat4 M;
+    for (int c = 0; c < 4; ++c) {
+        uint cb = b + uint(c);
+        M[c] = vec4(float(W[cb +  0u]),
+                    float(W[cb +  4u]),
+                    float(W[cb +  8u]),
+                    float(W[cb + 12u]));
+    }
+    return M;
+}
+
+void main() {
+    ivec2 p = ivec2(gl_GlobalInvocationID.xy);
+    if (any(greaterThanEqual(p, pc.size))) return;
+
+    bool stem    = (pc.flags & 1) != 0;
+    bool stride2 = (pc.flags & 2) != 0;
+    bool resid   = (pc.flags & 4) != 0;
+    bool doRelu  = (pc.flags & 8) != 0;
+    bool stem2x  = (pc.flags & 16) != 0;
+
+    ivec2 sp = (stride2 || stem2x) ? (p * 2) : p;
+
+    ivec2 hi = stem    ? (textureSize(uLuma, 0) - ivec2(1))
+             : stride2 ? (textureSize(uSrc, 0).xy - ivec2(1))
+             :           (pc.size - ivec2(1));
+
+    vec4 acc[MAX_T];
+    for (int co = 0; co < pc.coutT; ++co) acc[co] = vec4(0.0);
+
+    uint affBase = pc.wBase + uint(9 * pc.cinT * pc.coutT * 16);
+
+    for (int k = 0; k < 9; ++k) {
+        ivec2 q = clamp(sp + TAP[k], ivec2(0), hi);
+
+        if (stem) {
+
+            float luma = texelFetch(uLuma, q, 0).r;
+            vec4 x = vec4((luma - 0.208008) * 1.496094 + 0.769531, 0.0, 0.0, 0.0);
+            for (int co = 0; co < pc.coutT; ++co)
+                acc[co] += convMat(k, 0, co) * x;
+        } else {
+
+            for (int ci = 0; ci < pc.cinT; ++ci) {
+                vec4 x = texelFetch(uSrc, ivec3(q, ci), 0);
+                for (int co = 0; co < pc.coutT; ++co)
+                    acc[co] += convMat(k, ci, co) * x;
+            }
+        }
+    }
+
+    if (resid) {
+        acc[0] += texelFetch(uSrc, ivec3(p, 0), 0);
+    }
+
+    for (int co = 0; co < pc.coutT; ++co) {
+        vec4 bias   = vec4(float(W[affBase + uint((0 * pc.coutT + co) * 4 + 0)]),
+                           float(W[affBase + uint((0 * pc.coutT + co) * 4 + 1)]),
+                           float(W[affBase + uint((0 * pc.coutT + co) * 4 + 2)]),
+                           float(W[affBase + uint((0 * pc.coutT + co) * 4 + 3)]));
+        vec4 scale  = vec4(float(W[affBase + uint((1 * pc.coutT + co) * 4 + 0)]),
+                           float(W[affBase + uint((1 * pc.coutT + co) * 4 + 1)]),
+                           float(W[affBase + uint((1 * pc.coutT + co) * 4 + 2)]),
+                           float(W[affBase + uint((1 * pc.coutT + co) * 4 + 3)]));
+        vec4 offset = vec4(float(W[affBase + uint((2 * pc.coutT + co) * 4 + 0)]),
+                           float(W[affBase + uint((2 * pc.coutT + co) * 4 + 1)]),
+                           float(W[affBase + uint((2 * pc.coutT + co) * 4 + 2)]),
+                           float(W[affBase + uint((2 * pc.coutT + co) * 4 + 3)]));
+
+        vec4 v = (acc[co] - bias) * scale + offset;
+        if (doRelu) v = max(v, vec4(0.0));
+        imageStore(uDst, ivec3(p, co), clamp(v, 0.0, 1.0));
+    }
+}
diff --git a/app/src/main/cpp/winlator/vk/shaders/cnn_conv_2pass.comp b/app/src/main/cpp/winlator/vk/shaders/cnn_conv_2pass.comp
@@ -0,0 +1,151 @@
+#version 450
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+
+layout(local_size_x = 16, local_size_y = 16, local_size_z = 1) in;
+
+layout(constant_id = 0) const int  CIN_T  = 2;
+layout(constant_id = 1) const int  MID_T  = 2;
+layout(constant_id = 2) const int  COUT_T = 2;
+
+const int TILE = 18;
+const int MAX_T = 2;
+
+layout(set = 0, binding = 32) uniform sampler2D uIn0;
+layout(set = 0, binding = 33) uniform sampler2D uIn1;
+
+layout(set = 0, binding = 48, rgba8) uniform writeonly image2D uOut0;
+layout(set = 0, binding = 49, rgba8) uniform writeonly image2D uOut1;
+
+layout(set = 0, binding = 8, std430) readonly buffer Weights { float16_t W[]; };
+
+layout(push_constant) uniform PC {
+    ivec2 size;
+    float t;
+    float mvScale;
+    uint  wBase;
+    int   cinT;
+    int   coutT;
+    int   flags;
+} pc;
+
+const ivec2 TAP_XOUTER[9] = ivec2[](
+    ivec2(-1,-1), ivec2(-1, 0), ivec2(-1, 1),
+    ivec2( 0,-1), ivec2( 0, 0), ivec2( 0, 1),
+    ivec2( 1,-1), ivec2( 1, 0), ivec2( 1, 1));
+
+shared f16vec4 sMid[MAX_T][TILE][TILE];
+
+mat4 convMatAt(uint blockBase, int k, int ci, int co, int cinT, int coutT) {
+    uint b = blockBase + uint((((k * cinT + ci) * coutT + co) * 4) * 4);
+    mat4 M;
+    for (int c = 0; c < 4; ++c) {
+        uint cb = b + uint(c);
+        M[c] = vec4(float(W[cb +  0u]), float(W[cb +  4u]),
+                    float(W[cb +  8u]), float(W[cb + 12u]));
+    }
+    return M;
+}
+
+void affineAt(uint affBase, int co, int coutT,
+              out vec4 bias, out vec4 scale, out vec4 offset) {
+    bias   = vec4(float(W[affBase + uint((0*coutT + co)*4 + 0)]),
+                  float(W[affBase + uint((0*coutT + co)*4 + 1)]),
+                  float(W[affBase + uint((0*coutT + co)*4 + 2)]),
+                  float(W[affBase + uint((0*coutT + co)*4 + 3)]));
+    scale  = vec4(float(W[affBase + uint((1*coutT + co)*4 + 0)]),
+                  float(W[affBase + uint((1*coutT + co)*4 + 1)]),
+                  float(W[affBase + uint((1*coutT + co)*4 + 2)]),
+                  float(W[affBase + uint((1*coutT + co)*4 + 3)]));
+    offset = vec4(float(W[affBase + uint((2*coutT + co)*4 + 0)]),
+                  float(W[affBase + uint((2*coutT + co)*4 + 1)]),
+                  float(W[affBase + uint((2*coutT + co)*4 + 2)]),
+                  float(W[affBase + uint((2*coutT + co)*4 + 3)]));
+}
+
+void main() {
+    bool PASS1_CONV  = (pc.flags & 1) != 0;
+    bool PASS1_CLAMP = (pc.flags & 2) != 0;
+
+    ivec2 base = ivec2(gl_WorkGroupID.xy) * 16 - ivec2(1);
+    ivec2 lid  = ivec2(gl_LocalInvocationID.xy);
+    ivec2 p    = base + ivec2(1) + lid;
+    ivec2 hi   = pc.size - ivec2(1);
+
+    uint p1ConvBase = pc.wBase;
+    uint p1AffBase  = p1ConvBase + uint(9 * CIN_T * MID_T * 16);
+
+    uint p2ConvBase = PASS1_CONV ? (p1AffBase + uint(3 * MID_T * 4)) : pc.wBase;
+    uint p2AffBase  = p2ConvBase + uint(9 * MID_T * COUT_T * 16);
+
+    uint lindex = gl_LocalInvocationIndex;
+    for (uint idx = lindex; idx < uint(TILE*TILE); idx += 256u) {
+        int tx = int(idx % uint(TILE));
+        int ty = int(idx / uint(TILE));
+        ivec2 sp = base + ivec2(tx, ty);
+
+        if (PASS1_CONV) {
+
+            bool centerOOB = any(lessThan(sp, ivec2(0))) || any(greaterThan(sp, hi));
+            vec4 acc[MAX_T];
+            for (int co = 0; co < MID_T; ++co) acc[co] = vec4(0.0);
+            if (!centerOOB) {
+                for (int k = 0; k < 9; ++k) {
+                    ivec2 q = sp + TAP_XOUTER[k];
+                    bool inb = all(greaterThanEqual(q, ivec2(0))) && all(lessThanEqual(q, hi));
+
+                    vec4 x0 = inb ? texelFetch(uIn0, q, 0) : vec4(0.0);
+                    vec4 x1 = inb ? texelFetch(uIn1, q, 0) : vec4(0.0);
+                    for (int co = 0; co < MID_T; ++co) {
+                        acc[co] += convMatAt(p1ConvBase, k, 0, co, CIN_T, MID_T) * x0;
+                        if (CIN_T > 1)
+                            acc[co] += convMatAt(p1ConvBase, k, 1, co, CIN_T, MID_T) * x1;
+                    }
+                }
+            }
+            for (int co = 0; co < MID_T; ++co) {
+                vec4 v;
+                if (centerOOB) {
+                    v = vec4(0.0);
+                } else {
+                    vec4 bias, scale, offset;
+                    affineAt(p1AffBase, co, MID_T, bias, scale, offset);
+                    v = (acc[co] - bias) * scale + offset;
+                    if (PASS1_CLAMP) v = clamp(v, 0.0, 1.0);
+                }
+                sMid[co][tx][ty] = f16vec4(v);
+            }
+        } else {
+
+            ivec2 q = clamp(sp, ivec2(0), hi);
+            sMid[0][tx][ty] = f16vec4(texelFetch(uIn0, q, 0));
+            if (MID_T > 1)
+                sMid[1][tx][ty] = f16vec4(texelFetch(uIn1, q, 0));
+        }
+    }
+
+    barrier();
+
+    if (any(greaterThanEqual(p, pc.size))) return;
+
+    ivec2 c = lid + ivec2(1);
+
+    vec4 acc[MAX_T];
+    for (int co = 0; co < COUT_T; ++co) acc[co] = vec4(0.0);
+
+    for (int k = 0; k < 9; ++k) {
+        ivec2 li = c + TAP_XOUTER[k];
+        for (int ci = 0; ci < MID_T; ++ci) {
+            vec4 x = vec4(sMid[ci][li.x][li.y]);
+            for (int co = 0; co < COUT_T; ++co)
+                acc[co] += convMatAt(p2ConvBase, k, ci, co, MID_T, COUT_T) * x;
+        }
+    }
+
+    for (int co = 0; co < COUT_T; ++co) {
+        vec4 bias, scale, offset;
+        affineAt(p2AffBase, co, COUT_T, bias, scale, offset);
+        vec4 v = (acc[co] - bias) * scale + offset;
+        if (co == 0)      imageStore(uOut0, p, v);
+        else if (co == 1) imageStore(uOut1, p, v);
+    }
+}