Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
83c2550
Native frame generation (optical-flow interpolation)
maxjivi05 Jun 9, 2026
94e1b2e
Fix frame-gen compositor freeze on cursor/UI changes while the game i…
maxjivi05 Jun 9, 2026
b829bc1
Pipeline frame-gen submits: 3-slot history ring + targeted per-slot wait
maxjivi05 Jun 9, 2026
4707488
Frame-gen quality: sub-pixel motion vectors, MV median, swapchain hea…
maxjivi05 Jun 9, 2026
40dbc7f
Frame-gen pacing: MAILBOX over-post + native vsync-anchored present p…
maxjivi05 Jun 9, 2026
14f178e
Frame-gen pacing: present lead, time-based phase, windowed timing log
maxjivi05 Jun 10, 2026
4af1265
FrameGen: deterministic interp phase k/(N+1) + per-period phase telem…
maxjivi05 Jun 10, 2026
09684df
FrameGen: pin display mode to FG target, even present distribution, o…
maxjivi05 Jun 10, 2026
d5ec316
FrameGen: phase-correct interp fallback to fix early frame placement
maxjivi05 Jun 10, 2026
d3d406f
Merge branch 'WinNative-Emu:main' into frame-gen
maxjivi05 Jun 10, 2026
9eb86b4
Merge branch 'WinNative-Emu:main' into frame-gen
maxjivi05 Jun 11, 2026
c5c9d64
Merge branch 'WinNative-Emu:main' into frame-gen
maxjivi05 Jun 11, 2026
2408093
Frame generation: FIFO slot-grid pacing, extrapolation, per-game sett…
maxjivi05 Jun 12, 2026
168b6fe
Frame generation: worker pipeline, content-dedup, pacing + crash fixes
maxjivi05 Jun 17, 2026
f50b626
Merge upstream/main into frame-gen
maxjivi05 Jun 17, 2026
95caca8
Merge branch 'WinNative-Emu:main' into frame-gen
maxjivi05 Jun 17, 2026
3216f95
Merge upstream/main into frame-gen
maxjivi05 Jun 18, 2026
a03b075
Merge origin/frame-gen into frame-gen
maxjivi05 Jun 18, 2026
f755596
Frame generation P0: free content-rate present pacing
maxjivi05 Jun 18, 2026
f31a171
Frame generation: closed-loop content-anchored cadence
maxjivi05 Jun 18, 2026
253b387
Frame generation: steady output gate (fix tween under-production)
maxjivi05 Jun 18, 2026
fd65754
Frame generation: loosen content-dedup + Max preset uses deep flow
maxjivi05 Jun 18, 2026
f4992d2
Merge branch 'WinNative-Emu:main' into frame-gen
maxjivi05 Jun 18, 2026
21037b8
Merge branch 'WinNative-Emu:main' into frame-gen
maxjivi05 Jun 19, 2026
057cda5
Frame generation: clock the cadence off buffer-swap arrival timestamps
maxjivi05 Jun 19, 2026
412bf07
Merge origin/frame-gen into frame-gen
maxjivi05 Jun 19, 2026
383331d
Frame generation: trim verbose comments
maxjivi05 Jun 19, 2026
651bfe4
Frame generation: pipeline the worker for generate-ahead
maxjivi05 Jun 19, 2026
ce307fa
Merge branch 'main' into frame-gen
maxjivi05 Jun 19, 2026
3ff6f7b
Add CNN frame-generation flow path
maxjivi05 Jun 19, 2026
9a6cfcd
Async generate-ahead flow scheduling; CNN path always used
maxjivi05 Jun 19, 2026
70153c3
Feature ring: ingest each frame once, reuse across pairs and directions
maxjivi05 Jun 19, 2026
a99cdd2
Default swapchain to MAILBOX so timestamp pacing is not vsync-requant…
maxjivi05 Jun 19, 2026
2dcdafc
Fix interp flicker on the CNN path; drop the unused forward-flow pass
maxjivi05 Jun 19, 2026
57015c4
Stabilize the interp detail-sharpen term on the CNN path
maxjivi05 Jun 19, 2026
5df9c90
CNN frame-gen trained occlusion-select generate + keep FG live under …
maxjivi05 Jun 20, 2026
30eac9b
Fix FG interp phase: deterministic slot cadence instead of vsync-derived
maxjivi05 Jun 20, 2026
88beebf
Fix FG warp magnitude: preset-correct mvScale + interpolate.frag warp…
maxjivi05 Jun 20, 2026
3304dc6
Fix FG warp magnitude: flow is full-res pixel units, mvScale is const…
maxjivi05 Jun 20, 2026
19bf191
Frame dump: start a burst at the pair boundary for any multiplier
maxjivi05 Jun 20, 2026
74ec3c5
Fix FG warp overshoot: mvScale 0.40 (CNN flow overestimates motion ~25%)
maxjivi05 Jun 20, 2026
1e5c156
FG generate: decisive flow-consistency select to cut back/fwd smear
maxjivi05 Jun 20, 2026
bf3e8cd
Frame dump: correct portrait aspect (270x594) + log gen resolution
maxjivi05 Jun 20, 2026
890a5f2
FG generate: two independent flow fields
maxjivi05 Jun 20, 2026
13fcfa3
FG generate: wire wnfg_53 trained logits pyramid + faithful wnfg_04 (…
maxjivi05 Jun 20, 2026
3b3b1a6
Revert "FG generate: wire wnfg_53 trained logits pyramid + faithful w…
maxjivi05 Jun 20, 2026
06a40cf
Reapply "FG generate: wire wnfg_53 trained logits pyramid + faithful …
maxjivi05 Jun 20, 2026
086054b
FG wnfg_53: harness-validated wiring (b32=hD8,b33=hD7,b34=seed pair)
maxjivi05 Jun 20, 2026
e11cb5b
Fix Adreno 840 AHB import: match tiling to CPU-accessible linear buffers
maxjivi05 Jun 20, 2026
51048ef
Fix Adreno 840 stripe corruption: compositor matches the game's driver
maxjivi05 Jun 20, 2026
2f0f4cd
Compositor uses System driver unless FG is on (restore pre-FG path)
maxjivi05 Jun 21, 2026
4976bcf
Force BCn emulation off on Adreno GPUs (fixes a840 stripe corruption)
maxjivi05 Jun 21, 2026
29e7dae
FG: don't collapse interpolation during motion (UI-recomposite only w…
maxjivi05 Jun 21, 2026
768f34e
FG dump: bump to 636x1386 for character-detail interp inspection
maxjivi05 Jun 21, 2026
8a7f988
Frame-gen: occlusion-select generate on corrected bidirectional warp …
maxjivi05 Jun 21, 2026
9a70d45
Frame-gen: fix intermediate-warp sign + pacing; add controlled-motion…
maxjivi05 Jun 23, 2026
74a4be0
Frame-gen producer re-port (partial): fix degenerate gamma stage
maxjivi05 Jun 24, 2026
d76441d
Merge upstream/main into frame-gen
maxjivi05 Jun 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 34 additions & 14 deletions app/src/main/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@ include(FetchContent)

set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2 -Wno-unused-function -Wimplicit-function-declaration")

# Zstandard is used by winlator/native_content_io.cpp. Keep this dependency in
# the parent build instead of relying on the Steam client subproject to create it.
FetchContent_Declare(
zstd
GIT_REPOSITORY https://github.com/facebook/zstd.git
Expand Down Expand Up @@ -46,14 +44,6 @@ add_subdirectory(wn-libsteamclient)

find_package(curl REQUIRED CONFIG)

# ----------------------------------------------------------------------------
# SPIR-V shader compilation
# Each .glsl is compiled by glslc (shipped with the NDK) into a .spv binary,
# then converted to a C uint32_t array via bin2c.cmake. Headers are emitted
# under ${CMAKE_CURRENT_BINARY_DIR}/shaders/*.spv.h and included from vk_renderer.c.
# ----------------------------------------------------------------------------

# Locate glslc shipped with the NDK. ANDROID_NDK is provided by the Android Gradle plugin.
if(NOT DEFINED ANDROID_NDK)
message(FATAL_ERROR "ANDROID_NDK not defined; this project must be built via the Android Gradle plugin")
endif()
Expand Down Expand Up @@ -101,6 +91,20 @@ set(SHADER_LIST
"effect_colorblind:frag:effect_colorblind_frag"
"effect_pixelate:frag:effect_pixelate_frag"
"sgsr1:frag:sgsr1_frag"
"motion:comp:motion_comp"
"motion_fp32:comp:motion_fp32_comp"
"interpolate:frag:interpolate_frag"
"cnn_pyramid:comp:cnn_pyramid_comp"
"cnn_conv:comp:cnn_conv_comp"
"cnn_conv_2pass:comp:cnn_conv_2pass_comp"
"cnn_correlation:comp:cnn_correlation_comp"
"cnn_correlation_cost9:comp:cnn_correlation_cost9_comp"
"cnn_correlation_g09:comp:cnn_correlation_g09_comp"
"cnn_correlation_warpfollow:comp:cnn_correlation_warpfollow_comp"
"cnn_flowreg:comp:cnn_flowreg_comp"
"cnn_occlusion:comp:cnn_occlusion_comp"
"cnn_generate:comp:cnn_generate_comp"
"fg_synthshift:comp:fg_synthshift_comp"
)

set(SHADER_HEADERS "")
Expand Down Expand Up @@ -128,11 +132,27 @@ foreach(entry ${SHADER_LIST})
list(APPEND SHADER_HEADERS "${hdr}")
endforeach()

add_custom_target(winlator_shaders DEPENDS ${SHADER_HEADERS})
set(WEIGHTS_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/winlator/vk/weights_v2")
set(BIN2C_BYTES_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/winlator/vk/bin2c_bytes.cmake")
set(WEIGHTS_LIST 05 06 07 14 20 21 22 24 25 26 27 28 29 36 37 42 45 51)
foreach(id ${WEIGHTS_LIST})
set(winput "${WEIGHTS_SRC_DIR}/wnfg_${id}.weights.fp16")
set(whdr "${SHADER_OUT_DIR}/wnfg_${id}_weights.h")
add_custom_command(
OUTPUT "${whdr}"
COMMAND "${CMAKE_COMMAND}"
-DINPUT_FILE=${winput}
-DOUTPUT_FILE=${whdr}
-DVAR_NAME=wnfg_${id}_weights
-P "${BIN2C_BYTES_SCRIPT}"
DEPENDS "${winput}" "${BIN2C_BYTES_SCRIPT}"
COMMENT "Embedding weights wnfg_${id}.weights.fp16 -> wnfg_${id}_weights.h"
VERBATIM
)
list(APPEND SHADER_HEADERS "${whdr}")
endforeach()

# ----------------------------------------------------------------------------
# Winlator native library (X-server, AHB, Vulkan compositor, helpers)
# ----------------------------------------------------------------------------
add_custom_target(winlator_shaders DEPENDS ${SHADER_HEADERS})

add_library(winlator SHARED
winlator/drawable.c
Expand Down
36 changes: 36 additions & 0 deletions app/src/main/cpp/winlator/vk/bin2c_bytes.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
if(NOT INPUT_FILE OR NOT OUTPUT_FILE OR NOT VAR_NAME)
message(FATAL_ERROR "bin2c_bytes.cmake requires INPUT_FILE, OUTPUT_FILE, VAR_NAME")
endif()

file(READ "${INPUT_FILE}" hex_data HEX)
string(LENGTH "${hex_data}" hex_len)

set(bytes "")
set(line_bytes "")
set(bytes_per_line 0)

set(i 0)
while(i LESS hex_len)
string(SUBSTRING "${hex_data}" ${i} 2 b)
math(EXPR i "${i} + 2")
string(APPEND line_bytes "0x${b}, ")
math(EXPR bytes_per_line "${bytes_per_line} + 1")
if(bytes_per_line EQUAL 16)
string(APPEND bytes " ${line_bytes}\n")
set(line_bytes "")
set(bytes_per_line 0)
endif()
endwhile()
if(bytes_per_line GREATER 0)
string(APPEND bytes " ${line_bytes}\n")
endif()

file(WRITE "${OUTPUT_FILE}"
"#pragma once
#include <stdint.h>
#include <stddef.h>

static const uint8_t ${VAR_NAME}[] = {
${bytes}};
static const size_t ${VAR_NAME}_size = sizeof(${VAR_NAME});
")
104 changes: 104 additions & 0 deletions app/src/main/cpp/winlator/vk/shaders/cnn_conv.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#version 450
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require

layout(local_size_x = 16, local_size_y = 16, local_size_z = 1) in;

layout(set = 0, binding = 0) uniform sampler2DArray uSrc;
layout(set = 0, binding = 1, rgba8) uniform writeonly image2DArray uDst;
layout(set = 0, binding = 3) uniform sampler2D uLuma;

layout(set = 0, binding = 8, std430) readonly buffer Weights { float16_t W[]; };

layout(push_constant) uniform PC {
ivec2 size;
float t;
float mvScale;
uint wBase;
int cinT;
int coutT;
int flags;
} pc;

const ivec2 TAP[9] = ivec2[](
ivec2(-1,-1), ivec2(0,-1), ivec2(1,-1),
ivec2(-1, 0), ivec2(0, 0), ivec2(1, 0),
ivec2(-1, 1), ivec2(0, 1), ivec2(1, 1));

const int MAX_T = 4;

mat4 convMat(int k, int ci, int co) {
uint b = pc.wBase + uint((((k * pc.cinT + ci) * pc.coutT + co) * 4) * 4);
mat4 M;
for (int c = 0; c < 4; ++c) {
uint cb = b + uint(c);
M[c] = vec4(float(W[cb + 0u]),
float(W[cb + 4u]),
float(W[cb + 8u]),
float(W[cb + 12u]));
}
return M;
}

void main() {
ivec2 p = ivec2(gl_GlobalInvocationID.xy);
if (any(greaterThanEqual(p, pc.size))) return;

bool stem = (pc.flags & 1) != 0;
bool stride2 = (pc.flags & 2) != 0;
bool resid = (pc.flags & 4) != 0;
bool doRelu = (pc.flags & 8) != 0;
bool stem2x = (pc.flags & 16) != 0;

ivec2 sp = (stride2 || stem2x) ? (p * 2) : p;

ivec2 hi = stem ? (textureSize(uLuma, 0) - ivec2(1))
: stride2 ? (textureSize(uSrc, 0).xy - ivec2(1))
: (pc.size - ivec2(1));

vec4 acc[MAX_T];
for (int co = 0; co < pc.coutT; ++co) acc[co] = vec4(0.0);

uint affBase = pc.wBase + uint(9 * pc.cinT * pc.coutT * 16);

for (int k = 0; k < 9; ++k) {
ivec2 q = clamp(sp + TAP[k], ivec2(0), hi);

if (stem) {

float luma = texelFetch(uLuma, q, 0).r;
vec4 x = vec4((luma - 0.208008) * 1.496094 + 0.769531, 0.0, 0.0, 0.0);
for (int co = 0; co < pc.coutT; ++co)
acc[co] += convMat(k, 0, co) * x;
} else {

for (int ci = 0; ci < pc.cinT; ++ci) {
vec4 x = texelFetch(uSrc, ivec3(q, ci), 0);
for (int co = 0; co < pc.coutT; ++co)
acc[co] += convMat(k, ci, co) * x;
}
}
}

if (resid) {
acc[0] += texelFetch(uSrc, ivec3(p, 0), 0);
}

for (int co = 0; co < pc.coutT; ++co) {
vec4 bias = vec4(float(W[affBase + uint((0 * pc.coutT + co) * 4 + 0)]),
float(W[affBase + uint((0 * pc.coutT + co) * 4 + 1)]),
float(W[affBase + uint((0 * pc.coutT + co) * 4 + 2)]),
float(W[affBase + uint((0 * pc.coutT + co) * 4 + 3)]));
vec4 scale = vec4(float(W[affBase + uint((1 * pc.coutT + co) * 4 + 0)]),
float(W[affBase + uint((1 * pc.coutT + co) * 4 + 1)]),
float(W[affBase + uint((1 * pc.coutT + co) * 4 + 2)]),
float(W[affBase + uint((1 * pc.coutT + co) * 4 + 3)]));
vec4 offset = vec4(float(W[affBase + uint((2 * pc.coutT + co) * 4 + 0)]),
float(W[affBase + uint((2 * pc.coutT + co) * 4 + 1)]),
float(W[affBase + uint((2 * pc.coutT + co) * 4 + 2)]),
float(W[affBase + uint((2 * pc.coutT + co) * 4 + 3)]));

vec4 v = (acc[co] - bias) * scale + offset;
if (doRelu) v = max(v, vec4(0.0));
imageStore(uDst, ivec3(p, co), clamp(v, 0.0, 1.0));
}
}
151 changes: 151 additions & 0 deletions app/src/main/cpp/winlator/vk/shaders/cnn_conv_2pass.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#version 450
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require

layout(local_size_x = 16, local_size_y = 16, local_size_z = 1) in;

layout(constant_id = 0) const int CIN_T = 2;
layout(constant_id = 1) const int MID_T = 2;
layout(constant_id = 2) const int COUT_T = 2;

const int TILE = 18;
const int MAX_T = 2;

layout(set = 0, binding = 32) uniform sampler2D uIn0;
layout(set = 0, binding = 33) uniform sampler2D uIn1;

layout(set = 0, binding = 48, rgba8) uniform writeonly image2D uOut0;
layout(set = 0, binding = 49, rgba8) uniform writeonly image2D uOut1;

layout(set = 0, binding = 8, std430) readonly buffer Weights { float16_t W[]; };

layout(push_constant) uniform PC {
ivec2 size;
float t;
float mvScale;
uint wBase;
int cinT;
int coutT;
int flags;
} pc;

const ivec2 TAP_XOUTER[9] = ivec2[](
ivec2(-1,-1), ivec2(-1, 0), ivec2(-1, 1),
ivec2( 0,-1), ivec2( 0, 0), ivec2( 0, 1),
ivec2( 1,-1), ivec2( 1, 0), ivec2( 1, 1));

shared f16vec4 sMid[MAX_T][TILE][TILE];

mat4 convMatAt(uint blockBase, int k, int ci, int co, int cinT, int coutT) {
uint b = blockBase + uint((((k * cinT + ci) * coutT + co) * 4) * 4);
mat4 M;
for (int c = 0; c < 4; ++c) {
uint cb = b + uint(c);
M[c] = vec4(float(W[cb + 0u]), float(W[cb + 4u]),
float(W[cb + 8u]), float(W[cb + 12u]));
}
return M;
}

void affineAt(uint affBase, int co, int coutT,
out vec4 bias, out vec4 scale, out vec4 offset) {
bias = vec4(float(W[affBase + uint((0*coutT + co)*4 + 0)]),
float(W[affBase + uint((0*coutT + co)*4 + 1)]),
float(W[affBase + uint((0*coutT + co)*4 + 2)]),
float(W[affBase + uint((0*coutT + co)*4 + 3)]));
scale = vec4(float(W[affBase + uint((1*coutT + co)*4 + 0)]),
float(W[affBase + uint((1*coutT + co)*4 + 1)]),
float(W[affBase + uint((1*coutT + co)*4 + 2)]),
float(W[affBase + uint((1*coutT + co)*4 + 3)]));
offset = vec4(float(W[affBase + uint((2*coutT + co)*4 + 0)]),
float(W[affBase + uint((2*coutT + co)*4 + 1)]),
float(W[affBase + uint((2*coutT + co)*4 + 2)]),
float(W[affBase + uint((2*coutT + co)*4 + 3)]));
}

void main() {
bool PASS1_CONV = (pc.flags & 1) != 0;
bool PASS1_CLAMP = (pc.flags & 2) != 0;

ivec2 base = ivec2(gl_WorkGroupID.xy) * 16 - ivec2(1);
ivec2 lid = ivec2(gl_LocalInvocationID.xy);
ivec2 p = base + ivec2(1) + lid;
ivec2 hi = pc.size - ivec2(1);

uint p1ConvBase = pc.wBase;
uint p1AffBase = p1ConvBase + uint(9 * CIN_T * MID_T * 16);

uint p2ConvBase = PASS1_CONV ? (p1AffBase + uint(3 * MID_T * 4)) : pc.wBase;
uint p2AffBase = p2ConvBase + uint(9 * MID_T * COUT_T * 16);

uint lindex = gl_LocalInvocationIndex;
for (uint idx = lindex; idx < uint(TILE*TILE); idx += 256u) {
int tx = int(idx % uint(TILE));
int ty = int(idx / uint(TILE));
ivec2 sp = base + ivec2(tx, ty);

if (PASS1_CONV) {

bool centerOOB = any(lessThan(sp, ivec2(0))) || any(greaterThan(sp, hi));
vec4 acc[MAX_T];
for (int co = 0; co < MID_T; ++co) acc[co] = vec4(0.0);
if (!centerOOB) {
for (int k = 0; k < 9; ++k) {
ivec2 q = sp + TAP_XOUTER[k];
bool inb = all(greaterThanEqual(q, ivec2(0))) && all(lessThanEqual(q, hi));

vec4 x0 = inb ? texelFetch(uIn0, q, 0) : vec4(0.0);
vec4 x1 = inb ? texelFetch(uIn1, q, 0) : vec4(0.0);
for (int co = 0; co < MID_T; ++co) {
acc[co] += convMatAt(p1ConvBase, k, 0, co, CIN_T, MID_T) * x0;
if (CIN_T > 1)
acc[co] += convMatAt(p1ConvBase, k, 1, co, CIN_T, MID_T) * x1;
}
}
}
for (int co = 0; co < MID_T; ++co) {
vec4 v;
if (centerOOB) {
v = vec4(0.0);
} else {
vec4 bias, scale, offset;
affineAt(p1AffBase, co, MID_T, bias, scale, offset);
v = (acc[co] - bias) * scale + offset;
if (PASS1_CLAMP) v = clamp(v, 0.0, 1.0);
}
sMid[co][tx][ty] = f16vec4(v);
}
} else {

ivec2 q = clamp(sp, ivec2(0), hi);
sMid[0][tx][ty] = f16vec4(texelFetch(uIn0, q, 0));
if (MID_T > 1)
sMid[1][tx][ty] = f16vec4(texelFetch(uIn1, q, 0));
}
}

barrier();

if (any(greaterThanEqual(p, pc.size))) return;

ivec2 c = lid + ivec2(1);

vec4 acc[MAX_T];
for (int co = 0; co < COUT_T; ++co) acc[co] = vec4(0.0);

for (int k = 0; k < 9; ++k) {
ivec2 li = c + TAP_XOUTER[k];
for (int ci = 0; ci < MID_T; ++ci) {
vec4 x = vec4(sMid[ci][li.x][li.y]);
for (int co = 0; co < COUT_T; ++co)
acc[co] += convMatAt(p2ConvBase, k, ci, co, MID_T, COUT_T) * x;
}
}

for (int co = 0; co < COUT_T; ++co) {
vec4 bias, scale, offset;
affineAt(p2AffBase, co, COUT_T, bias, scale, offset);
vec4 v = (acc[co] - bias) * scale + offset;
if (co == 0) imageStore(uOut0, p, v);
else if (co == 1) imageStore(uOut1, p, v);
}
}
Loading