From cbff49d1bd665057be7adf894ea267f0e533ef8a Mon Sep 17 00:00:00 2001
From: Galaxy4594 <164440799+Galaxy4594@users.noreply.github.com>
Date: Wed, 18 Mar 2026 19:40:34 -0700
Subject: [PATCH 01/10] implement sharpyuv

---
 lib/extras/enc/jpegli.cc     |   3 +
 lib/extras/enc/jpegli.h      |   1 +
 lib/jpegli/encode.cc         |  13 +-
 lib/jpegli/encode.h          |   3 +
 lib/jpegli/encode_internal.h |   1 +
 lib/jpegli/sharp_yuv.cc      | 380 +++++++++++++++++++++++++++++++++++
 lib/jpegli/sharp_yuv.h       |  21 ++
 lib/jxl_lists.cmake          |   2 +
 tools/cjpegli.cc             |   4 +
 9 files changed, 427 insertions(+), 1 deletion(-)
 create mode 100644 lib/jpegli/sharp_yuv.cc
 create mode 100644 lib/jpegli/sharp_yuv.h

diff --git a/lib/extras/enc/jpegli.cc b/lib/extras/enc/jpegli.cc
index 14a7a398..cb2384e8 100644
--- a/lib/extras/enc/jpegli.cc
+++ b/lib/extras/enc/jpegli.cc
@@ -461,6 +461,9 @@ Status EncodeJpeg(const PackedPixelFile& ppf, const JpegSettings& jpeg_settings,
     }
     jpegli_enable_adaptive_quantization(
         &cinfo, TO_JXL_BOOL(jpeg_settings.use_adaptive_quantization));
+    if (jpeg_settings.use_sharpyuv) {
+      jpegli_set_sharp_yuv(&cinfo, TRUE);
+    }
     if (jpeg_settings.psnr_target > 0.0) {
       jpegli_set_psnr(&cinfo, jpeg_settings.psnr_target,
                       jpeg_settings.search_tolerance,
diff --git a/lib/extras/enc/jpegli.h b/lib/extras/enc/jpegli.h
index 7d841eb5..88db8074 100644
--- a/lib/extras/enc/jpegli.h
+++ b/lib/extras/enc/jpegli.h
@@ -24,6 +24,7 @@ class PackedPixelFile;
 
 struct JpegSettings {
   bool xyb = false;
+  bool use_sharpyuv = false;
   size_t target_size = 0;
   float quality = 0.0f;
   float distance = 1.f;
diff --git a/lib/jpegli/encode.cc b/lib/jpegli/encode.cc
index 4d9b0cae..f59da379 100644
--- a/lib/jpegli/encode.cc
+++ b/lib/jpegli/encode.cc
@@ -31,6 +31,7 @@
 #include "lib/jpegli/quant.h"
 #include "lib/jpegli/simd.h"
 #include "lib/jpegli/types.h"
+#include "lib/jpegli/sharp_yuv.h"
 
 namespace jpegli {
 
@@ -600,7 +601,11 @@ void ProcessiMCURow(j_compress_ptr cinfo) {
   JPEGLI_CHECK(cinfo->master->next_iMCU_row < cinfo->total_iMCU_rows);
   if (!cinfo->raw_data_in) {
     ApplyInputSmoothing(cinfo);
-    DownsampleInputBuffer(cinfo);
+    if (cinfo->master->use_sharpyuv && cinfo->jpeg_color_space == JCS_YCbCr) {
+      ApplySharpYuvDownsample(cinfo);
+    } else {
+      DownsampleInputBuffer(cinfo);
+    }
   }
   ComputeAdaptiveQuantField(cinfo);
   if (IsStreamingSupported(cinfo)) {
@@ -693,6 +698,7 @@ void jpegli_CreateCompress(j_compress_ptr cinfo, int version,
   cinfo->master->cicp_transfer_function = 2;  // unknown transfer function code
   cinfo->master->use_std_tables = false;
   cinfo->master->use_adaptive_quantization = true;
+  cinfo->master->use_sharpyuv = false;
   cinfo->master->progressive_level = jpegli::kDefaultProgressiveLevel;
   cinfo->master->data_type = JPEGLI_TYPE_UINT8;
   cinfo->master->endianness = JPEGLI_NATIVE_ENDIAN;
@@ -927,6 +933,11 @@ void jpegli_enable_adaptive_quantization(j_compress_ptr cinfo, boolean value) {
   cinfo->master->use_adaptive_quantization = FROM_JXL_BOOL(value);
 }
 
+void jpegli_set_sharp_yuv(j_compress_ptr cinfo, boolean enable) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->use_sharpyuv = FROM_JXL_BOOL(enable);
+}
+
 void jpegli_simple_progression(j_compress_ptr cinfo) {
   CheckState(cinfo, jpegli::kEncStart);
   jpegli_set_progressive_level(cinfo, 2);
diff --git a/lib/jpegli/encode.h b/lib/jpegli/encode.h
index 5f620b98..364894b1 100644
--- a/lib/jpegli/encode.h
+++ b/lib/jpegli/encode.h
@@ -145,6 +145,9 @@ void jpegli_set_input_format(j_compress_ptr cinfo, JpegliDataType data_type,
 // Enabled by default.
 void jpegli_enable_adaptive_quantization(j_compress_ptr cinfo, boolean value);
 
+// Enables linear-light sharp YUV downsampling.
+void jpegli_set_sharp_yuv(j_compress_ptr cinfo, boolean enable);
+
 // Sets the default progression parameters, where level 0 is sequential, and
 // greater level value means more progression steps. Default is 2.
 void jpegli_set_progressive_level(j_compress_ptr cinfo, int level);
diff --git a/lib/jpegli/encode_internal.h b/lib/jpegli/encode_internal.h
index 3f5d1a76..05f40dbf 100644
--- a/lib/jpegli/encode_internal.h
+++ b/lib/jpegli/encode_internal.h
@@ -75,6 +75,7 @@ struct jpeg_comp_master {
   uint8_t cicp_transfer_function;
   bool use_std_tables;
   bool use_adaptive_quantization;
+  bool use_sharpyuv;
   int progressive_level;
   size_t xsize_blocks;
   size_t ysize_blocks;
diff --git a/lib/jpegli/sharp_yuv.cc b/lib/jpegli/sharp_yuv.cc
new file mode 100644
index 00000000..dacfb26b
--- /dev/null
+++ b/lib/jpegli/sharp_yuv.cc
@@ -0,0 +1,380 @@
+// Copyright (c) the JPEG XL Project Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file or at
+// https://developers.google.com/open-source/licenses/bsd
+
+#include "lib/jpegli/sharp_yuv.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <vector>
+
+#include "lib/jpegli/common.h"
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/simd.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/sharp_yuv.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Div;
+using hwy::HWY_NAMESPACE::LoadU;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::Min;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Set;
+using hwy::HWY_NAMESPACE::StoreU;
+using hwy::HWY_NAMESPACE::Sub;
+
+using D = HWY_CAPPED(float, 8);
+constexpr D d;
+
+inline float GammaToLinear(float v) {
+  float n = std::max(0.0f, std::min(255.0f, v)) * (1.0f / 255.0f);
+  if (n <= 0.04045f) {
+    return n / 12.92f;
+  } else {
+    return std::pow((n + 0.055f) / 1.055f, 2.4f);
+  }
+}
+
+inline float LinearToGamma(float v) {
+  if (v <= 0.0031308f) {
+    return v * 12.92f * 255.0f;
+  } else {
+    return (1.055f * std::pow(std::max(0.0f, v), 1.0f / 2.4f) - 0.055f) * 255.0f;
+  }
+}
+
+inline void YCbCrToRGB_Scalar(float y, float cb, float cr, float* r, float* g,
+                              float* b) {
+  cb -= 128.0f;
+  cr -= 128.0f;
+  *r = y + cr * 1.402f;
+  *g = y + cb * (-0.114f * 1.772f / 0.587f) + cr * (-0.299f * 1.402f / 0.587f);
+  *b = y + cb * 1.772f;
+}
+
+inline void RGBToYCbCr_Scalar(float r, float g, float b, float* y, float* cb,
+                              float* cr) {
+  const float kR = 0.299f;
+  const float kG = 0.587f;
+  const float kB = 0.114f;
+  const float kAmpR = 0.701f;
+  const float kAmpB = 0.886f;
+  const float kDiffR = kAmpR + kR;
+  const float kDiffB = kAmpB + kB;
+  const float kNormR = 1.0f / (kAmpR + kG + kB);
+  const float kNormB = 1.0f / (kR + kG + kAmpB);
+
+  float r_base = r * kR;
+  float g_base = g * kG;
+  float b_base = b * kB;
+  float y_base = r_base + g_base + b_base;
+  *y = y_base;
+  *cb = (b * kDiffB - y_base) * kNormB + 128.0f;
+  *cr = (r * kDiffR - y_base) * kNormR + 128.0f;
+}
+
+inline float RGBToLuma_Scalar(float r, float g, float b) {
+  return 0.299f * r + 0.587f * g + 0.114f * b;
+}
+
+void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  const size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
+  const size_t y0 = m->next_iMCU_row * iMCU_height;
+  const size_t y1 = y0 + iMCU_height;
+  const size_t xsize_padded = m->xsize_blocks * DCTSIZE;
+
+  // Verify that Cb and Cr are 4:2:0 subsampled.
+  if (cinfo->num_components < 3) return;
+  const int h_factor = cinfo->max_h_samp_factor / cinfo->comp_info[1].h_samp_factor;
+  const int v_factor = cinfo->max_v_samp_factor / cinfo->comp_info[1].v_samp_factor;
+  if(h_factor != 2 || v_factor != 2) {
+    // Fallback logic for unsupported standard subsampling bounds
+    for (int c = 1; c < cinfo->num_components; ++c) {
+      jpeg_component_info* comp = &cinfo->comp_info[c];
+      const int hf = cinfo->max_h_samp_factor / comp->h_samp_factor;
+      const int vf = cinfo->max_v_samp_factor / comp->v_samp_factor;
+      if (hf == 1 && vf == 1) continue;
+      const size_t y_out0 = y0 / vf;
+      float* rows_in[MAX_SAMP_FACTOR];
+      auto& input = *m->smooth_input[c];
+      auto& output = *m->raw_data[c];
+      for (size_t y_in = y0, y_out = y_out0; y_in < y1; y_in += vf, ++y_out) {
+        for (int iy = 0; iy < vf; ++iy) {
+          rows_in[iy] = input.Row(y_in + iy);
+        }
+        (*m->downsample_method[c])(rows_in, xsize_padded, output.Row(y_out));
+      }
+    }
+    return;
+  }
+
+  const size_t down_width = xsize_padded / 2;
+  const size_t down_height = iMCU_height / 2;
+  
+  std::vector<float> target_y(xsize_padded * iMCU_height);
+  std::vector<float> best_y(xsize_padded * iMCU_height);
+
+  std::vector<float> target_cb(down_width * down_height);
+  std::vector<float> target_cr(down_width * down_height);
+  std::vector<float> best_cb(down_width * down_height);
+  std::vector<float> best_cr(down_width * down_height);
+
+  // 1. Initial High-Res state and 2x2 Box Downscale
+  for (size_t y = 0; y < down_height; ++y) {
+    const float* row_y0 = m->smooth_input[0]->Row(y0 + y * 2);
+    const float* row_y1 = m->smooth_input[0]->Row(y0 + y * 2 + 1);
+    const float* row_cb0 = m->smooth_input[1]->Row(y0 + y * 2);
+    const float* row_cb1 = m->smooth_input[1]->Row(y0 + y * 2 + 1);
+    const float* row_cr0 = m->smooth_input[2]->Row(y0 + y * 2);
+    const float* row_cr1 = m->smooth_input[2]->Row(y0 + y * 2 + 1);
+
+    for (size_t x = 0; x < down_width; ++x) {
+      float lin_r_sum = 0, lin_g_sum = 0, lin_b_sum = 0;
+      for (int iy = 0; iy < 2; ++iy) {
+        for (int ix = 0; ix < 2; ++ix) {
+          float py = (iy == 0 ? row_y0 : row_y1)[x * 2 + ix];
+          float pcb = (iy == 0 ? row_cb0 : row_cb1)[x * 2 + ix];
+          float pcr = (iy == 0 ? row_cr0 : row_cr1)[x * 2 + ix];
+
+          float r, g, b;
+          YCbCrToRGB_Scalar(py, pcb, pcr, &r, &g, &b);
+          
+          float lin_r = GammaToLinear(r);
+          float lin_g = GammaToLinear(g);
+          float lin_b = GammaToLinear(b);
+          
+          lin_r_sum += lin_r;
+          lin_g_sum += lin_g;
+          lin_b_sum += lin_b;
+          
+          // Libwebp-like logic: isolate intrinsic target Luma to preserve 
+          // sharp color edge brightness metrics
+          float lin_y = RGBToLuma_Scalar(lin_r, lin_g, lin_b);
+          size_t hy = y * 2 + iy;
+          size_t hx = x * 2 + ix;
+          target_y[hy * xsize_padded + hx] = LinearToGamma(lin_y);
+          best_y[hy * xsize_padded + hx] = py; // Initial Y is the raw unaltered Y
+        }
+      }
+      lin_r_sum *= 0.25f;
+      lin_g_sum *= 0.25f;
+      lin_b_sum *= 0.25f;
+
+      float avg_r = LinearToGamma(lin_r_sum);
+      float avg_g = LinearToGamma(lin_g_sum);
+      float avg_b = LinearToGamma(lin_b_sum);
+
+      float avg_y, avg_cb, avg_cr;
+      RGBToYCbCr_Scalar(avg_r, avg_g, avg_b, &avg_y, &avg_cb, &avg_cr);
+
+      target_cb[y * down_width + x] = avg_cb;
+      target_cr[y * down_width + x] = avg_cr;
+      best_cb[y * down_width + x] = avg_cb;
+      best_cr[y * down_width + x] = avg_cr;
+    }
+  }
+
+  // 2. Iterative Back-Projection (4-pass Y/Cb/Cr coupled stabilization)
+  float prev_diff_y_sum = 1e30f;
+  const float diff_y_threshold = 3.0f * xsize_padded * iMCU_height;
+
+  for (int iter = 0; iter < 4; ++iter) {
+    std::vector<float> err_y(xsize_padded * iMCU_height);
+    std::vector<float> err_cb(down_width * down_height);
+    std::vector<float> err_cr(down_width * down_height);
+    float diff_y_sum = 0.0f;
+
+    for (size_t y = 0; y < down_height; ++y) {
+      size_t y_prev = (y == 0) ? 0 : y - 1;
+      size_t y_next = (y + 1 == down_height) ? y : y + 1;
+
+      const float* prev_cb = &best_cb[y_prev * down_width];
+      const float* cur_cb = &best_cb[y * down_width];
+      const float* next_cb = &best_cb[y_next * down_width];
+
+      const float* prev_cr = &best_cr[y_prev * down_width];
+      const float* cur_cr = &best_cr[y * down_width];
+      const float* next_cr = &best_cr[y_next * down_width];
+
+      for (size_t x = 0; x < down_width; ++x) {
+        size_t x_prev = (x == 0) ? 0 : x - 1;
+        size_t x_next = (x + 1 == down_width) ? x : x + 1;
+
+        auto upsample_bilerp = [](float center, float t, float b, float l,
+                                  float r, float tl, float tr, float bl,
+                                  float br, float res[4]) {
+          res[0] = (center * 9.0f + t * 3.0f + l * 3.0f + tl) / 16.0f;
+          res[1] = (center * 9.0f + t * 3.0f + r * 3.0f + tr) / 16.0f;
+          res[2] = (center * 9.0f + b * 3.0f + l * 3.0f + bl) / 16.0f;
+          res[3] = (center * 9.0f + b * 3.0f + r * 3.0f + br) / 16.0f;
+        };
+
+        float up_cb[4];
+        upsample_bilerp(cur_cb[x], prev_cb[x], next_cb[x], cur_cb[x_prev],
+                        cur_cb[x_next], prev_cb[x_prev], prev_cb[x_next],
+                        next_cb[x_prev], next_cb[x_next], up_cb);
+
+        float up_cr[4];
+        upsample_bilerp(cur_cr[x], prev_cr[x], next_cr[x], cur_cr[x_prev],
+                        cur_cr[x_next], prev_cr[x_prev], prev_cr[x_next],
+                        next_cr[x_prev], next_cr[x_next], up_cr);
+
+        float lin_r_sum = 0, lin_g_sum = 0, lin_b_sum = 0;
+        for (int iy = 0; iy < 2; ++iy) {
+          for (int ix = 0; ix < 2; ++ix) {
+            int i = iy * 2 + ix;
+            size_t hx = x * 2 + ix;
+            size_t hy = y * 2 + iy;
+            float py = best_y[hy * xsize_padded + hx];
+            
+            float r, g, b;
+            YCbCrToRGB_Scalar(py, up_cb[i], up_cr[i], &r, &g, &b);
+            
+            float lin_r = GammaToLinear(r);
+            float lin_g = GammaToLinear(g);
+            float lin_b = GammaToLinear(b);
+            
+            // Record Y error based on simulated decoded linear light
+            float lin_y = RGBToLuma_Scalar(lin_r, lin_g, lin_b);
+            float dec_gamma_y = LinearToGamma(lin_y);
+            float error_y = target_y[hy * xsize_padded + hx] - dec_gamma_y;
+            err_y[hy * xsize_padded + hx] = error_y;
+            diff_y_sum += std::abs(error_y);
+            
+            lin_r_sum += lin_r;
+            lin_g_sum += lin_g;
+            lin_b_sum += lin_b;
+          }
+        }
+        lin_r_sum *= 0.25f;
+        lin_g_sum *= 0.25f;
+        lin_b_sum *= 0.25f;
+
+        float avg_r = LinearToGamma(lin_r_sum);
+        float avg_g = LinearToGamma(lin_g_sum);
+        float avg_b = LinearToGamma(lin_b_sum);
+
+        float avg_y, avg_cb, avg_cr;
+        RGBToYCbCr_Scalar(avg_r, avg_g, avg_b, &avg_y, &avg_cb, &avg_cr);
+
+        err_cb[y * down_width + x] = target_cb[y * down_width + x] - avg_cb;
+        err_cr[y * down_width + x] = target_cr[y * down_width + x] - avg_cr;
+      }
+    }
+    
+    const size_t N = hwy::HWY_NAMESPACE::Lanes(d);
+    
+    // Apply High-Res Y Error with clipping
+    for (size_t y = 0; y < iMCU_height; ++y) {
+      size_t x = 0;
+      float* y_ptr = &best_y[y * xsize_padded];
+      const float* ey_ptr = &err_y[y * xsize_padded];
+      auto v_zero = Set(d, 0.0f);
+      auto v_max = Set(d, 255.0f);
+      for (; x + N <= xsize_padded; x += N) {
+         auto y_vec = LoadU(d, y_ptr + x);
+         auto e_y_vec = LoadU(d, ey_ptr + x);
+         auto res = Add(y_vec, e_y_vec);
+         res = Min(Max(res, v_zero), v_max);
+         StoreU(res, d, y_ptr + x);
+      }
+      for (; x < xsize_padded; ++x) {
+         y_ptr[x] = std::max(0.0f, std::min(255.0f, y_ptr[x] + ey_ptr[x]));
+      }
+    }
+    
+    // Apply Cb/Cr Error
+    for (size_t y = 0; y < down_height; ++y) {
+      size_t x = 0;
+      float* cb_ptr = &best_cb[y * down_width];
+      float* cr_ptr = &best_cr[y * down_width];
+      const float* ecb_ptr = &err_cb[y * down_width];
+      const float* ecr_ptr = &err_cr[y * down_width];
+
+      for (; x + N <= down_width; x += N) {
+         auto cb_vec = LoadU(d, cb_ptr + x);
+         auto cr_vec = LoadU(d, cr_ptr + x);
+         auto e_cb_vec = LoadU(d, ecb_ptr + x);
+         auto e_cr_vec = LoadU(d, ecr_ptr + x);
+         StoreU(Add(cb_vec, e_cb_vec), d, cb_ptr + x);
+         StoreU(Add(cr_vec, e_cr_vec), d, cr_ptr + x);
+      }
+      for (; x < down_width; ++x) {
+         cb_ptr[x] += ecb_ptr[x];
+         cr_ptr[x] += ecr_ptr[x];
+      }
+    }
+
+    // test libwebp exit condition
+    if (iter > 0) {
+      if (diff_y_sum < diff_y_threshold) break;
+      if (diff_y_sum > prev_diff_y_sum) break;
+    }
+    prev_diff_y_sum = diff_y_sum;
+  }
+
+  // 3. Write output to buffers
+  const size_t N = hwy::HWY_NAMESPACE::Lanes(d);
+  
+  // Write Luma
+  for (size_t y = 0; y < iMCU_height; ++y) {
+    float* row_out_y = m->raw_data[0]->Row(y0 + y);
+    float* src_y = &best_y[y * xsize_padded];
+    size_t x = 0;
+    for (; x + N <= xsize_padded; x += N) {
+       StoreU(LoadU(d, src_y + x), d, row_out_y + x);
+    }
+    for (; x < xsize_padded; ++x) {
+       row_out_y[x] = src_y[x];
+    }
+  }
+  
+  // Write Chroma
+  const size_t y_out0 = y0 / 2;
+  for (size_t y_out = 0; y_out < down_height; ++y_out) {
+    float* row_out_cb = m->raw_data[1]->Row(y_out0 + y_out);
+    float* row_out_cr = m->raw_data[2]->Row(y_out0 + y_out);
+    float* src_cb = &best_cb[y_out * down_width];
+    float* src_cr = &best_cr[y_out * down_width];
+
+    size_t x = 0;
+    for (; x + N <= down_width; x += N) {
+       StoreU(LoadU(d, src_cb + x), d, row_out_cb + x);
+       StoreU(LoadU(d, src_cr + x), d, row_out_cr + x);
+    }
+    for (; x < down_width; ++x) {
+       row_out_cb[x] = src_cb[x];
+       row_out_cr[x] = src_cr[x];
+    }
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+
+HWY_EXPORT(ApplySharpYuvDownsampleImpl);
+
+void ApplySharpYuvDownsample(j_compress_ptr cinfo) {
+  HWY_DYNAMIC_DISPATCH(ApplySharpYuvDownsampleImpl)(cinfo);
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/lib/jpegli/sharp_yuv.h b/lib/jpegli/sharp_yuv.h
new file mode 100644
index 00000000..a1d73522
--- /dev/null
+++ b/lib/jpegli/sharp_yuv.h
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file or at
+// https://developers.google.com/open-source/licenses/bsd
+
+#ifndef LIB_JPEGLI_SHARP_YUV_H_
+#define LIB_JPEGLI_SHARP_YUV_H_
+
+#include "lib/jpegli/encode_internal.h"
+
+namespace jpegli {
+
+// Applies a sharp 2x2 area downscale in linear RGB space instead of
+// gamma compressed YCbCr space. This prevents bleeding/blur artifacts
+// and preserves color boundaries much better.
+void ApplySharpYuvDownsample(j_compress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_SHARP_YUV_H_
diff --git a/lib/jxl_lists.cmake b/lib/jxl_lists.cmake
index f5fa51d9..83394772 100644
--- a/lib/jxl_lists.cmake
+++ b/lib/jxl_lists.cmake
@@ -198,6 +198,8 @@ set(JPEGXL_INTERNAL_JPEGLI_SOURCES
   jpegli/quant.h
   jpegli/render.cc
   jpegli/render.h
+  jpegli/sharp_yuv.cc
+  jpegli/sharp_yuv.h
   jpegli/simd.cc
   jpegli/simd.h
   jpegli/source_manager.cc
diff --git a/tools/cjpegli.cc b/tools/cjpegli.cc
index 7bcd598a..35008edc 100644
--- a/tools/cjpegli.cc
+++ b/tools/cjpegli.cc
@@ -77,6 +77,10 @@ struct Args {
     cmdline->AddOptionFlag('\0', "xyb", "Convert to XYB colorspace",
                            &settings.xyb, &SetBooleanTrue, 1);
 
+    cmdline->AddOptionFlag('\0', "sharpyuv",
+                           "Use true linear-light sharp YUV downsampling for 4:2:0",
+                           &settings.use_sharpyuv, &SetBooleanTrue, 1);
+
     cmdline->AddOptionFlag(
         '\0', "std_quant",
         "Use quantization tables based on Annex K of the JPEG standard.",

From 7ea3a0046e854a11efba85654ebd592dbb8ec2bf Mon Sep 17 00:00:00 2001
From: Galaxy4594 <164440799+Galaxy4594@users.noreply.github.com>
Date: Wed, 18 Mar 2026 19:48:17 -0700
Subject: [PATCH 02/10] SIMD optimizations

---
 lib/jpegli/sharp_yuv.cc | 388 ++++++++++++++++++++++------------------
 1 file changed, 215 insertions(+), 173 deletions(-)

diff --git a/lib/jpegli/sharp_yuv.cc b/lib/jpegli/sharp_yuv.cc
index dacfb26b..135d82b6 100644
--- a/lib/jpegli/sharp_yuv.cc
+++ b/lib/jpegli/sharp_yuv.cc
@@ -19,6 +19,7 @@
 #define HWY_TARGET_INCLUDE "lib/jpegli/sharp_yuv.cc"
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
+#include <hwy/contrib/math/math-inl.h>
 
 HWY_BEFORE_NAMESPACE();
 namespace jpegli {
@@ -34,6 +35,8 @@ using hwy::HWY_NAMESPACE::MulAdd;
 using hwy::HWY_NAMESPACE::Set;
 using hwy::HWY_NAMESPACE::StoreU;
 using hwy::HWY_NAMESPACE::Sub;
+using hwy::HWY_NAMESPACE::IfThenElse;
+using hwy::HWY_NAMESPACE::Le;
 
 using D = HWY_CAPPED(float, 8);
 constexpr D d;
@@ -85,8 +88,45 @@ inline void RGBToYCbCr_Scalar(float r, float g, float b, float* y, float* cb,
   *cr = (r * kDiffR - y_base) * kNormR + 128.0f;
 }
 
-inline float RGBToLuma_Scalar(float r, float g, float b) {
-  return 0.299f * r + 0.587f * g + 0.114f * b;
+template <class D, class V>
+HWY_INLINE V GammaToLinear_SIMD(D d, V v) {
+  auto v_zero = Set(d, 0.0f);
+  auto v_max = Set(d, 255.0f);
+  auto n = Mul(Min(Max(v, v_zero), v_max), Set(d, 1.0f / 255.0f));
+  auto is_low = Le(n, Set(d, 0.04045f));
+  auto low_val = Div(n, Set(d, 12.92f));
+  auto base = Div(Add(n, Set(d, 0.055f)), Set(d, 1.055f));
+  base = Max(base, Set(d, 1e-6f));
+  auto log_base = hwy::HWY_NAMESPACE::Log2(d, base);
+  auto high_val = hwy::HWY_NAMESPACE::Exp2(d, Mul(log_base, Set(d, 2.4f)));
+  return IfThenElse(is_low, low_val, high_val);
+}
+
+template <class D, class V>
+HWY_INLINE V LinearToGamma_SIMD(D d, V v) {
+  v = Max(v, Set(d, 0.0f));
+  auto is_low = Le(v, Set(d, 0.0031308f));
+  auto low_val = Mul(v, Set(d, 12.92f * 255.0f));
+  auto v_safe = Max(v, Set(d, 1e-6f));
+  auto log_v = hwy::HWY_NAMESPACE::Log2(d, v_safe);
+  auto p = hwy::HWY_NAMESPACE::Exp2(d, Mul(log_v, Set(d, 1.0f / 2.4f)));
+  auto high_val = Mul(Sub(Mul(p, Set(d, 1.055f)), Set(d, 0.055f)), Set(d, 255.0f));
+  return IfThenElse(is_low, low_val, high_val);
+}
+
+template <class D, class V>
+HWY_INLINE void YCbCrToRGB_SIMD(D d, V y, V cb, V cr, V* r, V* g, V* b) {
+  auto v128 = Set(d, 128.0f);
+  cb = Sub(cb, v128);
+  cr = Sub(cr, v128);
+  *r = Add(y, Mul(cr, Set(d, 1.402f)));
+  *g = Add(y, Add(Mul(cb, Set(d, -0.114f * 1.772f / 0.587f)), Mul(cr, Set(d, -0.299f * 1.402f / 0.587f))));
+  *b = Add(y, Mul(cb, Set(d, 1.772f)));
+}
+
+template <class D, class V>
+HWY_INLINE V TempLuma_SIMD(D d, V r, V g, V b) {
+  return Add(Mul(Set(d, 0.299f), r), Add(Mul(Set(d, 0.587f), g), Mul(Set(d, 0.114f), b)));
 }
 
 void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
@@ -96,12 +136,10 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
   const size_t y1 = y0 + iMCU_height;
   const size_t xsize_padded = m->xsize_blocks * DCTSIZE;
 
-  // Verify that Cb and Cr are 4:2:0 subsampled.
   if (cinfo->num_components < 3) return;
   const int h_factor = cinfo->max_h_samp_factor / cinfo->comp_info[1].h_samp_factor;
   const int v_factor = cinfo->max_v_samp_factor / cinfo->comp_info[1].v_samp_factor;
   if(h_factor != 2 || v_factor != 2) {
-    // Fallback logic for unsupported standard subsampling bounds
     for (int c = 1; c < cinfo->num_components; ++c) {
       jpeg_component_info* comp = &cinfo->comp_info[c];
       const int hf = cinfo->max_h_samp_factor / comp->h_samp_factor;
@@ -123,233 +161,237 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
 
   const size_t down_width = xsize_padded / 2;
   const size_t down_height = iMCU_height / 2;
+  const size_t high_size = xsize_padded * iMCU_height;
+  const size_t low_size = down_width * down_height;
   
-  std::vector<float> target_y(xsize_padded * iMCU_height);
-  std::vector<float> best_y(xsize_padded * iMCU_height);
+  std::vector<float> target_y(high_size);
+  std::vector<float> best_y(high_size);
+  std::vector<float> tmp_lin_r(high_size);
+  std::vector<float> tmp_lin_g(high_size);
+  std::vector<float> tmp_lin_b(high_size);
 
-  std::vector<float> target_cb(down_width * down_height);
-  std::vector<float> target_cr(down_width * down_height);
-  std::vector<float> best_cb(down_width * down_height);
-  std::vector<float> best_cr(down_width * down_height);
+  std::vector<float> target_cb(low_size);
+  std::vector<float> target_cr(low_size);
+  std::vector<float> best_cb(low_size);
+  std::vector<float> best_cr(low_size);
 
-  // 1. Initial High-Res state and 2x2 Box Downscale
-  for (size_t y = 0; y < down_height; ++y) {
-    const float* row_y0 = m->smooth_input[0]->Row(y0 + y * 2);
-    const float* row_y1 = m->smooth_input[0]->Row(y0 + y * 2 + 1);
-    const float* row_cb0 = m->smooth_input[1]->Row(y0 + y * 2);
-    const float* row_cb1 = m->smooth_input[1]->Row(y0 + y * 2 + 1);
-    const float* row_cr0 = m->smooth_input[2]->Row(y0 + y * 2);
-    const float* row_cr1 = m->smooth_input[2]->Row(y0 + y * 2 + 1);
+  std::vector<float> up_cb(high_size);
+  std::vector<float> up_cr(high_size);
+
+  const size_t N = hwy::HWY_NAMESPACE::Lanes(d);
+
+  // 1. Initial High-Res state -> SIMD Loop
+  for (size_t y = 0; y < iMCU_height; ++y) {
+    const float* row_y = m->smooth_input[0]->Row(y0 + y);
+    const float* row_cb = m->smooth_input[1]->Row(y0 + y);
+    const float* row_cr = m->smooth_input[2]->Row(y0 + y);
+    size_t x = 0;
+    for (; x + N <= xsize_padded; x += N) {
+      auto py = LoadU(d, row_y + x);
+      auto pcb = LoadU(d, row_cb + x);
+      auto pcr = LoadU(d, row_cr + x);
+      
+      auto r = Set(d,0.0f), g = Set(d,0.0f), b = Set(d,0.0f);
+      YCbCrToRGB_SIMD(d, py, pcb, pcr, &r, &g, &b);
+      
+      auto lin_r = GammaToLinear_SIMD(d, r);
+      auto lin_g = GammaToLinear_SIMD(d, g);
+      auto lin_b = GammaToLinear_SIMD(d, b);
+      
+      auto lin_y = TempLuma_SIMD(d, lin_r, lin_g, lin_b);
+      auto t_y = LinearToGamma_SIMD(d, lin_y);
+      
+      size_t idx = y * xsize_padded + x;
+      StoreU(t_y, d, &target_y[idx]);
+      StoreU(py, d, &best_y[idx]);
+      StoreU(lin_r, d, &tmp_lin_r[idx]);
+      StoreU(lin_g, d, &tmp_lin_g[idx]);
+      StoreU(lin_b, d, &tmp_lin_b[idx]);
+    }
+    for (; x < xsize_padded; ++x) {
+      float py = row_y[x], pcb = row_cb[x], pcr = row_cr[x];
+      float r, g, b; YCbCrToRGB_Scalar(py, pcb, pcr, &r, &g, &b);
+      float lr = GammaToLinear(r), lg = GammaToLinear(g), lb = GammaToLinear(b);
+      float ly = 0.299 * lr + 0.587 * lg + 0.114 * lb;
+      size_t idx = y * xsize_padded + x;
+      target_y[idx] = LinearToGamma(ly);
+      best_y[idx] = py;
+      tmp_lin_r[idx] = lr; tmp_lin_g[idx] = lg; tmp_lin_b[idx] = lb;
+    }
+  }
 
+  // 1b. Box downscale to target_cb, target_cr -> Scalar loop 
+  // (Downscaled loops represent 1/4 the iterations, scalar overhead is negligible)
+  for (size_t y = 0; y < down_height; ++y) {
     for (size_t x = 0; x < down_width; ++x) {
-      float lin_r_sum = 0, lin_g_sum = 0, lin_b_sum = 0;
+      float lr_sum = 0, lg_sum = 0, lb_sum = 0;
       for (int iy = 0; iy < 2; ++iy) {
         for (int ix = 0; ix < 2; ++ix) {
-          float py = (iy == 0 ? row_y0 : row_y1)[x * 2 + ix];
-          float pcb = (iy == 0 ? row_cb0 : row_cb1)[x * 2 + ix];
-          float pcr = (iy == 0 ? row_cr0 : row_cr1)[x * 2 + ix];
-
-          float r, g, b;
-          YCbCrToRGB_Scalar(py, pcb, pcr, &r, &g, &b);
-          
-          float lin_r = GammaToLinear(r);
-          float lin_g = GammaToLinear(g);
-          float lin_b = GammaToLinear(b);
-          
-          lin_r_sum += lin_r;
-          lin_g_sum += lin_g;
-          lin_b_sum += lin_b;
-          
-          // Libwebp-like logic: isolate intrinsic target Luma to preserve 
-          // sharp color edge brightness metrics
-          float lin_y = RGBToLuma_Scalar(lin_r, lin_g, lin_b);
-          size_t hy = y * 2 + iy;
-          size_t hx = x * 2 + ix;
-          target_y[hy * xsize_padded + hx] = LinearToGamma(lin_y);
-          best_y[hy * xsize_padded + hx] = py; // Initial Y is the raw unaltered Y
+          size_t idx = (y * 2 + iy) * xsize_padded + (x * 2 + ix);
+          lr_sum += tmp_lin_r[idx];
+          lg_sum += tmp_lin_g[idx];
+          lb_sum += tmp_lin_b[idx];
         }
       }
-      lin_r_sum *= 0.25f;
-      lin_g_sum *= 0.25f;
-      lin_b_sum *= 0.25f;
-
-      float avg_r = LinearToGamma(lin_r_sum);
-      float avg_g = LinearToGamma(lin_g_sum);
-      float avg_b = LinearToGamma(lin_b_sum);
-
+      float avg_r = LinearToGamma(lr_sum * 0.25f);
+      float avg_g = LinearToGamma(lg_sum * 0.25f);
+      float avg_b = LinearToGamma(lb_sum * 0.25f);
       float avg_y, avg_cb, avg_cr;
       RGBToYCbCr_Scalar(avg_r, avg_g, avg_b, &avg_y, &avg_cb, &avg_cr);
-
-      target_cb[y * down_width + x] = avg_cb;
-      target_cr[y * down_width + x] = avg_cr;
-      best_cb[y * down_width + x] = avg_cb;
-      best_cr[y * down_width + x] = avg_cr;
+      size_t di = y * down_width + x;
+      target_cb[di] = avg_cb; target_cr[di] = avg_cr;
+      best_cb[di] = avg_cb;   best_cr[di] = avg_cr;
     }
   }
 
-  // 2. Iterative Back-Projection (4-pass Y/Cb/Cr coupled stabilization)
   float prev_diff_y_sum = 1e30f;
-  const float diff_y_threshold = 3.0f * xsize_padded * iMCU_height;
+  const float diff_y_threshold = 3.0f * high_size;
 
   for (int iter = 0; iter < 4; ++iter) {
-    std::vector<float> err_y(xsize_padded * iMCU_height);
-    std::vector<float> err_cb(down_width * down_height);
-    std::vector<float> err_cr(down_width * down_height);
+    std::vector<float> err_y(high_size);
+    std::vector<float> err_cb(low_size);
+    std::vector<float> err_cr(low_size);
     float diff_y_sum = 0.0f;
 
+    // Bilerp Up to Full Res (Scalar optimized loop)
     for (size_t y = 0; y < down_height; ++y) {
       size_t y_prev = (y == 0) ? 0 : y - 1;
       size_t y_next = (y + 1 == down_height) ? y : y + 1;
-
-      const float* prev_cb = &best_cb[y_prev * down_width];
-      const float* cur_cb = &best_cb[y * down_width];
-      const float* next_cb = &best_cb[y_next * down_width];
-
-      const float* prev_cr = &best_cr[y_prev * down_width];
-      const float* cur_cr = &best_cr[y * down_width];
-      const float* next_cr = &best_cr[y_next * down_width];
-
       for (size_t x = 0; x < down_width; ++x) {
         size_t x_prev = (x == 0) ? 0 : x - 1;
         size_t x_next = (x + 1 == down_width) ? x : x + 1;
-
-        auto upsample_bilerp = [](float center, float t, float b, float l,
-                                  float r, float tl, float tr, float bl,
-                                  float br, float res[4]) {
-          res[0] = (center * 9.0f + t * 3.0f + l * 3.0f + tl) / 16.0f;
-          res[1] = (center * 9.0f + t * 3.0f + r * 3.0f + tr) / 16.0f;
-          res[2] = (center * 9.0f + b * 3.0f + l * 3.0f + bl) / 16.0f;
-          res[3] = (center * 9.0f + b * 3.0f + r * 3.0f + br) / 16.0f;
+        
+        auto get_avg = [](float c, float t, float b, float l, float r, 
+                          float tl, float tr, float bl, float br, float res[4]) {
+          res[0] = (c * 9.0f + t * 3.0f + l * 3.0f + tl) / 16.0f;
+          res[1] = (c * 9.0f + t * 3.0f + r * 3.0f + tr) / 16.0f;
+          res[2] = (c * 9.0f + b * 3.0f + l * 3.0f + bl) / 16.0f;
+          res[3] = (c * 9.0f + b * 3.0f + r * 3.0f + br) / 16.0f;
         };
-
-        float up_cb[4];
-        upsample_bilerp(cur_cb[x], prev_cb[x], next_cb[x], cur_cb[x_prev],
-                        cur_cb[x_next], prev_cb[x_prev], prev_cb[x_next],
-                        next_cb[x_prev], next_cb[x_next], up_cb);
-
-        float up_cr[4];
-        upsample_bilerp(cur_cr[x], prev_cr[x], next_cr[x], cur_cr[x_prev],
-                        cur_cr[x_next], prev_cr[x_prev], prev_cr[x_next],
-                        next_cr[x_prev], next_cr[x_next], up_cr);
-
-        float lin_r_sum = 0, lin_g_sum = 0, lin_b_sum = 0;
-        for (int iy = 0; iy < 2; ++iy) {
-          for (int ix = 0; ix < 2; ++ix) {
-            int i = iy * 2 + ix;
-            size_t hx = x * 2 + ix;
-            size_t hy = y * 2 + iy;
-            float py = best_y[hy * xsize_padded + hx];
-            
-            float r, g, b;
-            YCbCrToRGB_Scalar(py, up_cb[i], up_cr[i], &r, &g, &b);
-            
-            float lin_r = GammaToLinear(r);
-            float lin_g = GammaToLinear(g);
-            float lin_b = GammaToLinear(b);
-            
-            // Record Y error based on simulated decoded linear light
-            float lin_y = RGBToLuma_Scalar(lin_r, lin_g, lin_b);
-            float dec_gamma_y = LinearToGamma(lin_y);
-            float error_y = target_y[hy * xsize_padded + hx] - dec_gamma_y;
-            err_y[hy * xsize_padded + hx] = error_y;
-            diff_y_sum += std::abs(error_y);
-            
-            lin_r_sum += lin_r;
-            lin_g_sum += lin_g;
-            lin_b_sum += lin_b;
+        
+        float u_cb[4], u_cr[4];
+        float ccb = best_cb[y*down_width+x], ccr = best_cr[y*down_width+x];
+        get_avg(ccb, best_cb[y_prev*down_width+x], best_cb[y_next*down_width+x],
+                best_cb[y*down_width+x_prev], best_cb[y*down_width+x_next],
+                best_cb[y_prev*down_width+x_prev], best_cb[y_prev*down_width+x_next],
+                best_cb[y_next*down_width+x_prev], best_cb[y_next*down_width+x_next], u_cb);
+        get_avg(ccr, best_cr[y_prev*down_width+x], best_cr[y_next*down_width+x],
+                best_cr[y*down_width+x_prev], best_cr[y*down_width+x_next],
+                best_cr[y_prev*down_width+x_prev], best_cr[y_prev*down_width+x_next],
+                best_cr[y_next*down_width+x_prev], best_cr[y_next*down_width+x_next], u_cr);
+
+        for(int iy=0; iy<2; ++iy) {
+          for(int ix=0; ix<2; ++ix) {
+            size_t idx = (y * 2 + iy) * xsize_padded + (x * 2 + ix);
+            up_cb[idx] = u_cb[iy*2+ix];
+            up_cr[idx] = u_cr[iy*2+ix];
           }
         }
-        lin_r_sum *= 0.25f;
-        lin_g_sum *= 0.25f;
-        lin_b_sum *= 0.25f;
-
-        float avg_r = LinearToGamma(lin_r_sum);
-        float avg_g = LinearToGamma(lin_g_sum);
-        float avg_b = LinearToGamma(lin_b_sum);
-
-        float avg_y, avg_cb, avg_cr;
-        RGBToYCbCr_Scalar(avg_r, avg_g, avg_b, &avg_y, &avg_cb, &avg_cr);
-
-        err_cb[y * down_width + x] = target_cb[y * down_width + x] - avg_cb;
-        err_cr[y * down_width + x] = target_cr[y * down_width + x] - avg_cr;
       }
     }
-    
-    const size_t N = hwy::HWY_NAMESPACE::Lanes(d);
-    
-    // Apply High-Res Y Error with clipping
+
+    // SIMD Evaluate Error 
     for (size_t y = 0; y < iMCU_height; ++y) {
       size_t x = 0;
-      float* y_ptr = &best_y[y * xsize_padded];
-      const float* ey_ptr = &err_y[y * xsize_padded];
-      auto v_zero = Set(d, 0.0f);
-      auto v_max = Set(d, 255.0f);
       for (; x + N <= xsize_padded; x += N) {
-         auto y_vec = LoadU(d, y_ptr + x);
-         auto e_y_vec = LoadU(d, ey_ptr + x);
-         auto res = Add(y_vec, e_y_vec);
-         res = Min(Max(res, v_zero), v_max);
-         StoreU(res, d, y_ptr + x);
+        size_t idx = y * xsize_padded + x;
+        auto py = LoadU(d, &best_y[idx]);
+        auto ucb = LoadU(d, &up_cb[idx]);
+        auto ucr = LoadU(d, &up_cr[idx]);
+
+        auto r = Set(d,0.0f), g = Set(d,0.0f), b = Set(d,0.0f);
+        YCbCrToRGB_SIMD(d, py, ucb, ucr, &r, &g, &b);
+        
+        auto lin_r = GammaToLinear_SIMD(d, r);
+        auto lin_g = GammaToLinear_SIMD(d, g);
+        auto lin_b = GammaToLinear_SIMD(d, b);
+
+        auto lin_y = TempLuma_SIMD(d, lin_r, lin_g, lin_b);
+        auto dec_y = LinearToGamma_SIMD(d, lin_y);
+        
+        auto e_y = Sub(LoadU(d, &target_y[idx]), dec_y);
+        StoreU(e_y, d, &err_y[idx]);
+        StoreU(lin_r, d, &tmp_lin_r[idx]);
+        StoreU(lin_g, d, &tmp_lin_g[idx]);
+        StoreU(lin_b, d, &tmp_lin_b[idx]);
       }
       for (; x < xsize_padded; ++x) {
-         y_ptr[x] = std::max(0.0f, std::min(255.0f, y_ptr[x] + ey_ptr[x]));
+        size_t idx = y * xsize_padded + x;
+        float py = best_y[idx], ucb = up_cb[idx], ucr = up_cr[idx];
+        float r, g, b; YCbCrToRGB_Scalar(py, ucb, ucr, &r, &g, &b);
+        float lr = GammaToLinear(r), lg = GammaToLinear(g), lb = GammaToLinear(b);
+        float ly = 0.299 * lr + 0.587 * lg + 0.114 * lb;
+        err_y[idx] = target_y[idx] - LinearToGamma(ly);
+        tmp_lin_r[idx] = lr; tmp_lin_g[idx] = lg; tmp_lin_b[idx] = lb;
       }
     }
     
-    // Apply Cb/Cr Error
+    for (size_t i = 0; i < high_size; ++i) diff_y_sum += std::abs(err_y[i]);
+
+    // Box downscale decoupled backprojection Cb/Cr evaluation
     for (size_t y = 0; y < down_height; ++y) {
-      size_t x = 0;
-      float* cb_ptr = &best_cb[y * down_width];
-      float* cr_ptr = &best_cr[y * down_width];
-      const float* ecb_ptr = &err_cb[y * down_width];
-      const float* ecr_ptr = &err_cr[y * down_width];
-
-      for (; x + N <= down_width; x += N) {
-         auto cb_vec = LoadU(d, cb_ptr + x);
-         auto cr_vec = LoadU(d, cr_ptr + x);
-         auto e_cb_vec = LoadU(d, ecb_ptr + x);
-         auto e_cr_vec = LoadU(d, ecr_ptr + x);
-         StoreU(Add(cb_vec, e_cb_vec), d, cb_ptr + x);
-         StoreU(Add(cr_vec, e_cr_vec), d, cr_ptr + x);
-      }
-      for (; x < down_width; ++x) {
-         cb_ptr[x] += ecb_ptr[x];
-         cr_ptr[x] += ecr_ptr[x];
+      for (size_t x = 0; x < down_width; ++x) {
+        float lr_sum = 0, lg_sum = 0, lb_sum = 0;
+        for (int iy = 0; iy < 2; ++iy) {
+          for (int ix = 0; ix < 2; ++ix) {
+            size_t idx = (y * 2 + iy) * xsize_padded + (x * 2 + ix);
+            lr_sum += tmp_lin_r[idx];
+            lg_sum += tmp_lin_g[idx];
+            lb_sum += tmp_lin_b[idx];
+          }
+        }
+        float avg_r = LinearToGamma(lr_sum * 0.25f);
+        float avg_g = LinearToGamma(lg_sum * 0.25f);
+        float avg_b = LinearToGamma(lb_sum * 0.25f);
+        float avg_y, avg_cb, avg_cr;
+        RGBToYCbCr_Scalar(avg_r, avg_g, avg_b, &avg_y, &avg_cb, &avg_cr);
+        size_t di = y * down_width + x;
+        err_cb[di] = target_cb[di] - avg_cb;
+        err_cr[di] = target_cr[di] - avg_cr;
       }
     }
-
-    // test libwebp exit condition
-    if (iter > 0) {
-      if (diff_y_sum < diff_y_threshold) break;
-      if (diff_y_sum > prev_diff_y_sum) break;
+    
+    // Apply Y Error
+    for (size_t i = 0; i + N <= high_size; i += N) {
+       auto y_vec = LoadU(d, &best_y[i]);
+       auto e_y_vec = LoadU(d, &err_y[i]);
+       auto res = Add(y_vec, e_y_vec);
+       res = Min(Max(res, Set(d, 0.0f)), Set(d, 255.0f));
+       StoreU(res, d, &best_y[i]);
+    }
+    for (size_t i = high_size - (high_size % N); i < high_size; ++i) {
+       best_y[i] = std::max(0.0f, std::min(255.0f, best_y[i] + err_y[i]));
+    }
+    
+    // Apply Cb/Cr Error
+    for (size_t i = 0; i + N <= low_size; i += N) {
+       StoreU(Add(LoadU(d, &best_cb[i]), LoadU(d, &err_cb[i])), d, &best_cb[i]);
+       StoreU(Add(LoadU(d, &best_cr[i]), LoadU(d, &err_cr[i])), d, &best_cr[i]);
+    }
+    for (size_t i = low_size - (low_size % N); i < low_size; ++i) {
+       best_cb[i] += err_cb[i];
+       best_cr[i] += err_cr[i];
     }
+
+    if (iter > 0 && (diff_y_sum < diff_y_threshold || diff_y_sum > prev_diff_y_sum)) break;
     prev_diff_y_sum = diff_y_sum;
   }
 
   // 3. Write output to buffers
-  const size_t N = hwy::HWY_NAMESPACE::Lanes(d);
-  
-  // Write Luma
   for (size_t y = 0; y < iMCU_height; ++y) {
     float* row_out_y = m->raw_data[0]->Row(y0 + y);
     float* src_y = &best_y[y * xsize_padded];
     size_t x = 0;
-    for (; x + N <= xsize_padded; x += N) {
-       StoreU(LoadU(d, src_y + x), d, row_out_y + x);
-    }
-    for (; x < xsize_padded; ++x) {
-       row_out_y[x] = src_y[x];
-    }
+    for (; x + N <= xsize_padded; x += N) StoreU(LoadU(d, src_y + x), d, row_out_y + x);
+    for (; x < xsize_padded; ++x) row_out_y[x] = src_y[x];
   }
   
-  // Write Chroma
   const size_t y_out0 = y0 / 2;
   for (size_t y_out = 0; y_out < down_height; ++y_out) {
     float* row_out_cb = m->raw_data[1]->Row(y_out0 + y_out);
     float* row_out_cr = m->raw_data[2]->Row(y_out0 + y_out);
     float* src_cb = &best_cb[y_out * down_width];
     float* src_cr = &best_cr[y_out * down_width];
-
     size_t x = 0;
     for (; x + N <= down_width; x += N) {
        StoreU(LoadU(d, src_cb + x), d, row_out_cb + x);

From a4514f110a4931d5c935052ad5528a38e073bfa5 Mon Sep 17 00:00:00 2001
From: Galaxy4594 <164440799+Galaxy4594@users.noreply.github.com>
Date: Sat, 21 Mar 2026 01:40:09 -0700
Subject: [PATCH 03/10] speed up clean up

---
 lib/jpegli/sharp_yuv.cc | 643 ++++++++++++++++++++++++++++++----------
 1 file changed, 489 insertions(+), 154 deletions(-)

diff --git a/lib/jpegli/sharp_yuv.cc b/lib/jpegli/sharp_yuv.cc
index 135d82b6..bc84316c 100644
--- a/lib/jpegli/sharp_yuv.cc
+++ b/lib/jpegli/sharp_yuv.cc
@@ -9,6 +9,8 @@
 #include <algorithm>
 #include <cmath>
 #include <cstddef>
+#include <cstdint>
+#include <cstring>
 #include <vector>
 
 #include "lib/jpegli/common.h"
@@ -19,28 +21,43 @@
 #define HWY_TARGET_INCLUDE "lib/jpegli/sharp_yuv.cc"
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
-#include <hwy/contrib/math/math-inl.h>
 
 HWY_BEFORE_NAMESPACE();
 namespace jpegli {
 namespace HWY_NAMESPACE {
 
+using hwy::HWY_NAMESPACE::Abs;
 using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::And;
+using hwy::HWY_NAMESPACE::BitCast;
+using hwy::HWY_NAMESPACE::ConvertTo;
 using hwy::HWY_NAMESPACE::Div;
+using hwy::HWY_NAMESPACE::Floor;
+using hwy::HWY_NAMESPACE::IfThenElse;
+using hwy::HWY_NAMESPACE::Le;
+using hwy::HWY_NAMESPACE::LoadDup128;
 using hwy::HWY_NAMESPACE::LoadU;
 using hwy::HWY_NAMESPACE::Max;
 using hwy::HWY_NAMESPACE::Min;
 using hwy::HWY_NAMESPACE::Mul;
 using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Or;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::ReduceSum;
 using hwy::HWY_NAMESPACE::Set;
+using hwy::HWY_NAMESPACE::ShiftLeft;
+using hwy::HWY_NAMESPACE::ShiftRight;
 using hwy::HWY_NAMESPACE::StoreU;
 using hwy::HWY_NAMESPACE::Sub;
-using hwy::HWY_NAMESPACE::IfThenElse;
-using hwy::HWY_NAMESPACE::Le;
+using hwy::HWY_NAMESPACE::Zero;
 
 using D = HWY_CAPPED(float, 8);
 constexpr D d;
 
+// ---------------------------------------------------------------------------
+// Scalar helpers (used for tail elements only)
+// ---------------------------------------------------------------------------
+
 inline float GammaToLinear(float v) {
   float n = std::max(0.0f, std::min(255.0f, v)) * (1.0f / 255.0f);
   if (n <= 0.04045f) {
@@ -54,7 +71,8 @@ inline float LinearToGamma(float v) {
   if (v <= 0.0031308f) {
     return v * 12.92f * 255.0f;
   } else {
-    return (1.055f * std::pow(std::max(0.0f, v), 1.0f / 2.4f) - 0.055f) * 255.0f;
+    return (1.055f * std::pow(std::max(0.0f, v), 1.0f / 2.4f) - 0.055f) *
+           255.0f;
   }
 }
 
@@ -63,7 +81,8 @@ inline void YCbCrToRGB_Scalar(float y, float cb, float cr, float* r, float* g,
   cb -= 128.0f;
   cr -= 128.0f;
   *r = y + cr * 1.402f;
-  *g = y + cb * (-0.114f * 1.772f / 0.587f) + cr * (-0.299f * 1.402f / 0.587f);
+  *g = y + cb * (-0.114f * 1.772f / 0.587f) +
+       cr * (-0.299f * 1.402f / 0.587f);
   *b = y + cb * 1.772f;
 }
 
@@ -88,17 +107,87 @@ inline void RGBToYCbCr_Scalar(float r, float g, float b, float* y, float* cb,
   *cr = (r * kDiffR - y_base) * kNormR + 128.0f;
 }
 
+// ---------------------------------------------------------------------------
+// Fast log2/exp2 approximations (same approach as adaptive_quantization.cc)
+// ---------------------------------------------------------------------------
+
+// Evaluates a rational polynomial p(x)/q(x) via Horner's scheme.
+template <size_t NP, size_t NQ, class DF, class V, typename T>
+HWY_INLINE V EvalRationalPoly(const DF df, const V x, const T (&p)[NP],
+                              const T (&q)[NQ]) {
+  constexpr size_t kDegP = NP / 4 - 1;
+  constexpr size_t kDegQ = NQ / 4 - 1;
+  auto yp = LoadDup128(df, &p[kDegP * 4]);
+  auto yq = LoadDup128(df, &q[kDegQ * 4]);
+  HWY_FENCE;
+  if (kDegP >= 1) yp = MulAdd(yp, x, LoadDup128(df, p + ((kDegP - 1) * 4)));
+  if (kDegQ >= 1) yq = MulAdd(yq, x, LoadDup128(df, q + ((kDegQ - 1) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 2) yp = MulAdd(yp, x, LoadDup128(df, p + ((kDegP - 2) * 4)));
+  if (kDegQ >= 2) yq = MulAdd(yq, x, LoadDup128(df, q + ((kDegQ - 2) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 3) yp = MulAdd(yp, x, LoadDup128(df, p + ((kDegP - 3) * 4)));
+  if (kDegQ >= 3) yq = MulAdd(yq, x, LoadDup128(df, q + ((kDegQ - 3) * 4)));
+  return Div(yp, yq);
+}
+
+// Fast base-2 logarithm. L1 error ~3.9E-6.
+template <class DF, class V>
+HWY_INLINE V FastLog2f(const DF df, V x) {
+  HWY_ALIGN const float p[4 * 3] = {HWY_REP4(-1.8503833400518310E-06f),
+                                     HWY_REP4(1.4287160470083755E+00f),
+                                     HWY_REP4(7.4245873327820566E-01f)};
+  HWY_ALIGN const float q[4 * 3] = {HWY_REP4(9.9032814277590719E-01f),
+                                     HWY_REP4(1.0096718572241148E+00f),
+                                     HWY_REP4(1.7409343003366853E-01f)};
+  const Rebind<int32_t, DF> di;
+  const auto x_bits = BitCast(di, x);
+  const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab));
+  const auto exp_shifted = ShiftRight<23>(exp_bits);
+  const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted)));
+  const auto exp_val = ConvertTo(df, exp_shifted);
+  return Add(EvalRationalPoly(df, Sub(mantissa, Set(df, 1.0f)), p, q), exp_val);
+}
+
+// Fast 2^x. Max relative error ~3e-7.
+template <class DF, class V>
+HWY_INLINE V FastPow2f(const DF df, V x) {
+  const Rebind<int32_t, DF> di;
+  auto floorx = Floor(x);
+  auto exp =
+      BitCast(df, ShiftLeft<23>(Add(ConvertTo(di, floorx), Set(di, 127))));
+  auto frac = Sub(x, floorx);
+  auto num = Add(frac, Set(df, 1.01749063e+01f));
+  num = MulAdd(num, frac, Set(df, 4.88687798e+01f));
+  num = MulAdd(num, frac, Set(df, 9.85506591e+01f));
+  num = Mul(num, exp);
+  auto den = MulAdd(frac, Set(df, 2.10242958e-01f), Set(df, -2.22328856e-02f));
+  den = MulAdd(den, frac, Set(df, -1.94414990e+01f));
+  den = MulAdd(den, frac, Set(df, 9.85506633e+01f));
+  return Div(num, den);
+}
+
+// Fast pow(x, p) = 2^(p * log2(x)) for x > 0.
+template <class DF, class V>
+HWY_INLINE V FastPowf(const DF df, V x, float power) {
+  return FastPow2f(df, Mul(FastLog2f(df, x), Set(df, power)));
+}
+
+// ---------------------------------------------------------------------------
+// Highway SIMD gamma helpers using fast polynomial approximations
+// ---------------------------------------------------------------------------
+
 template <class D, class V>
 HWY_INLINE V GammaToLinear_SIMD(D d, V v) {
   auto v_zero = Set(d, 0.0f);
   auto v_max = Set(d, 255.0f);
   auto n = Mul(Min(Max(v, v_zero), v_max), Set(d, 1.0f / 255.0f));
   auto is_low = Le(n, Set(d, 0.04045f));
-  auto low_val = Div(n, Set(d, 12.92f));
-  auto base = Div(Add(n, Set(d, 0.055f)), Set(d, 1.055f));
+  auto low_val = Mul(n, Set(d, 1.0f / 12.92f));
+  // High path: ((n + 0.055) / 1.055) ^ 2.4
+  auto base = Mul(Add(n, Set(d, 0.055f)), Set(d, 1.0f / 1.055f));
   base = Max(base, Set(d, 1e-6f));
-  auto log_base = hwy::HWY_NAMESPACE::Log2(d, base);
-  auto high_val = hwy::HWY_NAMESPACE::Exp2(d, Mul(log_base, Set(d, 2.4f)));
+  auto high_val = FastPowf(d, base, 2.4f);
   return IfThenElse(is_low, low_val, high_val);
 }
 
@@ -108,9 +197,9 @@ HWY_INLINE V LinearToGamma_SIMD(D d, V v) {
   auto is_low = Le(v, Set(d, 0.0031308f));
   auto low_val = Mul(v, Set(d, 12.92f * 255.0f));
   auto v_safe = Max(v, Set(d, 1e-6f));
-  auto log_v = hwy::HWY_NAMESPACE::Log2(d, v_safe);
-  auto p = hwy::HWY_NAMESPACE::Exp2(d, Mul(log_v, Set(d, 1.0f / 2.4f)));
-  auto high_val = Mul(Sub(Mul(p, Set(d, 1.055f)), Set(d, 0.055f)), Set(d, 255.0f));
+  auto p = FastPowf(d, v_safe, 1.0f / 2.4f);
+  auto high_val =
+      Mul(Sub(Mul(p, Set(d, 1.055f)), Set(d, 0.055f)), Set(d, 255.0f));
   return IfThenElse(is_low, low_val, high_val);
 }
 
@@ -119,16 +208,240 @@ HWY_INLINE void YCbCrToRGB_SIMD(D d, V y, V cb, V cr, V* r, V* g, V* b) {
   auto v128 = Set(d, 128.0f);
   cb = Sub(cb, v128);
   cr = Sub(cr, v128);
-  *r = Add(y, Mul(cr, Set(d, 1.402f)));
-  *g = Add(y, Add(Mul(cb, Set(d, -0.114f * 1.772f / 0.587f)), Mul(cr, Set(d, -0.299f * 1.402f / 0.587f))));
-  *b = Add(y, Mul(cb, Set(d, 1.772f)));
+  *r = MulAdd(cr, Set(d, 1.402f), y);
+  *g = MulAdd(cb, Set(d, -0.114f * 1.772f / 0.587f),
+              MulAdd(cr, Set(d, -0.299f * 1.402f / 0.587f), y));
+  *b = MulAdd(cb, Set(d, 1.772f), y);
 }
 
 template <class D, class V>
 HWY_INLINE V TempLuma_SIMD(D d, V r, V g, V b) {
-  return Add(Mul(Set(d, 0.299f), r), Add(Mul(Set(d, 0.587f), g), Mul(Set(d, 0.114f), b)));
+  return MulAdd(Set(d, 0.299f), r,
+                MulAdd(Set(d, 0.587f), g, Mul(Set(d, 0.114f), b)));
+}
+
+// Vectorized RGBToYCbCr: computes Y, Cb, Cr from gamma-space R, G, B vectors.
+template <class D, class V>
+HWY_INLINE void RGBToYCbCr_SIMD(D d, V r, V g, V b, V* y, V* cb, V* cr) {
+  const auto kR = Set(d, 0.299f);
+  const auto kG = Set(d, 0.587f);
+  const auto kB = Set(d, 0.114f);
+  const auto kDiffR = Set(d, 0.701f + 0.299f);
+  const auto kDiffB = Set(d, 0.886f + 0.114f);
+  const auto kNormR = Set(d, 1.0f / (0.701f + 0.587f + 0.114f));
+  const auto kNormB = Set(d, 1.0f / (0.299f + 0.587f + 0.886f));
+  const auto v128 = Set(d, 128.0f);
+
+  auto y_base = MulAdd(kR, r, MulAdd(kG, g, Mul(kB, b)));
+  *y = y_base;
+  // cb = (b * kDiffB - y_base) * kNormB + 128
+  *cb = MulAdd(Sub(Mul(b, kDiffB), y_base), kNormB, v128);
+  // cr = (r * kDiffR - y_base) * kNormR + 128
+  *cr = MulAdd(Sub(Mul(r, kDiffR), y_base), kNormR, v128);
 }
 
+// ---------------------------------------------------------------------------
+// SIMD box-average downsample of linear RGB -> YCbCr chroma
+// Processes each row of down_width pixels. For each output pixel, averages
+// a 2x2 block of linear RGB, converts back to gamma, then to YCbCr.
+// ---------------------------------------------------------------------------
+HWY_INLINE void BoxAverageDownsampleRow(const float* lin_r0,
+                                        const float* lin_g0,
+                                        const float* lin_b0,
+                                        const float* lin_r1,
+                                        const float* lin_g1,
+                                        const float* lin_b1,
+                                        size_t down_width, size_t xsize_padded,
+                                        float* out_cb, float* out_cr) {
+  const size_t N = hwy::HWY_NAMESPACE::Lanes(d);
+  const auto quarter = Set(d, 0.25f);
+  size_t x = 0;
+
+  // SIMD path: process N output pixels at a time.
+  // Each output pixel needs 2 input pixels horizontally, so we need 2*N inputs.
+  for (; x + N <= down_width; x += N) {
+    // Gather even and odd columns from two rows.
+    // Row 0: indices 2*x, 2*x+1, 2*x+2, 2*x+3, ...
+    // Row 1: same pattern.
+    // We accumulate the 4 samples for each output pixel.
+    auto sum_r = Zero(d);
+    auto sum_g = Zero(d);
+    auto sum_b = Zero(d);
+
+    for (int iy = 0; iy < 2; ++iy) {
+      const float* src_r = (iy == 0) ? lin_r0 : lin_r1;
+      const float* src_g = (iy == 0) ? lin_g0 : lin_g1;
+      const float* src_b = (iy == 0) ? lin_b0 : lin_b1;
+      for (int ix = 0; ix < 2; ++ix) {
+        // Load non-contiguous: elements at positions 2*x+ix, 2*(x+1)+ix, ...
+        // Unfortunately these are strided, so we load pairs and deinterleave
+        // is cleaner. For simplicity and correctness, use scalar gather here
+        // since the inner 2x2 loop is just 4 iterations.
+        // Actually, let's use a temporary buffer approach for the gather.
+        HWY_ALIGN float tmp_r[HWY_MAX_LANES_D(D)];
+        HWY_ALIGN float tmp_g[HWY_MAX_LANES_D(D)];
+        HWY_ALIGN float tmp_b[HWY_MAX_LANES_D(D)];
+        for (size_t k = 0; k < N; ++k) {
+          size_t si = (x + k) * 2 + ix;
+          tmp_r[k] = src_r[si];
+          tmp_g[k] = src_g[si];
+          tmp_b[k] = src_b[si];
+        }
+        sum_r = Add(sum_r, LoadU(d, tmp_r));
+        sum_g = Add(sum_g, LoadU(d, tmp_g));
+        sum_b = Add(sum_b, LoadU(d, tmp_b));
+      }
+    }
+
+    auto avg_r = LinearToGamma_SIMD(d, Mul(sum_r, quarter));
+    auto avg_g = LinearToGamma_SIMD(d, Mul(sum_g, quarter));
+    auto avg_b = LinearToGamma_SIMD(d, Mul(sum_b, quarter));
+
+    decltype(avg_r) vy, vcb, vcr;
+    RGBToYCbCr_SIMD(d, avg_r, avg_g, avg_b, &vy, &vcb, &vcr);
+
+    StoreU(vcb, d, out_cb + x);
+    StoreU(vcr, d, out_cr + x);
+  }
+
+  // Scalar tail
+  for (; x < down_width; ++x) {
+    float lr_sum = 0, lg_sum = 0, lb_sum = 0;
+    for (int iy = 0; iy < 2; ++iy) {
+      const float* src_r = (iy == 0) ? lin_r0 : lin_r1;
+      const float* src_g = (iy == 0) ? lin_g0 : lin_g1;
+      const float* src_b = (iy == 0) ? lin_b0 : lin_b1;
+      for (int ix = 0; ix < 2; ++ix) {
+        size_t si = x * 2 + ix;
+        lr_sum += src_r[si];
+        lg_sum += src_g[si];
+        lb_sum += src_b[si];
+      }
+    }
+    float avg_r = LinearToGamma(lr_sum * 0.25f);
+    float avg_g = LinearToGamma(lg_sum * 0.25f);
+    float avg_b = LinearToGamma(lb_sum * 0.25f);
+    float avg_y, avg_cb, avg_cr;
+    RGBToYCbCr_Scalar(avg_r, avg_g, avg_b, &avg_y, &avg_cb, &avg_cr);
+    out_cb[x] = avg_cb;
+    out_cr[x] = avg_cr;
+  }
+}
+
+// ---------------------------------------------------------------------------
+// SIMD tent-filter upsample of one chroma row
+// For each x in [0, down_width), computes 4 upsampled values using the
+// 9/3/3/1 tent filter, and writes them to the 4 corresponding full-res
+// positions.
+// ---------------------------------------------------------------------------
+HWY_INLINE void TentUpsampleRow(const float* prev_row, const float* cur_row,
+                                const float* next_row, size_t down_width,
+                                size_t xsize_padded, size_t y,
+                                float* out_full) {
+  const size_t N = hwy::HWY_NAMESPACE::Lanes(d);
+  const auto k9 = Set(d, 9.0f);
+  const auto k3 = Set(d, 3.0f);
+  const auto kInv16 = Set(d, 1.0f / 16.0f);
+  size_t x = 0;
+
+  for (; x + N <= down_width; x += N) {
+    auto vc = LoadU(d, cur_row + x);
+
+    // Load left neighbors (clamp at boundary)
+    HWY_ALIGN float tmp_l[HWY_MAX_LANES_D(D)];
+    HWY_ALIGN float tmp_tl[HWY_MAX_LANES_D(D)];
+    HWY_ALIGN float tmp_bl[HWY_MAX_LANES_D(D)];
+    for (size_t k = 0; k < N; ++k) {
+      size_t xi = x + k;
+      size_t xl = (xi == 0) ? 0 : xi - 1;
+      tmp_l[k] = cur_row[xl];
+      tmp_tl[k] = prev_row[xl];
+      tmp_bl[k] = next_row[xl];
+    }
+    auto vl = LoadU(d, tmp_l);
+    auto vtl = LoadU(d, tmp_tl);
+    auto vbl = LoadU(d, tmp_bl);
+
+    // Load right neighbors (clamp at boundary)
+    HWY_ALIGN float tmp_r[HWY_MAX_LANES_D(D)];
+    HWY_ALIGN float tmp_tr[HWY_MAX_LANES_D(D)];
+    HWY_ALIGN float tmp_br[HWY_MAX_LANES_D(D)];
+    for (size_t k = 0; k < N; ++k) {
+      size_t xi = x + k;
+      size_t xr = (xi + 1 == down_width) ? xi : xi + 1;
+      tmp_r[k] = cur_row[xr];
+      tmp_tr[k] = prev_row[xr];
+      tmp_br[k] = next_row[xr];
+    }
+    auto vr = LoadU(d, tmp_r);
+    auto vtr = LoadU(d, tmp_tr);
+    auto vbr = LoadU(d, tmp_br);
+
+    auto vt = LoadU(d, prev_row + x);
+    auto vb = LoadU(d, next_row + x);
+
+    // c9 = c * 9
+    auto c9 = Mul(vc, k9);
+
+    // Top-left quadrant: (c*9 + t*3 + l*3 + tl) / 16
+    auto val_tl = Mul(MulAdd(k3, Add(vt, vl), Add(c9, vtl)), kInv16);
+    // Top-right quadrant: (c*9 + t*3 + r*3 + tr) / 16
+    auto val_tr = Mul(MulAdd(k3, Add(vt, vr), Add(c9, vtr)), kInv16);
+    // Bottom-left quadrant: (c*9 + b*3 + l*3 + bl) / 16
+    auto val_bl = Mul(MulAdd(k3, Add(vb, vl), Add(c9, vbl)), kInv16);
+    // Bottom-right quadrant: (c*9 + b*3 + r*3 + br) / 16
+    auto val_br = Mul(MulAdd(k3, Add(vb, vr), Add(c9, vbr)), kInv16);
+
+    // Scatter to full-res positions
+    HWY_ALIGN float res_tl[HWY_MAX_LANES_D(D)];
+    HWY_ALIGN float res_tr[HWY_MAX_LANES_D(D)];
+    HWY_ALIGN float res_bl[HWY_MAX_LANES_D(D)];
+    HWY_ALIGN float res_br[HWY_MAX_LANES_D(D)];
+    StoreU(val_tl, d, res_tl);
+    StoreU(val_tr, d, res_tr);
+    StoreU(val_bl, d, res_bl);
+    StoreU(val_br, d, res_br);
+    for (size_t k = 0; k < N; ++k) {
+      size_t xi = x + k;
+      out_full[(y * 2 + 0) * xsize_padded + (xi * 2 + 0)] = res_tl[k];
+      out_full[(y * 2 + 0) * xsize_padded + (xi * 2 + 1)] = res_tr[k];
+      out_full[(y * 2 + 1) * xsize_padded + (xi * 2 + 0)] = res_bl[k];
+      out_full[(y * 2 + 1) * xsize_padded + (xi * 2 + 1)] = res_br[k];
+    }
+  }
+
+  // Scalar tail
+  for (; x < down_width; ++x) {
+    size_t x_prev = (x == 0) ? 0 : x - 1;
+    size_t x_next = (x + 1 == down_width) ? x : x + 1;
+
+    float cc = cur_row[x];
+    float ct = prev_row[x];
+    float cb = next_row[x];
+    float cl = cur_row[x_prev];
+    float cr = cur_row[x_next];
+    float ctl = prev_row[x_prev];
+    float ctr = prev_row[x_next];
+    float cbl = next_row[x_prev];
+    float cbr = next_row[x_next];
+
+    float c9 = cc * 9.0f;
+    float inv16 = 1.0f / 16.0f;
+    out_full[(y * 2 + 0) * xsize_padded + (x * 2 + 0)] =
+        (c9 + ct * 3.0f + cl * 3.0f + ctl) * inv16;
+    out_full[(y * 2 + 0) * xsize_padded + (x * 2 + 1)] =
+        (c9 + ct * 3.0f + cr * 3.0f + ctr) * inv16;
+    out_full[(y * 2 + 1) * xsize_padded + (x * 2 + 0)] =
+        (c9 + cb * 3.0f + cl * 3.0f + cbl) * inv16;
+    out_full[(y * 2 + 1) * xsize_padded + (x * 2 + 1)] =
+        (c9 + cb * 3.0f + cr * 3.0f + cbr) * inv16;
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Main SharpYUV implementation
+// ---------------------------------------------------------------------------
+
 void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
   jpeg_comp_master* m = cinfo->master;
   const size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
@@ -137,9 +450,11 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
   const size_t xsize_padded = m->xsize_blocks * DCTSIZE;
 
   if (cinfo->num_components < 3) return;
-  const int h_factor = cinfo->max_h_samp_factor / cinfo->comp_info[1].h_samp_factor;
-  const int v_factor = cinfo->max_v_samp_factor / cinfo->comp_info[1].v_samp_factor;
-  if(h_factor != 2 || v_factor != 2) {
+  const int h_factor =
+      cinfo->max_h_samp_factor / cinfo->comp_info[1].h_samp_factor;
+  const int v_factor =
+      cinfo->max_v_samp_factor / cinfo->comp_info[1].v_samp_factor;
+  if (h_factor != 2 || v_factor != 2) {
     for (int c = 1; c < cinfo->num_components; ++c) {
       jpeg_component_info* comp = &cinfo->comp_info[c];
       const int hf = cinfo->max_h_samp_factor / comp->h_samp_factor;
@@ -149,7 +464,8 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
       float* rows_in[MAX_SAMP_FACTOR];
       auto& input = *m->smooth_input[c];
       auto& output = *m->raw_data[c];
-      for (size_t y_in = y0, y_out = y_out0; y_in < y1; y_in += vf, ++y_out) {
+      for (size_t y_in = y0, y_out = y_out0; y_in < y1;
+           y_in += vf, ++y_out) {
         for (int iy = 0; iy < vf; ++iy) {
           rows_in[iy] = input.Row(y_in + iy);
         }
@@ -163,7 +479,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
   const size_t down_height = iMCU_height / 2;
   const size_t high_size = xsize_padded * iMCU_height;
   const size_t low_size = down_width * down_height;
-  
+
   std::vector<float> target_y(high_size);
   std::vector<float> best_y(high_size);
   std::vector<float> tmp_lin_r(high_size);
@@ -180,7 +496,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
 
   const size_t N = hwy::HWY_NAMESPACE::Lanes(d);
 
-  // 1. Initial High-Res state -> SIMD Loop
+  // 1. Decode input YCbCr -> linear RGB, compute per-pixel target luma (SIMD)
   for (size_t y = 0; y < iMCU_height; ++y) {
     const float* row_y = m->smooth_input[0]->Row(y0 + y);
     const float* row_cb = m->smooth_input[1]->Row(y0 + y);
@@ -190,17 +506,17 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
       auto py = LoadU(d, row_y + x);
       auto pcb = LoadU(d, row_cb + x);
       auto pcr = LoadU(d, row_cr + x);
-      
-      auto r = Set(d,0.0f), g = Set(d,0.0f), b = Set(d,0.0f);
+
+      decltype(py) r, g, b;
       YCbCrToRGB_SIMD(d, py, pcb, pcr, &r, &g, &b);
-      
+
       auto lin_r = GammaToLinear_SIMD(d, r);
       auto lin_g = GammaToLinear_SIMD(d, g);
       auto lin_b = GammaToLinear_SIMD(d, b);
-      
+
       auto lin_y = TempLuma_SIMD(d, lin_r, lin_g, lin_b);
       auto t_y = LinearToGamma_SIMD(d, lin_y);
-      
+
       size_t idx = y * xsize_padded + x;
       StoreU(t_y, d, &target_y[idx]);
       StoreU(py, d, &best_y[idx]);
@@ -209,88 +525,78 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
       StoreU(lin_b, d, &tmp_lin_b[idx]);
     }
     for (; x < xsize_padded; ++x) {
-      float py = row_y[x], pcb = row_cb[x], pcr = row_cr[x];
-      float r, g, b; YCbCrToRGB_Scalar(py, pcb, pcr, &r, &g, &b);
-      float lr = GammaToLinear(r), lg = GammaToLinear(g), lb = GammaToLinear(b);
-      float ly = 0.299 * lr + 0.587 * lg + 0.114 * lb;
+      float py = row_y[x];
+      float pcb = row_cb[x];
+      float pcr = row_cr[x];
+      float r, g, b;
+      YCbCrToRGB_Scalar(py, pcb, pcr, &r, &g, &b);
+      float lr = GammaToLinear(r);
+      float lg = GammaToLinear(g);
+      float lb = GammaToLinear(b);
+      float ly = 0.299f * lr + 0.587f * lg + 0.114f * lb;
       size_t idx = y * xsize_padded + x;
       target_y[idx] = LinearToGamma(ly);
       best_y[idx] = py;
-      tmp_lin_r[idx] = lr; tmp_lin_g[idx] = lg; tmp_lin_b[idx] = lb;
+      tmp_lin_r[idx] = lr;
+      tmp_lin_g[idx] = lg;
+      tmp_lin_b[idx] = lb;
     }
   }
 
-  // 1b. Box downscale to target_cb, target_cr -> Scalar loop 
-  // (Downscaled loops represent 1/4 the iterations, scalar overhead is negligible)
+  // 2. Box-average downsample of linear RGB -> target Cb/Cr (SIMD)
   for (size_t y = 0; y < down_height; ++y) {
-    for (size_t x = 0; x < down_width; ++x) {
-      float lr_sum = 0, lg_sum = 0, lb_sum = 0;
-      for (int iy = 0; iy < 2; ++iy) {
-        for (int ix = 0; ix < 2; ++ix) {
-          size_t idx = (y * 2 + iy) * xsize_padded + (x * 2 + ix);
-          lr_sum += tmp_lin_r[idx];
-          lg_sum += tmp_lin_g[idx];
-          lb_sum += tmp_lin_b[idx];
-        }
-      }
-      float avg_r = LinearToGamma(lr_sum * 0.25f);
-      float avg_g = LinearToGamma(lg_sum * 0.25f);
-      float avg_b = LinearToGamma(lb_sum * 0.25f);
-      float avg_y, avg_cb, avg_cr;
-      RGBToYCbCr_Scalar(avg_r, avg_g, avg_b, &avg_y, &avg_cb, &avg_cr);
-      size_t di = y * down_width + x;
-      target_cb[di] = avg_cb; target_cr[di] = avg_cr;
-      best_cb[di] = avg_cb;   best_cr[di] = avg_cr;
-    }
+    const float* r0 = &tmp_lin_r[(y * 2 + 0) * xsize_padded];
+    const float* g0 = &tmp_lin_g[(y * 2 + 0) * xsize_padded];
+    const float* b0 = &tmp_lin_b[(y * 2 + 0) * xsize_padded];
+    const float* r1 = &tmp_lin_r[(y * 2 + 1) * xsize_padded];
+    const float* g1 = &tmp_lin_g[(y * 2 + 1) * xsize_padded];
+    const float* b1 = &tmp_lin_b[(y * 2 + 1) * xsize_padded];
+    float* dcb = &target_cb[y * down_width];
+    float* dcr = &target_cr[y * down_width];
+
+    BoxAverageDownsampleRow(r0, g0, b0, r1, g1, b1, down_width, xsize_padded,
+                            dcb, dcr);
+
+    // Also initialize best_cb/cr = target_cb/cr
+    memcpy(&best_cb[y * down_width], dcb, down_width * sizeof(float));
+    memcpy(&best_cr[y * down_width], dcr, down_width * sizeof(float));
   }
 
   float prev_diff_y_sum = 1e30f;
   const float diff_y_threshold = 3.0f * high_size;
 
+  // Pre-allocate error buffers outside the iteration loop to avoid repeated
+  // heap allocation/deallocation.
+  std::vector<float> err_y(high_size);
+  std::vector<float> err_cb(low_size);
+  std::vector<float> err_cr(low_size);
+
   for (int iter = 0; iter < 4; ++iter) {
-    std::vector<float> err_y(high_size);
-    std::vector<float> err_cb(low_size);
-    std::vector<float> err_cr(low_size);
+    memset(err_y.data(), 0, high_size * sizeof(float));
+    memset(err_cb.data(), 0, low_size * sizeof(float));
+    memset(err_cr.data(), 0, low_size * sizeof(float));
     float diff_y_sum = 0.0f;
 
-    // Bilerp Up to Full Res (Scalar optimized loop)
+    // 3a. Upsample best_cb/cr to full-res using a 2D tent (9/3/3/1) filter
+    //     (SIMD inner loop with scalar boundary handling)
     for (size_t y = 0; y < down_height; ++y) {
       size_t y_prev = (y == 0) ? 0 : y - 1;
       size_t y_next = (y + 1 == down_height) ? y : y + 1;
-      for (size_t x = 0; x < down_width; ++x) {
-        size_t x_prev = (x == 0) ? 0 : x - 1;
-        size_t x_next = (x + 1 == down_width) ? x : x + 1;
-        
-        auto get_avg = [](float c, float t, float b, float l, float r, 
-                          float tl, float tr, float bl, float br, float res[4]) {
-          res[0] = (c * 9.0f + t * 3.0f + l * 3.0f + tl) / 16.0f;
-          res[1] = (c * 9.0f + t * 3.0f + r * 3.0f + tr) / 16.0f;
-          res[2] = (c * 9.0f + b * 3.0f + l * 3.0f + bl) / 16.0f;
-          res[3] = (c * 9.0f + b * 3.0f + r * 3.0f + br) / 16.0f;
-        };
-        
-        float u_cb[4], u_cr[4];
-        float ccb = best_cb[y*down_width+x], ccr = best_cr[y*down_width+x];
-        get_avg(ccb, best_cb[y_prev*down_width+x], best_cb[y_next*down_width+x],
-                best_cb[y*down_width+x_prev], best_cb[y*down_width+x_next],
-                best_cb[y_prev*down_width+x_prev], best_cb[y_prev*down_width+x_next],
-                best_cb[y_next*down_width+x_prev], best_cb[y_next*down_width+x_next], u_cb);
-        get_avg(ccr, best_cr[y_prev*down_width+x], best_cr[y_next*down_width+x],
-                best_cr[y*down_width+x_prev], best_cr[y*down_width+x_next],
-                best_cr[y_prev*down_width+x_prev], best_cr[y_prev*down_width+x_next],
-                best_cr[y_next*down_width+x_prev], best_cr[y_next*down_width+x_next], u_cr);
-
-        for(int iy=0; iy<2; ++iy) {
-          for(int ix=0; ix<2; ++ix) {
-            size_t idx = (y * 2 + iy) * xsize_padded + (x * 2 + ix);
-            up_cb[idx] = u_cb[iy*2+ix];
-            up_cr[idx] = u_cr[iy*2+ix];
-          }
-        }
-      }
+
+      const float* cb_prev = &best_cb[y_prev * down_width];
+      const float* cb_cur = &best_cb[y * down_width];
+      const float* cb_next = &best_cb[y_next * down_width];
+      TentUpsampleRow(cb_prev, cb_cur, cb_next, down_width, xsize_padded, y,
+                      up_cb.data());
+
+      const float* cr_prev = &best_cr[y_prev * down_width];
+      const float* cr_cur = &best_cr[y * down_width];
+      const float* cr_next = &best_cr[y_next * down_width];
+      TentUpsampleRow(cr_prev, cr_cur, cr_next, down_width, xsize_padded, y,
+                      up_cr.data());
     }
 
-    // SIMD Evaluate Error 
+    // 3b. Evaluate YCbCr reconstruction error at full resolution (SIMD)
     for (size_t y = 0; y < iMCU_height; ++y) {
       size_t x = 0;
       for (; x + N <= xsize_padded; x += N) {
@@ -299,16 +605,16 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
         auto ucb = LoadU(d, &up_cb[idx]);
         auto ucr = LoadU(d, &up_cr[idx]);
 
-        auto r = Set(d,0.0f), g = Set(d,0.0f), b = Set(d,0.0f);
+        decltype(py) r, g, b;
         YCbCrToRGB_SIMD(d, py, ucb, ucr, &r, &g, &b);
-        
+
         auto lin_r = GammaToLinear_SIMD(d, r);
         auto lin_g = GammaToLinear_SIMD(d, g);
         auto lin_b = GammaToLinear_SIMD(d, b);
 
         auto lin_y = TempLuma_SIMD(d, lin_r, lin_g, lin_b);
         auto dec_y = LinearToGamma_SIMD(d, lin_y);
-        
+
         auto e_y = Sub(LoadU(d, &target_y[idx]), dec_y);
         StoreU(e_y, d, &err_y[idx]);
         StoreU(lin_r, d, &tmp_lin_r[idx]);
@@ -317,90 +623,119 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
       }
       for (; x < xsize_padded; ++x) {
         size_t idx = y * xsize_padded + x;
-        float py = best_y[idx], ucb = up_cb[idx], ucr = up_cr[idx];
-        float r, g, b; YCbCrToRGB_Scalar(py, ucb, ucr, &r, &g, &b);
-        float lr = GammaToLinear(r), lg = GammaToLinear(g), lb = GammaToLinear(b);
-        float ly = 0.299 * lr + 0.587 * lg + 0.114 * lb;
+        float py = best_y[idx];
+        float ucb = up_cb[idx];
+        float ucr = up_cr[idx];
+        float r, g, b;
+        YCbCrToRGB_Scalar(py, ucb, ucr, &r, &g, &b);
+        float lr = GammaToLinear(r);
+        float lg = GammaToLinear(g);
+        float lb = GammaToLinear(b);
+        float ly = 0.299f * lr + 0.587f * lg + 0.114f * lb;
         err_y[idx] = target_y[idx] - LinearToGamma(ly);
-        tmp_lin_r[idx] = lr; tmp_lin_g[idx] = lg; tmp_lin_b[idx] = lb;
+        tmp_lin_r[idx] = lr;
+        tmp_lin_g[idx] = lg;
+        tmp_lin_b[idx] = lb;
       }
     }
-    
-    for (size_t i = 0; i < high_size; ++i) diff_y_sum += std::abs(err_y[i]);
 
-    // Box downscale decoupled backprojection Cb/Cr evaluation
+    // Accumulate |err_y|
+    auto acc_y = Zero(d);
+    size_t i_y = 0;
+    for (; i_y + N <= high_size; i_y += N) {
+      acc_y = Add(acc_y, Abs(LoadU(d, &err_y[i_y])));
+    }
+    diff_y_sum = ReduceSum(d, acc_y);
+    for (; i_y < high_size; ++i_y) {
+      diff_y_sum += std::abs(err_y[i_y]);
+    }
+
+    // 3c. Box-average downsample of reconstructed RGB -> compute Cb/Cr error
+    //     (SIMD via BoxAverageDownsampleRow)
     for (size_t y = 0; y < down_height; ++y) {
-      for (size_t x = 0; x < down_width; ++x) {
-        float lr_sum = 0, lg_sum = 0, lb_sum = 0;
-        for (int iy = 0; iy < 2; ++iy) {
-          for (int ix = 0; ix < 2; ++ix) {
-            size_t idx = (y * 2 + iy) * xsize_padded + (x * 2 + ix);
-            lr_sum += tmp_lin_r[idx];
-            lg_sum += tmp_lin_g[idx];
-            lb_sum += tmp_lin_b[idx];
-          }
-        }
-        float avg_r = LinearToGamma(lr_sum * 0.25f);
-        float avg_g = LinearToGamma(lg_sum * 0.25f);
-        float avg_b = LinearToGamma(lb_sum * 0.25f);
-        float avg_y, avg_cb, avg_cr;
-        RGBToYCbCr_Scalar(avg_r, avg_g, avg_b, &avg_y, &avg_cb, &avg_cr);
-        size_t di = y * down_width + x;
-        err_cb[di] = target_cb[di] - avg_cb;
-        err_cr[di] = target_cr[di] - avg_cr;
+      const float* r0 = &tmp_lin_r[(y * 2 + 0) * xsize_padded];
+      const float* g0 = &tmp_lin_g[(y * 2 + 0) * xsize_padded];
+      const float* b0 = &tmp_lin_b[(y * 2 + 0) * xsize_padded];
+      const float* r1 = &tmp_lin_r[(y * 2 + 1) * xsize_padded];
+      const float* g1 = &tmp_lin_g[(y * 2 + 1) * xsize_padded];
+      const float* b1 = &tmp_lin_b[(y * 2 + 1) * xsize_padded];
+
+      float* row_err_cb = &err_cb[y * down_width];
+      float* row_err_cr = &err_cr[y * down_width];
+      float* row_tgt_cb = &target_cb[y * down_width];
+      float* row_tgt_cr = &target_cr[y * down_width];
+
+      // BoxAverageDownsampleRow outputs Cb/Cr directly, so we use err
+      // buffers as temp storage for the reconstructed Cb/Cr.
+      BoxAverageDownsampleRow(r0, g0, b0, r1, g1, b1, down_width,
+                              xsize_padded, row_err_cb, row_err_cr);
+
+      // Compute error = target - reconstructed
+      size_t x = 0;
+      for (; x + N <= down_width; x += N) {
+        auto tgt_cb = LoadU(d, row_tgt_cb + x);
+        auto rec_cb = LoadU(d, row_err_cb + x);
+        StoreU(Sub(tgt_cb, rec_cb), d, row_err_cb + x);
+
+        auto tgt_cr = LoadU(d, row_tgt_cr + x);
+        auto rec_cr = LoadU(d, row_err_cr + x);
+        StoreU(Sub(tgt_cr, rec_cr), d, row_err_cr + x);
+      }
+      for (; x < down_width; ++x) {
+        row_err_cb[x] = row_tgt_cb[x] - row_err_cb[x];
+        row_err_cr[x] = row_tgt_cr[x] - row_err_cr[x];
       }
     }
-    
-    // Apply Y Error
-    for (size_t i = 0; i + N <= high_size; i += N) {
-       auto y_vec = LoadU(d, &best_y[i]);
-       auto e_y_vec = LoadU(d, &err_y[i]);
-       auto res = Add(y_vec, e_y_vec);
-       res = Min(Max(res, Set(d, 0.0f)), Set(d, 255.0f));
-       StoreU(res, d, &best_y[i]);
+
+    // 3d. Apply Y error correction with clamping (SIMD + scalar tail)
+    const auto vmin = Set(d, 0.0f);
+    const auto vmax = Set(d, 255.0f);
+    size_t i_y_corr = 0;
+    for (; i_y_corr + N <= high_size; i_y_corr += N) {
+      auto y_vec = LoadU(d, &best_y[i_y_corr]);
+      auto e_y_vec = LoadU(d, &err_y[i_y_corr]);
+      auto res = Min(Max(Add(y_vec, e_y_vec), vmin), vmax);
+      StoreU(res, d, &best_y[i_y_corr]);
     }
-    for (size_t i = high_size - (high_size % N); i < high_size; ++i) {
-       best_y[i] = std::max(0.0f, std::min(255.0f, best_y[i] + err_y[i]));
+    for (; i_y_corr < high_size; ++i_y_corr) {
+      best_y[i_y_corr] = std::max(0.0f, std::min(255.0f, best_y[i_y_corr] + err_y[i_y_corr]));
     }
-    
-    // Apply Cb/Cr Error
-    for (size_t i = 0; i + N <= low_size; i += N) {
-       StoreU(Add(LoadU(d, &best_cb[i]), LoadU(d, &err_cb[i])), d, &best_cb[i]);
-       StoreU(Add(LoadU(d, &best_cr[i]), LoadU(d, &err_cr[i])), d, &best_cr[i]);
+
+    // 3e. Apply Cb/Cr error correction (SIMD + scalar tail)
+    //     Unlike Y, Cb/Cr are NOT clamped during iterations to allow error diffusion.
+    size_t i_c_corr = 0;
+    for (; i_c_corr + N <= low_size; i_c_corr += N) {
+      auto cb_vec = Add(LoadU(d, &best_cb[i_c_corr]), LoadU(d, &err_cb[i_c_corr]));
+      auto cr_vec = Add(LoadU(d, &best_cr[i_c_corr]), LoadU(d, &err_cr[i_c_corr]));
+      StoreU(cb_vec, d, &best_cb[i_c_corr]);
+      StoreU(cr_vec, d, &best_cr[i_c_corr]);
     }
-    for (size_t i = low_size - (low_size % N); i < low_size; ++i) {
-       best_cb[i] += err_cb[i];
-       best_cr[i] += err_cr[i];
+    for (; i_c_corr < low_size; ++i_c_corr) {
+      best_cb[i_c_corr] += err_cb[i_c_corr];
+      best_cr[i_c_corr] += err_cr[i_c_corr];
     }
 
-    if (iter > 0 && (diff_y_sum < diff_y_threshold || diff_y_sum > prev_diff_y_sum)) break;
+    if (iter > 0 &&
+        (diff_y_sum < diff_y_threshold || diff_y_sum > prev_diff_y_sum))
+      break;
     prev_diff_y_sum = diff_y_sum;
   }
 
-  // 3. Write output to buffers
+  // 4. Write best_y and best_cb/cr to raw_data output buffers (memcpy)
   for (size_t y = 0; y < iMCU_height; ++y) {
     float* row_out_y = m->raw_data[0]->Row(y0 + y);
-    float* src_y = &best_y[y * xsize_padded];
-    size_t x = 0;
-    for (; x + N <= xsize_padded; x += N) StoreU(LoadU(d, src_y + x), d, row_out_y + x);
-    for (; x < xsize_padded; ++x) row_out_y[x] = src_y[x];
+    memcpy(row_out_y, &best_y[y * xsize_padded],
+           xsize_padded * sizeof(float));
   }
-  
+
   const size_t y_out0 = y0 / 2;
   for (size_t y_out = 0; y_out < down_height; ++y_out) {
     float* row_out_cb = m->raw_data[1]->Row(y_out0 + y_out);
     float* row_out_cr = m->raw_data[2]->Row(y_out0 + y_out);
-    float* src_cb = &best_cb[y_out * down_width];
-    float* src_cr = &best_cr[y_out * down_width];
-    size_t x = 0;
-    for (; x + N <= down_width; x += N) {
-       StoreU(LoadU(d, src_cb + x), d, row_out_cb + x);
-       StoreU(LoadU(d, src_cr + x), d, row_out_cr + x);
-    }
-    for (; x < down_width; ++x) {
-       row_out_cb[x] = src_cb[x];
-       row_out_cr[x] = src_cr[x];
-    }
+    memcpy(row_out_cb, &best_cb[y_out * down_width],
+           down_width * sizeof(float));
+    memcpy(row_out_cr, &best_cr[y_out * down_width],
+           down_width * sizeof(float));
   }
 }
 

From 34364360b528037f3e98575ddb8794efac9eb01e Mon Sep 17 00:00:00 2001
From: Galaxy4594 <164440799+Galaxy4594@users.noreply.github.com>
Date: Mon, 23 Mar 2026 08:28:39 -0700
Subject: [PATCH 04/10] remove fluff

---
 lib/jpegli/sharp_yuv.cc | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/lib/jpegli/sharp_yuv.cc b/lib/jpegli/sharp_yuv.cc
index bc84316c..080c99fb 100644
--- a/lib/jpegli/sharp_yuv.cc
+++ b/lib/jpegli/sharp_yuv.cc
@@ -496,7 +496,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
 
   const size_t N = hwy::HWY_NAMESPACE::Lanes(d);
 
-  // 1. Decode input YCbCr -> linear RGB, compute per-pixel target luma (SIMD)
+  // Decode input YCbCr to linear RGB and compute target luma.
   for (size_t y = 0; y < iMCU_height; ++y) {
     const float* row_y = m->smooth_input[0]->Row(y0 + y);
     const float* row_cb = m->smooth_input[1]->Row(y0 + y);
@@ -543,7 +543,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
     }
   }
 
-  // 2. Box-average downsample of linear RGB -> target Cb/Cr (SIMD)
+  // Box-average downsample of linear RGB to target Cb/Cr.
   for (size_t y = 0; y < down_height; ++y) {
     const float* r0 = &tmp_lin_r[(y * 2 + 0) * xsize_padded];
     const float* g0 = &tmp_lin_g[(y * 2 + 0) * xsize_padded];
@@ -577,8 +577,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
     memset(err_cr.data(), 0, low_size * sizeof(float));
     float diff_y_sum = 0.0f;
 
-    // 3a. Upsample best_cb/cr to full-res using a 2D tent (9/3/3/1) filter
-    //     (SIMD inner loop with scalar boundary handling)
+    // Upsample best_cb/cr to full-res using a 2D tent filter.
     for (size_t y = 0; y < down_height; ++y) {
       size_t y_prev = (y == 0) ? 0 : y - 1;
       size_t y_next = (y + 1 == down_height) ? y : y + 1;
@@ -596,7 +595,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
                       up_cr.data());
     }
 
-    // 3b. Evaluate YCbCr reconstruction error at full resolution (SIMD)
+    // Evaluate YCbCr reconstruction error at full resolution.
     for (size_t y = 0; y < iMCU_height; ++y) {
       size_t x = 0;
       for (; x + N <= xsize_padded; x += N) {
@@ -650,8 +649,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
       diff_y_sum += std::abs(err_y[i_y]);
     }
 
-    // 3c. Box-average downsample of reconstructed RGB -> compute Cb/Cr error
-    //     (SIMD via BoxAverageDownsampleRow)
+    // Box-average downsample reconstructed RGB to compute Cb/Cr error.
     for (size_t y = 0; y < down_height; ++y) {
       const float* r0 = &tmp_lin_r[(y * 2 + 0) * xsize_padded];
       const float* g0 = &tmp_lin_g[(y * 2 + 0) * xsize_padded];
@@ -687,7 +685,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
       }
     }
 
-    // 3d. Apply Y error correction with clamping (SIMD + scalar tail)
+    // Apply Y error correction with clamping.
     const auto vmin = Set(d, 0.0f);
     const auto vmax = Set(d, 255.0f);
     size_t i_y_corr = 0;
@@ -701,8 +699,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
       best_y[i_y_corr] = std::max(0.0f, std::min(255.0f, best_y[i_y_corr] + err_y[i_y_corr]));
     }
 
-    // 3e. Apply Cb/Cr error correction (SIMD + scalar tail)
-    //     Unlike Y, Cb/Cr are NOT clamped during iterations to allow error diffusion.
+    // Apply Cb/Cr error correction.
     size_t i_c_corr = 0;
     for (; i_c_corr + N <= low_size; i_c_corr += N) {
       auto cb_vec = Add(LoadU(d, &best_cb[i_c_corr]), LoadU(d, &err_cb[i_c_corr]));
@@ -721,7 +718,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
     prev_diff_y_sum = diff_y_sum;
   }
 
-  // 4. Write best_y and best_cb/cr to raw_data output buffers (memcpy)
+  // Write best_y and best_cb/cr to raw_data output buffers.
   for (size_t y = 0; y < iMCU_height; ++y) {
     float* row_out_y = m->raw_data[0]->Row(y0 + y);
     memcpy(row_out_y, &best_y[y * xsize_padded],

From bdf86c8e638a6232f8acd32903d7d5c81aaf435b Mon Sep 17 00:00:00 2001
From: Galaxy4594 <164440799+Galaxy4594@users.noreply.github.com>
Date: Sat, 4 Apr 2026 02:05:32 -0700
Subject: [PATCH 05/10] Final performance pass

Pulls RGB data from initial decode and works from there, the AI hallucination machine was doing unneeded color conversions (RGB -> YUV -> RGB) which was costing performance.
---
 lib/jpegli/common_internal.h |   1 +
 lib/jpegli/encode.cc         |  29 +++
 lib/jpegli/encode_internal.h |   2 +
 lib/jpegli/sharp_yuv.cc      | 472 ++++++++++++++---------------------
 4 files changed, 225 insertions(+), 279 deletions(-)

diff --git a/lib/jpegli/common_internal.h b/lib/jpegli/common_internal.h
index e41daa3d..80074b31 100644
--- a/lib/jpegli/common_internal.h
+++ b/lib/jpegli/common_internal.h
@@ -112,6 +112,7 @@ class RowBuffer {
   size_t xsize() const { return xsize_; };
   size_t ysize() const { return ysize_; };
   size_t stride() const { return stride_; }
+  bool HasData() const { return xsize_ > 0; }
 
   void PadRow(size_t y, size_t from, int border) {
     float* row = Row(y);
diff --git a/lib/jpegli/encode.cc b/lib/jpegli/encode.cc
index f59da379..2ae318d3 100644
--- a/lib/jpegli/encode.cc
+++ b/lib/jpegli/encode.cc
@@ -431,6 +431,12 @@ void AllocateBuffers(j_compress_ptr cinfo) {
     for (int c = 0; c < num_all_components; ++c) {
       m->input_buffer[c].Allocate(cinfo, ysize_full, xsize_full);
     }
+    if (m->use_sharpyuv && cinfo->in_color_space == JCS_RGB &&
+        cinfo->jpeg_color_space == JCS_YCbCr) {
+      for (int c = 0; c < 3; ++c) {
+        m->input_rgb[c].Allocate(cinfo, ysize_full, xsize_full);
+      }
+    }
   }
   for (int c = 0; c < cinfo->num_components; ++c) {
     jpeg_component_info* comp = &cinfo->comp_info[c];
@@ -491,6 +497,14 @@ void AllocateBuffers(j_compress_ptr cinfo) {
     memset(m->zero_bias_mul[c], 0, DCTSIZE2 * sizeof(float));
     memset(m->zero_bias_offset[c], 0, DCTSIZE2 * sizeof(float));
   }
+  if (m->use_sharpyuv) {
+    for (int i = 0; i < 7; ++i) {
+      m->sharpyuv_workspace[i].Allocate(cinfo, iMCU_height, xsize_full);
+    }
+    for (int i = 7; i < 13; ++i) {
+      m->sharpyuv_workspace[i].Allocate(cinfo, iMCU_height / 2, xsize_full / 2);
+    }
+  }
 }
 
 void InitProgressMonitor(j_compress_ptr cinfo) {
@@ -692,6 +706,7 @@ void jpegli_CreateCompress(j_compress_ptr cinfo, int version,
   memset(cinfo->arith_ac_K, 0, sizeof(cinfo->arith_ac_K));
   cinfo->write_Adobe_marker = FALSE;
   cinfo->master = jpegli::Allocate<jpeg_comp_master>(cinfo, 1);
+  memset(cinfo->master->input_rgb, 0, sizeof(cinfo->master->input_rgb));
   jpegli::InitializeCompressParams(cinfo);
   cinfo->master->force_baseline = true;
   cinfo->master->xyb_mode = false;
@@ -1160,8 +1175,22 @@ JDIMENSION jpegli_write_scanlines(j_compress_ptr cinfo, JSAMPARRAY scanlines,
     cinfo->next_scanline += input_lag;
   }
   float* rows[jpegli::kMaxComponents];
+  const bool snapshot_rgb = m->use_sharpyuv &&
+                            cinfo->in_color_space == JCS_RGB &&
+                            cinfo->jpeg_color_space == JCS_YCbCr;
   for (size_t i = input_lag; i < num_lines; ++i) {
     jpegli::ReadInputRow(cinfo, scanlines[i], rows);
+    if (snapshot_rgb) {
+      size_t row_idx = m->next_input_row - 1;
+      size_t img_w = cinfo->image_width;
+      size_t pad_w = m->xsize_blocks * DCTSIZE;
+      for (int c = 0; c < 3; ++c) {
+        float* rgb_row = m->input_rgb[c].Row(row_idx);
+        memcpy(rgb_row, rows[c], img_w * sizeof(float));
+        float last = rgb_row[img_w - 1];
+        for (size_t x = img_w; x <= pad_w; ++x) rgb_row[x] = last;
+      }
+    }
     (*m->color_transform)(rows, cinfo->image_width);
     jpegli::PadInputBuffer(cinfo, rows);
     jpegli::ProcessiMCURows(cinfo);
diff --git a/lib/jpegli/encode_internal.h b/lib/jpegli/encode_internal.h
index 05f40dbf..b260b6e0 100644
--- a/lib/jpegli/encode_internal.h
+++ b/lib/jpegli/encode_internal.h
@@ -68,6 +68,7 @@ struct ScanTokenInfo {
 
 struct jpeg_comp_master {
   jpegli::RowBuffer<float> input_buffer[jpegli::kMaxComponents];
+  jpegli::RowBuffer<float> input_rgb[3];
   jpegli::RowBuffer<float>* smooth_input[jpegli::kMaxComponents];
   jpegli::RowBuffer<float>* raw_data[jpegli::kMaxComponents];
   bool force_baseline;
@@ -139,6 +140,7 @@ struct jpeg_comp_master {
   float psnr_tolerance;
   float min_distance;
   float max_distance;
+  jpegli::RowBuffer<float> sharpyuv_workspace[13];
 };
 
 #endif  // LIB_JPEGLI_ENCODE_INTERNAL_H_
diff --git a/lib/jpegli/sharp_yuv.cc b/lib/jpegli/sharp_yuv.cc
index 080c99fb..154d2370 100644
--- a/lib/jpegli/sharp_yuv.cc
+++ b/lib/jpegli/sharp_yuv.cc
@@ -11,7 +11,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
-#include <vector>
+
 
 #include "lib/jpegli/common.h"
 #include "lib/jpegli/encode_internal.h"
@@ -35,7 +35,9 @@ using hwy::HWY_NAMESPACE::Div;
 using hwy::HWY_NAMESPACE::Floor;
 using hwy::HWY_NAMESPACE::IfThenElse;
 using hwy::HWY_NAMESPACE::Le;
+using hwy::HWY_NAMESPACE::Lanes;
 using hwy::HWY_NAMESPACE::LoadDup128;
+using hwy::HWY_NAMESPACE::LoadInterleaved2;
 using hwy::HWY_NAMESPACE::LoadU;
 using hwy::HWY_NAMESPACE::Max;
 using hwy::HWY_NAMESPACE::Min;
@@ -47,8 +49,10 @@ using hwy::HWY_NAMESPACE::ReduceSum;
 using hwy::HWY_NAMESPACE::Set;
 using hwy::HWY_NAMESPACE::ShiftLeft;
 using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::StoreInterleaved2;
 using hwy::HWY_NAMESPACE::StoreU;
 using hwy::HWY_NAMESPACE::Sub;
+using hwy::HWY_NAMESPACE::Vec;
 using hwy::HWY_NAMESPACE::Zero;
 
 using D = HWY_CAPPED(float, 8);
@@ -257,47 +261,25 @@ HWY_INLINE void BoxAverageDownsampleRow(const float* lin_r0,
   const auto quarter = Set(d, 0.25f);
   size_t x = 0;
 
-  // SIMD path: process N output pixels at a time.
-  // Each output pixel needs 2 input pixels horizontally, so we need 2*N inputs.
   for (; x + N <= down_width; x += N) {
-    // Gather even and odd columns from two rows.
-    // Row 0: indices 2*x, 2*x+1, 2*x+2, 2*x+3, ...
-    // Row 1: same pattern.
-    // We accumulate the 4 samples for each output pixel.
-    auto sum_r = Zero(d);
-    auto sum_g = Zero(d);
-    auto sum_b = Zero(d);
-
-    for (int iy = 0; iy < 2; ++iy) {
-      const float* src_r = (iy == 0) ? lin_r0 : lin_r1;
-      const float* src_g = (iy == 0) ? lin_g0 : lin_g1;
-      const float* src_b = (iy == 0) ? lin_b0 : lin_b1;
-      for (int ix = 0; ix < 2; ++ix) {
-        // Load non-contiguous: elements at positions 2*x+ix, 2*(x+1)+ix, ...
-        // Unfortunately these are strided, so we load pairs and deinterleave
-        // is cleaner. For simplicity and correctness, use scalar gather here
-        // since the inner 2x2 loop is just 4 iterations.
-        // Actually, let's use a temporary buffer approach for the gather.
-        HWY_ALIGN float tmp_r[HWY_MAX_LANES_D(D)];
-        HWY_ALIGN float tmp_g[HWY_MAX_LANES_D(D)];
-        HWY_ALIGN float tmp_b[HWY_MAX_LANES_D(D)];
-        for (size_t k = 0; k < N; ++k) {
-          size_t si = (x + k) * 2 + ix;
-          tmp_r[k] = src_r[si];
-          tmp_g[k] = src_g[si];
-          tmp_b[k] = src_b[si];
-        }
-        sum_r = Add(sum_r, LoadU(d, tmp_r));
-        sum_g = Add(sum_g, LoadU(d, tmp_g));
-        sum_b = Add(sum_b, LoadU(d, tmp_b));
-      }
-    }
+    Vec<D> r0e, r0o, g0e, g0o, b0e, b0o;
+    Vec<D> r1e, r1o, g1e, g1o, b1e, b1o;
+    LoadInterleaved2(d, lin_r0 + 2 * x, r0e, r0o);
+    LoadInterleaved2(d, lin_g0 + 2 * x, g0e, g0o);
+    LoadInterleaved2(d, lin_b0 + 2 * x, b0e, b0o);
+    LoadInterleaved2(d, lin_r1 + 2 * x, r1e, r1o);
+    LoadInterleaved2(d, lin_g1 + 2 * x, g1e, g1o);
+    LoadInterleaved2(d, lin_b1 + 2 * x, b1e, b1o);
+
+    auto sum_r = Add(Add(r0e, r0o), Add(r1e, r1o));
+    auto sum_g = Add(Add(g0e, g0o), Add(g1e, g1o));
+    auto sum_b = Add(Add(b0e, b0o), Add(b1e, b1o));
 
     auto avg_r = LinearToGamma_SIMD(d, Mul(sum_r, quarter));
     auto avg_g = LinearToGamma_SIMD(d, Mul(sum_g, quarter));
     auto avg_b = LinearToGamma_SIMD(d, Mul(sum_b, quarter));
 
-    decltype(avg_r) vy, vcb, vcr;
+    Vec<D> vy, vcb, vcr;
     RGBToYCbCr_SIMD(d, avg_r, avg_g, avg_b, &vy, &vcb, &vcr);
 
     StoreU(vcb, d, out_cb + x);
@@ -336,8 +318,7 @@ HWY_INLINE void BoxAverageDownsampleRow(const float* lin_r0,
 // ---------------------------------------------------------------------------
 HWY_INLINE void TentUpsampleRow(const float* prev_row, const float* cur_row,
                                 const float* next_row, size_t down_width,
-                                size_t xsize_padded, size_t y,
-                                float* out_full) {
+                                float* out0, float* out1) {
   const size_t N = hwy::HWY_NAMESPACE::Lanes(d);
   const auto k9 = Set(d, 9.0f);
   const auto k3 = Set(d, 3.0f);
@@ -346,68 +327,26 @@ HWY_INLINE void TentUpsampleRow(const float* prev_row, const float* cur_row,
 
   for (; x + N <= down_width; x += N) {
     auto vc = LoadU(d, cur_row + x);
-
-    // Load left neighbors (clamp at boundary)
-    HWY_ALIGN float tmp_l[HWY_MAX_LANES_D(D)];
-    HWY_ALIGN float tmp_tl[HWY_MAX_LANES_D(D)];
-    HWY_ALIGN float tmp_bl[HWY_MAX_LANES_D(D)];
-    for (size_t k = 0; k < N; ++k) {
-      size_t xi = x + k;
-      size_t xl = (xi == 0) ? 0 : xi - 1;
-      tmp_l[k] = cur_row[xl];
-      tmp_tl[k] = prev_row[xl];
-      tmp_bl[k] = next_row[xl];
-    }
-    auto vl = LoadU(d, tmp_l);
-    auto vtl = LoadU(d, tmp_tl);
-    auto vbl = LoadU(d, tmp_bl);
-
-    // Load right neighbors (clamp at boundary)
-    HWY_ALIGN float tmp_r[HWY_MAX_LANES_D(D)];
-    HWY_ALIGN float tmp_tr[HWY_MAX_LANES_D(D)];
-    HWY_ALIGN float tmp_br[HWY_MAX_LANES_D(D)];
-    for (size_t k = 0; k < N; ++k) {
-      size_t xi = x + k;
-      size_t xr = (xi + 1 == down_width) ? xi : xi + 1;
-      tmp_r[k] = cur_row[xr];
-      tmp_tr[k] = prev_row[xr];
-      tmp_br[k] = next_row[xr];
-    }
-    auto vr = LoadU(d, tmp_r);
-    auto vtr = LoadU(d, tmp_tr);
-    auto vbr = LoadU(d, tmp_br);
-
     auto vt = LoadU(d, prev_row + x);
     auto vb = LoadU(d, next_row + x);
 
-    // c9 = c * 9
+    auto vl = LoadU(d, cur_row + x - 1);
+    auto vtl = LoadU(d, prev_row + x - 1);
+    auto vbl = LoadU(d, next_row + x - 1);
+
+    auto vr = LoadU(d, cur_row + x + 1);
+    auto vtr = LoadU(d, prev_row + x + 1);
+    auto vbr = LoadU(d, next_row + x + 1);
+
     auto c9 = Mul(vc, k9);
 
-    // Top-left quadrant: (c*9 + t*3 + l*3 + tl) / 16
     auto val_tl = Mul(MulAdd(k3, Add(vt, vl), Add(c9, vtl)), kInv16);
-    // Top-right quadrant: (c*9 + t*3 + r*3 + tr) / 16
     auto val_tr = Mul(MulAdd(k3, Add(vt, vr), Add(c9, vtr)), kInv16);
-    // Bottom-left quadrant: (c*9 + b*3 + l*3 + bl) / 16
     auto val_bl = Mul(MulAdd(k3, Add(vb, vl), Add(c9, vbl)), kInv16);
-    // Bottom-right quadrant: (c*9 + b*3 + r*3 + br) / 16
     auto val_br = Mul(MulAdd(k3, Add(vb, vr), Add(c9, vbr)), kInv16);
 
-    // Scatter to full-res positions
-    HWY_ALIGN float res_tl[HWY_MAX_LANES_D(D)];
-    HWY_ALIGN float res_tr[HWY_MAX_LANES_D(D)];
-    HWY_ALIGN float res_bl[HWY_MAX_LANES_D(D)];
-    HWY_ALIGN float res_br[HWY_MAX_LANES_D(D)];
-    StoreU(val_tl, d, res_tl);
-    StoreU(val_tr, d, res_tr);
-    StoreU(val_bl, d, res_bl);
-    StoreU(val_br, d, res_br);
-    for (size_t k = 0; k < N; ++k) {
-      size_t xi = x + k;
-      out_full[(y * 2 + 0) * xsize_padded + (xi * 2 + 0)] = res_tl[k];
-      out_full[(y * 2 + 0) * xsize_padded + (xi * 2 + 1)] = res_tr[k];
-      out_full[(y * 2 + 1) * xsize_padded + (xi * 2 + 0)] = res_bl[k];
-      out_full[(y * 2 + 1) * xsize_padded + (xi * 2 + 1)] = res_br[k];
-    }
+    StoreInterleaved2(val_tl, val_tr, d, out0 + 2 * x);
+    StoreInterleaved2(val_bl, val_br, d, out1 + 2 * x);
   }
 
   // Scalar tail
@@ -427,14 +366,10 @@ HWY_INLINE void TentUpsampleRow(const float* prev_row, const float* cur_row,
 
     float c9 = cc * 9.0f;
     float inv16 = 1.0f / 16.0f;
-    out_full[(y * 2 + 0) * xsize_padded + (x * 2 + 0)] =
-        (c9 + ct * 3.0f + cl * 3.0f + ctl) * inv16;
-    out_full[(y * 2 + 0) * xsize_padded + (x * 2 + 1)] =
-        (c9 + ct * 3.0f + cr * 3.0f + ctr) * inv16;
-    out_full[(y * 2 + 1) * xsize_padded + (x * 2 + 0)] =
-        (c9 + cb * 3.0f + cl * 3.0f + cbl) * inv16;
-    out_full[(y * 2 + 1) * xsize_padded + (x * 2 + 1)] =
-        (c9 + cb * 3.0f + cr * 3.0f + cbr) * inv16;
+    out0[x * 2 + 0] = (c9 + ct * 3.0f + cl * 3.0f + ctl) * inv16;
+    out0[(x * 2 + 1)] = (c9 + ct * 3.0f + cr * 3.0f + ctr) * inv16;
+    out1[x * 2 + 0] = (c9 + cb * 3.0f + cl * 3.0f + cbl) * inv16;
+    out1[(x * 2 + 1)] = (c9 + cb * 3.0f + cr * 3.0f + cbr) * inv16;
   }
 }
 
@@ -478,37 +413,53 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
   const size_t down_width = xsize_padded / 2;
   const size_t down_height = iMCU_height / 2;
   const size_t high_size = xsize_padded * iMCU_height;
-  const size_t low_size = down_width * down_height;
-
-  std::vector<float> target_y(high_size);
-  std::vector<float> best_y(high_size);
-  std::vector<float> tmp_lin_r(high_size);
-  std::vector<float> tmp_lin_g(high_size);
-  std::vector<float> tmp_lin_b(high_size);
-
-  std::vector<float> target_cb(low_size);
-  std::vector<float> target_cr(low_size);
-  std::vector<float> best_cb(low_size);
-  std::vector<float> best_cr(low_size);
 
-  std::vector<float> up_cb(high_size);
-  std::vector<float> up_cr(high_size);
+  // Hoisted workspace buffers from master struct.
+  jpegli::RowBuffer<float>* ws = m->sharpyuv_workspace;
+  auto target_y_ws = &ws[0];
+  auto best_y_ws = &ws[1];
+  auto tmp_lin_r_ws = &ws[2];
+  auto tmp_lin_g_ws = &ws[3];
+  auto tmp_lin_b_ws = &ws[4];
+  auto up_cb_ws = &ws[5];
+  auto up_cr_ws = &ws[6];
+  auto target_cb_ws = &ws[7];
+  auto target_cr_ws = &ws[8];
+  auto best_cb_ws = &ws[9];
+  auto best_cr_ws = &ws[10];
+  auto err_cb_ws = &ws[11];
+  auto err_cr_ws = &ws[12];
 
   const size_t N = hwy::HWY_NAMESPACE::Lanes(d);
 
-  // Decode input YCbCr to linear RGB and compute target luma.
+  const bool have_rgb_input = m->input_rgb[0].HasData();
+  // Read pre-transform sRGB directly (if available) or convert from YCbCr.
   for (size_t y = 0; y < iMCU_height; ++y) {
     const float* row_y = m->smooth_input[0]->Row(y0 + y);
-    const float* row_cb = m->smooth_input[1]->Row(y0 + y);
-    const float* row_cr = m->smooth_input[2]->Row(y0 + y);
+    const float* row_cb = have_rgb_input ? nullptr : m->smooth_input[1]->Row(y0 + y);
+    const float* row_cr = have_rgb_input ? nullptr : m->smooth_input[2]->Row(y0 + y);
+    const float* row_r = have_rgb_input ? m->input_rgb[0].Row(y0 + y) : nullptr;
+    const float* row_g = have_rgb_input ? m->input_rgb[1].Row(y0 + y) : nullptr;
+    const float* row_b = have_rgb_input ? m->input_rgb[2].Row(y0 + y) : nullptr;
+    float* ty_out = target_y_ws->Row(y);
+    float* by_out = best_y_ws->Row(y);
+    float* lr_out = tmp_lin_r_ws->Row(y);
+    float* lg_out = tmp_lin_g_ws->Row(y);
+    float* lb_out = tmp_lin_b_ws->Row(y);
+
     size_t x = 0;
     for (; x + N <= xsize_padded; x += N) {
       auto py = LoadU(d, row_y + x);
-      auto pcb = LoadU(d, row_cb + x);
-      auto pcr = LoadU(d, row_cr + x);
-
       decltype(py) r, g, b;
-      YCbCrToRGB_SIMD(d, py, pcb, pcr, &r, &g, &b);
+      if (have_rgb_input) {
+        r = LoadU(d, row_r + x);
+        g = LoadU(d, row_g + x);
+        b = LoadU(d, row_b + x);
+      } else {
+        auto pcb = LoadU(d, row_cb + x);
+        auto pcr = LoadU(d, row_cr + x);
+        YCbCrToRGB_SIMD(d, py, pcb, pcr, &r, &g, &b);
+      }
 
       auto lin_r = GammaToLinear_SIMD(d, r);
       auto lin_g = GammaToLinear_SIMD(d, g);
@@ -517,200 +468,166 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
       auto lin_y = TempLuma_SIMD(d, lin_r, lin_g, lin_b);
       auto t_y = LinearToGamma_SIMD(d, lin_y);
 
-      size_t idx = y * xsize_padded + x;
-      StoreU(t_y, d, &target_y[idx]);
-      StoreU(py, d, &best_y[idx]);
-      StoreU(lin_r, d, &tmp_lin_r[idx]);
-      StoreU(lin_g, d, &tmp_lin_g[idx]);
-      StoreU(lin_b, d, &tmp_lin_b[idx]);
+      StoreU(t_y, d, ty_out + x);
+      StoreU(py, d, by_out + x);
+      StoreU(lin_r, d, lr_out + x);
+      StoreU(lin_g, d, lg_out + x);
+      StoreU(lin_b, d, lb_out + x);
     }
     for (; x < xsize_padded; ++x) {
       float py = row_y[x];
-      float pcb = row_cb[x];
-      float pcr = row_cr[x];
       float r, g, b;
-      YCbCrToRGB_Scalar(py, pcb, pcr, &r, &g, &b);
+      if (have_rgb_input) {
+        r = row_r[x];
+        g = row_g[x];
+        b = row_b[x];
+      } else {
+        float pcb = row_cb[x];
+        float pcr = row_cr[x];
+        YCbCrToRGB_Scalar(py, pcb, pcr, &r, &g, &b);
+      }
       float lr = GammaToLinear(r);
       float lg = GammaToLinear(g);
       float lb = GammaToLinear(b);
       float ly = 0.299f * lr + 0.587f * lg + 0.114f * lb;
-      size_t idx = y * xsize_padded + x;
-      target_y[idx] = LinearToGamma(ly);
-      best_y[idx] = py;
-      tmp_lin_r[idx] = lr;
-      tmp_lin_g[idx] = lg;
-      tmp_lin_b[idx] = lb;
+      ty_out[x] = LinearToGamma(ly);
+      by_out[x] = py;
+      lr_out[x] = lr;
+      lg_out[x] = lg;
+      lb_out[x] = lb;
     }
   }
 
   // Box-average downsample of linear RGB to target Cb/Cr.
   for (size_t y = 0; y < down_height; ++y) {
-    const float* r0 = &tmp_lin_r[(y * 2 + 0) * xsize_padded];
-    const float* g0 = &tmp_lin_g[(y * 2 + 0) * xsize_padded];
-    const float* b0 = &tmp_lin_b[(y * 2 + 0) * xsize_padded];
-    const float* r1 = &tmp_lin_r[(y * 2 + 1) * xsize_padded];
-    const float* g1 = &tmp_lin_g[(y * 2 + 1) * xsize_padded];
-    const float* b1 = &tmp_lin_b[(y * 2 + 1) * xsize_padded];
-    float* dcb = &target_cb[y * down_width];
-    float* dcr = &target_cr[y * down_width];
+    const float* r0 = tmp_lin_r_ws->Row(y * 2 + 0);
+    const float* g0 = tmp_lin_g_ws->Row(y * 2 + 0);
+    const float* b0 = tmp_lin_b_ws->Row(y * 2 + 0);
+    const float* r1 = tmp_lin_r_ws->Row(y * 2 + 1);
+    const float* g1 = tmp_lin_g_ws->Row(y * 2 + 1);
+    const float* b1 = tmp_lin_b_ws->Row(y * 2 + 1);
+    float* dcb = target_cb_ws->Row(y);
+    float* dcr = target_cr_ws->Row(y);
 
     BoxAverageDownsampleRow(r0, g0, b0, r1, g1, b1, down_width, xsize_padded,
                             dcb, dcr);
 
     // Also initialize best_cb/cr = target_cb/cr
-    memcpy(&best_cb[y * down_width], dcb, down_width * sizeof(float));
-    memcpy(&best_cr[y * down_width], dcr, down_width * sizeof(float));
+    memcpy(best_cb_ws->Row(y), dcb, down_width * sizeof(float));
+    memcpy(best_cr_ws->Row(y), dcr, down_width * sizeof(float));
+  }
+
+  for (size_t y = 0; y < down_height; ++y) {
+    best_cb_ws->PadRow(y, down_width, 1);
+    best_cr_ws->PadRow(y, down_width, 1);
   }
 
+  const auto vmin = Set(d, 0.0f);
+  const auto vmax = Set(d, 255.0f);
   float prev_diff_y_sum = 1e30f;
   const float diff_y_threshold = 3.0f * high_size;
 
-  // Pre-allocate error buffers outside the iteration loop to avoid repeated
-  // heap allocation/deallocation.
-  std::vector<float> err_y(high_size);
-  std::vector<float> err_cb(low_size);
-  std::vector<float> err_cr(low_size);
-
   for (int iter = 0; iter < 4; ++iter) {
-    memset(err_y.data(), 0, high_size * sizeof(float));
-    memset(err_cb.data(), 0, low_size * sizeof(float));
-    memset(err_cr.data(), 0, low_size * sizeof(float));
     float diff_y_sum = 0.0f;
+    auto acc_y = Zero(d);
 
-    // Upsample best_cb/cr to full-res using a 2D tent filter.
+    // Padding for upsampling (all rows for this iteration)
+    for (size_t y = 0; y < down_height; ++y) {
+      best_cb_ws->PadRow(y, down_width, 1);
+      best_cr_ws->PadRow(y, down_width, 1);
+    }
+
+    // Pass 1: Upsample current best_cb/cr
     for (size_t y = 0; y < down_height; ++y) {
       size_t y_prev = (y == 0) ? 0 : y - 1;
       size_t y_next = (y + 1 == down_height) ? y : y + 1;
-
-      const float* cb_prev = &best_cb[y_prev * down_width];
-      const float* cb_cur = &best_cb[y * down_width];
-      const float* cb_next = &best_cb[y_next * down_width];
-      TentUpsampleRow(cb_prev, cb_cur, cb_next, down_width, xsize_padded, y,
-                      up_cb.data());
-
-      const float* cr_prev = &best_cr[y_prev * down_width];
-      const float* cr_cur = &best_cr[y * down_width];
-      const float* cr_next = &best_cr[y_next * down_width];
-      TentUpsampleRow(cr_prev, cr_cur, cr_next, down_width, xsize_padded, y,
-                      up_cr.data());
+      TentUpsampleRow(best_cb_ws->Row(y_prev), best_cb_ws->Row(y),
+                      best_cb_ws->Row(y_next), down_width,
+                      up_cb_ws->Row(y * 2), up_cb_ws->Row(y * 2 + 1));
+      TentUpsampleRow(best_cr_ws->Row(y_prev), best_cr_ws->Row(y),
+                      best_cr_ws->Row(y_next), down_width,
+                      up_cr_ws->Row(y * 2), up_cr_ws->Row(y * 2 + 1));
     }
 
-    // Evaluate YCbCr reconstruction error at full resolution.
-    for (size_t y = 0; y < iMCU_height; ++y) {
-      size_t x = 0;
-      for (; x + N <= xsize_padded; x += N) {
-        size_t idx = y * xsize_padded + x;
-        auto py = LoadU(d, &best_y[idx]);
-        auto ucb = LoadU(d, &up_cb[idx]);
-        auto ucr = LoadU(d, &up_cr[idx]);
-
-        decltype(py) r, g, b;
-        YCbCrToRGB_SIMD(d, py, ucb, ucr, &r, &g, &b);
-
-        auto lin_r = GammaToLinear_SIMD(d, r);
-        auto lin_g = GammaToLinear_SIMD(d, g);
-        auto lin_b = GammaToLinear_SIMD(d, b);
-
-        auto lin_y = TempLuma_SIMD(d, lin_r, lin_g, lin_b);
-        auto dec_y = LinearToGamma_SIMD(d, lin_y);
-
-        auto e_y = Sub(LoadU(d, &target_y[idx]), dec_y);
-        StoreU(e_y, d, &err_y[idx]);
-        StoreU(lin_r, d, &tmp_lin_r[idx]);
-        StoreU(lin_g, d, &tmp_lin_g[idx]);
-        StoreU(lin_b, d, &tmp_lin_b[idx]);
-      }
-      for (; x < xsize_padded; ++x) {
-        size_t idx = y * xsize_padded + x;
-        float py = best_y[idx];
-        float ucb = up_cb[idx];
-        float ucr = up_cr[idx];
-        float r, g, b;
-        YCbCrToRGB_Scalar(py, ucb, ucr, &r, &g, &b);
-        float lr = GammaToLinear(r);
-        float lg = GammaToLinear(g);
-        float lb = GammaToLinear(b);
-        float ly = 0.299f * lr + 0.587f * lg + 0.114f * lb;
-        err_y[idx] = target_y[idx] - LinearToGamma(ly);
-        tmp_lin_r[idx] = lr;
-        tmp_lin_g[idx] = lg;
-        tmp_lin_b[idx] = lb;
+    // Pass 2: Evaluate Y and Prepare for Chroma update
+    for (size_t y = 0; y < down_height; ++y) {
+      for (int iy = 0; iy < 2; ++iy) {
+        size_t yy = y * 2 + iy;
+        float* row_best_y = best_y_ws->Row(yy);
+        const float* row_up_cb = up_cb_ws->Row(yy);
+        const float* row_up_cr = up_cr_ws->Row(yy);
+        const float* row_target_y = target_y_ws->Row(yy);
+        float* row_lin_r = tmp_lin_r_ws->Row(yy);
+        float* row_lin_g = tmp_lin_g_ws->Row(yy);
+        float* row_lin_b = tmp_lin_b_ws->Row(yy);
+
+        size_t x = 0;
+        for (; x + N <= xsize_padded; x += N) {
+          auto py = LoadU(d, row_best_y + x);
+          auto ucb = LoadU(d, row_up_cb + x);
+          auto ucr = LoadU(d, row_up_cr + x);
+          Vec<D> r, g, b;
+          YCbCrToRGB_SIMD(d, py, ucb, ucr, &r, &g, &b);
+          auto lin_r = GammaToLinear_SIMD(d, r);
+          auto lin_g = GammaToLinear_SIMD(d, g);
+          auto lin_b = GammaToLinear_SIMD(d, b);
+          auto lin_y = TempLuma_SIMD(d, lin_r, lin_g, lin_b);
+          auto dec_y = LinearToGamma_SIMD(d, lin_y);
+          auto ey = Sub(LoadU(d, row_target_y + x), dec_y);
+          acc_y = Add(acc_y, Abs(ey));
+
+          StoreU(Min(Max(Add(py, ey), vmin), vmax), d, row_best_y + x);
+          StoreU(lin_r, d, row_lin_r + x);
+          StoreU(lin_g, d, row_lin_g + x);
+          StoreU(lin_b, d, row_lin_b + x);
+        }
+        for (; x < xsize_padded; ++x) {
+          float py = row_best_y[x];
+          float r, g, b;
+          YCbCrToRGB_Scalar(py, row_up_cb[x], row_up_cr[x], &r, &g, &b);
+          float lr = GammaToLinear(r);
+          float lg = GammaToLinear(g);
+          float lb = GammaToLinear(b);
+          float ly = 0.299f * lr + 0.587f * lg + 0.114f * lb;
+          float ey = row_target_y[x] - LinearToGamma(ly);
+          diff_y_sum += std::abs(ey);
+          row_best_y[x] = std::max(0.0f, std::min(255.0f, py + ey));
+          row_lin_r[x] = lr;
+          row_lin_g[x] = lg;
+          row_lin_b[x] = lb;
+        }
       }
-    }
 
-    // Accumulate |err_y|
-    auto acc_y = Zero(d);
-    size_t i_y = 0;
-    for (; i_y + N <= high_size; i_y += N) {
-      acc_y = Add(acc_y, Abs(LoadU(d, &err_y[i_y])));
-    }
-    diff_y_sum = ReduceSum(d, acc_y);
-    for (; i_y < high_size; ++i_y) {
-      diff_y_sum += std::abs(err_y[i_y]);
-    }
+      float* row_best_cb = best_cb_ws->Row(y);
+      float* row_best_cr = best_cr_ws->Row(y);
+      float* row_err_cb = err_cb_ws->Row(y);
+      float* row_err_cr = err_cr_ws->Row(y);
+      float* row_tgt_cb = target_cb_ws->Row(y);
+      float* row_tgt_cr = target_cr_ws->Row(y);
+
+      BoxAverageDownsampleRow(
+          tmp_lin_r_ws->Row(y * 2 + 0), tmp_lin_g_ws->Row(y * 2 + 0),
+          tmp_lin_b_ws->Row(y * 2 + 0), tmp_lin_r_ws->Row(y * 2 + 1),
+          tmp_lin_g_ws->Row(y * 2 + 1), tmp_lin_b_ws->Row(y * 2 + 1), down_width,
+          xsize_padded, row_err_cb, row_err_cr);
 
-    // Box-average downsample reconstructed RGB to compute Cb/Cr error.
-    for (size_t y = 0; y < down_height; ++y) {
-      const float* r0 = &tmp_lin_r[(y * 2 + 0) * xsize_padded];
-      const float* g0 = &tmp_lin_g[(y * 2 + 0) * xsize_padded];
-      const float* b0 = &tmp_lin_b[(y * 2 + 0) * xsize_padded];
-      const float* r1 = &tmp_lin_r[(y * 2 + 1) * xsize_padded];
-      const float* g1 = &tmp_lin_g[(y * 2 + 1) * xsize_padded];
-      const float* b1 = &tmp_lin_b[(y * 2 + 1) * xsize_padded];
-
-      float* row_err_cb = &err_cb[y * down_width];
-      float* row_err_cr = &err_cr[y * down_width];
-      float* row_tgt_cb = &target_cb[y * down_width];
-      float* row_tgt_cr = &target_cr[y * down_width];
-
-      // BoxAverageDownsampleRow outputs Cb/Cr directly, so we use err
-      // buffers as temp storage for the reconstructed Cb/Cr.
-      BoxAverageDownsampleRow(r0, g0, b0, r1, g1, b1, down_width,
-                              xsize_padded, row_err_cb, row_err_cr);
-
-      // Compute error = target - reconstructed
       size_t x = 0;
       for (; x + N <= down_width; x += N) {
-        auto tgt_cb = LoadU(d, row_tgt_cb + x);
-        auto rec_cb = LoadU(d, row_err_cb + x);
-        StoreU(Sub(tgt_cb, rec_cb), d, row_err_cb + x);
-
-        auto tgt_cr = LoadU(d, row_tgt_cr + x);
-        auto rec_cr = LoadU(d, row_err_cr + x);
-        StoreU(Sub(tgt_cr, rec_cr), d, row_err_cr + x);
+        auto bcb = LoadU(d, row_best_cb + x);
+        auto bcr = LoadU(d, row_best_cr + x);
+        auto tcb = LoadU(d, row_tgt_cb + x);
+        auto tcr = LoadU(d, row_tgt_cr + x);
+        auto dcb = LoadU(d, row_err_cb + x);
+        auto dcr = LoadU(d, row_err_cr + x);
+        StoreU(Add(bcb, Sub(tcb, dcb)), d, row_best_cb + x);
+        StoreU(Add(bcr, Sub(tcr, dcr)), d, row_best_cr + x);
       }
       for (; x < down_width; ++x) {
-        row_err_cb[x] = row_tgt_cb[x] - row_err_cb[x];
-        row_err_cr[x] = row_tgt_cr[x] - row_err_cr[x];
+        row_best_cb[x] += row_tgt_cb[x] - row_err_cb[x];
+        row_best_cr[x] += row_tgt_cr[x] - row_err_cr[x];
       }
     }
-
-    // Apply Y error correction with clamping.
-    const auto vmin = Set(d, 0.0f);
-    const auto vmax = Set(d, 255.0f);
-    size_t i_y_corr = 0;
-    for (; i_y_corr + N <= high_size; i_y_corr += N) {
-      auto y_vec = LoadU(d, &best_y[i_y_corr]);
-      auto e_y_vec = LoadU(d, &err_y[i_y_corr]);
-      auto res = Min(Max(Add(y_vec, e_y_vec), vmin), vmax);
-      StoreU(res, d, &best_y[i_y_corr]);
-    }
-    for (; i_y_corr < high_size; ++i_y_corr) {
-      best_y[i_y_corr] = std::max(0.0f, std::min(255.0f, best_y[i_y_corr] + err_y[i_y_corr]));
-    }
-
-    // Apply Cb/Cr error correction.
-    size_t i_c_corr = 0;
-    for (; i_c_corr + N <= low_size; i_c_corr += N) {
-      auto cb_vec = Add(LoadU(d, &best_cb[i_c_corr]), LoadU(d, &err_cb[i_c_corr]));
-      auto cr_vec = Add(LoadU(d, &best_cr[i_c_corr]), LoadU(d, &err_cr[i_c_corr]));
-      StoreU(cb_vec, d, &best_cb[i_c_corr]);
-      StoreU(cr_vec, d, &best_cr[i_c_corr]);
-    }
-    for (; i_c_corr < low_size; ++i_c_corr) {
-      best_cb[i_c_corr] += err_cb[i_c_corr];
-      best_cr[i_c_corr] += err_cr[i_c_corr];
-    }
+    diff_y_sum += ReduceSum(d, acc_y);
 
     if (iter > 0 &&
         (diff_y_sum < diff_y_threshold || diff_y_sum > prev_diff_y_sum))
@@ -721,18 +638,15 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
   // Write best_y and best_cb/cr to raw_data output buffers.
   for (size_t y = 0; y < iMCU_height; ++y) {
     float* row_out_y = m->raw_data[0]->Row(y0 + y);
-    memcpy(row_out_y, &best_y[y * xsize_padded],
-           xsize_padded * sizeof(float));
+    memcpy(row_out_y, best_y_ws->Row(y), xsize_padded * sizeof(float));
   }
 
   const size_t y_out0 = y0 / 2;
   for (size_t y_out = 0; y_out < down_height; ++y_out) {
     float* row_out_cb = m->raw_data[1]->Row(y_out0 + y_out);
     float* row_out_cr = m->raw_data[2]->Row(y_out0 + y_out);
-    memcpy(row_out_cb, &best_cb[y_out * down_width],
-           down_width * sizeof(float));
-    memcpy(row_out_cr, &best_cr[y_out * down_width],
-           down_width * sizeof(float));
+    memcpy(row_out_cb, best_cb_ws->Row(y_out), down_width * sizeof(float));
+    memcpy(row_out_cr, best_cr_ws->Row(y_out), down_width * sizeof(float));
   }
 }
 

From 126cc4c303c11fc61dd1b4324556c37ebce1447a Mon Sep 17 00:00:00 2001
From: Galaxy4594 <164440799+Galaxy4594@users.noreply.github.com>
Date: Sun, 5 Apr 2026 16:34:48 -0700
Subject: [PATCH 06/10] fix boundary artifacts and remove AI fluff

---
 lib/jpegli/sharp_yuv.cc | 189 ++++++++++++++++++++++------------------
 1 file changed, 102 insertions(+), 87 deletions(-)

diff --git a/lib/jpegli/sharp_yuv.cc b/lib/jpegli/sharp_yuv.cc
index 154d2370..d040c70d 100644
--- a/lib/jpegli/sharp_yuv.cc
+++ b/lib/jpegli/sharp_yuv.cc
@@ -12,10 +12,8 @@
 #include <cstdint>
 #include <cstring>
 
-
 #include "lib/jpegli/common.h"
 #include "lib/jpegli/encode_internal.h"
-#include "lib/jpegli/simd.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "lib/jpegli/sharp_yuv.cc"
@@ -34,8 +32,8 @@ using hwy::HWY_NAMESPACE::ConvertTo;
 using hwy::HWY_NAMESPACE::Div;
 using hwy::HWY_NAMESPACE::Floor;
 using hwy::HWY_NAMESPACE::IfThenElse;
-using hwy::HWY_NAMESPACE::Le;
 using hwy::HWY_NAMESPACE::Lanes;
+using hwy::HWY_NAMESPACE::Le;
 using hwy::HWY_NAMESPACE::LoadDup128;
 using hwy::HWY_NAMESPACE::LoadInterleaved2;
 using hwy::HWY_NAMESPACE::LoadU;
@@ -85,8 +83,7 @@ inline void YCbCrToRGB_Scalar(float y, float cb, float cr, float* r, float* g,
   cb -= 128.0f;
   cr -= 128.0f;
   *r = y + cr * 1.402f;
-  *g = y + cb * (-0.114f * 1.772f / 0.587f) +
-       cr * (-0.299f * 1.402f / 0.587f);
+  *g = y + cb * (-0.114f * 1.772f / 0.587f) + cr * (-0.299f * 1.402f / 0.587f);
   *b = y + cb * 1.772f;
 }
 
@@ -139,11 +136,11 @@ HWY_INLINE V EvalRationalPoly(const DF df, const V x, const T (&p)[NP],
 template <class DF, class V>
 HWY_INLINE V FastLog2f(const DF df, V x) {
   HWY_ALIGN const float p[4 * 3] = {HWY_REP4(-1.8503833400518310E-06f),
-                                     HWY_REP4(1.4287160470083755E+00f),
-                                     HWY_REP4(7.4245873327820566E-01f)};
+                                    HWY_REP4(1.4287160470083755E+00f),
+                                    HWY_REP4(7.4245873327820566E-01f)};
   HWY_ALIGN const float q[4 * 3] = {HWY_REP4(9.9032814277590719E-01f),
-                                     HWY_REP4(1.0096718572241148E+00f),
-                                     HWY_REP4(1.7409343003366853E-01f)};
+                                    HWY_REP4(1.0096718572241148E+00f),
+                                    HWY_REP4(1.7409343003366853E-01f)};
   const Rebind<int32_t, DF> di;
   const auto x_bits = BitCast(di, x);
   const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab));
@@ -220,6 +217,8 @@ HWY_INLINE void YCbCrToRGB_SIMD(D d, V y, V cb, V cr, V* r, V* g, V* b) {
 
 template <class D, class V>
 HWY_INLINE V TempLuma_SIMD(D d, V r, V g, V b) {
+  // using bt 709 values (0.2126f, 0.7152f, 0.0722f) may be more perceptually
+  // accurate, but it needs more visual testing.
   return MulAdd(Set(d, 0.299f), r,
                 MulAdd(Set(d, 0.587f), g, Mul(Set(d, 0.114f), b)));
 }
@@ -244,26 +243,29 @@ HWY_INLINE void RGBToYCbCr_SIMD(D d, V r, V g, V b, V* y, V* cb, V* cr) {
   *cr = MulAdd(Sub(Mul(r, kDiffR), y_base), kNormR, v128);
 }
 
-// ---------------------------------------------------------------------------
 // SIMD box-average downsample of linear RGB -> YCbCr chroma
-// Processes each row of down_width pixels. For each output pixel, averages
-// a 2x2 block of linear RGB, converts back to gamma, then to YCbCr.
-// ---------------------------------------------------------------------------
-HWY_INLINE void BoxAverageDownsampleRow(const float* lin_r0,
-                                        const float* lin_g0,
-                                        const float* lin_b0,
-                                        const float* lin_r1,
-                                        const float* lin_g1,
-                                        const float* lin_b1,
-                                        size_t down_width, size_t xsize_padded,
-                                        float* out_cb, float* out_cr) {
+HWY_INLINE void BoxDownsampleRow(const float* lin_r0, const float* lin_g0,
+                                 const float* lin_b0, const float* lin_r1,
+                                 const float* lin_g1, const float* lin_b1,
+                                 size_t down_width, size_t xsize_padded,
+                                 float* out_cb, float* out_cr) {
   const size_t N = hwy::HWY_NAMESPACE::Lanes(d);
   const auto quarter = Set(d, 0.25f);
   size_t x = 0;
 
   for (; x + N <= down_width; x += N) {
-    Vec<D> r0e, r0o, g0e, g0o, b0e, b0o;
-    Vec<D> r1e, r1o, g1e, g1o, b1e, b1o;
+    Vec<D> r0e;
+    Vec<D> r0o;
+    Vec<D> g0e;
+    Vec<D> g0o;
+    Vec<D> b0e;
+    Vec<D> b0o;
+    Vec<D> r1e;
+    Vec<D> r1o;
+    Vec<D> g1e;
+    Vec<D> g1o;
+    Vec<D> b1e;
+    Vec<D> b1o;
     LoadInterleaved2(d, lin_r0 + 2 * x, r0e, r0o);
     LoadInterleaved2(d, lin_g0 + 2 * x, g0e, g0o);
     LoadInterleaved2(d, lin_b0 + 2 * x, b0e, b0o);
@@ -279,7 +281,9 @@ HWY_INLINE void BoxAverageDownsampleRow(const float* lin_r0,
     auto avg_g = LinearToGamma_SIMD(d, Mul(sum_g, quarter));
     auto avg_b = LinearToGamma_SIMD(d, Mul(sum_b, quarter));
 
-    Vec<D> vy, vcb, vcr;
+    Vec<D> vy;
+    Vec<D> vcb;
+    Vec<D> vcr;
     RGBToYCbCr_SIMD(d, avg_r, avg_g, avg_b, &vy, &vcb, &vcr);
 
     StoreU(vcb, d, out_cb + x);
@@ -288,7 +292,9 @@ HWY_INLINE void BoxAverageDownsampleRow(const float* lin_r0,
 
   // Scalar tail
   for (; x < down_width; ++x) {
-    float lr_sum = 0, lg_sum = 0, lb_sum = 0;
+    float lr_sum = 0;
+    float lg_sum = 0;
+    float lb_sum = 0;
     for (int iy = 0; iy < 2; ++iy) {
       const float* src_r = (iy == 0) ? lin_r0 : lin_r1;
       const float* src_g = (iy == 0) ? lin_g0 : lin_g1;
@@ -303,19 +309,16 @@ HWY_INLINE void BoxAverageDownsampleRow(const float* lin_r0,
     float avg_r = LinearToGamma(lr_sum * 0.25f);
     float avg_g = LinearToGamma(lg_sum * 0.25f);
     float avg_b = LinearToGamma(lb_sum * 0.25f);
-    float avg_y, avg_cb, avg_cr;
+    float avg_y;
+    float avg_cb;
+    float avg_cr;
     RGBToYCbCr_Scalar(avg_r, avg_g, avg_b, &avg_y, &avg_cb, &avg_cr);
     out_cb[x] = avg_cb;
     out_cr[x] = avg_cr;
   }
 }
 
-// ---------------------------------------------------------------------------
 // SIMD tent-filter upsample of one chroma row
-// For each x in [0, down_width), computes 4 upsampled values using the
-// 9/3/3/1 tent filter, and writes them to the 4 corresponding full-res
-// positions.
-// ---------------------------------------------------------------------------
 HWY_INLINE void TentUpsampleRow(const float* prev_row, const float* cur_row,
                                 const float* next_row, size_t down_width,
                                 float* out0, float* out1) {
@@ -373,10 +376,6 @@ HWY_INLINE void TentUpsampleRow(const float* prev_row, const float* cur_row,
   }
 }
 
-// ---------------------------------------------------------------------------
-// Main SharpYUV implementation
-// ---------------------------------------------------------------------------
-
 void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
   jpeg_comp_master* m = cinfo->master;
   const size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
@@ -399,8 +398,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
       float* rows_in[MAX_SAMP_FACTOR];
       auto& input = *m->smooth_input[c];
       auto& output = *m->raw_data[c];
-      for (size_t y_in = y0, y_out = y_out0; y_in < y1;
-           y_in += vf, ++y_out) {
+      for (size_t y_in = y0, y_out = y_out0; y_in < y1; y_in += vf, ++y_out) {
         for (int iy = 0; iy < vf; ++iy) {
           rows_in[iy] = input.Row(y_in + iy);
         }
@@ -412,23 +410,28 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
 
   const size_t down_width = xsize_padded / 2;
   const size_t down_height = iMCU_height / 2;
+  const size_t actual_height =
+      (y0 >= cinfo->image_height)
+          ? 1
+          : std::min(y1, static_cast<size_t>(cinfo->image_height)) - y0;
+  const size_t actual_down_height = (actual_height + 1) / 2;
   const size_t high_size = xsize_padded * iMCU_height;
 
   // Hoisted workspace buffers from master struct.
   jpegli::RowBuffer<float>* ws = m->sharpyuv_workspace;
-  auto target_y_ws = &ws[0];
-  auto best_y_ws = &ws[1];
-  auto tmp_lin_r_ws = &ws[2];
-  auto tmp_lin_g_ws = &ws[3];
-  auto tmp_lin_b_ws = &ws[4];
-  auto up_cb_ws = &ws[5];
-  auto up_cr_ws = &ws[6];
-  auto target_cb_ws = &ws[7];
-  auto target_cr_ws = &ws[8];
-  auto best_cb_ws = &ws[9];
-  auto best_cr_ws = &ws[10];
-  auto err_cb_ws = &ws[11];
-  auto err_cr_ws = &ws[12];
+  auto* target_y_ws = &ws[0];
+  auto* best_y_ws = &ws[1];
+  auto* lin_r_ws = &ws[2];
+  auto* lin_g_ws = &ws[3];
+  auto* lin_b_ws = &ws[4];
+  auto* up_cb_ws = &ws[5];
+  auto* up_cr_ws = &ws[6];
+  auto* target_cb_ws = &ws[7];
+  auto* target_cr_ws = &ws[8];
+  auto* best_cb_ws = &ws[9];
+  auto* best_cr_ws = &ws[10];
+  auto* err_cb_ws = &ws[11];
+  auto* err_cr_ws = &ws[12];
 
   const size_t N = hwy::HWY_NAMESPACE::Lanes(d);
 
@@ -436,21 +439,25 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
   // Read pre-transform sRGB directly (if available) or convert from YCbCr.
   for (size_t y = 0; y < iMCU_height; ++y) {
     const float* row_y = m->smooth_input[0]->Row(y0 + y);
-    const float* row_cb = have_rgb_input ? nullptr : m->smooth_input[1]->Row(y0 + y);
-    const float* row_cr = have_rgb_input ? nullptr : m->smooth_input[2]->Row(y0 + y);
+    const float* row_cb =
+        have_rgb_input ? nullptr : m->smooth_input[1]->Row(y0 + y);
+    const float* row_cr =
+        have_rgb_input ? nullptr : m->smooth_input[2]->Row(y0 + y);
     const float* row_r = have_rgb_input ? m->input_rgb[0].Row(y0 + y) : nullptr;
     const float* row_g = have_rgb_input ? m->input_rgb[1].Row(y0 + y) : nullptr;
     const float* row_b = have_rgb_input ? m->input_rgb[2].Row(y0 + y) : nullptr;
     float* ty_out = target_y_ws->Row(y);
     float* by_out = best_y_ws->Row(y);
-    float* lr_out = tmp_lin_r_ws->Row(y);
-    float* lg_out = tmp_lin_g_ws->Row(y);
-    float* lb_out = tmp_lin_b_ws->Row(y);
+    float* lr_out = lin_r_ws->Row(y);
+    float* lg_out = lin_g_ws->Row(y);
+    float* lb_out = lin_b_ws->Row(y);
 
     size_t x = 0;
     for (; x + N <= xsize_padded; x += N) {
       auto py = LoadU(d, row_y + x);
-      decltype(py) r, g, b;
+      decltype(py) r;
+      decltype(py) g;
+      decltype(py) b;
       if (have_rgb_input) {
         r = LoadU(d, row_r + x);
         g = LoadU(d, row_g + x);
@@ -476,7 +483,9 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
     }
     for (; x < xsize_padded; ++x) {
       float py = row_y[x];
-      float r, g, b;
+      float r;
+      float g;
+      float b;
       if (have_rgb_input) {
         r = row_r[x];
         g = row_g[x];
@@ -489,6 +498,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
       float lr = GammaToLinear(r);
       float lg = GammaToLinear(g);
       float lb = GammaToLinear(b);
+      // see TempLuma_SIMD for bt 709 coefficients
       float ly = 0.299f * lr + 0.587f * lg + 0.114f * lb;
       ty_out[x] = LinearToGamma(ly);
       by_out[x] = py;
@@ -499,25 +509,25 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
   }
 
   // Box-average downsample of linear RGB to target Cb/Cr.
-  for (size_t y = 0; y < down_height; ++y) {
-    const float* r0 = tmp_lin_r_ws->Row(y * 2 + 0);
-    const float* g0 = tmp_lin_g_ws->Row(y * 2 + 0);
-    const float* b0 = tmp_lin_b_ws->Row(y * 2 + 0);
-    const float* r1 = tmp_lin_r_ws->Row(y * 2 + 1);
-    const float* g1 = tmp_lin_g_ws->Row(y * 2 + 1);
-    const float* b1 = tmp_lin_b_ws->Row(y * 2 + 1);
+  for (size_t y = 0; y < actual_down_height; ++y) {
+    const float* r0 = lin_r_ws->Row(y * 2 + 0);
+    const float* g0 = lin_g_ws->Row(y * 2 + 0);
+    const float* b0 = lin_b_ws->Row(y * 2 + 0);
+    const float* r1 = lin_r_ws->Row(y * 2 + 1);
+    const float* g1 = lin_g_ws->Row(y * 2 + 1);
+    const float* b1 = lin_b_ws->Row(y * 2 + 1);
     float* dcb = target_cb_ws->Row(y);
     float* dcr = target_cr_ws->Row(y);
 
-    BoxAverageDownsampleRow(r0, g0, b0, r1, g1, b1, down_width, xsize_padded,
-                            dcb, dcr);
+    BoxDownsampleRow(r0, g0, b0, r1, g1, b1, down_width, xsize_padded, dcb,
+                     dcr);
 
     // Also initialize best_cb/cr = target_cb/cr
     memcpy(best_cb_ws->Row(y), dcb, down_width * sizeof(float));
     memcpy(best_cr_ws->Row(y), dcr, down_width * sizeof(float));
   }
 
-  for (size_t y = 0; y < down_height; ++y) {
+  for (size_t y = 0; y < actual_down_height; ++y) {
     best_cb_ws->PadRow(y, down_width, 1);
     best_cr_ws->PadRow(y, down_width, 1);
   }
@@ -532,41 +542,43 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
     auto acc_y = Zero(d);
 
     // Padding for upsampling (all rows for this iteration)
-    for (size_t y = 0; y < down_height; ++y) {
+    for (size_t y = 0; y < actual_down_height; ++y) {
       best_cb_ws->PadRow(y, down_width, 1);
       best_cr_ws->PadRow(y, down_width, 1);
     }
 
     // Pass 1: Upsample current best_cb/cr
-    for (size_t y = 0; y < down_height; ++y) {
+    for (size_t y = 0; y < actual_down_height; ++y) {
       size_t y_prev = (y == 0) ? 0 : y - 1;
-      size_t y_next = (y + 1 == down_height) ? y : y + 1;
+      size_t y_next = (y + 1 == actual_down_height) ? y : y + 1;
       TentUpsampleRow(best_cb_ws->Row(y_prev), best_cb_ws->Row(y),
-                      best_cb_ws->Row(y_next), down_width,
-                      up_cb_ws->Row(y * 2), up_cb_ws->Row(y * 2 + 1));
+                      best_cb_ws->Row(y_next), down_width, up_cb_ws->Row(y * 2),
+                      up_cb_ws->Row(y * 2 + 1));
       TentUpsampleRow(best_cr_ws->Row(y_prev), best_cr_ws->Row(y),
-                      best_cr_ws->Row(y_next), down_width,
-                      up_cr_ws->Row(y * 2), up_cr_ws->Row(y * 2 + 1));
+                      best_cr_ws->Row(y_next), down_width, up_cr_ws->Row(y * 2),
+                      up_cr_ws->Row(y * 2 + 1));
     }
 
     // Pass 2: Evaluate Y and Prepare for Chroma update
-    for (size_t y = 0; y < down_height; ++y) {
+    for (size_t y = 0; y < actual_down_height; ++y) {
       for (int iy = 0; iy < 2; ++iy) {
         size_t yy = y * 2 + iy;
         float* row_best_y = best_y_ws->Row(yy);
         const float* row_up_cb = up_cb_ws->Row(yy);
         const float* row_up_cr = up_cr_ws->Row(yy);
         const float* row_target_y = target_y_ws->Row(yy);
-        float* row_lin_r = tmp_lin_r_ws->Row(yy);
-        float* row_lin_g = tmp_lin_g_ws->Row(yy);
-        float* row_lin_b = tmp_lin_b_ws->Row(yy);
+        float* row_lin_r = lin_r_ws->Row(yy);
+        float* row_lin_g = lin_g_ws->Row(yy);
+        float* row_lin_b = lin_b_ws->Row(yy);
 
         size_t x = 0;
         for (; x + N <= xsize_padded; x += N) {
           auto py = LoadU(d, row_best_y + x);
           auto ucb = LoadU(d, row_up_cb + x);
           auto ucr = LoadU(d, row_up_cr + x);
-          Vec<D> r, g, b;
+          Vec<D> r;
+          Vec<D> g;
+          Vec<D> b;
           YCbCrToRGB_SIMD(d, py, ucb, ucr, &r, &g, &b);
           auto lin_r = GammaToLinear_SIMD(d, r);
           auto lin_g = GammaToLinear_SIMD(d, g);
@@ -583,11 +595,14 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
         }
         for (; x < xsize_padded; ++x) {
           float py = row_best_y[x];
-          float r, g, b;
+          float r;
+          float g;
+          float b;
           YCbCrToRGB_Scalar(py, row_up_cb[x], row_up_cr[x], &r, &g, &b);
           float lr = GammaToLinear(r);
           float lg = GammaToLinear(g);
           float lb = GammaToLinear(b);
+          // see TempLuma_SIMD for bt 709 coefficients
           float ly = 0.299f * lr + 0.587f * lg + 0.114f * lb;
           float ey = row_target_y[x] - LinearToGamma(ly);
           diff_y_sum += std::abs(ey);
@@ -605,11 +620,10 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
       float* row_tgt_cb = target_cb_ws->Row(y);
       float* row_tgt_cr = target_cr_ws->Row(y);
 
-      BoxAverageDownsampleRow(
-          tmp_lin_r_ws->Row(y * 2 + 0), tmp_lin_g_ws->Row(y * 2 + 0),
-          tmp_lin_b_ws->Row(y * 2 + 0), tmp_lin_r_ws->Row(y * 2 + 1),
-          tmp_lin_g_ws->Row(y * 2 + 1), tmp_lin_b_ws->Row(y * 2 + 1), down_width,
-          xsize_padded, row_err_cb, row_err_cr);
+      BoxDownsampleRow(lin_r_ws->Row(y * 2 + 0), lin_g_ws->Row(y * 2 + 0),
+                       lin_b_ws->Row(y * 2 + 0), lin_r_ws->Row(y * 2 + 1),
+                       lin_g_ws->Row(y * 2 + 1), lin_b_ws->Row(y * 2 + 1),
+                       down_width, xsize_padded, row_err_cb, row_err_cr);
 
       size_t x = 0;
       for (; x + N <= down_width; x += N) {
@@ -643,10 +657,11 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
 
   const size_t y_out0 = y0 / 2;
   for (size_t y_out = 0; y_out < down_height; ++y_out) {
+    size_t copy_y = std::min(y_out, actual_down_height - 1);
     float* row_out_cb = m->raw_data[1]->Row(y_out0 + y_out);
     float* row_out_cr = m->raw_data[2]->Row(y_out0 + y_out);
-    memcpy(row_out_cb, best_cb_ws->Row(y_out), down_width * sizeof(float));
-    memcpy(row_out_cr, best_cr_ws->Row(y_out), down_width * sizeof(float));
+    memcpy(row_out_cb, best_cb_ws->Row(copy_y), down_width * sizeof(float));
+    memcpy(row_out_cr, best_cr_ws->Row(copy_y), down_width * sizeof(float));
   }
 }
 

From 7f0931fb9c06d917c799f046dce83604d8572556 Mon Sep 17 00:00:00 2001
From: Galaxy4594 <164440799+Galaxy4594@users.noreply.github.com>
Date: Sat, 11 Apr 2026 19:44:22 -0700
Subject: [PATCH 07/10] +1 MP/s

---
 lib/jpegli/sharp_yuv.cc | 62 +++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 33 deletions(-)

diff --git a/lib/jpegli/sharp_yuv.cc b/lib/jpegli/sharp_yuv.cc
index d040c70d..850542a6 100644
--- a/lib/jpegli/sharp_yuv.cc
+++ b/lib/jpegli/sharp_yuv.cc
@@ -31,6 +31,7 @@ using hwy::HWY_NAMESPACE::BitCast;
 using hwy::HWY_NAMESPACE::ConvertTo;
 using hwy::HWY_NAMESPACE::Div;
 using hwy::HWY_NAMESPACE::Floor;
+using hwy::HWY_NAMESPACE::GetLane;
 using hwy::HWY_NAMESPACE::IfThenElse;
 using hwy::HWY_NAMESPACE::Lanes;
 using hwy::HWY_NAMESPACE::Le;
@@ -60,23 +61,7 @@ constexpr D d;
 // Scalar helpers (used for tail elements only)
 // ---------------------------------------------------------------------------
 
-inline float GammaToLinear(float v) {
-  float n = std::max(0.0f, std::min(255.0f, v)) * (1.0f / 255.0f);
-  if (n <= 0.04045f) {
-    return n / 12.92f;
-  } else {
-    return std::pow((n + 0.055f) / 1.055f, 2.4f);
-  }
-}
 
-inline float LinearToGamma(float v) {
-  if (v <= 0.0031308f) {
-    return v * 12.92f * 255.0f;
-  } else {
-    return (1.055f * std::pow(std::max(0.0f, v), 1.0f / 2.4f) - 0.055f) *
-           255.0f;
-  }
-}
 
 inline void YCbCrToRGB_Scalar(float y, float cb, float cr, float* r, float* g,
                               float* b) {
@@ -94,8 +79,6 @@ inline void RGBToYCbCr_Scalar(float r, float g, float b, float* y, float* cb,
   const float kB = 0.114f;
   const float kAmpR = 0.701f;
   const float kAmpB = 0.886f;
-  const float kDiffR = kAmpR + kR;
-  const float kDiffB = kAmpB + kB;
   const float kNormR = 1.0f / (kAmpR + kG + kB);
   const float kNormB = 1.0f / (kR + kG + kAmpB);
 
@@ -104,8 +87,8 @@ inline void RGBToYCbCr_Scalar(float r, float g, float b, float* y, float* cb,
   float b_base = b * kB;
   float y_base = r_base + g_base + b_base;
   *y = y_base;
-  *cb = (b * kDiffB - y_base) * kNormB + 128.0f;
-  *cr = (r * kDiffR - y_base) * kNormR + 128.0f;
+  *cb = (b - y_base) * kNormB + 128.0f;
+  *cr = (r - y_base) * kNormR + 128.0f;
 }
 
 // ---------------------------------------------------------------------------
@@ -204,23 +187,38 @@ HWY_INLINE V LinearToGamma_SIMD(D d, V v) {
   return IfThenElse(is_low, low_val, high_val);
 }
 
+inline float GammaToLinear(float v) {
+  HWY_CAPPED(float, 1) d1;
+  return GetLane(GammaToLinear_SIMD(d1, Set(d1, v)));
+}
+
+inline float LinearToGamma(float v) {
+  HWY_CAPPED(float, 1) d1;
+  return GetLane(LinearToGamma_SIMD(d1, Set(d1, v)));
+}
+
 template <class D, class V>
 HWY_INLINE void YCbCrToRGB_SIMD(D d, V y, V cb, V cr, V* r, V* g, V* b) {
-  auto v128 = Set(d, 128.0f);
+  const auto v128 = Set(d, 128.0f);
   cb = Sub(cb, v128);
   cr = Sub(cr, v128);
-  *r = MulAdd(cr, Set(d, 1.402f), y);
-  *g = MulAdd(cb, Set(d, -0.114f * 1.772f / 0.587f),
-              MulAdd(cr, Set(d, -0.299f * 1.402f / 0.587f), y));
-  *b = MulAdd(cb, Set(d, 1.772f), y);
+  const auto kCrR = Set(d, 1.402f);
+  const auto kCbG = Set(d, -0.114f * 1.772f / 0.587f);
+  const auto kCrG = Set(d, -0.299f * 1.402f / 0.587f);
+  const auto kCbB = Set(d, 1.772f);
+  *r = MulAdd(cr, kCrR, y);
+  *g = MulAdd(cb, kCbG, MulAdd(cr, kCrG, y));
+  *b = MulAdd(cb, kCbB, y);
 }
 
 template <class D, class V>
 HWY_INLINE V TempLuma_SIMD(D d, V r, V g, V b) {
   // using bt 709 values (0.2126f, 0.7152f, 0.0722f) may be more perceptually
   // accurate, but it needs more visual testing.
-  return MulAdd(Set(d, 0.299f), r,
-                MulAdd(Set(d, 0.587f), g, Mul(Set(d, 0.114f), b)));
+  const auto kR = Set(d, 0.299f);
+  const auto kG = Set(d, 0.587f);
+  const auto kB = Set(d, 0.114f);
+  return MulAdd(kR, r, MulAdd(kG, g, Mul(kB, b)));
 }
 
 // Vectorized RGBToYCbCr: computes Y, Cb, Cr from gamma-space R, G, B vectors.
@@ -229,18 +227,16 @@ HWY_INLINE void RGBToYCbCr_SIMD(D d, V r, V g, V b, V* y, V* cb, V* cr) {
   const auto kR = Set(d, 0.299f);
   const auto kG = Set(d, 0.587f);
   const auto kB = Set(d, 0.114f);
-  const auto kDiffR = Set(d, 0.701f + 0.299f);
-  const auto kDiffB = Set(d, 0.886f + 0.114f);
   const auto kNormR = Set(d, 1.0f / (0.701f + 0.587f + 0.114f));
   const auto kNormB = Set(d, 1.0f / (0.299f + 0.587f + 0.886f));
   const auto v128 = Set(d, 128.0f);
 
   auto y_base = MulAdd(kR, r, MulAdd(kG, g, Mul(kB, b)));
   *y = y_base;
-  // cb = (b * kDiffB - y_base) * kNormB + 128
-  *cb = MulAdd(Sub(Mul(b, kDiffB), y_base), kNormB, v128);
-  // cr = (r * kDiffR - y_base) * kNormR + 128
-  *cr = MulAdd(Sub(Mul(r, kDiffR), y_base), kNormR, v128);
+  // cb = (b - y_base) * kNormB + 128
+  *cb = MulAdd(Sub(b, y_base), kNormB, v128);
+  // cr = (r - y_base) * kNormR + 128
+  *cr = MulAdd(Sub(r, y_base), kNormR, v128);
 }
 
 // SIMD box-average downsample of linear RGB -> YCbCr chroma

From dbe53bd0d9407176a2d97d58640ea0287994e476 Mon Sep 17 00:00:00 2001
From: Galaxy4594 <164440799+Galaxy4594@users.noreply.github.com>
Date: Fri, 24 Apr 2026 03:42:50 -0700
Subject: [PATCH 08/10] minor stuff

---
 tools/cjpegli.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/cjpegli.cc b/tools/cjpegli.cc
index 35008edc..e48b4fa1 100644
--- a/tools/cjpegli.cc
+++ b/tools/cjpegli.cc
@@ -77,7 +77,7 @@ struct Args {
     cmdline->AddOptionFlag('\0', "xyb", "Convert to XYB colorspace",
                            &settings.xyb, &SetBooleanTrue, 1);
 
-    cmdline->AddOptionFlag('\0', "sharpyuv",
+    cmdline->AddOptionFlag('\0', "sharp_yuv",
                            "Use true linear-light sharp YUV downsampling for 4:2:0",
                            &settings.use_sharpyuv, &SetBooleanTrue, 1);
 

From 8bcfe96858b27030554e1fb130ae2f67564b1f51 Mon Sep 17 00:00:00 2001
From: Galaxy4594 <164440799+Galaxy4594@users.noreply.github.com>
Date: Sun, 26 Apr 2026 02:52:36 -0700
Subject: [PATCH 09/10] remove dead code & fix comments

another small speed up of +2 mp/s
---
 lib/jpegli/sharp_yuv.cc | 163 +++-------------------------------------
 lib/jpegli/sharp_yuv.h  |   7 +-
 2 files changed, 14 insertions(+), 156 deletions(-)

diff --git a/lib/jpegli/sharp_yuv.cc b/lib/jpegli/sharp_yuv.cc
index 850542a6..694bde38 100644
--- a/lib/jpegli/sharp_yuv.cc
+++ b/lib/jpegli/sharp_yuv.cc
@@ -7,7 +7,6 @@
 #include "lib/jpegli/sharp_yuv.h"
 
 #include <algorithm>
-#include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
@@ -26,12 +25,10 @@ namespace HWY_NAMESPACE {
 
 using hwy::HWY_NAMESPACE::Abs;
 using hwy::HWY_NAMESPACE::Add;
-using hwy::HWY_NAMESPACE::And;
 using hwy::HWY_NAMESPACE::BitCast;
 using hwy::HWY_NAMESPACE::ConvertTo;
 using hwy::HWY_NAMESPACE::Div;
 using hwy::HWY_NAMESPACE::Floor;
-using hwy::HWY_NAMESPACE::GetLane;
 using hwy::HWY_NAMESPACE::IfThenElse;
 using hwy::HWY_NAMESPACE::Lanes;
 using hwy::HWY_NAMESPACE::Le;
@@ -42,7 +39,6 @@ using hwy::HWY_NAMESPACE::Max;
 using hwy::HWY_NAMESPACE::Min;
 using hwy::HWY_NAMESPACE::Mul;
 using hwy::HWY_NAMESPACE::MulAdd;
-using hwy::HWY_NAMESPACE::Or;
 using hwy::HWY_NAMESPACE::Rebind;
 using hwy::HWY_NAMESPACE::ReduceSum;
 using hwy::HWY_NAMESPACE::Set;
@@ -57,45 +53,7 @@ using hwy::HWY_NAMESPACE::Zero;
 using D = HWY_CAPPED(float, 8);
 constexpr D d;
 
-// ---------------------------------------------------------------------------
-// Scalar helpers (used for tail elements only)
-// ---------------------------------------------------------------------------
-
-
-
-inline void YCbCrToRGB_Scalar(float y, float cb, float cr, float* r, float* g,
-                              float* b) {
-  cb -= 128.0f;
-  cr -= 128.0f;
-  *r = y + cr * 1.402f;
-  *g = y + cb * (-0.114f * 1.772f / 0.587f) + cr * (-0.299f * 1.402f / 0.587f);
-  *b = y + cb * 1.772f;
-}
-
-inline void RGBToYCbCr_Scalar(float r, float g, float b, float* y, float* cb,
-                              float* cr) {
-  const float kR = 0.299f;
-  const float kG = 0.587f;
-  const float kB = 0.114f;
-  const float kAmpR = 0.701f;
-  const float kAmpB = 0.886f;
-  const float kNormR = 1.0f / (kAmpR + kG + kB);
-  const float kNormB = 1.0f / (kR + kG + kAmpB);
-
-  float r_base = r * kR;
-  float g_base = g * kG;
-  float b_base = b * kB;
-  float y_base = r_base + g_base + b_base;
-  *y = y_base;
-  *cb = (b - y_base) * kNormB + 128.0f;
-  *cr = (r - y_base) * kNormR + 128.0f;
-}
-
-// ---------------------------------------------------------------------------
 // Fast log2/exp2 approximations (same approach as adaptive_quantization.cc)
-// ---------------------------------------------------------------------------
-
-// Evaluates a rational polynomial p(x)/q(x) via Horner's scheme.
 template <size_t NP, size_t NQ, class DF, class V, typename T>
 HWY_INLINE V EvalRationalPoly(const DF df, const V x, const T (&p)[NP],
                               const T (&q)[NQ]) {
@@ -157,10 +115,7 @@ HWY_INLINE V FastPowf(const DF df, V x, float power) {
   return FastPow2f(df, Mul(FastLog2f(df, x), Set(df, power)));
 }
 
-// ---------------------------------------------------------------------------
 // Highway SIMD gamma helpers using fast polynomial approximations
-// ---------------------------------------------------------------------------
-
 template <class D, class V>
 HWY_INLINE V GammaToLinear_SIMD(D d, V v) {
   auto v_zero = Set(d, 0.0f);
@@ -187,16 +142,6 @@ HWY_INLINE V LinearToGamma_SIMD(D d, V v) {
   return IfThenElse(is_low, low_val, high_val);
 }
 
-inline float GammaToLinear(float v) {
-  HWY_CAPPED(float, 1) d1;
-  return GetLane(GammaToLinear_SIMD(d1, Set(d1, v)));
-}
-
-inline float LinearToGamma(float v) {
-  HWY_CAPPED(float, 1) d1;
-  return GetLane(LinearToGamma_SIMD(d1, Set(d1, v)));
-}
-
 template <class D, class V>
 HWY_INLINE void YCbCrToRGB_SIMD(D d, V y, V cb, V cr, V* r, V* g, V* b) {
   const auto v128 = Set(d, 128.0f);
@@ -239,7 +184,8 @@ HWY_INLINE void RGBToYCbCr_SIMD(D d, V r, V g, V b, V* y, V* cb, V* cr) {
   *cr = MulAdd(Sub(r, y_base), kNormR, v128);
 }
 
-// SIMD box-average downsample of linear RGB -> YCbCr chroma
+// Box-averages a 2x2 block of linear-light RGB pixels and converts to
+// gamma-space YCbCr, writing only the Cb and Cr chroma values.
 HWY_INLINE void BoxDownsampleRow(const float* lin_r0, const float* lin_g0,
                                  const float* lin_b0, const float* lin_r1,
                                  const float* lin_g1, const float* lin_b1,
@@ -286,35 +232,11 @@ HWY_INLINE void BoxDownsampleRow(const float* lin_r0, const float* lin_g0,
     StoreU(vcr, d, out_cr + x);
   }
 
-  // Scalar tail
-  for (; x < down_width; ++x) {
-    float lr_sum = 0;
-    float lg_sum = 0;
-    float lb_sum = 0;
-    for (int iy = 0; iy < 2; ++iy) {
-      const float* src_r = (iy == 0) ? lin_r0 : lin_r1;
-      const float* src_g = (iy == 0) ? lin_g0 : lin_g1;
-      const float* src_b = (iy == 0) ? lin_b0 : lin_b1;
-      for (int ix = 0; ix < 2; ++ix) {
-        size_t si = x * 2 + ix;
-        lr_sum += src_r[si];
-        lg_sum += src_g[si];
-        lb_sum += src_b[si];
-      }
-    }
-    float avg_r = LinearToGamma(lr_sum * 0.25f);
-    float avg_g = LinearToGamma(lg_sum * 0.25f);
-    float avg_b = LinearToGamma(lb_sum * 0.25f);
-    float avg_y;
-    float avg_cb;
-    float avg_cr;
-    RGBToYCbCr_Scalar(avg_r, avg_g, avg_b, &avg_y, &avg_cb, &avg_cr);
-    out_cb[x] = avg_cb;
-    out_cr[x] = avg_cr;
-  }
+
 }
 
-// SIMD tent-filter upsample of one chroma row
+// 2x tent-filter upsample of one downsampled chroma row into two
+// full-resolution output rows (out0 = even output row, out1 = odd output row).
 HWY_INLINE void TentUpsampleRow(const float* prev_row, const float* cur_row,
                                 const float* next_row, size_t down_width,
                                 float* out0, float* out1) {
@@ -348,28 +270,7 @@ HWY_INLINE void TentUpsampleRow(const float* prev_row, const float* cur_row,
     StoreInterleaved2(val_bl, val_br, d, out1 + 2 * x);
   }
 
-  // Scalar tail
-  for (; x < down_width; ++x) {
-    size_t x_prev = (x == 0) ? 0 : x - 1;
-    size_t x_next = (x + 1 == down_width) ? x : x + 1;
-
-    float cc = cur_row[x];
-    float ct = prev_row[x];
-    float cb = next_row[x];
-    float cl = cur_row[x_prev];
-    float cr = cur_row[x_next];
-    float ctl = prev_row[x_prev];
-    float ctr = prev_row[x_next];
-    float cbl = next_row[x_prev];
-    float cbr = next_row[x_next];
-
-    float c9 = cc * 9.0f;
-    float inv16 = 1.0f / 16.0f;
-    out0[x * 2 + 0] = (c9 + ct * 3.0f + cl * 3.0f + ctl) * inv16;
-    out0[(x * 2 + 1)] = (c9 + ct * 3.0f + cr * 3.0f + ctr) * inv16;
-    out1[x * 2 + 0] = (c9 + cb * 3.0f + cl * 3.0f + cbl) * inv16;
-    out1[(x * 2 + 1)] = (c9 + cb * 3.0f + cr * 3.0f + cbr) * inv16;
-  }
+
 }
 
 void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
@@ -413,7 +314,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
   const size_t actual_down_height = (actual_height + 1) / 2;
   const size_t high_size = xsize_padded * iMCU_height;
 
-  // Hoisted workspace buffers from master struct.
+  // Aliases to workspace buffers in master struct.
   jpegli::RowBuffer<float>* ws = m->sharpyuv_workspace;
   auto* target_y_ws = &ws[0];
   auto* best_y_ws = &ws[1];
@@ -477,31 +378,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
       StoreU(lin_g, d, lg_out + x);
       StoreU(lin_b, d, lb_out + x);
     }
-    for (; x < xsize_padded; ++x) {
-      float py = row_y[x];
-      float r;
-      float g;
-      float b;
-      if (have_rgb_input) {
-        r = row_r[x];
-        g = row_g[x];
-        b = row_b[x];
-      } else {
-        float pcb = row_cb[x];
-        float pcr = row_cr[x];
-        YCbCrToRGB_Scalar(py, pcb, pcr, &r, &g, &b);
-      }
-      float lr = GammaToLinear(r);
-      float lg = GammaToLinear(g);
-      float lb = GammaToLinear(b);
-      // see TempLuma_SIMD for bt 709 coefficients
-      float ly = 0.299f * lr + 0.587f * lg + 0.114f * lb;
-      ty_out[x] = LinearToGamma(ly);
-      by_out[x] = py;
-      lr_out[x] = lr;
-      lg_out[x] = lg;
-      lb_out[x] = lb;
-    }
+
   }
 
   // Box-average downsample of linear RGB to target Cb/Cr.
@@ -589,24 +466,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
           StoreU(lin_g, d, row_lin_g + x);
           StoreU(lin_b, d, row_lin_b + x);
         }
-        for (; x < xsize_padded; ++x) {
-          float py = row_best_y[x];
-          float r;
-          float g;
-          float b;
-          YCbCrToRGB_Scalar(py, row_up_cb[x], row_up_cr[x], &r, &g, &b);
-          float lr = GammaToLinear(r);
-          float lg = GammaToLinear(g);
-          float lb = GammaToLinear(b);
-          // see TempLuma_SIMD for bt 709 coefficients
-          float ly = 0.299f * lr + 0.587f * lg + 0.114f * lb;
-          float ey = row_target_y[x] - LinearToGamma(ly);
-          diff_y_sum += std::abs(ey);
-          row_best_y[x] = std::max(0.0f, std::min(255.0f, py + ey));
-          row_lin_r[x] = lr;
-          row_lin_g[x] = lg;
-          row_lin_b[x] = lb;
-        }
+
       }
 
       float* row_best_cb = best_cb_ws->Row(y);
@@ -632,10 +492,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
         StoreU(Add(bcb, Sub(tcb, dcb)), d, row_best_cb + x);
         StoreU(Add(bcr, Sub(tcr, dcr)), d, row_best_cr + x);
       }
-      for (; x < down_width; ++x) {
-        row_best_cb[x] += row_tgt_cb[x] - row_err_cb[x];
-        row_best_cr[x] += row_tgt_cr[x] - row_err_cr[x];
-      }
+
     }
     diff_y_sum += ReduceSum(d, acc_y);
 
diff --git a/lib/jpegli/sharp_yuv.h b/lib/jpegli/sharp_yuv.h
index a1d73522..703c6c48 100644
--- a/lib/jpegli/sharp_yuv.h
+++ b/lib/jpegli/sharp_yuv.h
@@ -11,9 +11,10 @@
 
 namespace jpegli {
 
-// Applies a sharp 2x2 area downscale in linear RGB space instead of
-// gamma compressed YCbCr space. This prevents bleeding/blur artifacts
-// and preserves color boundaries much better.
+// Applies an iterative sharp chroma downsampling for 4:2:0 subsampling.
+// The algorithm downsamples Cb/Cr in linear RGB space and iteratively refines
+// the chroma values to minimize luma error after upsample, preserving color
+// boundaries better than a simple box filter.
 void ApplySharpYuvDownsample(j_compress_ptr cinfo);
 
 }  // namespace jpegli

From 2d1cdb23da5905d260e31479afb36b915b0f40e4 Mon Sep 17 00:00:00 2001
From: Galaxy4594 <164440799+Galaxy4594@users.noreply.github.com>
Date: Thu, 14 May 2026 03:16:33 -0700
Subject: [PATCH 10/10] more dead code

---
 lib/jpegli/sharp_yuv.cc | 33 ++++++++-------------------------
 1 file changed, 8 insertions(+), 25 deletions(-)

diff --git a/lib/jpegli/sharp_yuv.cc b/lib/jpegli/sharp_yuv.cc
index 694bde38..e85cade4 100644
--- a/lib/jpegli/sharp_yuv.cc
+++ b/lib/jpegli/sharp_yuv.cc
@@ -8,7 +8,6 @@
 
 #include <algorithm>
 #include <cstddef>
-#include <cstdint>
 #include <cstring>
 
 #include "lib/jpegli/common.h"
@@ -30,7 +29,6 @@ using hwy::HWY_NAMESPACE::ConvertTo;
 using hwy::HWY_NAMESPACE::Div;
 using hwy::HWY_NAMESPACE::Floor;
 using hwy::HWY_NAMESPACE::IfThenElse;
-using hwy::HWY_NAMESPACE::Lanes;
 using hwy::HWY_NAMESPACE::Le;
 using hwy::HWY_NAMESPACE::LoadDup128;
 using hwy::HWY_NAMESPACE::LoadInterleaved2;
@@ -166,9 +164,9 @@ HWY_INLINE V TempLuma_SIMD(D d, V r, V g, V b) {
   return MulAdd(kR, r, MulAdd(kG, g, Mul(kB, b)));
 }
 
-// Vectorized RGBToYCbCr: computes Y, Cb, Cr from gamma-space R, G, B vectors.
+// Vectorized RGBToCbCr: computes Cb, Cr from gamma-space R, G, B vectors.
 template <class D, class V>
-HWY_INLINE void RGBToYCbCr_SIMD(D d, V r, V g, V b, V* y, V* cb, V* cr) {
+HWY_INLINE void RGBToCbCr_SIMD(D d, V r, V g, V b, V* cb, V* cr) {
   const auto kR = Set(d, 0.299f);
   const auto kG = Set(d, 0.587f);
   const auto kB = Set(d, 0.114f);
@@ -177,7 +175,6 @@ HWY_INLINE void RGBToYCbCr_SIMD(D d, V r, V g, V b, V* y, V* cb, V* cr) {
   const auto v128 = Set(d, 128.0f);
 
   auto y_base = MulAdd(kR, r, MulAdd(kG, g, Mul(kB, b)));
-  *y = y_base;
   // cb = (b - y_base) * kNormB + 128
   *cb = MulAdd(Sub(b, y_base), kNormB, v128);
   // cr = (r - y_base) * kNormR + 128
@@ -185,12 +182,12 @@ HWY_INLINE void RGBToYCbCr_SIMD(D d, V r, V g, V b, V* y, V* cb, V* cr) {
 }
 
 // Box-averages a 2x2 block of linear-light RGB pixels and converts to
-// gamma-space YCbCr, writing only the Cb and Cr chroma values.
+// gamma-space Cb and Cr chroma values.
 HWY_INLINE void BoxDownsampleRow(const float* lin_r0, const float* lin_g0,
                                  const float* lin_b0, const float* lin_r1,
                                  const float* lin_g1, const float* lin_b1,
-                                 size_t down_width, size_t xsize_padded,
-                                 float* out_cb, float* out_cr) {
+                                 size_t down_width, float* out_cb,
+                                 float* out_cr) {
   const size_t N = hwy::HWY_NAMESPACE::Lanes(d);
   const auto quarter = Set(d, 0.25f);
   size_t x = 0;
@@ -223,16 +220,13 @@ HWY_INLINE void BoxDownsampleRow(const float* lin_r0, const float* lin_g0,
     auto avg_g = LinearToGamma_SIMD(d, Mul(sum_g, quarter));
     auto avg_b = LinearToGamma_SIMD(d, Mul(sum_b, quarter));
 
-    Vec<D> vy;
     Vec<D> vcb;
     Vec<D> vcr;
-    RGBToYCbCr_SIMD(d, avg_r, avg_g, avg_b, &vy, &vcb, &vcr);
+    RGBToCbCr_SIMD(d, avg_r, avg_g, avg_b, &vcb, &vcr);
 
     StoreU(vcb, d, out_cb + x);
     StoreU(vcr, d, out_cr + x);
   }
-
-
 }
 
 // 2x tent-filter upsample of one downsampled chroma row into two
@@ -269,8 +263,6 @@ HWY_INLINE void TentUpsampleRow(const float* prev_row, const float* cur_row,
     StoreInterleaved2(val_tl, val_tr, d, out0 + 2 * x);
     StoreInterleaved2(val_bl, val_br, d, out1 + 2 * x);
   }
-
-
 }
 
 void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
@@ -378,7 +370,6 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
       StoreU(lin_g, d, lg_out + x);
       StoreU(lin_b, d, lb_out + x);
     }
-
   }
 
   // Box-average downsample of linear RGB to target Cb/Cr.
@@ -392,19 +383,13 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
     float* dcb = target_cb_ws->Row(y);
     float* dcr = target_cr_ws->Row(y);
 
-    BoxDownsampleRow(r0, g0, b0, r1, g1, b1, down_width, xsize_padded, dcb,
-                     dcr);
+    BoxDownsampleRow(r0, g0, b0, r1, g1, b1, down_width, dcb, dcr);
 
     // Also initialize best_cb/cr = target_cb/cr
     memcpy(best_cb_ws->Row(y), dcb, down_width * sizeof(float));
     memcpy(best_cr_ws->Row(y), dcr, down_width * sizeof(float));
   }
 
-  for (size_t y = 0; y < actual_down_height; ++y) {
-    best_cb_ws->PadRow(y, down_width, 1);
-    best_cr_ws->PadRow(y, down_width, 1);
-  }
-
   const auto vmin = Set(d, 0.0f);
   const auto vmax = Set(d, 255.0f);
   float prev_diff_y_sum = 1e30f;
@@ -466,7 +451,6 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
           StoreU(lin_g, d, row_lin_g + x);
           StoreU(lin_b, d, row_lin_b + x);
         }
-
       }
 
       float* row_best_cb = best_cb_ws->Row(y);
@@ -479,7 +463,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
       BoxDownsampleRow(lin_r_ws->Row(y * 2 + 0), lin_g_ws->Row(y * 2 + 0),
                        lin_b_ws->Row(y * 2 + 0), lin_r_ws->Row(y * 2 + 1),
                        lin_g_ws->Row(y * 2 + 1), lin_b_ws->Row(y * 2 + 1),
-                       down_width, xsize_padded, row_err_cb, row_err_cr);
+                       down_width, row_err_cb, row_err_cr);
 
       size_t x = 0;
       for (; x + N <= down_width; x += N) {
@@ -492,7 +476,6 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) {
         StoreU(Add(bcb, Sub(tcb, dcb)), d, row_best_cb + x);
         StoreU(Add(bcr, Sub(tcr, dcr)), d, row_best_cr + x);
       }
-
     }
     diff_y_sum += ReduceSum(d, acc_y);