From cbff49d1bd665057be7adf894ea267f0e533ef8a Mon Sep 17 00:00:00 2001 From: Galaxy4594 <164440799+Galaxy4594@users.noreply.github.com> Date: Wed, 18 Mar 2026 19:40:34 -0700 Subject: [PATCH 01/10] implement sharpyuv --- lib/extras/enc/jpegli.cc | 3 + lib/extras/enc/jpegli.h | 1 + lib/jpegli/encode.cc | 13 +- lib/jpegli/encode.h | 3 + lib/jpegli/encode_internal.h | 1 + lib/jpegli/sharp_yuv.cc | 380 +++++++++++++++++++++++++++++++++++ lib/jpegli/sharp_yuv.h | 21 ++ lib/jxl_lists.cmake | 2 + tools/cjpegli.cc | 4 + 9 files changed, 427 insertions(+), 1 deletion(-) create mode 100644 lib/jpegli/sharp_yuv.cc create mode 100644 lib/jpegli/sharp_yuv.h diff --git a/lib/extras/enc/jpegli.cc b/lib/extras/enc/jpegli.cc index 14a7a398..cb2384e8 100644 --- a/lib/extras/enc/jpegli.cc +++ b/lib/extras/enc/jpegli.cc @@ -461,6 +461,9 @@ Status EncodeJpeg(const PackedPixelFile& ppf, const JpegSettings& jpeg_settings, } jpegli_enable_adaptive_quantization( &cinfo, TO_JXL_BOOL(jpeg_settings.use_adaptive_quantization)); + if (jpeg_settings.use_sharpyuv) { + jpegli_set_sharp_yuv(&cinfo, TRUE); + } if (jpeg_settings.psnr_target > 0.0) { jpegli_set_psnr(&cinfo, jpeg_settings.psnr_target, jpeg_settings.search_tolerance, diff --git a/lib/extras/enc/jpegli.h b/lib/extras/enc/jpegli.h index 7d841eb5..88db8074 100644 --- a/lib/extras/enc/jpegli.h +++ b/lib/extras/enc/jpegli.h @@ -24,6 +24,7 @@ class PackedPixelFile; struct JpegSettings { bool xyb = false; + bool use_sharpyuv = false; size_t target_size = 0; float quality = 0.0f; float distance = 1.f; diff --git a/lib/jpegli/encode.cc b/lib/jpegli/encode.cc index 4d9b0cae..f59da379 100644 --- a/lib/jpegli/encode.cc +++ b/lib/jpegli/encode.cc @@ -31,6 +31,7 @@ #include "lib/jpegli/quant.h" #include "lib/jpegli/simd.h" #include "lib/jpegli/types.h" +#include "lib/jpegli/sharp_yuv.h" namespace jpegli { @@ -600,7 +601,11 @@ void ProcessiMCURow(j_compress_ptr cinfo) { JPEGLI_CHECK(cinfo->master->next_iMCU_row < cinfo->total_iMCU_rows); if (!cinfo->raw_data_in) { ApplyInputSmoothing(cinfo); - DownsampleInputBuffer(cinfo); + if (cinfo->master->use_sharpyuv && cinfo->jpeg_color_space == JCS_YCbCr) { + ApplySharpYuvDownsample(cinfo); + } else { + DownsampleInputBuffer(cinfo); + } } ComputeAdaptiveQuantField(cinfo); if (IsStreamingSupported(cinfo)) { @@ -693,6 +698,7 @@ void jpegli_CreateCompress(j_compress_ptr cinfo, int version, cinfo->master->cicp_transfer_function = 2; // unknown transfer function code cinfo->master->use_std_tables = false; cinfo->master->use_adaptive_quantization = true; + cinfo->master->use_sharpyuv = false; cinfo->master->progressive_level = jpegli::kDefaultProgressiveLevel; cinfo->master->data_type = JPEGLI_TYPE_UINT8; cinfo->master->endianness = JPEGLI_NATIVE_ENDIAN; @@ -927,6 +933,11 @@ void jpegli_enable_adaptive_quantization(j_compress_ptr cinfo, boolean value) { cinfo->master->use_adaptive_quantization = FROM_JXL_BOOL(value); } +void jpegli_set_sharp_yuv(j_compress_ptr cinfo, boolean enable) { + CheckState(cinfo, jpegli::kEncStart); + cinfo->master->use_sharpyuv = FROM_JXL_BOOL(enable); +} + void jpegli_simple_progression(j_compress_ptr cinfo) { CheckState(cinfo, jpegli::kEncStart); jpegli_set_progressive_level(cinfo, 2); diff --git a/lib/jpegli/encode.h b/lib/jpegli/encode.h index 5f620b98..364894b1 100644 --- a/lib/jpegli/encode.h +++ b/lib/jpegli/encode.h @@ -145,6 +145,9 @@ void jpegli_set_input_format(j_compress_ptr cinfo, JpegliDataType data_type, // Enabled by default. void jpegli_enable_adaptive_quantization(j_compress_ptr cinfo, boolean value); +// Enables linear-light sharp YUV downsampling. +void jpegli_set_sharp_yuv(j_compress_ptr cinfo, boolean enable); + // Sets the default progression parameters, where level 0 is sequential, and // greater level value means more progression steps. Default is 2. void jpegli_set_progressive_level(j_compress_ptr cinfo, int level); diff --git a/lib/jpegli/encode_internal.h b/lib/jpegli/encode_internal.h index 3f5d1a76..05f40dbf 100644 --- a/lib/jpegli/encode_internal.h +++ b/lib/jpegli/encode_internal.h @@ -75,6 +75,7 @@ struct jpeg_comp_master { uint8_t cicp_transfer_function; bool use_std_tables; bool use_adaptive_quantization; + bool use_sharpyuv; int progressive_level; size_t xsize_blocks; size_t ysize_blocks; diff --git a/lib/jpegli/sharp_yuv.cc b/lib/jpegli/sharp_yuv.cc new file mode 100644 index 00000000..dacfb26b --- /dev/null +++ b/lib/jpegli/sharp_yuv.cc @@ -0,0 +1,380 @@ +// Copyright (c) the JPEG XL Project Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file or at +// https://developers.google.com/open-source/licenses/bsd + +#include "lib/jpegli/sharp_yuv.h" + +#include +#include +#include +#include + +#include "lib/jpegli/common.h" +#include "lib/jpegli/encode_internal.h" +#include "lib/jpegli/simd.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jpegli/sharp_yuv.cc" +#include +#include + +HWY_BEFORE_NAMESPACE(); +namespace jpegli { +namespace HWY_NAMESPACE { + +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::Div; +using hwy::HWY_NAMESPACE::LoadU; +using hwy::HWY_NAMESPACE::Max; +using hwy::HWY_NAMESPACE::Min; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::MulAdd; +using hwy::HWY_NAMESPACE::Set; +using hwy::HWY_NAMESPACE::StoreU; +using hwy::HWY_NAMESPACE::Sub; + +using D = HWY_CAPPED(float, 8); +constexpr D d; + +inline float GammaToLinear(float v) { + float n = std::max(0.0f, std::min(255.0f, v)) * (1.0f / 255.0f); + if (n <= 0.04045f) { + return n / 12.92f; + } else { + return std::pow((n + 0.055f) / 1.055f, 2.4f); + } +} + +inline float LinearToGamma(float v) { + if (v <= 0.0031308f) { + return v * 12.92f * 255.0f; + } else { + return (1.055f * std::pow(std::max(0.0f, v), 1.0f / 2.4f) - 0.055f) * 255.0f; + } +} + +inline void YCbCrToRGB_Scalar(float y, float cb, float cr, float* r, float* g, + float* b) { + cb -= 128.0f; + cr -= 128.0f; + *r = y + cr * 1.402f; + *g = y + cb * (-0.114f * 1.772f / 0.587f) + cr * (-0.299f * 1.402f / 0.587f); + *b = y + cb * 1.772f; +} + +inline void RGBToYCbCr_Scalar(float r, float g, float b, float* y, float* cb, + float* cr) { + const float kR = 0.299f; + const float kG = 0.587f; + const float kB = 0.114f; + const float kAmpR = 0.701f; + const float kAmpB = 0.886f; + const float kDiffR = kAmpR + kR; + const float kDiffB = kAmpB + kB; + const float kNormR = 1.0f / (kAmpR + kG + kB); + const float kNormB = 1.0f / (kR + kG + kAmpB); + + float r_base = r * kR; + float g_base = g * kG; + float b_base = b * kB; + float y_base = r_base + g_base + b_base; + *y = y_base; + *cb = (b * kDiffB - y_base) * kNormB + 128.0f; + *cr = (r * kDiffR - y_base) * kNormR + 128.0f; +} + +inline float RGBToLuma_Scalar(float r, float g, float b) { + return 0.299f * r + 0.587f * g + 0.114f * b; +} + +void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { + jpeg_comp_master* m = cinfo->master; + const size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor; + const size_t y0 = m->next_iMCU_row * iMCU_height; + const size_t y1 = y0 + iMCU_height; + const size_t xsize_padded = m->xsize_blocks * DCTSIZE; + + // Verify that Cb and Cr are 4:2:0 subsampled. + if (cinfo->num_components < 3) return; + const int h_factor = cinfo->max_h_samp_factor / cinfo->comp_info[1].h_samp_factor; + const int v_factor = cinfo->max_v_samp_factor / cinfo->comp_info[1].v_samp_factor; + if(h_factor != 2 || v_factor != 2) { + // Fallback logic for unsupported standard subsampling bounds + for (int c = 1; c < cinfo->num_components; ++c) { + jpeg_component_info* comp = &cinfo->comp_info[c]; + const int hf = cinfo->max_h_samp_factor / comp->h_samp_factor; + const int vf = cinfo->max_v_samp_factor / comp->v_samp_factor; + if (hf == 1 && vf == 1) continue; + const size_t y_out0 = y0 / vf; + float* rows_in[MAX_SAMP_FACTOR]; + auto& input = *m->smooth_input[c]; + auto& output = *m->raw_data[c]; + for (size_t y_in = y0, y_out = y_out0; y_in < y1; y_in += vf, ++y_out) { + for (int iy = 0; iy < vf; ++iy) { + rows_in[iy] = input.Row(y_in + iy); + } + (*m->downsample_method[c])(rows_in, xsize_padded, output.Row(y_out)); + } + } + return; + } + + const size_t down_width = xsize_padded / 2; + const size_t down_height = iMCU_height / 2; + + std::vector target_y(xsize_padded * iMCU_height); + std::vector best_y(xsize_padded * iMCU_height); + + std::vector target_cb(down_width * down_height); + std::vector target_cr(down_width * down_height); + std::vector best_cb(down_width * down_height); + std::vector best_cr(down_width * down_height); + + // 1. Initial High-Res state and 2x2 Box Downscale + for (size_t y = 0; y < down_height; ++y) { + const float* row_y0 = m->smooth_input[0]->Row(y0 + y * 2); + const float* row_y1 = m->smooth_input[0]->Row(y0 + y * 2 + 1); + const float* row_cb0 = m->smooth_input[1]->Row(y0 + y * 2); + const float* row_cb1 = m->smooth_input[1]->Row(y0 + y * 2 + 1); + const float* row_cr0 = m->smooth_input[2]->Row(y0 + y * 2); + const float* row_cr1 = m->smooth_input[2]->Row(y0 + y * 2 + 1); + + for (size_t x = 0; x < down_width; ++x) { + float lin_r_sum = 0, lin_g_sum = 0, lin_b_sum = 0; + for (int iy = 0; iy < 2; ++iy) { + for (int ix = 0; ix < 2; ++ix) { + float py = (iy == 0 ? row_y0 : row_y1)[x * 2 + ix]; + float pcb = (iy == 0 ? row_cb0 : row_cb1)[x * 2 + ix]; + float pcr = (iy == 0 ? row_cr0 : row_cr1)[x * 2 + ix]; + + float r, g, b; + YCbCrToRGB_Scalar(py, pcb, pcr, &r, &g, &b); + + float lin_r = GammaToLinear(r); + float lin_g = GammaToLinear(g); + float lin_b = GammaToLinear(b); + + lin_r_sum += lin_r; + lin_g_sum += lin_g; + lin_b_sum += lin_b; + + // Libwebp-like logic: isolate intrinsic target Luma to preserve + // sharp color edge brightness metrics + float lin_y = RGBToLuma_Scalar(lin_r, lin_g, lin_b); + size_t hy = y * 2 + iy; + size_t hx = x * 2 + ix; + target_y[hy * xsize_padded + hx] = LinearToGamma(lin_y); + best_y[hy * xsize_padded + hx] = py; // Initial Y is the raw unaltered Y + } + } + lin_r_sum *= 0.25f; + lin_g_sum *= 0.25f; + lin_b_sum *= 0.25f; + + float avg_r = LinearToGamma(lin_r_sum); + float avg_g = LinearToGamma(lin_g_sum); + float avg_b = LinearToGamma(lin_b_sum); + + float avg_y, avg_cb, avg_cr; + RGBToYCbCr_Scalar(avg_r, avg_g, avg_b, &avg_y, &avg_cb, &avg_cr); + + target_cb[y * down_width + x] = avg_cb; + target_cr[y * down_width + x] = avg_cr; + best_cb[y * down_width + x] = avg_cb; + best_cr[y * down_width + x] = avg_cr; + } + } + + // 2. Iterative Back-Projection (4-pass Y/Cb/Cr coupled stabilization) + float prev_diff_y_sum = 1e30f; + const float diff_y_threshold = 3.0f * xsize_padded * iMCU_height; + + for (int iter = 0; iter < 4; ++iter) { + std::vector err_y(xsize_padded * iMCU_height); + std::vector err_cb(down_width * down_height); + std::vector err_cr(down_width * down_height); + float diff_y_sum = 0.0f; + + for (size_t y = 0; y < down_height; ++y) { + size_t y_prev = (y == 0) ? 0 : y - 1; + size_t y_next = (y + 1 == down_height) ? y : y + 1; + + const float* prev_cb = &best_cb[y_prev * down_width]; + const float* cur_cb = &best_cb[y * down_width]; + const float* next_cb = &best_cb[y_next * down_width]; + + const float* prev_cr = &best_cr[y_prev * down_width]; + const float* cur_cr = &best_cr[y * down_width]; + const float* next_cr = &best_cr[y_next * down_width]; + + for (size_t x = 0; x < down_width; ++x) { + size_t x_prev = (x == 0) ? 0 : x - 1; + size_t x_next = (x + 1 == down_width) ? x : x + 1; + + auto upsample_bilerp = [](float center, float t, float b, float l, + float r, float tl, float tr, float bl, + float br, float res[4]) { + res[0] = (center * 9.0f + t * 3.0f + l * 3.0f + tl) / 16.0f; + res[1] = (center * 9.0f + t * 3.0f + r * 3.0f + tr) / 16.0f; + res[2] = (center * 9.0f + b * 3.0f + l * 3.0f + bl) / 16.0f; + res[3] = (center * 9.0f + b * 3.0f + r * 3.0f + br) / 16.0f; + }; + + float up_cb[4]; + upsample_bilerp(cur_cb[x], prev_cb[x], next_cb[x], cur_cb[x_prev], + cur_cb[x_next], prev_cb[x_prev], prev_cb[x_next], + next_cb[x_prev], next_cb[x_next], up_cb); + + float up_cr[4]; + upsample_bilerp(cur_cr[x], prev_cr[x], next_cr[x], cur_cr[x_prev], + cur_cr[x_next], prev_cr[x_prev], prev_cr[x_next], + next_cr[x_prev], next_cr[x_next], up_cr); + + float lin_r_sum = 0, lin_g_sum = 0, lin_b_sum = 0; + for (int iy = 0; iy < 2; ++iy) { + for (int ix = 0; ix < 2; ++ix) { + int i = iy * 2 + ix; + size_t hx = x * 2 + ix; + size_t hy = y * 2 + iy; + float py = best_y[hy * xsize_padded + hx]; + + float r, g, b; + YCbCrToRGB_Scalar(py, up_cb[i], up_cr[i], &r, &g, &b); + + float lin_r = GammaToLinear(r); + float lin_g = GammaToLinear(g); + float lin_b = GammaToLinear(b); + + // Record Y error based on simulated decoded linear light + float lin_y = RGBToLuma_Scalar(lin_r, lin_g, lin_b); + float dec_gamma_y = LinearToGamma(lin_y); + float error_y = target_y[hy * xsize_padded + hx] - dec_gamma_y; + err_y[hy * xsize_padded + hx] = error_y; + diff_y_sum += std::abs(error_y); + + lin_r_sum += lin_r; + lin_g_sum += lin_g; + lin_b_sum += lin_b; + } + } + lin_r_sum *= 0.25f; + lin_g_sum *= 0.25f; + lin_b_sum *= 0.25f; + + float avg_r = LinearToGamma(lin_r_sum); + float avg_g = LinearToGamma(lin_g_sum); + float avg_b = LinearToGamma(lin_b_sum); + + float avg_y, avg_cb, avg_cr; + RGBToYCbCr_Scalar(avg_r, avg_g, avg_b, &avg_y, &avg_cb, &avg_cr); + + err_cb[y * down_width + x] = target_cb[y * down_width + x] - avg_cb; + err_cr[y * down_width + x] = target_cr[y * down_width + x] - avg_cr; + } + } + + const size_t N = hwy::HWY_NAMESPACE::Lanes(d); + + // Apply High-Res Y Error with clipping + for (size_t y = 0; y < iMCU_height; ++y) { + size_t x = 0; + float* y_ptr = &best_y[y * xsize_padded]; + const float* ey_ptr = &err_y[y * xsize_padded]; + auto v_zero = Set(d, 0.0f); + auto v_max = Set(d, 255.0f); + for (; x + N <= xsize_padded; x += N) { + auto y_vec = LoadU(d, y_ptr + x); + auto e_y_vec = LoadU(d, ey_ptr + x); + auto res = Add(y_vec, e_y_vec); + res = Min(Max(res, v_zero), v_max); + StoreU(res, d, y_ptr + x); + } + for (; x < xsize_padded; ++x) { + y_ptr[x] = std::max(0.0f, std::min(255.0f, y_ptr[x] + ey_ptr[x])); + } + } + + // Apply Cb/Cr Error + for (size_t y = 0; y < down_height; ++y) { + size_t x = 0; + float* cb_ptr = &best_cb[y * down_width]; + float* cr_ptr = &best_cr[y * down_width]; + const float* ecb_ptr = &err_cb[y * down_width]; + const float* ecr_ptr = &err_cr[y * down_width]; + + for (; x + N <= down_width; x += N) { + auto cb_vec = LoadU(d, cb_ptr + x); + auto cr_vec = LoadU(d, cr_ptr + x); + auto e_cb_vec = LoadU(d, ecb_ptr + x); + auto e_cr_vec = LoadU(d, ecr_ptr + x); + StoreU(Add(cb_vec, e_cb_vec), d, cb_ptr + x); + StoreU(Add(cr_vec, e_cr_vec), d, cr_ptr + x); + } + for (; x < down_width; ++x) { + cb_ptr[x] += ecb_ptr[x]; + cr_ptr[x] += ecr_ptr[x]; + } + } + + // test libwebp exit condition + if (iter > 0) { + if (diff_y_sum < diff_y_threshold) break; + if (diff_y_sum > prev_diff_y_sum) break; + } + prev_diff_y_sum = diff_y_sum; + } + + // 3. Write output to buffers + const size_t N = hwy::HWY_NAMESPACE::Lanes(d); + + // Write Luma + for (size_t y = 0; y < iMCU_height; ++y) { + float* row_out_y = m->raw_data[0]->Row(y0 + y); + float* src_y = &best_y[y * xsize_padded]; + size_t x = 0; + for (; x + N <= xsize_padded; x += N) { + StoreU(LoadU(d, src_y + x), d, row_out_y + x); + } + for (; x < xsize_padded; ++x) { + row_out_y[x] = src_y[x]; + } + } + + // Write Chroma + const size_t y_out0 = y0 / 2; + for (size_t y_out = 0; y_out < down_height; ++y_out) { + float* row_out_cb = m->raw_data[1]->Row(y_out0 + y_out); + float* row_out_cr = m->raw_data[2]->Row(y_out0 + y_out); + float* src_cb = &best_cb[y_out * down_width]; + float* src_cr = &best_cr[y_out * down_width]; + + size_t x = 0; + for (; x + N <= down_width; x += N) { + StoreU(LoadU(d, src_cb + x), d, row_out_cb + x); + StoreU(LoadU(d, src_cr + x), d, row_out_cr + x); + } + for (; x < down_width; ++x) { + row_out_cb[x] = src_cb[x]; + row_out_cr[x] = src_cr[x]; + } + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jpegli +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jpegli { + +HWY_EXPORT(ApplySharpYuvDownsampleImpl); + +void ApplySharpYuvDownsample(j_compress_ptr cinfo) { + HWY_DYNAMIC_DISPATCH(ApplySharpYuvDownsampleImpl)(cinfo); +} + +} // namespace jpegli +#endif // HWY_ONCE diff --git a/lib/jpegli/sharp_yuv.h b/lib/jpegli/sharp_yuv.h new file mode 100644 index 00000000..a1d73522 --- /dev/null +++ b/lib/jpegli/sharp_yuv.h @@ -0,0 +1,21 @@ +// Copyright (c) the JPEG XL Project Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file or at +// https://developers.google.com/open-source/licenses/bsd + +#ifndef LIB_JPEGLI_SHARP_YUV_H_ +#define LIB_JPEGLI_SHARP_YUV_H_ + +#include "lib/jpegli/encode_internal.h" + +namespace jpegli { + +// Applies a sharp 2x2 area downscale in linear RGB space instead of +// gamma compressed YCbCr space. This prevents bleeding/blur artifacts +// and preserves color boundaries much better. +void ApplySharpYuvDownsample(j_compress_ptr cinfo); + +} // namespace jpegli + +#endif // LIB_JPEGLI_SHARP_YUV_H_ diff --git a/lib/jxl_lists.cmake b/lib/jxl_lists.cmake index f5fa51d9..83394772 100644 --- a/lib/jxl_lists.cmake +++ b/lib/jxl_lists.cmake @@ -198,6 +198,8 @@ set(JPEGXL_INTERNAL_JPEGLI_SOURCES jpegli/quant.h jpegli/render.cc jpegli/render.h + jpegli/sharp_yuv.cc + jpegli/sharp_yuv.h jpegli/simd.cc jpegli/simd.h jpegli/source_manager.cc diff --git a/tools/cjpegli.cc b/tools/cjpegli.cc index 7bcd598a..35008edc 100644 --- a/tools/cjpegli.cc +++ b/tools/cjpegli.cc @@ -77,6 +77,10 @@ struct Args { cmdline->AddOptionFlag('\0', "xyb", "Convert to XYB colorspace", &settings.xyb, &SetBooleanTrue, 1); + cmdline->AddOptionFlag('\0', "sharpyuv", + "Use true linear-light sharp YUV downsampling for 4:2:0", + &settings.use_sharpyuv, &SetBooleanTrue, 1); + cmdline->AddOptionFlag( '\0', "std_quant", "Use quantization tables based on Annex K of the JPEG standard.", From 7ea3a0046e854a11efba85654ebd592dbb8ec2bf Mon Sep 17 00:00:00 2001 From: Galaxy4594 <164440799+Galaxy4594@users.noreply.github.com> Date: Wed, 18 Mar 2026 19:48:17 -0700 Subject: [PATCH 02/10] SIMD optimizations --- lib/jpegli/sharp_yuv.cc | 388 ++++++++++++++++++++++------------------ 1 file changed, 215 insertions(+), 173 deletions(-) diff --git a/lib/jpegli/sharp_yuv.cc b/lib/jpegli/sharp_yuv.cc index dacfb26b..135d82b6 100644 --- a/lib/jpegli/sharp_yuv.cc +++ b/lib/jpegli/sharp_yuv.cc @@ -19,6 +19,7 @@ #define HWY_TARGET_INCLUDE "lib/jpegli/sharp_yuv.cc" #include #include +#include HWY_BEFORE_NAMESPACE(); namespace jpegli { @@ -34,6 +35,8 @@ using hwy::HWY_NAMESPACE::MulAdd; using hwy::HWY_NAMESPACE::Set; using hwy::HWY_NAMESPACE::StoreU; using hwy::HWY_NAMESPACE::Sub; +using hwy::HWY_NAMESPACE::IfThenElse; +using hwy::HWY_NAMESPACE::Le; using D = HWY_CAPPED(float, 8); constexpr D d; @@ -85,8 +88,45 @@ inline void RGBToYCbCr_Scalar(float r, float g, float b, float* y, float* cb, *cr = (r * kDiffR - y_base) * kNormR + 128.0f; } -inline float RGBToLuma_Scalar(float r, float g, float b) { - return 0.299f * r + 0.587f * g + 0.114f * b; +template +HWY_INLINE V GammaToLinear_SIMD(D d, V v) { + auto v_zero = Set(d, 0.0f); + auto v_max = Set(d, 255.0f); + auto n = Mul(Min(Max(v, v_zero), v_max), Set(d, 1.0f / 255.0f)); + auto is_low = Le(n, Set(d, 0.04045f)); + auto low_val = Div(n, Set(d, 12.92f)); + auto base = Div(Add(n, Set(d, 0.055f)), Set(d, 1.055f)); + base = Max(base, Set(d, 1e-6f)); + auto log_base = hwy::HWY_NAMESPACE::Log2(d, base); + auto high_val = hwy::HWY_NAMESPACE::Exp2(d, Mul(log_base, Set(d, 2.4f))); + return IfThenElse(is_low, low_val, high_val); +} + +template +HWY_INLINE V LinearToGamma_SIMD(D d, V v) { + v = Max(v, Set(d, 0.0f)); + auto is_low = Le(v, Set(d, 0.0031308f)); + auto low_val = Mul(v, Set(d, 12.92f * 255.0f)); + auto v_safe = Max(v, Set(d, 1e-6f)); + auto log_v = hwy::HWY_NAMESPACE::Log2(d, v_safe); + auto p = hwy::HWY_NAMESPACE::Exp2(d, Mul(log_v, Set(d, 1.0f / 2.4f))); + auto high_val = Mul(Sub(Mul(p, Set(d, 1.055f)), Set(d, 0.055f)), Set(d, 255.0f)); + return IfThenElse(is_low, low_val, high_val); +} + +template +HWY_INLINE void YCbCrToRGB_SIMD(D d, V y, V cb, V cr, V* r, V* g, V* b) { + auto v128 = Set(d, 128.0f); + cb = Sub(cb, v128); + cr = Sub(cr, v128); + *r = Add(y, Mul(cr, Set(d, 1.402f))); + *g = Add(y, Add(Mul(cb, Set(d, -0.114f * 1.772f / 0.587f)), Mul(cr, Set(d, -0.299f * 1.402f / 0.587f)))); + *b = Add(y, Mul(cb, Set(d, 1.772f))); +} + +template +HWY_INLINE V TempLuma_SIMD(D d, V r, V g, V b) { + return Add(Mul(Set(d, 0.299f), r), Add(Mul(Set(d, 0.587f), g), Mul(Set(d, 0.114f), b))); } void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { @@ -96,12 +136,10 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { const size_t y1 = y0 + iMCU_height; const size_t xsize_padded = m->xsize_blocks * DCTSIZE; - // Verify that Cb and Cr are 4:2:0 subsampled. if (cinfo->num_components < 3) return; const int h_factor = cinfo->max_h_samp_factor / cinfo->comp_info[1].h_samp_factor; const int v_factor = cinfo->max_v_samp_factor / cinfo->comp_info[1].v_samp_factor; if(h_factor != 2 || v_factor != 2) { - // Fallback logic for unsupported standard subsampling bounds for (int c = 1; c < cinfo->num_components; ++c) { jpeg_component_info* comp = &cinfo->comp_info[c]; const int hf = cinfo->max_h_samp_factor / comp->h_samp_factor; @@ -123,233 +161,237 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { const size_t down_width = xsize_padded / 2; const size_t down_height = iMCU_height / 2; + const size_t high_size = xsize_padded * iMCU_height; + const size_t low_size = down_width * down_height; - std::vector target_y(xsize_padded * iMCU_height); - std::vector best_y(xsize_padded * iMCU_height); + std::vector target_y(high_size); + std::vector best_y(high_size); + std::vector tmp_lin_r(high_size); + std::vector tmp_lin_g(high_size); + std::vector tmp_lin_b(high_size); - std::vector target_cb(down_width * down_height); - std::vector target_cr(down_width * down_height); - std::vector best_cb(down_width * down_height); - std::vector best_cr(down_width * down_height); + std::vector target_cb(low_size); + std::vector target_cr(low_size); + std::vector best_cb(low_size); + std::vector best_cr(low_size); - // 1. Initial High-Res state and 2x2 Box Downscale - for (size_t y = 0; y < down_height; ++y) { - const float* row_y0 = m->smooth_input[0]->Row(y0 + y * 2); - const float* row_y1 = m->smooth_input[0]->Row(y0 + y * 2 + 1); - const float* row_cb0 = m->smooth_input[1]->Row(y0 + y * 2); - const float* row_cb1 = m->smooth_input[1]->Row(y0 + y * 2 + 1); - const float* row_cr0 = m->smooth_input[2]->Row(y0 + y * 2); - const float* row_cr1 = m->smooth_input[2]->Row(y0 + y * 2 + 1); + std::vector up_cb(high_size); + std::vector up_cr(high_size); + + const size_t N = hwy::HWY_NAMESPACE::Lanes(d); + + // 1. Initial High-Res state -> SIMD Loop + for (size_t y = 0; y < iMCU_height; ++y) { + const float* row_y = m->smooth_input[0]->Row(y0 + y); + const float* row_cb = m->smooth_input[1]->Row(y0 + y); + const float* row_cr = m->smooth_input[2]->Row(y0 + y); + size_t x = 0; + for (; x + N <= xsize_padded; x += N) { + auto py = LoadU(d, row_y + x); + auto pcb = LoadU(d, row_cb + x); + auto pcr = LoadU(d, row_cr + x); + + auto r = Set(d,0.0f), g = Set(d,0.0f), b = Set(d,0.0f); + YCbCrToRGB_SIMD(d, py, pcb, pcr, &r, &g, &b); + + auto lin_r = GammaToLinear_SIMD(d, r); + auto lin_g = GammaToLinear_SIMD(d, g); + auto lin_b = GammaToLinear_SIMD(d, b); + + auto lin_y = TempLuma_SIMD(d, lin_r, lin_g, lin_b); + auto t_y = LinearToGamma_SIMD(d, lin_y); + + size_t idx = y * xsize_padded + x; + StoreU(t_y, d, &target_y[idx]); + StoreU(py, d, &best_y[idx]); + StoreU(lin_r, d, &tmp_lin_r[idx]); + StoreU(lin_g, d, &tmp_lin_g[idx]); + StoreU(lin_b, d, &tmp_lin_b[idx]); + } + for (; x < xsize_padded; ++x) { + float py = row_y[x], pcb = row_cb[x], pcr = row_cr[x]; + float r, g, b; YCbCrToRGB_Scalar(py, pcb, pcr, &r, &g, &b); + float lr = GammaToLinear(r), lg = GammaToLinear(g), lb = GammaToLinear(b); + float ly = 0.299 * lr + 0.587 * lg + 0.114 * lb; + size_t idx = y * xsize_padded + x; + target_y[idx] = LinearToGamma(ly); + best_y[idx] = py; + tmp_lin_r[idx] = lr; tmp_lin_g[idx] = lg; tmp_lin_b[idx] = lb; + } + } + // 1b. Box downscale to target_cb, target_cr -> Scalar loop + // (Downscaled loops represent 1/4 the iterations, scalar overhead is negligible) + for (size_t y = 0; y < down_height; ++y) { for (size_t x = 0; x < down_width; ++x) { - float lin_r_sum = 0, lin_g_sum = 0, lin_b_sum = 0; + float lr_sum = 0, lg_sum = 0, lb_sum = 0; for (int iy = 0; iy < 2; ++iy) { for (int ix = 0; ix < 2; ++ix) { - float py = (iy == 0 ? row_y0 : row_y1)[x * 2 + ix]; - float pcb = (iy == 0 ? row_cb0 : row_cb1)[x * 2 + ix]; - float pcr = (iy == 0 ? row_cr0 : row_cr1)[x * 2 + ix]; - - float r, g, b; - YCbCrToRGB_Scalar(py, pcb, pcr, &r, &g, &b); - - float lin_r = GammaToLinear(r); - float lin_g = GammaToLinear(g); - float lin_b = GammaToLinear(b); - - lin_r_sum += lin_r; - lin_g_sum += lin_g; - lin_b_sum += lin_b; - - // Libwebp-like logic: isolate intrinsic target Luma to preserve - // sharp color edge brightness metrics - float lin_y = RGBToLuma_Scalar(lin_r, lin_g, lin_b); - size_t hy = y * 2 + iy; - size_t hx = x * 2 + ix; - target_y[hy * xsize_padded + hx] = LinearToGamma(lin_y); - best_y[hy * xsize_padded + hx] = py; // Initial Y is the raw unaltered Y + size_t idx = (y * 2 + iy) * xsize_padded + (x * 2 + ix); + lr_sum += tmp_lin_r[idx]; + lg_sum += tmp_lin_g[idx]; + lb_sum += tmp_lin_b[idx]; } } - lin_r_sum *= 0.25f; - lin_g_sum *= 0.25f; - lin_b_sum *= 0.25f; - - float avg_r = LinearToGamma(lin_r_sum); - float avg_g = LinearToGamma(lin_g_sum); - float avg_b = LinearToGamma(lin_b_sum); - + float avg_r = LinearToGamma(lr_sum * 0.25f); + float avg_g = LinearToGamma(lg_sum * 0.25f); + float avg_b = LinearToGamma(lb_sum * 0.25f); float avg_y, avg_cb, avg_cr; RGBToYCbCr_Scalar(avg_r, avg_g, avg_b, &avg_y, &avg_cb, &avg_cr); - - target_cb[y * down_width + x] = avg_cb; - target_cr[y * down_width + x] = avg_cr; - best_cb[y * down_width + x] = avg_cb; - best_cr[y * down_width + x] = avg_cr; + size_t di = y * down_width + x; + target_cb[di] = avg_cb; target_cr[di] = avg_cr; + best_cb[di] = avg_cb; best_cr[di] = avg_cr; } } - // 2. Iterative Back-Projection (4-pass Y/Cb/Cr coupled stabilization) float prev_diff_y_sum = 1e30f; - const float diff_y_threshold = 3.0f * xsize_padded * iMCU_height; + const float diff_y_threshold = 3.0f * high_size; for (int iter = 0; iter < 4; ++iter) { - std::vector err_y(xsize_padded * iMCU_height); - std::vector err_cb(down_width * down_height); - std::vector err_cr(down_width * down_height); + std::vector err_y(high_size); + std::vector err_cb(low_size); + std::vector err_cr(low_size); float diff_y_sum = 0.0f; + // Bilerp Up to Full Res (Scalar optimized loop) for (size_t y = 0; y < down_height; ++y) { size_t y_prev = (y == 0) ? 0 : y - 1; size_t y_next = (y + 1 == down_height) ? y : y + 1; - - const float* prev_cb = &best_cb[y_prev * down_width]; - const float* cur_cb = &best_cb[y * down_width]; - const float* next_cb = &best_cb[y_next * down_width]; - - const float* prev_cr = &best_cr[y_prev * down_width]; - const float* cur_cr = &best_cr[y * down_width]; - const float* next_cr = &best_cr[y_next * down_width]; - for (size_t x = 0; x < down_width; ++x) { size_t x_prev = (x == 0) ? 0 : x - 1; size_t x_next = (x + 1 == down_width) ? x : x + 1; - - auto upsample_bilerp = [](float center, float t, float b, float l, - float r, float tl, float tr, float bl, - float br, float res[4]) { - res[0] = (center * 9.0f + t * 3.0f + l * 3.0f + tl) / 16.0f; - res[1] = (center * 9.0f + t * 3.0f + r * 3.0f + tr) / 16.0f; - res[2] = (center * 9.0f + b * 3.0f + l * 3.0f + bl) / 16.0f; - res[3] = (center * 9.0f + b * 3.0f + r * 3.0f + br) / 16.0f; + + auto get_avg = [](float c, float t, float b, float l, float r, + float tl, float tr, float bl, float br, float res[4]) { + res[0] = (c * 9.0f + t * 3.0f + l * 3.0f + tl) / 16.0f; + res[1] = (c * 9.0f + t * 3.0f + r * 3.0f + tr) / 16.0f; + res[2] = (c * 9.0f + b * 3.0f + l * 3.0f + bl) / 16.0f; + res[3] = (c * 9.0f + b * 3.0f + r * 3.0f + br) / 16.0f; }; - - float up_cb[4]; - upsample_bilerp(cur_cb[x], prev_cb[x], next_cb[x], cur_cb[x_prev], - cur_cb[x_next], prev_cb[x_prev], prev_cb[x_next], - next_cb[x_prev], next_cb[x_next], up_cb); - - float up_cr[4]; - upsample_bilerp(cur_cr[x], prev_cr[x], next_cr[x], cur_cr[x_prev], - cur_cr[x_next], prev_cr[x_prev], prev_cr[x_next], - next_cr[x_prev], next_cr[x_next], up_cr); - - float lin_r_sum = 0, lin_g_sum = 0, lin_b_sum = 0; - for (int iy = 0; iy < 2; ++iy) { - for (int ix = 0; ix < 2; ++ix) { - int i = iy * 2 + ix; - size_t hx = x * 2 + ix; - size_t hy = y * 2 + iy; - float py = best_y[hy * xsize_padded + hx]; - - float r, g, b; - YCbCrToRGB_Scalar(py, up_cb[i], up_cr[i], &r, &g, &b); - - float lin_r = GammaToLinear(r); - float lin_g = GammaToLinear(g); - float lin_b = GammaToLinear(b); - - // Record Y error based on simulated decoded linear light - float lin_y = RGBToLuma_Scalar(lin_r, lin_g, lin_b); - float dec_gamma_y = LinearToGamma(lin_y); - float error_y = target_y[hy * xsize_padded + hx] - dec_gamma_y; - err_y[hy * xsize_padded + hx] = error_y; - diff_y_sum += std::abs(error_y); - - lin_r_sum += lin_r; - lin_g_sum += lin_g; - lin_b_sum += lin_b; + + float u_cb[4], u_cr[4]; + float ccb = best_cb[y*down_width+x], ccr = best_cr[y*down_width+x]; + get_avg(ccb, best_cb[y_prev*down_width+x], best_cb[y_next*down_width+x], + best_cb[y*down_width+x_prev], best_cb[y*down_width+x_next], + best_cb[y_prev*down_width+x_prev], best_cb[y_prev*down_width+x_next], + best_cb[y_next*down_width+x_prev], best_cb[y_next*down_width+x_next], u_cb); + get_avg(ccr, best_cr[y_prev*down_width+x], best_cr[y_next*down_width+x], + best_cr[y*down_width+x_prev], best_cr[y*down_width+x_next], + best_cr[y_prev*down_width+x_prev], best_cr[y_prev*down_width+x_next], + best_cr[y_next*down_width+x_prev], best_cr[y_next*down_width+x_next], u_cr); + + for(int iy=0; iy<2; ++iy) { + for(int ix=0; ix<2; ++ix) { + size_t idx = (y * 2 + iy) * xsize_padded + (x * 2 + ix); + up_cb[idx] = u_cb[iy*2+ix]; + up_cr[idx] = u_cr[iy*2+ix]; } } - lin_r_sum *= 0.25f; - lin_g_sum *= 0.25f; - lin_b_sum *= 0.25f; - - float avg_r = LinearToGamma(lin_r_sum); - float avg_g = LinearToGamma(lin_g_sum); - float avg_b = LinearToGamma(lin_b_sum); - - float avg_y, avg_cb, avg_cr; - RGBToYCbCr_Scalar(avg_r, avg_g, avg_b, &avg_y, &avg_cb, &avg_cr); - - err_cb[y * down_width + x] = target_cb[y * down_width + x] - avg_cb; - err_cr[y * down_width + x] = target_cr[y * down_width + x] - avg_cr; } } - - const size_t N = hwy::HWY_NAMESPACE::Lanes(d); - - // Apply High-Res Y Error with clipping + + // SIMD Evaluate Error for (size_t y = 0; y < iMCU_height; ++y) { size_t x = 0; - float* y_ptr = &best_y[y * xsize_padded]; - const float* ey_ptr = &err_y[y * xsize_padded]; - auto v_zero = Set(d, 0.0f); - auto v_max = Set(d, 255.0f); for (; x + N <= xsize_padded; x += N) { - auto y_vec = LoadU(d, y_ptr + x); - auto e_y_vec = LoadU(d, ey_ptr + x); - auto res = Add(y_vec, e_y_vec); - res = Min(Max(res, v_zero), v_max); - StoreU(res, d, y_ptr + x); + size_t idx = y * xsize_padded + x; + auto py = LoadU(d, &best_y[idx]); + auto ucb = LoadU(d, &up_cb[idx]); + auto ucr = LoadU(d, &up_cr[idx]); + + auto r = Set(d,0.0f), g = Set(d,0.0f), b = Set(d,0.0f); + YCbCrToRGB_SIMD(d, py, ucb, ucr, &r, &g, &b); + + auto lin_r = GammaToLinear_SIMD(d, r); + auto lin_g = GammaToLinear_SIMD(d, g); + auto lin_b = GammaToLinear_SIMD(d, b); + + auto lin_y = TempLuma_SIMD(d, lin_r, lin_g, lin_b); + auto dec_y = LinearToGamma_SIMD(d, lin_y); + + auto e_y = Sub(LoadU(d, &target_y[idx]), dec_y); + StoreU(e_y, d, &err_y[idx]); + StoreU(lin_r, d, &tmp_lin_r[idx]); + StoreU(lin_g, d, &tmp_lin_g[idx]); + StoreU(lin_b, d, &tmp_lin_b[idx]); } for (; x < xsize_padded; ++x) { - y_ptr[x] = std::max(0.0f, std::min(255.0f, y_ptr[x] + ey_ptr[x])); + size_t idx = y * xsize_padded + x; + float py = best_y[idx], ucb = up_cb[idx], ucr = up_cr[idx]; + float r, g, b; YCbCrToRGB_Scalar(py, ucb, ucr, &r, &g, &b); + float lr = GammaToLinear(r), lg = GammaToLinear(g), lb = GammaToLinear(b); + float ly = 0.299 * lr + 0.587 * lg + 0.114 * lb; + err_y[idx] = target_y[idx] - LinearToGamma(ly); + tmp_lin_r[idx] = lr; tmp_lin_g[idx] = lg; tmp_lin_b[idx] = lb; } } - // Apply Cb/Cr Error + for (size_t i = 0; i < high_size; ++i) diff_y_sum += std::abs(err_y[i]); + + // Box downscale decoupled backprojection Cb/Cr evaluation for (size_t y = 0; y < down_height; ++y) { - size_t x = 0; - float* cb_ptr = &best_cb[y * down_width]; - float* cr_ptr = &best_cr[y * down_width]; - const float* ecb_ptr = &err_cb[y * down_width]; - const float* ecr_ptr = &err_cr[y * down_width]; - - for (; x + N <= down_width; x += N) { - auto cb_vec = LoadU(d, cb_ptr + x); - auto cr_vec = LoadU(d, cr_ptr + x); - auto e_cb_vec = LoadU(d, ecb_ptr + x); - auto e_cr_vec = LoadU(d, ecr_ptr + x); - StoreU(Add(cb_vec, e_cb_vec), d, cb_ptr + x); - StoreU(Add(cr_vec, e_cr_vec), d, cr_ptr + x); - } - for (; x < down_width; ++x) { - cb_ptr[x] += ecb_ptr[x]; - cr_ptr[x] += ecr_ptr[x]; + for (size_t x = 0; x < down_width; ++x) { + float lr_sum = 0, lg_sum = 0, lb_sum = 0; + for (int iy = 0; iy < 2; ++iy) { + for (int ix = 0; ix < 2; ++ix) { + size_t idx = (y * 2 + iy) * xsize_padded + (x * 2 + ix); + lr_sum += tmp_lin_r[idx]; + lg_sum += tmp_lin_g[idx]; + lb_sum += tmp_lin_b[idx]; + } + } + float avg_r = LinearToGamma(lr_sum * 0.25f); + float avg_g = LinearToGamma(lg_sum * 0.25f); + float avg_b = LinearToGamma(lb_sum * 0.25f); + float avg_y, avg_cb, avg_cr; + RGBToYCbCr_Scalar(avg_r, avg_g, avg_b, &avg_y, &avg_cb, &avg_cr); + size_t di = y * down_width + x; + err_cb[di] = target_cb[di] - avg_cb; + err_cr[di] = target_cr[di] - avg_cr; } } - - // test libwebp exit condition - if (iter > 0) { - if (diff_y_sum < diff_y_threshold) break; - if (diff_y_sum > prev_diff_y_sum) break; + + // Apply Y Error + for (size_t i = 0; i + N <= high_size; i += N) { + auto y_vec = LoadU(d, &best_y[i]); + auto e_y_vec = LoadU(d, &err_y[i]); + auto res = Add(y_vec, e_y_vec); + res = Min(Max(res, Set(d, 0.0f)), Set(d, 255.0f)); + StoreU(res, d, &best_y[i]); + } + for (size_t i = high_size - (high_size % N); i < high_size; ++i) { + best_y[i] = std::max(0.0f, std::min(255.0f, best_y[i] + err_y[i])); + } + + // Apply Cb/Cr Error + for (size_t i = 0; i + N <= low_size; i += N) { + StoreU(Add(LoadU(d, &best_cb[i]), LoadU(d, &err_cb[i])), d, &best_cb[i]); + StoreU(Add(LoadU(d, &best_cr[i]), LoadU(d, &err_cr[i])), d, &best_cr[i]); + } + for (size_t i = low_size - (low_size % N); i < low_size; ++i) { + best_cb[i] += err_cb[i]; + best_cr[i] += err_cr[i]; } + + if (iter > 0 && (diff_y_sum < diff_y_threshold || diff_y_sum > prev_diff_y_sum)) break; prev_diff_y_sum = diff_y_sum; } // 3. Write output to buffers - const size_t N = hwy::HWY_NAMESPACE::Lanes(d); - - // Write Luma for (size_t y = 0; y < iMCU_height; ++y) { float* row_out_y = m->raw_data[0]->Row(y0 + y); float* src_y = &best_y[y * xsize_padded]; size_t x = 0; - for (; x + N <= xsize_padded; x += N) { - StoreU(LoadU(d, src_y + x), d, row_out_y + x); - } - for (; x < xsize_padded; ++x) { - row_out_y[x] = src_y[x]; - } + for (; x + N <= xsize_padded; x += N) StoreU(LoadU(d, src_y + x), d, row_out_y + x); + for (; x < xsize_padded; ++x) row_out_y[x] = src_y[x]; } - // Write Chroma const size_t y_out0 = y0 / 2; for (size_t y_out = 0; y_out < down_height; ++y_out) { float* row_out_cb = m->raw_data[1]->Row(y_out0 + y_out); float* row_out_cr = m->raw_data[2]->Row(y_out0 + y_out); float* src_cb = &best_cb[y_out * down_width]; float* src_cr = &best_cr[y_out * down_width]; - size_t x = 0; for (; x + N <= down_width; x += N) { StoreU(LoadU(d, src_cb + x), d, row_out_cb + x); From a4514f110a4931d5c935052ad5528a38e073bfa5 Mon Sep 17 00:00:00 2001 From: Galaxy4594 <164440799+Galaxy4594@users.noreply.github.com> Date: Sat, 21 Mar 2026 01:40:09 -0700 Subject: [PATCH 03/10] speed up clean up --- lib/jpegli/sharp_yuv.cc | 643 ++++++++++++++++++++++++++++++---------- 1 file changed, 489 insertions(+), 154 deletions(-) diff --git a/lib/jpegli/sharp_yuv.cc b/lib/jpegli/sharp_yuv.cc index 135d82b6..bc84316c 100644 --- a/lib/jpegli/sharp_yuv.cc +++ b/lib/jpegli/sharp_yuv.cc @@ -9,6 +9,8 @@ #include #include #include +#include +#include #include #include "lib/jpegli/common.h" @@ -19,28 +21,43 @@ #define HWY_TARGET_INCLUDE "lib/jpegli/sharp_yuv.cc" #include #include -#include HWY_BEFORE_NAMESPACE(); namespace jpegli { namespace HWY_NAMESPACE { +using hwy::HWY_NAMESPACE::Abs; using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::And; +using hwy::HWY_NAMESPACE::BitCast; +using hwy::HWY_NAMESPACE::ConvertTo; using hwy::HWY_NAMESPACE::Div; +using hwy::HWY_NAMESPACE::Floor; +using hwy::HWY_NAMESPACE::IfThenElse; +using hwy::HWY_NAMESPACE::Le; +using hwy::HWY_NAMESPACE::LoadDup128; using hwy::HWY_NAMESPACE::LoadU; using hwy::HWY_NAMESPACE::Max; using hwy::HWY_NAMESPACE::Min; using hwy::HWY_NAMESPACE::Mul; using hwy::HWY_NAMESPACE::MulAdd; +using hwy::HWY_NAMESPACE::Or; +using hwy::HWY_NAMESPACE::Rebind; +using hwy::HWY_NAMESPACE::ReduceSum; using hwy::HWY_NAMESPACE::Set; +using hwy::HWY_NAMESPACE::ShiftLeft; +using hwy::HWY_NAMESPACE::ShiftRight; using hwy::HWY_NAMESPACE::StoreU; using hwy::HWY_NAMESPACE::Sub; -using hwy::HWY_NAMESPACE::IfThenElse; -using hwy::HWY_NAMESPACE::Le; +using hwy::HWY_NAMESPACE::Zero; using D = HWY_CAPPED(float, 8); constexpr D d; +// --------------------------------------------------------------------------- +// Scalar helpers (used for tail elements only) +// --------------------------------------------------------------------------- + inline float GammaToLinear(float v) { float n = std::max(0.0f, std::min(255.0f, v)) * (1.0f / 255.0f); if (n <= 0.04045f) { @@ -54,7 +71,8 @@ inline float LinearToGamma(float v) { if (v <= 0.0031308f) { return v * 12.92f * 255.0f; } else { - return (1.055f * std::pow(std::max(0.0f, v), 1.0f / 2.4f) - 0.055f) * 255.0f; + return (1.055f * std::pow(std::max(0.0f, v), 1.0f / 2.4f) - 0.055f) * + 255.0f; } } @@ -63,7 +81,8 @@ inline void YCbCrToRGB_Scalar(float y, float cb, float cr, float* r, float* g, cb -= 128.0f; cr -= 128.0f; *r = y + cr * 1.402f; - *g = y + cb * (-0.114f * 1.772f / 0.587f) + cr * (-0.299f * 1.402f / 0.587f); + *g = y + cb * (-0.114f * 1.772f / 0.587f) + + cr * (-0.299f * 1.402f / 0.587f); *b = y + cb * 1.772f; } @@ -88,17 +107,87 @@ inline void RGBToYCbCr_Scalar(float r, float g, float b, float* y, float* cb, *cr = (r * kDiffR - y_base) * kNormR + 128.0f; } +// --------------------------------------------------------------------------- +// Fast log2/exp2 approximations (same approach as adaptive_quantization.cc) +// --------------------------------------------------------------------------- + +// Evaluates a rational polynomial p(x)/q(x) via Horner's scheme. +template +HWY_INLINE V EvalRationalPoly(const DF df, const V x, const T (&p)[NP], + const T (&q)[NQ]) { + constexpr size_t kDegP = NP / 4 - 1; + constexpr size_t kDegQ = NQ / 4 - 1; + auto yp = LoadDup128(df, &p[kDegP * 4]); + auto yq = LoadDup128(df, &q[kDegQ * 4]); + HWY_FENCE; + if (kDegP >= 1) yp = MulAdd(yp, x, LoadDup128(df, p + ((kDegP - 1) * 4))); + if (kDegQ >= 1) yq = MulAdd(yq, x, LoadDup128(df, q + ((kDegQ - 1) * 4))); + HWY_FENCE; + if (kDegP >= 2) yp = MulAdd(yp, x, LoadDup128(df, p + ((kDegP - 2) * 4))); + if (kDegQ >= 2) yq = MulAdd(yq, x, LoadDup128(df, q + ((kDegQ - 2) * 4))); + HWY_FENCE; + if (kDegP >= 3) yp = MulAdd(yp, x, LoadDup128(df, p + ((kDegP - 3) * 4))); + if (kDegQ >= 3) yq = MulAdd(yq, x, LoadDup128(df, q + ((kDegQ - 3) * 4))); + return Div(yp, yq); +} + +// Fast base-2 logarithm. L1 error ~3.9E-6. +template +HWY_INLINE V FastLog2f(const DF df, V x) { + HWY_ALIGN const float p[4 * 3] = {HWY_REP4(-1.8503833400518310E-06f), + HWY_REP4(1.4287160470083755E+00f), + HWY_REP4(7.4245873327820566E-01f)}; + HWY_ALIGN const float q[4 * 3] = {HWY_REP4(9.9032814277590719E-01f), + HWY_REP4(1.0096718572241148E+00f), + HWY_REP4(1.7409343003366853E-01f)}; + const Rebind di; + const auto x_bits = BitCast(di, x); + const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab)); + const auto exp_shifted = ShiftRight<23>(exp_bits); + const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted))); + const auto exp_val = ConvertTo(df, exp_shifted); + return Add(EvalRationalPoly(df, Sub(mantissa, Set(df, 1.0f)), p, q), exp_val); +} + +// Fast 2^x. Max relative error ~3e-7. +template +HWY_INLINE V FastPow2f(const DF df, V x) { + const Rebind di; + auto floorx = Floor(x); + auto exp = + BitCast(df, ShiftLeft<23>(Add(ConvertTo(di, floorx), Set(di, 127)))); + auto frac = Sub(x, floorx); + auto num = Add(frac, Set(df, 1.01749063e+01f)); + num = MulAdd(num, frac, Set(df, 4.88687798e+01f)); + num = MulAdd(num, frac, Set(df, 9.85506591e+01f)); + num = Mul(num, exp); + auto den = MulAdd(frac, Set(df, 2.10242958e-01f), Set(df, -2.22328856e-02f)); + den = MulAdd(den, frac, Set(df, -1.94414990e+01f)); + den = MulAdd(den, frac, Set(df, 9.85506633e+01f)); + return Div(num, den); +} + +// Fast pow(x, p) = 2^(p * log2(x)) for x > 0. +template +HWY_INLINE V FastPowf(const DF df, V x, float power) { + return FastPow2f(df, Mul(FastLog2f(df, x), Set(df, power))); +} + +// --------------------------------------------------------------------------- +// Highway SIMD gamma helpers using fast polynomial approximations +// --------------------------------------------------------------------------- + template HWY_INLINE V GammaToLinear_SIMD(D d, V v) { auto v_zero = Set(d, 0.0f); auto v_max = Set(d, 255.0f); auto n = Mul(Min(Max(v, v_zero), v_max), Set(d, 1.0f / 255.0f)); auto is_low = Le(n, Set(d, 0.04045f)); - auto low_val = Div(n, Set(d, 12.92f)); - auto base = Div(Add(n, Set(d, 0.055f)), Set(d, 1.055f)); + auto low_val = Mul(n, Set(d, 1.0f / 12.92f)); + // High path: ((n + 0.055) / 1.055) ^ 2.4 + auto base = Mul(Add(n, Set(d, 0.055f)), Set(d, 1.0f / 1.055f)); base = Max(base, Set(d, 1e-6f)); - auto log_base = hwy::HWY_NAMESPACE::Log2(d, base); - auto high_val = hwy::HWY_NAMESPACE::Exp2(d, Mul(log_base, Set(d, 2.4f))); + auto high_val = FastPowf(d, base, 2.4f); return IfThenElse(is_low, low_val, high_val); } @@ -108,9 +197,9 @@ HWY_INLINE V LinearToGamma_SIMD(D d, V v) { auto is_low = Le(v, Set(d, 0.0031308f)); auto low_val = Mul(v, Set(d, 12.92f * 255.0f)); auto v_safe = Max(v, Set(d, 1e-6f)); - auto log_v = hwy::HWY_NAMESPACE::Log2(d, v_safe); - auto p = hwy::HWY_NAMESPACE::Exp2(d, Mul(log_v, Set(d, 1.0f / 2.4f))); - auto high_val = Mul(Sub(Mul(p, Set(d, 1.055f)), Set(d, 0.055f)), Set(d, 255.0f)); + auto p = FastPowf(d, v_safe, 1.0f / 2.4f); + auto high_val = + Mul(Sub(Mul(p, Set(d, 1.055f)), Set(d, 0.055f)), Set(d, 255.0f)); return IfThenElse(is_low, low_val, high_val); } @@ -119,16 +208,240 @@ HWY_INLINE void YCbCrToRGB_SIMD(D d, V y, V cb, V cr, V* r, V* g, V* b) { auto v128 = Set(d, 128.0f); cb = Sub(cb, v128); cr = Sub(cr, v128); - *r = Add(y, Mul(cr, Set(d, 1.402f))); - *g = Add(y, Add(Mul(cb, Set(d, -0.114f * 1.772f / 0.587f)), Mul(cr, Set(d, -0.299f * 1.402f / 0.587f)))); - *b = Add(y, Mul(cb, Set(d, 1.772f))); + *r = MulAdd(cr, Set(d, 1.402f), y); + *g = MulAdd(cb, Set(d, -0.114f * 1.772f / 0.587f), + MulAdd(cr, Set(d, -0.299f * 1.402f / 0.587f), y)); + *b = MulAdd(cb, Set(d, 1.772f), y); } template HWY_INLINE V TempLuma_SIMD(D d, V r, V g, V b) { - return Add(Mul(Set(d, 0.299f), r), Add(Mul(Set(d, 0.587f), g), Mul(Set(d, 0.114f), b))); + return MulAdd(Set(d, 0.299f), r, + MulAdd(Set(d, 0.587f), g, Mul(Set(d, 0.114f), b))); +} + +// Vectorized RGBToYCbCr: computes Y, Cb, Cr from gamma-space R, G, B vectors. +template +HWY_INLINE void RGBToYCbCr_SIMD(D d, V r, V g, V b, V* y, V* cb, V* cr) { + const auto kR = Set(d, 0.299f); + const auto kG = Set(d, 0.587f); + const auto kB = Set(d, 0.114f); + const auto kDiffR = Set(d, 0.701f + 0.299f); + const auto kDiffB = Set(d, 0.886f + 0.114f); + const auto kNormR = Set(d, 1.0f / (0.701f + 0.587f + 0.114f)); + const auto kNormB = Set(d, 1.0f / (0.299f + 0.587f + 0.886f)); + const auto v128 = Set(d, 128.0f); + + auto y_base = MulAdd(kR, r, MulAdd(kG, g, Mul(kB, b))); + *y = y_base; + // cb = (b * kDiffB - y_base) * kNormB + 128 + *cb = MulAdd(Sub(Mul(b, kDiffB), y_base), kNormB, v128); + // cr = (r * kDiffR - y_base) * kNormR + 128 + *cr = MulAdd(Sub(Mul(r, kDiffR), y_base), kNormR, v128); } +// --------------------------------------------------------------------------- +// SIMD box-average downsample of linear RGB -> YCbCr chroma +// Processes each row of down_width pixels. For each output pixel, averages +// a 2x2 block of linear RGB, converts back to gamma, then to YCbCr. +// --------------------------------------------------------------------------- +HWY_INLINE void BoxAverageDownsampleRow(const float* lin_r0, + const float* lin_g0, + const float* lin_b0, + const float* lin_r1, + const float* lin_g1, + const float* lin_b1, + size_t down_width, size_t xsize_padded, + float* out_cb, float* out_cr) { + const size_t N = hwy::HWY_NAMESPACE::Lanes(d); + const auto quarter = Set(d, 0.25f); + size_t x = 0; + + // SIMD path: process N output pixels at a time. + // Each output pixel needs 2 input pixels horizontally, so we need 2*N inputs. + for (; x + N <= down_width; x += N) { + // Gather even and odd columns from two rows. + // Row 0: indices 2*x, 2*x+1, 2*x+2, 2*x+3, ... + // Row 1: same pattern. + // We accumulate the 4 samples for each output pixel. + auto sum_r = Zero(d); + auto sum_g = Zero(d); + auto sum_b = Zero(d); + + for (int iy = 0; iy < 2; ++iy) { + const float* src_r = (iy == 0) ? lin_r0 : lin_r1; + const float* src_g = (iy == 0) ? lin_g0 : lin_g1; + const float* src_b = (iy == 0) ? lin_b0 : lin_b1; + for (int ix = 0; ix < 2; ++ix) { + // Load non-contiguous: elements at positions 2*x+ix, 2*(x+1)+ix, ... + // Unfortunately these are strided, so we load pairs and deinterleave + // is cleaner. For simplicity and correctness, use scalar gather here + // since the inner 2x2 loop is just 4 iterations. + // Actually, let's use a temporary buffer approach for the gather. + HWY_ALIGN float tmp_r[HWY_MAX_LANES_D(D)]; + HWY_ALIGN float tmp_g[HWY_MAX_LANES_D(D)]; + HWY_ALIGN float tmp_b[HWY_MAX_LANES_D(D)]; + for (size_t k = 0; k < N; ++k) { + size_t si = (x + k) * 2 + ix; + tmp_r[k] = src_r[si]; + tmp_g[k] = src_g[si]; + tmp_b[k] = src_b[si]; + } + sum_r = Add(sum_r, LoadU(d, tmp_r)); + sum_g = Add(sum_g, LoadU(d, tmp_g)); + sum_b = Add(sum_b, LoadU(d, tmp_b)); + } + } + + auto avg_r = LinearToGamma_SIMD(d, Mul(sum_r, quarter)); + auto avg_g = LinearToGamma_SIMD(d, Mul(sum_g, quarter)); + auto avg_b = LinearToGamma_SIMD(d, Mul(sum_b, quarter)); + + decltype(avg_r) vy, vcb, vcr; + RGBToYCbCr_SIMD(d, avg_r, avg_g, avg_b, &vy, &vcb, &vcr); + + StoreU(vcb, d, out_cb + x); + StoreU(vcr, d, out_cr + x); + } + + // Scalar tail + for (; x < down_width; ++x) { + float lr_sum = 0, lg_sum = 0, lb_sum = 0; + for (int iy = 0; iy < 2; ++iy) { + const float* src_r = (iy == 0) ? lin_r0 : lin_r1; + const float* src_g = (iy == 0) ? lin_g0 : lin_g1; + const float* src_b = (iy == 0) ? lin_b0 : lin_b1; + for (int ix = 0; ix < 2; ++ix) { + size_t si = x * 2 + ix; + lr_sum += src_r[si]; + lg_sum += src_g[si]; + lb_sum += src_b[si]; + } + } + float avg_r = LinearToGamma(lr_sum * 0.25f); + float avg_g = LinearToGamma(lg_sum * 0.25f); + float avg_b = LinearToGamma(lb_sum * 0.25f); + float avg_y, avg_cb, avg_cr; + RGBToYCbCr_Scalar(avg_r, avg_g, avg_b, &avg_y, &avg_cb, &avg_cr); + out_cb[x] = avg_cb; + out_cr[x] = avg_cr; + } +} + +// --------------------------------------------------------------------------- +// SIMD tent-filter upsample of one chroma row +// For each x in [0, down_width), computes 4 upsampled values using the +// 9/3/3/1 tent filter, and writes them to the 4 corresponding full-res +// positions. +// --------------------------------------------------------------------------- +HWY_INLINE void TentUpsampleRow(const float* prev_row, const float* cur_row, + const float* next_row, size_t down_width, + size_t xsize_padded, size_t y, + float* out_full) { + const size_t N = hwy::HWY_NAMESPACE::Lanes(d); + const auto k9 = Set(d, 9.0f); + const auto k3 = Set(d, 3.0f); + const auto kInv16 = Set(d, 1.0f / 16.0f); + size_t x = 0; + + for (; x + N <= down_width; x += N) { + auto vc = LoadU(d, cur_row + x); + + // Load left neighbors (clamp at boundary) + HWY_ALIGN float tmp_l[HWY_MAX_LANES_D(D)]; + HWY_ALIGN float tmp_tl[HWY_MAX_LANES_D(D)]; + HWY_ALIGN float tmp_bl[HWY_MAX_LANES_D(D)]; + for (size_t k = 0; k < N; ++k) { + size_t xi = x + k; + size_t xl = (xi == 0) ? 0 : xi - 1; + tmp_l[k] = cur_row[xl]; + tmp_tl[k] = prev_row[xl]; + tmp_bl[k] = next_row[xl]; + } + auto vl = LoadU(d, tmp_l); + auto vtl = LoadU(d, tmp_tl); + auto vbl = LoadU(d, tmp_bl); + + // Load right neighbors (clamp at boundary) + HWY_ALIGN float tmp_r[HWY_MAX_LANES_D(D)]; + HWY_ALIGN float tmp_tr[HWY_MAX_LANES_D(D)]; + HWY_ALIGN float tmp_br[HWY_MAX_LANES_D(D)]; + for (size_t k = 0; k < N; ++k) { + size_t xi = x + k; + size_t xr = (xi + 1 == down_width) ? xi : xi + 1; + tmp_r[k] = cur_row[xr]; + tmp_tr[k] = prev_row[xr]; + tmp_br[k] = next_row[xr]; + } + auto vr = LoadU(d, tmp_r); + auto vtr = LoadU(d, tmp_tr); + auto vbr = LoadU(d, tmp_br); + + auto vt = LoadU(d, prev_row + x); + auto vb = LoadU(d, next_row + x); + + // c9 = c * 9 + auto c9 = Mul(vc, k9); + + // Top-left quadrant: (c*9 + t*3 + l*3 + tl) / 16 + auto val_tl = Mul(MulAdd(k3, Add(vt, vl), Add(c9, vtl)), kInv16); + // Top-right quadrant: (c*9 + t*3 + r*3 + tr) / 16 + auto val_tr = Mul(MulAdd(k3, Add(vt, vr), Add(c9, vtr)), kInv16); + // Bottom-left quadrant: (c*9 + b*3 + l*3 + bl) / 16 + auto val_bl = Mul(MulAdd(k3, Add(vb, vl), Add(c9, vbl)), kInv16); + // Bottom-right quadrant: (c*9 + b*3 + r*3 + br) / 16 + auto val_br = Mul(MulAdd(k3, Add(vb, vr), Add(c9, vbr)), kInv16); + + // Scatter to full-res positions + HWY_ALIGN float res_tl[HWY_MAX_LANES_D(D)]; + HWY_ALIGN float res_tr[HWY_MAX_LANES_D(D)]; + HWY_ALIGN float res_bl[HWY_MAX_LANES_D(D)]; + HWY_ALIGN float res_br[HWY_MAX_LANES_D(D)]; + StoreU(val_tl, d, res_tl); + StoreU(val_tr, d, res_tr); + StoreU(val_bl, d, res_bl); + StoreU(val_br, d, res_br); + for (size_t k = 0; k < N; ++k) { + size_t xi = x + k; + out_full[(y * 2 + 0) * xsize_padded + (xi * 2 + 0)] = res_tl[k]; + out_full[(y * 2 + 0) * xsize_padded + (xi * 2 + 1)] = res_tr[k]; + out_full[(y * 2 + 1) * xsize_padded + (xi * 2 + 0)] = res_bl[k]; + out_full[(y * 2 + 1) * xsize_padded + (xi * 2 + 1)] = res_br[k]; + } + } + + // Scalar tail + for (; x < down_width; ++x) { + size_t x_prev = (x == 0) ? 0 : x - 1; + size_t x_next = (x + 1 == down_width) ? x : x + 1; + + float cc = cur_row[x]; + float ct = prev_row[x]; + float cb = next_row[x]; + float cl = cur_row[x_prev]; + float cr = cur_row[x_next]; + float ctl = prev_row[x_prev]; + float ctr = prev_row[x_next]; + float cbl = next_row[x_prev]; + float cbr = next_row[x_next]; + + float c9 = cc * 9.0f; + float inv16 = 1.0f / 16.0f; + out_full[(y * 2 + 0) * xsize_padded + (x * 2 + 0)] = + (c9 + ct * 3.0f + cl * 3.0f + ctl) * inv16; + out_full[(y * 2 + 0) * xsize_padded + (x * 2 + 1)] = + (c9 + ct * 3.0f + cr * 3.0f + ctr) * inv16; + out_full[(y * 2 + 1) * xsize_padded + (x * 2 + 0)] = + (c9 + cb * 3.0f + cl * 3.0f + cbl) * inv16; + out_full[(y * 2 + 1) * xsize_padded + (x * 2 + 1)] = + (c9 + cb * 3.0f + cr * 3.0f + cbr) * inv16; + } +} + +// --------------------------------------------------------------------------- +// Main SharpYUV implementation +// --------------------------------------------------------------------------- + void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { jpeg_comp_master* m = cinfo->master; const size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor; @@ -137,9 +450,11 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { const size_t xsize_padded = m->xsize_blocks * DCTSIZE; if (cinfo->num_components < 3) return; - const int h_factor = cinfo->max_h_samp_factor / cinfo->comp_info[1].h_samp_factor; - const int v_factor = cinfo->max_v_samp_factor / cinfo->comp_info[1].v_samp_factor; - if(h_factor != 2 || v_factor != 2) { + const int h_factor = + cinfo->max_h_samp_factor / cinfo->comp_info[1].h_samp_factor; + const int v_factor = + cinfo->max_v_samp_factor / cinfo->comp_info[1].v_samp_factor; + if (h_factor != 2 || v_factor != 2) { for (int c = 1; c < cinfo->num_components; ++c) { jpeg_component_info* comp = &cinfo->comp_info[c]; const int hf = cinfo->max_h_samp_factor / comp->h_samp_factor; @@ -149,7 +464,8 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { float* rows_in[MAX_SAMP_FACTOR]; auto& input = *m->smooth_input[c]; auto& output = *m->raw_data[c]; - for (size_t y_in = y0, y_out = y_out0; y_in < y1; y_in += vf, ++y_out) { + for (size_t y_in = y0, y_out = y_out0; y_in < y1; + y_in += vf, ++y_out) { for (int iy = 0; iy < vf; ++iy) { rows_in[iy] = input.Row(y_in + iy); } @@ -163,7 +479,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { const size_t down_height = iMCU_height / 2; const size_t high_size = xsize_padded * iMCU_height; const size_t low_size = down_width * down_height; - + std::vector target_y(high_size); std::vector best_y(high_size); std::vector tmp_lin_r(high_size); @@ -180,7 +496,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { const size_t N = hwy::HWY_NAMESPACE::Lanes(d); - // 1. Initial High-Res state -> SIMD Loop + // 1. Decode input YCbCr -> linear RGB, compute per-pixel target luma (SIMD) for (size_t y = 0; y < iMCU_height; ++y) { const float* row_y = m->smooth_input[0]->Row(y0 + y); const float* row_cb = m->smooth_input[1]->Row(y0 + y); @@ -190,17 +506,17 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { auto py = LoadU(d, row_y + x); auto pcb = LoadU(d, row_cb + x); auto pcr = LoadU(d, row_cr + x); - - auto r = Set(d,0.0f), g = Set(d,0.0f), b = Set(d,0.0f); + + decltype(py) r, g, b; YCbCrToRGB_SIMD(d, py, pcb, pcr, &r, &g, &b); - + auto lin_r = GammaToLinear_SIMD(d, r); auto lin_g = GammaToLinear_SIMD(d, g); auto lin_b = GammaToLinear_SIMD(d, b); - + auto lin_y = TempLuma_SIMD(d, lin_r, lin_g, lin_b); auto t_y = LinearToGamma_SIMD(d, lin_y); - + size_t idx = y * xsize_padded + x; StoreU(t_y, d, &target_y[idx]); StoreU(py, d, &best_y[idx]); @@ -209,88 +525,78 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { StoreU(lin_b, d, &tmp_lin_b[idx]); } for (; x < xsize_padded; ++x) { - float py = row_y[x], pcb = row_cb[x], pcr = row_cr[x]; - float r, g, b; YCbCrToRGB_Scalar(py, pcb, pcr, &r, &g, &b); - float lr = GammaToLinear(r), lg = GammaToLinear(g), lb = GammaToLinear(b); - float ly = 0.299 * lr + 0.587 * lg + 0.114 * lb; + float py = row_y[x]; + float pcb = row_cb[x]; + float pcr = row_cr[x]; + float r, g, b; + YCbCrToRGB_Scalar(py, pcb, pcr, &r, &g, &b); + float lr = GammaToLinear(r); + float lg = GammaToLinear(g); + float lb = GammaToLinear(b); + float ly = 0.299f * lr + 0.587f * lg + 0.114f * lb; size_t idx = y * xsize_padded + x; target_y[idx] = LinearToGamma(ly); best_y[idx] = py; - tmp_lin_r[idx] = lr; tmp_lin_g[idx] = lg; tmp_lin_b[idx] = lb; + tmp_lin_r[idx] = lr; + tmp_lin_g[idx] = lg; + tmp_lin_b[idx] = lb; } } - // 1b. Box downscale to target_cb, target_cr -> Scalar loop - // (Downscaled loops represent 1/4 the iterations, scalar overhead is negligible) + // 2. Box-average downsample of linear RGB -> target Cb/Cr (SIMD) for (size_t y = 0; y < down_height; ++y) { - for (size_t x = 0; x < down_width; ++x) { - float lr_sum = 0, lg_sum = 0, lb_sum = 0; - for (int iy = 0; iy < 2; ++iy) { - for (int ix = 0; ix < 2; ++ix) { - size_t idx = (y * 2 + iy) * xsize_padded + (x * 2 + ix); - lr_sum += tmp_lin_r[idx]; - lg_sum += tmp_lin_g[idx]; - lb_sum += tmp_lin_b[idx]; - } - } - float avg_r = LinearToGamma(lr_sum * 0.25f); - float avg_g = LinearToGamma(lg_sum * 0.25f); - float avg_b = LinearToGamma(lb_sum * 0.25f); - float avg_y, avg_cb, avg_cr; - RGBToYCbCr_Scalar(avg_r, avg_g, avg_b, &avg_y, &avg_cb, &avg_cr); - size_t di = y * down_width + x; - target_cb[di] = avg_cb; target_cr[di] = avg_cr; - best_cb[di] = avg_cb; best_cr[di] = avg_cr; - } + const float* r0 = &tmp_lin_r[(y * 2 + 0) * xsize_padded]; + const float* g0 = &tmp_lin_g[(y * 2 + 0) * xsize_padded]; + const float* b0 = &tmp_lin_b[(y * 2 + 0) * xsize_padded]; + const float* r1 = &tmp_lin_r[(y * 2 + 1) * xsize_padded]; + const float* g1 = &tmp_lin_g[(y * 2 + 1) * xsize_padded]; + const float* b1 = &tmp_lin_b[(y * 2 + 1) * xsize_padded]; + float* dcb = &target_cb[y * down_width]; + float* dcr = &target_cr[y * down_width]; + + BoxAverageDownsampleRow(r0, g0, b0, r1, g1, b1, down_width, xsize_padded, + dcb, dcr); + + // Also initialize best_cb/cr = target_cb/cr + memcpy(&best_cb[y * down_width], dcb, down_width * sizeof(float)); + memcpy(&best_cr[y * down_width], dcr, down_width * sizeof(float)); } float prev_diff_y_sum = 1e30f; const float diff_y_threshold = 3.0f * high_size; + // Pre-allocate error buffers outside the iteration loop to avoid repeated + // heap allocation/deallocation. + std::vector err_y(high_size); + std::vector err_cb(low_size); + std::vector err_cr(low_size); + for (int iter = 0; iter < 4; ++iter) { - std::vector err_y(high_size); - std::vector err_cb(low_size); - std::vector err_cr(low_size); + memset(err_y.data(), 0, high_size * sizeof(float)); + memset(err_cb.data(), 0, low_size * sizeof(float)); + memset(err_cr.data(), 0, low_size * sizeof(float)); float diff_y_sum = 0.0f; - // Bilerp Up to Full Res (Scalar optimized loop) + // 3a. Upsample best_cb/cr to full-res using a 2D tent (9/3/3/1) filter + // (SIMD inner loop with scalar boundary handling) for (size_t y = 0; y < down_height; ++y) { size_t y_prev = (y == 0) ? 0 : y - 1; size_t y_next = (y + 1 == down_height) ? y : y + 1; - for (size_t x = 0; x < down_width; ++x) { - size_t x_prev = (x == 0) ? 0 : x - 1; - size_t x_next = (x + 1 == down_width) ? x : x + 1; - - auto get_avg = [](float c, float t, float b, float l, float r, - float tl, float tr, float bl, float br, float res[4]) { - res[0] = (c * 9.0f + t * 3.0f + l * 3.0f + tl) / 16.0f; - res[1] = (c * 9.0f + t * 3.0f + r * 3.0f + tr) / 16.0f; - res[2] = (c * 9.0f + b * 3.0f + l * 3.0f + bl) / 16.0f; - res[3] = (c * 9.0f + b * 3.0f + r * 3.0f + br) / 16.0f; - }; - - float u_cb[4], u_cr[4]; - float ccb = best_cb[y*down_width+x], ccr = best_cr[y*down_width+x]; - get_avg(ccb, best_cb[y_prev*down_width+x], best_cb[y_next*down_width+x], - best_cb[y*down_width+x_prev], best_cb[y*down_width+x_next], - best_cb[y_prev*down_width+x_prev], best_cb[y_prev*down_width+x_next], - best_cb[y_next*down_width+x_prev], best_cb[y_next*down_width+x_next], u_cb); - get_avg(ccr, best_cr[y_prev*down_width+x], best_cr[y_next*down_width+x], - best_cr[y*down_width+x_prev], best_cr[y*down_width+x_next], - best_cr[y_prev*down_width+x_prev], best_cr[y_prev*down_width+x_next], - best_cr[y_next*down_width+x_prev], best_cr[y_next*down_width+x_next], u_cr); - - for(int iy=0; iy<2; ++iy) { - for(int ix=0; ix<2; ++ix) { - size_t idx = (y * 2 + iy) * xsize_padded + (x * 2 + ix); - up_cb[idx] = u_cb[iy*2+ix]; - up_cr[idx] = u_cr[iy*2+ix]; - } - } - } + + const float* cb_prev = &best_cb[y_prev * down_width]; + const float* cb_cur = &best_cb[y * down_width]; + const float* cb_next = &best_cb[y_next * down_width]; + TentUpsampleRow(cb_prev, cb_cur, cb_next, down_width, xsize_padded, y, + up_cb.data()); + + const float* cr_prev = &best_cr[y_prev * down_width]; + const float* cr_cur = &best_cr[y * down_width]; + const float* cr_next = &best_cr[y_next * down_width]; + TentUpsampleRow(cr_prev, cr_cur, cr_next, down_width, xsize_padded, y, + up_cr.data()); } - // SIMD Evaluate Error + // 3b. Evaluate YCbCr reconstruction error at full resolution (SIMD) for (size_t y = 0; y < iMCU_height; ++y) { size_t x = 0; for (; x + N <= xsize_padded; x += N) { @@ -299,16 +605,16 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { auto ucb = LoadU(d, &up_cb[idx]); auto ucr = LoadU(d, &up_cr[idx]); - auto r = Set(d,0.0f), g = Set(d,0.0f), b = Set(d,0.0f); + decltype(py) r, g, b; YCbCrToRGB_SIMD(d, py, ucb, ucr, &r, &g, &b); - + auto lin_r = GammaToLinear_SIMD(d, r); auto lin_g = GammaToLinear_SIMD(d, g); auto lin_b = GammaToLinear_SIMD(d, b); auto lin_y = TempLuma_SIMD(d, lin_r, lin_g, lin_b); auto dec_y = LinearToGamma_SIMD(d, lin_y); - + auto e_y = Sub(LoadU(d, &target_y[idx]), dec_y); StoreU(e_y, d, &err_y[idx]); StoreU(lin_r, d, &tmp_lin_r[idx]); @@ -317,90 +623,119 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { } for (; x < xsize_padded; ++x) { size_t idx = y * xsize_padded + x; - float py = best_y[idx], ucb = up_cb[idx], ucr = up_cr[idx]; - float r, g, b; YCbCrToRGB_Scalar(py, ucb, ucr, &r, &g, &b); - float lr = GammaToLinear(r), lg = GammaToLinear(g), lb = GammaToLinear(b); - float ly = 0.299 * lr + 0.587 * lg + 0.114 * lb; + float py = best_y[idx]; + float ucb = up_cb[idx]; + float ucr = up_cr[idx]; + float r, g, b; + YCbCrToRGB_Scalar(py, ucb, ucr, &r, &g, &b); + float lr = GammaToLinear(r); + float lg = GammaToLinear(g); + float lb = GammaToLinear(b); + float ly = 0.299f * lr + 0.587f * lg + 0.114f * lb; err_y[idx] = target_y[idx] - LinearToGamma(ly); - tmp_lin_r[idx] = lr; tmp_lin_g[idx] = lg; tmp_lin_b[idx] = lb; + tmp_lin_r[idx] = lr; + tmp_lin_g[idx] = lg; + tmp_lin_b[idx] = lb; } } - - for (size_t i = 0; i < high_size; ++i) diff_y_sum += std::abs(err_y[i]); - // Box downscale decoupled backprojection Cb/Cr evaluation + // Accumulate |err_y| + auto acc_y = Zero(d); + size_t i_y = 0; + for (; i_y + N <= high_size; i_y += N) { + acc_y = Add(acc_y, Abs(LoadU(d, &err_y[i_y]))); + } + diff_y_sum = ReduceSum(d, acc_y); + for (; i_y < high_size; ++i_y) { + diff_y_sum += std::abs(err_y[i_y]); + } + + // 3c. Box-average downsample of reconstructed RGB -> compute Cb/Cr error + // (SIMD via BoxAverageDownsampleRow) for (size_t y = 0; y < down_height; ++y) { - for (size_t x = 0; x < down_width; ++x) { - float lr_sum = 0, lg_sum = 0, lb_sum = 0; - for (int iy = 0; iy < 2; ++iy) { - for (int ix = 0; ix < 2; ++ix) { - size_t idx = (y * 2 + iy) * xsize_padded + (x * 2 + ix); - lr_sum += tmp_lin_r[idx]; - lg_sum += tmp_lin_g[idx]; - lb_sum += tmp_lin_b[idx]; - } - } - float avg_r = LinearToGamma(lr_sum * 0.25f); - float avg_g = LinearToGamma(lg_sum * 0.25f); - float avg_b = LinearToGamma(lb_sum * 0.25f); - float avg_y, avg_cb, avg_cr; - RGBToYCbCr_Scalar(avg_r, avg_g, avg_b, &avg_y, &avg_cb, &avg_cr); - size_t di = y * down_width + x; - err_cb[di] = target_cb[di] - avg_cb; - err_cr[di] = target_cr[di] - avg_cr; + const float* r0 = &tmp_lin_r[(y * 2 + 0) * xsize_padded]; + const float* g0 = &tmp_lin_g[(y * 2 + 0) * xsize_padded]; + const float* b0 = &tmp_lin_b[(y * 2 + 0) * xsize_padded]; + const float* r1 = &tmp_lin_r[(y * 2 + 1) * xsize_padded]; + const float* g1 = &tmp_lin_g[(y * 2 + 1) * xsize_padded]; + const float* b1 = &tmp_lin_b[(y * 2 + 1) * xsize_padded]; + + float* row_err_cb = &err_cb[y * down_width]; + float* row_err_cr = &err_cr[y * down_width]; + float* row_tgt_cb = &target_cb[y * down_width]; + float* row_tgt_cr = &target_cr[y * down_width]; + + // BoxAverageDownsampleRow outputs Cb/Cr directly, so we use err + // buffers as temp storage for the reconstructed Cb/Cr. + BoxAverageDownsampleRow(r0, g0, b0, r1, g1, b1, down_width, + xsize_padded, row_err_cb, row_err_cr); + + // Compute error = target - reconstructed + size_t x = 0; + for (; x + N <= down_width; x += N) { + auto tgt_cb = LoadU(d, row_tgt_cb + x); + auto rec_cb = LoadU(d, row_err_cb + x); + StoreU(Sub(tgt_cb, rec_cb), d, row_err_cb + x); + + auto tgt_cr = LoadU(d, row_tgt_cr + x); + auto rec_cr = LoadU(d, row_err_cr + x); + StoreU(Sub(tgt_cr, rec_cr), d, row_err_cr + x); + } + for (; x < down_width; ++x) { + row_err_cb[x] = row_tgt_cb[x] - row_err_cb[x]; + row_err_cr[x] = row_tgt_cr[x] - row_err_cr[x]; } } - - // Apply Y Error - for (size_t i = 0; i + N <= high_size; i += N) { - auto y_vec = LoadU(d, &best_y[i]); - auto e_y_vec = LoadU(d, &err_y[i]); - auto res = Add(y_vec, e_y_vec); - res = Min(Max(res, Set(d, 0.0f)), Set(d, 255.0f)); - StoreU(res, d, &best_y[i]); + + // 3d. Apply Y error correction with clamping (SIMD + scalar tail) + const auto vmin = Set(d, 0.0f); + const auto vmax = Set(d, 255.0f); + size_t i_y_corr = 0; + for (; i_y_corr + N <= high_size; i_y_corr += N) { + auto y_vec = LoadU(d, &best_y[i_y_corr]); + auto e_y_vec = LoadU(d, &err_y[i_y_corr]); + auto res = Min(Max(Add(y_vec, e_y_vec), vmin), vmax); + StoreU(res, d, &best_y[i_y_corr]); } - for (size_t i = high_size - (high_size % N); i < high_size; ++i) { - best_y[i] = std::max(0.0f, std::min(255.0f, best_y[i] + err_y[i])); + for (; i_y_corr < high_size; ++i_y_corr) { + best_y[i_y_corr] = std::max(0.0f, std::min(255.0f, best_y[i_y_corr] + err_y[i_y_corr])); } - - // Apply Cb/Cr Error - for (size_t i = 0; i + N <= low_size; i += N) { - StoreU(Add(LoadU(d, &best_cb[i]), LoadU(d, &err_cb[i])), d, &best_cb[i]); - StoreU(Add(LoadU(d, &best_cr[i]), LoadU(d, &err_cr[i])), d, &best_cr[i]); + + // 3e. Apply Cb/Cr error correction (SIMD + scalar tail) + // Unlike Y, Cb/Cr are NOT clamped during iterations to allow error diffusion. + size_t i_c_corr = 0; + for (; i_c_corr + N <= low_size; i_c_corr += N) { + auto cb_vec = Add(LoadU(d, &best_cb[i_c_corr]), LoadU(d, &err_cb[i_c_corr])); + auto cr_vec = Add(LoadU(d, &best_cr[i_c_corr]), LoadU(d, &err_cr[i_c_corr])); + StoreU(cb_vec, d, &best_cb[i_c_corr]); + StoreU(cr_vec, d, &best_cr[i_c_corr]); } - for (size_t i = low_size - (low_size % N); i < low_size; ++i) { - best_cb[i] += err_cb[i]; - best_cr[i] += err_cr[i]; + for (; i_c_corr < low_size; ++i_c_corr) { + best_cb[i_c_corr] += err_cb[i_c_corr]; + best_cr[i_c_corr] += err_cr[i_c_corr]; } - if (iter > 0 && (diff_y_sum < diff_y_threshold || diff_y_sum > prev_diff_y_sum)) break; + if (iter > 0 && + (diff_y_sum < diff_y_threshold || diff_y_sum > prev_diff_y_sum)) + break; prev_diff_y_sum = diff_y_sum; } - // 3. Write output to buffers + // 4. Write best_y and best_cb/cr to raw_data output buffers (memcpy) for (size_t y = 0; y < iMCU_height; ++y) { float* row_out_y = m->raw_data[0]->Row(y0 + y); - float* src_y = &best_y[y * xsize_padded]; - size_t x = 0; - for (; x + N <= xsize_padded; x += N) StoreU(LoadU(d, src_y + x), d, row_out_y + x); - for (; x < xsize_padded; ++x) row_out_y[x] = src_y[x]; + memcpy(row_out_y, &best_y[y * xsize_padded], + xsize_padded * sizeof(float)); } - + const size_t y_out0 = y0 / 2; for (size_t y_out = 0; y_out < down_height; ++y_out) { float* row_out_cb = m->raw_data[1]->Row(y_out0 + y_out); float* row_out_cr = m->raw_data[2]->Row(y_out0 + y_out); - float* src_cb = &best_cb[y_out * down_width]; - float* src_cr = &best_cr[y_out * down_width]; - size_t x = 0; - for (; x + N <= down_width; x += N) { - StoreU(LoadU(d, src_cb + x), d, row_out_cb + x); - StoreU(LoadU(d, src_cr + x), d, row_out_cr + x); - } - for (; x < down_width; ++x) { - row_out_cb[x] = src_cb[x]; - row_out_cr[x] = src_cr[x]; - } + memcpy(row_out_cb, &best_cb[y_out * down_width], + down_width * sizeof(float)); + memcpy(row_out_cr, &best_cr[y_out * down_width], + down_width * sizeof(float)); } } From 34364360b528037f3e98575ddb8794efac9eb01e Mon Sep 17 00:00:00 2001 From: Galaxy4594 <164440799+Galaxy4594@users.noreply.github.com> Date: Mon, 23 Mar 2026 08:28:39 -0700 Subject: [PATCH 04/10] remove fluff --- lib/jpegli/sharp_yuv.cc | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/lib/jpegli/sharp_yuv.cc b/lib/jpegli/sharp_yuv.cc index bc84316c..080c99fb 100644 --- a/lib/jpegli/sharp_yuv.cc +++ b/lib/jpegli/sharp_yuv.cc @@ -496,7 +496,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { const size_t N = hwy::HWY_NAMESPACE::Lanes(d); - // 1. Decode input YCbCr -> linear RGB, compute per-pixel target luma (SIMD) + // Decode input YCbCr to linear RGB and compute target luma. for (size_t y = 0; y < iMCU_height; ++y) { const float* row_y = m->smooth_input[0]->Row(y0 + y); const float* row_cb = m->smooth_input[1]->Row(y0 + y); @@ -543,7 +543,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { } } - // 2. Box-average downsample of linear RGB -> target Cb/Cr (SIMD) + // Box-average downsample of linear RGB to target Cb/Cr. for (size_t y = 0; y < down_height; ++y) { const float* r0 = &tmp_lin_r[(y * 2 + 0) * xsize_padded]; const float* g0 = &tmp_lin_g[(y * 2 + 0) * xsize_padded]; @@ -577,8 +577,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { memset(err_cr.data(), 0, low_size * sizeof(float)); float diff_y_sum = 0.0f; - // 3a. Upsample best_cb/cr to full-res using a 2D tent (9/3/3/1) filter - // (SIMD inner loop with scalar boundary handling) + // Upsample best_cb/cr to full-res using a 2D tent filter. for (size_t y = 0; y < down_height; ++y) { size_t y_prev = (y == 0) ? 0 : y - 1; size_t y_next = (y + 1 == down_height) ? y : y + 1; @@ -596,7 +595,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { up_cr.data()); } - // 3b. Evaluate YCbCr reconstruction error at full resolution (SIMD) + // Evaluate YCbCr reconstruction error at full resolution. for (size_t y = 0; y < iMCU_height; ++y) { size_t x = 0; for (; x + N <= xsize_padded; x += N) { @@ -650,8 +649,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { diff_y_sum += std::abs(err_y[i_y]); } - // 3c. Box-average downsample of reconstructed RGB -> compute Cb/Cr error - // (SIMD via BoxAverageDownsampleRow) + // Box-average downsample reconstructed RGB to compute Cb/Cr error. for (size_t y = 0; y < down_height; ++y) { const float* r0 = &tmp_lin_r[(y * 2 + 0) * xsize_padded]; const float* g0 = &tmp_lin_g[(y * 2 + 0) * xsize_padded]; @@ -687,7 +685,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { } } - // 3d. Apply Y error correction with clamping (SIMD + scalar tail) + // Apply Y error correction with clamping. const auto vmin = Set(d, 0.0f); const auto vmax = Set(d, 255.0f); size_t i_y_corr = 0; @@ -701,8 +699,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { best_y[i_y_corr] = std::max(0.0f, std::min(255.0f, best_y[i_y_corr] + err_y[i_y_corr])); } - // 3e. Apply Cb/Cr error correction (SIMD + scalar tail) - // Unlike Y, Cb/Cr are NOT clamped during iterations to allow error diffusion. + // Apply Cb/Cr error correction. size_t i_c_corr = 0; for (; i_c_corr + N <= low_size; i_c_corr += N) { auto cb_vec = Add(LoadU(d, &best_cb[i_c_corr]), LoadU(d, &err_cb[i_c_corr])); @@ -721,7 +718,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { prev_diff_y_sum = diff_y_sum; } - // 4. Write best_y and best_cb/cr to raw_data output buffers (memcpy) + // Write best_y and best_cb/cr to raw_data output buffers. for (size_t y = 0; y < iMCU_height; ++y) { float* row_out_y = m->raw_data[0]->Row(y0 + y); memcpy(row_out_y, &best_y[y * xsize_padded], From bdf86c8e638a6232f8acd32903d7d5c81aaf435b Mon Sep 17 00:00:00 2001 From: Galaxy4594 <164440799+Galaxy4594@users.noreply.github.com> Date: Sat, 4 Apr 2026 02:05:32 -0700 Subject: [PATCH 05/10] Final performance pass Pulls RGB data from initial decode and works from there, the AI hallucination machine was doing unneeded color conversions (RGB -> YUV -> RGB) which was costing performance. --- lib/jpegli/common_internal.h | 1 + lib/jpegli/encode.cc | 29 +++ lib/jpegli/encode_internal.h | 2 + lib/jpegli/sharp_yuv.cc | 472 ++++++++++++++--------------------- 4 files changed, 225 insertions(+), 279 deletions(-) diff --git a/lib/jpegli/common_internal.h b/lib/jpegli/common_internal.h index e41daa3d..80074b31 100644 --- a/lib/jpegli/common_internal.h +++ b/lib/jpegli/common_internal.h @@ -112,6 +112,7 @@ class RowBuffer { size_t xsize() const { return xsize_; }; size_t ysize() const { return ysize_; }; size_t stride() const { return stride_; } + bool HasData() const { return xsize_ > 0; } void PadRow(size_t y, size_t from, int border) { float* row = Row(y); diff --git a/lib/jpegli/encode.cc b/lib/jpegli/encode.cc index f59da379..2ae318d3 100644 --- a/lib/jpegli/encode.cc +++ b/lib/jpegli/encode.cc @@ -431,6 +431,12 @@ void AllocateBuffers(j_compress_ptr cinfo) { for (int c = 0; c < num_all_components; ++c) { m->input_buffer[c].Allocate(cinfo, ysize_full, xsize_full); } + if (m->use_sharpyuv && cinfo->in_color_space == JCS_RGB && + cinfo->jpeg_color_space == JCS_YCbCr) { + for (int c = 0; c < 3; ++c) { + m->input_rgb[c].Allocate(cinfo, ysize_full, xsize_full); + } + } } for (int c = 0; c < cinfo->num_components; ++c) { jpeg_component_info* comp = &cinfo->comp_info[c]; @@ -491,6 +497,14 @@ void AllocateBuffers(j_compress_ptr cinfo) { memset(m->zero_bias_mul[c], 0, DCTSIZE2 * sizeof(float)); memset(m->zero_bias_offset[c], 0, DCTSIZE2 * sizeof(float)); } + if (m->use_sharpyuv) { + for (int i = 0; i < 7; ++i) { + m->sharpyuv_workspace[i].Allocate(cinfo, iMCU_height, xsize_full); + } + for (int i = 7; i < 13; ++i) { + m->sharpyuv_workspace[i].Allocate(cinfo, iMCU_height / 2, xsize_full / 2); + } + } } void InitProgressMonitor(j_compress_ptr cinfo) { @@ -692,6 +706,7 @@ void jpegli_CreateCompress(j_compress_ptr cinfo, int version, memset(cinfo->arith_ac_K, 0, sizeof(cinfo->arith_ac_K)); cinfo->write_Adobe_marker = FALSE; cinfo->master = jpegli::Allocate(cinfo, 1); + memset(cinfo->master->input_rgb, 0, sizeof(cinfo->master->input_rgb)); jpegli::InitializeCompressParams(cinfo); cinfo->master->force_baseline = true; cinfo->master->xyb_mode = false; @@ -1160,8 +1175,22 @@ JDIMENSION jpegli_write_scanlines(j_compress_ptr cinfo, JSAMPARRAY scanlines, cinfo->next_scanline += input_lag; } float* rows[jpegli::kMaxComponents]; + const bool snapshot_rgb = m->use_sharpyuv && + cinfo->in_color_space == JCS_RGB && + cinfo->jpeg_color_space == JCS_YCbCr; for (size_t i = input_lag; i < num_lines; ++i) { jpegli::ReadInputRow(cinfo, scanlines[i], rows); + if (snapshot_rgb) { + size_t row_idx = m->next_input_row - 1; + size_t img_w = cinfo->image_width; + size_t pad_w = m->xsize_blocks * DCTSIZE; + for (int c = 0; c < 3; ++c) { + float* rgb_row = m->input_rgb[c].Row(row_idx); + memcpy(rgb_row, rows[c], img_w * sizeof(float)); + float last = rgb_row[img_w - 1]; + for (size_t x = img_w; x <= pad_w; ++x) rgb_row[x] = last; + } + } (*m->color_transform)(rows, cinfo->image_width); jpegli::PadInputBuffer(cinfo, rows); jpegli::ProcessiMCURows(cinfo); diff --git a/lib/jpegli/encode_internal.h b/lib/jpegli/encode_internal.h index 05f40dbf..b260b6e0 100644 --- a/lib/jpegli/encode_internal.h +++ b/lib/jpegli/encode_internal.h @@ -68,6 +68,7 @@ struct ScanTokenInfo { struct jpeg_comp_master { jpegli::RowBuffer input_buffer[jpegli::kMaxComponents]; + jpegli::RowBuffer input_rgb[3]; jpegli::RowBuffer* smooth_input[jpegli::kMaxComponents]; jpegli::RowBuffer* raw_data[jpegli::kMaxComponents]; bool force_baseline; @@ -139,6 +140,7 @@ struct jpeg_comp_master { float psnr_tolerance; float min_distance; float max_distance; + jpegli::RowBuffer sharpyuv_workspace[13]; }; #endif // LIB_JPEGLI_ENCODE_INTERNAL_H_ diff --git a/lib/jpegli/sharp_yuv.cc b/lib/jpegli/sharp_yuv.cc index 080c99fb..154d2370 100644 --- a/lib/jpegli/sharp_yuv.cc +++ b/lib/jpegli/sharp_yuv.cc @@ -11,7 +11,7 @@ #include #include #include -#include + #include "lib/jpegli/common.h" #include "lib/jpegli/encode_internal.h" @@ -35,7 +35,9 @@ using hwy::HWY_NAMESPACE::Div; using hwy::HWY_NAMESPACE::Floor; using hwy::HWY_NAMESPACE::IfThenElse; using hwy::HWY_NAMESPACE::Le; +using hwy::HWY_NAMESPACE::Lanes; using hwy::HWY_NAMESPACE::LoadDup128; +using hwy::HWY_NAMESPACE::LoadInterleaved2; using hwy::HWY_NAMESPACE::LoadU; using hwy::HWY_NAMESPACE::Max; using hwy::HWY_NAMESPACE::Min; @@ -47,8 +49,10 @@ using hwy::HWY_NAMESPACE::ReduceSum; using hwy::HWY_NAMESPACE::Set; using hwy::HWY_NAMESPACE::ShiftLeft; using hwy::HWY_NAMESPACE::ShiftRight; +using hwy::HWY_NAMESPACE::StoreInterleaved2; using hwy::HWY_NAMESPACE::StoreU; using hwy::HWY_NAMESPACE::Sub; +using hwy::HWY_NAMESPACE::Vec; using hwy::HWY_NAMESPACE::Zero; using D = HWY_CAPPED(float, 8); @@ -257,47 +261,25 @@ HWY_INLINE void BoxAverageDownsampleRow(const float* lin_r0, const auto quarter = Set(d, 0.25f); size_t x = 0; - // SIMD path: process N output pixels at a time. - // Each output pixel needs 2 input pixels horizontally, so we need 2*N inputs. for (; x + N <= down_width; x += N) { - // Gather even and odd columns from two rows. - // Row 0: indices 2*x, 2*x+1, 2*x+2, 2*x+3, ... - // Row 1: same pattern. - // We accumulate the 4 samples for each output pixel. - auto sum_r = Zero(d); - auto sum_g = Zero(d); - auto sum_b = Zero(d); - - for (int iy = 0; iy < 2; ++iy) { - const float* src_r = (iy == 0) ? lin_r0 : lin_r1; - const float* src_g = (iy == 0) ? lin_g0 : lin_g1; - const float* src_b = (iy == 0) ? lin_b0 : lin_b1; - for (int ix = 0; ix < 2; ++ix) { - // Load non-contiguous: elements at positions 2*x+ix, 2*(x+1)+ix, ... - // Unfortunately these are strided, so we load pairs and deinterleave - // is cleaner. For simplicity and correctness, use scalar gather here - // since the inner 2x2 loop is just 4 iterations. - // Actually, let's use a temporary buffer approach for the gather. - HWY_ALIGN float tmp_r[HWY_MAX_LANES_D(D)]; - HWY_ALIGN float tmp_g[HWY_MAX_LANES_D(D)]; - HWY_ALIGN float tmp_b[HWY_MAX_LANES_D(D)]; - for (size_t k = 0; k < N; ++k) { - size_t si = (x + k) * 2 + ix; - tmp_r[k] = src_r[si]; - tmp_g[k] = src_g[si]; - tmp_b[k] = src_b[si]; - } - sum_r = Add(sum_r, LoadU(d, tmp_r)); - sum_g = Add(sum_g, LoadU(d, tmp_g)); - sum_b = Add(sum_b, LoadU(d, tmp_b)); - } - } + Vec r0e, r0o, g0e, g0o, b0e, b0o; + Vec r1e, r1o, g1e, g1o, b1e, b1o; + LoadInterleaved2(d, lin_r0 + 2 * x, r0e, r0o); + LoadInterleaved2(d, lin_g0 + 2 * x, g0e, g0o); + LoadInterleaved2(d, lin_b0 + 2 * x, b0e, b0o); + LoadInterleaved2(d, lin_r1 + 2 * x, r1e, r1o); + LoadInterleaved2(d, lin_g1 + 2 * x, g1e, g1o); + LoadInterleaved2(d, lin_b1 + 2 * x, b1e, b1o); + + auto sum_r = Add(Add(r0e, r0o), Add(r1e, r1o)); + auto sum_g = Add(Add(g0e, g0o), Add(g1e, g1o)); + auto sum_b = Add(Add(b0e, b0o), Add(b1e, b1o)); auto avg_r = LinearToGamma_SIMD(d, Mul(sum_r, quarter)); auto avg_g = LinearToGamma_SIMD(d, Mul(sum_g, quarter)); auto avg_b = LinearToGamma_SIMD(d, Mul(sum_b, quarter)); - decltype(avg_r) vy, vcb, vcr; + Vec vy, vcb, vcr; RGBToYCbCr_SIMD(d, avg_r, avg_g, avg_b, &vy, &vcb, &vcr); StoreU(vcb, d, out_cb + x); @@ -336,8 +318,7 @@ HWY_INLINE void BoxAverageDownsampleRow(const float* lin_r0, // --------------------------------------------------------------------------- HWY_INLINE void TentUpsampleRow(const float* prev_row, const float* cur_row, const float* next_row, size_t down_width, - size_t xsize_padded, size_t y, - float* out_full) { + float* out0, float* out1) { const size_t N = hwy::HWY_NAMESPACE::Lanes(d); const auto k9 = Set(d, 9.0f); const auto k3 = Set(d, 3.0f); @@ -346,68 +327,26 @@ HWY_INLINE void TentUpsampleRow(const float* prev_row, const float* cur_row, for (; x + N <= down_width; x += N) { auto vc = LoadU(d, cur_row + x); - - // Load left neighbors (clamp at boundary) - HWY_ALIGN float tmp_l[HWY_MAX_LANES_D(D)]; - HWY_ALIGN float tmp_tl[HWY_MAX_LANES_D(D)]; - HWY_ALIGN float tmp_bl[HWY_MAX_LANES_D(D)]; - for (size_t k = 0; k < N; ++k) { - size_t xi = x + k; - size_t xl = (xi == 0) ? 0 : xi - 1; - tmp_l[k] = cur_row[xl]; - tmp_tl[k] = prev_row[xl]; - tmp_bl[k] = next_row[xl]; - } - auto vl = LoadU(d, tmp_l); - auto vtl = LoadU(d, tmp_tl); - auto vbl = LoadU(d, tmp_bl); - - // Load right neighbors (clamp at boundary) - HWY_ALIGN float tmp_r[HWY_MAX_LANES_D(D)]; - HWY_ALIGN float tmp_tr[HWY_MAX_LANES_D(D)]; - HWY_ALIGN float tmp_br[HWY_MAX_LANES_D(D)]; - for (size_t k = 0; k < N; ++k) { - size_t xi = x + k; - size_t xr = (xi + 1 == down_width) ? xi : xi + 1; - tmp_r[k] = cur_row[xr]; - tmp_tr[k] = prev_row[xr]; - tmp_br[k] = next_row[xr]; - } - auto vr = LoadU(d, tmp_r); - auto vtr = LoadU(d, tmp_tr); - auto vbr = LoadU(d, tmp_br); - auto vt = LoadU(d, prev_row + x); auto vb = LoadU(d, next_row + x); - // c9 = c * 9 + auto vl = LoadU(d, cur_row + x - 1); + auto vtl = LoadU(d, prev_row + x - 1); + auto vbl = LoadU(d, next_row + x - 1); + + auto vr = LoadU(d, cur_row + x + 1); + auto vtr = LoadU(d, prev_row + x + 1); + auto vbr = LoadU(d, next_row + x + 1); + auto c9 = Mul(vc, k9); - // Top-left quadrant: (c*9 + t*3 + l*3 + tl) / 16 auto val_tl = Mul(MulAdd(k3, Add(vt, vl), Add(c9, vtl)), kInv16); - // Top-right quadrant: (c*9 + t*3 + r*3 + tr) / 16 auto val_tr = Mul(MulAdd(k3, Add(vt, vr), Add(c9, vtr)), kInv16); - // Bottom-left quadrant: (c*9 + b*3 + l*3 + bl) / 16 auto val_bl = Mul(MulAdd(k3, Add(vb, vl), Add(c9, vbl)), kInv16); - // Bottom-right quadrant: (c*9 + b*3 + r*3 + br) / 16 auto val_br = Mul(MulAdd(k3, Add(vb, vr), Add(c9, vbr)), kInv16); - // Scatter to full-res positions - HWY_ALIGN float res_tl[HWY_MAX_LANES_D(D)]; - HWY_ALIGN float res_tr[HWY_MAX_LANES_D(D)]; - HWY_ALIGN float res_bl[HWY_MAX_LANES_D(D)]; - HWY_ALIGN float res_br[HWY_MAX_LANES_D(D)]; - StoreU(val_tl, d, res_tl); - StoreU(val_tr, d, res_tr); - StoreU(val_bl, d, res_bl); - StoreU(val_br, d, res_br); - for (size_t k = 0; k < N; ++k) { - size_t xi = x + k; - out_full[(y * 2 + 0) * xsize_padded + (xi * 2 + 0)] = res_tl[k]; - out_full[(y * 2 + 0) * xsize_padded + (xi * 2 + 1)] = res_tr[k]; - out_full[(y * 2 + 1) * xsize_padded + (xi * 2 + 0)] = res_bl[k]; - out_full[(y * 2 + 1) * xsize_padded + (xi * 2 + 1)] = res_br[k]; - } + StoreInterleaved2(val_tl, val_tr, d, out0 + 2 * x); + StoreInterleaved2(val_bl, val_br, d, out1 + 2 * x); } // Scalar tail @@ -427,14 +366,10 @@ HWY_INLINE void TentUpsampleRow(const float* prev_row, const float* cur_row, float c9 = cc * 9.0f; float inv16 = 1.0f / 16.0f; - out_full[(y * 2 + 0) * xsize_padded + (x * 2 + 0)] = - (c9 + ct * 3.0f + cl * 3.0f + ctl) * inv16; - out_full[(y * 2 + 0) * xsize_padded + (x * 2 + 1)] = - (c9 + ct * 3.0f + cr * 3.0f + ctr) * inv16; - out_full[(y * 2 + 1) * xsize_padded + (x * 2 + 0)] = - (c9 + cb * 3.0f + cl * 3.0f + cbl) * inv16; - out_full[(y * 2 + 1) * xsize_padded + (x * 2 + 1)] = - (c9 + cb * 3.0f + cr * 3.0f + cbr) * inv16; + out0[x * 2 + 0] = (c9 + ct * 3.0f + cl * 3.0f + ctl) * inv16; + out0[(x * 2 + 1)] = (c9 + ct * 3.0f + cr * 3.0f + ctr) * inv16; + out1[x * 2 + 0] = (c9 + cb * 3.0f + cl * 3.0f + cbl) * inv16; + out1[(x * 2 + 1)] = (c9 + cb * 3.0f + cr * 3.0f + cbr) * inv16; } } @@ -478,37 +413,53 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { const size_t down_width = xsize_padded / 2; const size_t down_height = iMCU_height / 2; const size_t high_size = xsize_padded * iMCU_height; - const size_t low_size = down_width * down_height; - - std::vector target_y(high_size); - std::vector best_y(high_size); - std::vector tmp_lin_r(high_size); - std::vector tmp_lin_g(high_size); - std::vector tmp_lin_b(high_size); - - std::vector target_cb(low_size); - std::vector target_cr(low_size); - std::vector best_cb(low_size); - std::vector best_cr(low_size); - std::vector up_cb(high_size); - std::vector up_cr(high_size); + // Hoisted workspace buffers from master struct. + jpegli::RowBuffer* ws = m->sharpyuv_workspace; + auto target_y_ws = &ws[0]; + auto best_y_ws = &ws[1]; + auto tmp_lin_r_ws = &ws[2]; + auto tmp_lin_g_ws = &ws[3]; + auto tmp_lin_b_ws = &ws[4]; + auto up_cb_ws = &ws[5]; + auto up_cr_ws = &ws[6]; + auto target_cb_ws = &ws[7]; + auto target_cr_ws = &ws[8]; + auto best_cb_ws = &ws[9]; + auto best_cr_ws = &ws[10]; + auto err_cb_ws = &ws[11]; + auto err_cr_ws = &ws[12]; const size_t N = hwy::HWY_NAMESPACE::Lanes(d); - // Decode input YCbCr to linear RGB and compute target luma. + const bool have_rgb_input = m->input_rgb[0].HasData(); + // Read pre-transform sRGB directly (if available) or convert from YCbCr. for (size_t y = 0; y < iMCU_height; ++y) { const float* row_y = m->smooth_input[0]->Row(y0 + y); - const float* row_cb = m->smooth_input[1]->Row(y0 + y); - const float* row_cr = m->smooth_input[2]->Row(y0 + y); + const float* row_cb = have_rgb_input ? nullptr : m->smooth_input[1]->Row(y0 + y); + const float* row_cr = have_rgb_input ? nullptr : m->smooth_input[2]->Row(y0 + y); + const float* row_r = have_rgb_input ? m->input_rgb[0].Row(y0 + y) : nullptr; + const float* row_g = have_rgb_input ? m->input_rgb[1].Row(y0 + y) : nullptr; + const float* row_b = have_rgb_input ? m->input_rgb[2].Row(y0 + y) : nullptr; + float* ty_out = target_y_ws->Row(y); + float* by_out = best_y_ws->Row(y); + float* lr_out = tmp_lin_r_ws->Row(y); + float* lg_out = tmp_lin_g_ws->Row(y); + float* lb_out = tmp_lin_b_ws->Row(y); + size_t x = 0; for (; x + N <= xsize_padded; x += N) { auto py = LoadU(d, row_y + x); - auto pcb = LoadU(d, row_cb + x); - auto pcr = LoadU(d, row_cr + x); - decltype(py) r, g, b; - YCbCrToRGB_SIMD(d, py, pcb, pcr, &r, &g, &b); + if (have_rgb_input) { + r = LoadU(d, row_r + x); + g = LoadU(d, row_g + x); + b = LoadU(d, row_b + x); + } else { + auto pcb = LoadU(d, row_cb + x); + auto pcr = LoadU(d, row_cr + x); + YCbCrToRGB_SIMD(d, py, pcb, pcr, &r, &g, &b); + } auto lin_r = GammaToLinear_SIMD(d, r); auto lin_g = GammaToLinear_SIMD(d, g); @@ -517,200 +468,166 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { auto lin_y = TempLuma_SIMD(d, lin_r, lin_g, lin_b); auto t_y = LinearToGamma_SIMD(d, lin_y); - size_t idx = y * xsize_padded + x; - StoreU(t_y, d, &target_y[idx]); - StoreU(py, d, &best_y[idx]); - StoreU(lin_r, d, &tmp_lin_r[idx]); - StoreU(lin_g, d, &tmp_lin_g[idx]); - StoreU(lin_b, d, &tmp_lin_b[idx]); + StoreU(t_y, d, ty_out + x); + StoreU(py, d, by_out + x); + StoreU(lin_r, d, lr_out + x); + StoreU(lin_g, d, lg_out + x); + StoreU(lin_b, d, lb_out + x); } for (; x < xsize_padded; ++x) { float py = row_y[x]; - float pcb = row_cb[x]; - float pcr = row_cr[x]; float r, g, b; - YCbCrToRGB_Scalar(py, pcb, pcr, &r, &g, &b); + if (have_rgb_input) { + r = row_r[x]; + g = row_g[x]; + b = row_b[x]; + } else { + float pcb = row_cb[x]; + float pcr = row_cr[x]; + YCbCrToRGB_Scalar(py, pcb, pcr, &r, &g, &b); + } float lr = GammaToLinear(r); float lg = GammaToLinear(g); float lb = GammaToLinear(b); float ly = 0.299f * lr + 0.587f * lg + 0.114f * lb; - size_t idx = y * xsize_padded + x; - target_y[idx] = LinearToGamma(ly); - best_y[idx] = py; - tmp_lin_r[idx] = lr; - tmp_lin_g[idx] = lg; - tmp_lin_b[idx] = lb; + ty_out[x] = LinearToGamma(ly); + by_out[x] = py; + lr_out[x] = lr; + lg_out[x] = lg; + lb_out[x] = lb; } } // Box-average downsample of linear RGB to target Cb/Cr. for (size_t y = 0; y < down_height; ++y) { - const float* r0 = &tmp_lin_r[(y * 2 + 0) * xsize_padded]; - const float* g0 = &tmp_lin_g[(y * 2 + 0) * xsize_padded]; - const float* b0 = &tmp_lin_b[(y * 2 + 0) * xsize_padded]; - const float* r1 = &tmp_lin_r[(y * 2 + 1) * xsize_padded]; - const float* g1 = &tmp_lin_g[(y * 2 + 1) * xsize_padded]; - const float* b1 = &tmp_lin_b[(y * 2 + 1) * xsize_padded]; - float* dcb = &target_cb[y * down_width]; - float* dcr = &target_cr[y * down_width]; + const float* r0 = tmp_lin_r_ws->Row(y * 2 + 0); + const float* g0 = tmp_lin_g_ws->Row(y * 2 + 0); + const float* b0 = tmp_lin_b_ws->Row(y * 2 + 0); + const float* r1 = tmp_lin_r_ws->Row(y * 2 + 1); + const float* g1 = tmp_lin_g_ws->Row(y * 2 + 1); + const float* b1 = tmp_lin_b_ws->Row(y * 2 + 1); + float* dcb = target_cb_ws->Row(y); + float* dcr = target_cr_ws->Row(y); BoxAverageDownsampleRow(r0, g0, b0, r1, g1, b1, down_width, xsize_padded, dcb, dcr); // Also initialize best_cb/cr = target_cb/cr - memcpy(&best_cb[y * down_width], dcb, down_width * sizeof(float)); - memcpy(&best_cr[y * down_width], dcr, down_width * sizeof(float)); + memcpy(best_cb_ws->Row(y), dcb, down_width * sizeof(float)); + memcpy(best_cr_ws->Row(y), dcr, down_width * sizeof(float)); + } + + for (size_t y = 0; y < down_height; ++y) { + best_cb_ws->PadRow(y, down_width, 1); + best_cr_ws->PadRow(y, down_width, 1); } + const auto vmin = Set(d, 0.0f); + const auto vmax = Set(d, 255.0f); float prev_diff_y_sum = 1e30f; const float diff_y_threshold = 3.0f * high_size; - // Pre-allocate error buffers outside the iteration loop to avoid repeated - // heap allocation/deallocation. - std::vector err_y(high_size); - std::vector err_cb(low_size); - std::vector err_cr(low_size); - for (int iter = 0; iter < 4; ++iter) { - memset(err_y.data(), 0, high_size * sizeof(float)); - memset(err_cb.data(), 0, low_size * sizeof(float)); - memset(err_cr.data(), 0, low_size * sizeof(float)); float diff_y_sum = 0.0f; + auto acc_y = Zero(d); - // Upsample best_cb/cr to full-res using a 2D tent filter. + // Padding for upsampling (all rows for this iteration) + for (size_t y = 0; y < down_height; ++y) { + best_cb_ws->PadRow(y, down_width, 1); + best_cr_ws->PadRow(y, down_width, 1); + } + + // Pass 1: Upsample current best_cb/cr for (size_t y = 0; y < down_height; ++y) { size_t y_prev = (y == 0) ? 0 : y - 1; size_t y_next = (y + 1 == down_height) ? y : y + 1; - - const float* cb_prev = &best_cb[y_prev * down_width]; - const float* cb_cur = &best_cb[y * down_width]; - const float* cb_next = &best_cb[y_next * down_width]; - TentUpsampleRow(cb_prev, cb_cur, cb_next, down_width, xsize_padded, y, - up_cb.data()); - - const float* cr_prev = &best_cr[y_prev * down_width]; - const float* cr_cur = &best_cr[y * down_width]; - const float* cr_next = &best_cr[y_next * down_width]; - TentUpsampleRow(cr_prev, cr_cur, cr_next, down_width, xsize_padded, y, - up_cr.data()); + TentUpsampleRow(best_cb_ws->Row(y_prev), best_cb_ws->Row(y), + best_cb_ws->Row(y_next), down_width, + up_cb_ws->Row(y * 2), up_cb_ws->Row(y * 2 + 1)); + TentUpsampleRow(best_cr_ws->Row(y_prev), best_cr_ws->Row(y), + best_cr_ws->Row(y_next), down_width, + up_cr_ws->Row(y * 2), up_cr_ws->Row(y * 2 + 1)); } - // Evaluate YCbCr reconstruction error at full resolution. - for (size_t y = 0; y < iMCU_height; ++y) { - size_t x = 0; - for (; x + N <= xsize_padded; x += N) { - size_t idx = y * xsize_padded + x; - auto py = LoadU(d, &best_y[idx]); - auto ucb = LoadU(d, &up_cb[idx]); - auto ucr = LoadU(d, &up_cr[idx]); - - decltype(py) r, g, b; - YCbCrToRGB_SIMD(d, py, ucb, ucr, &r, &g, &b); - - auto lin_r = GammaToLinear_SIMD(d, r); - auto lin_g = GammaToLinear_SIMD(d, g); - auto lin_b = GammaToLinear_SIMD(d, b); - - auto lin_y = TempLuma_SIMD(d, lin_r, lin_g, lin_b); - auto dec_y = LinearToGamma_SIMD(d, lin_y); - - auto e_y = Sub(LoadU(d, &target_y[idx]), dec_y); - StoreU(e_y, d, &err_y[idx]); - StoreU(lin_r, d, &tmp_lin_r[idx]); - StoreU(lin_g, d, &tmp_lin_g[idx]); - StoreU(lin_b, d, &tmp_lin_b[idx]); - } - for (; x < xsize_padded; ++x) { - size_t idx = y * xsize_padded + x; - float py = best_y[idx]; - float ucb = up_cb[idx]; - float ucr = up_cr[idx]; - float r, g, b; - YCbCrToRGB_Scalar(py, ucb, ucr, &r, &g, &b); - float lr = GammaToLinear(r); - float lg = GammaToLinear(g); - float lb = GammaToLinear(b); - float ly = 0.299f * lr + 0.587f * lg + 0.114f * lb; - err_y[idx] = target_y[idx] - LinearToGamma(ly); - tmp_lin_r[idx] = lr; - tmp_lin_g[idx] = lg; - tmp_lin_b[idx] = lb; + // Pass 2: Evaluate Y and Prepare for Chroma update + for (size_t y = 0; y < down_height; ++y) { + for (int iy = 0; iy < 2; ++iy) { + size_t yy = y * 2 + iy; + float* row_best_y = best_y_ws->Row(yy); + const float* row_up_cb = up_cb_ws->Row(yy); + const float* row_up_cr = up_cr_ws->Row(yy); + const float* row_target_y = target_y_ws->Row(yy); + float* row_lin_r = tmp_lin_r_ws->Row(yy); + float* row_lin_g = tmp_lin_g_ws->Row(yy); + float* row_lin_b = tmp_lin_b_ws->Row(yy); + + size_t x = 0; + for (; x + N <= xsize_padded; x += N) { + auto py = LoadU(d, row_best_y + x); + auto ucb = LoadU(d, row_up_cb + x); + auto ucr = LoadU(d, row_up_cr + x); + Vec r, g, b; + YCbCrToRGB_SIMD(d, py, ucb, ucr, &r, &g, &b); + auto lin_r = GammaToLinear_SIMD(d, r); + auto lin_g = GammaToLinear_SIMD(d, g); + auto lin_b = GammaToLinear_SIMD(d, b); + auto lin_y = TempLuma_SIMD(d, lin_r, lin_g, lin_b); + auto dec_y = LinearToGamma_SIMD(d, lin_y); + auto ey = Sub(LoadU(d, row_target_y + x), dec_y); + acc_y = Add(acc_y, Abs(ey)); + + StoreU(Min(Max(Add(py, ey), vmin), vmax), d, row_best_y + x); + StoreU(lin_r, d, row_lin_r + x); + StoreU(lin_g, d, row_lin_g + x); + StoreU(lin_b, d, row_lin_b + x); + } + for (; x < xsize_padded; ++x) { + float py = row_best_y[x]; + float r, g, b; + YCbCrToRGB_Scalar(py, row_up_cb[x], row_up_cr[x], &r, &g, &b); + float lr = GammaToLinear(r); + float lg = GammaToLinear(g); + float lb = GammaToLinear(b); + float ly = 0.299f * lr + 0.587f * lg + 0.114f * lb; + float ey = row_target_y[x] - LinearToGamma(ly); + diff_y_sum += std::abs(ey); + row_best_y[x] = std::max(0.0f, std::min(255.0f, py + ey)); + row_lin_r[x] = lr; + row_lin_g[x] = lg; + row_lin_b[x] = lb; + } } - } - // Accumulate |err_y| - auto acc_y = Zero(d); - size_t i_y = 0; - for (; i_y + N <= high_size; i_y += N) { - acc_y = Add(acc_y, Abs(LoadU(d, &err_y[i_y]))); - } - diff_y_sum = ReduceSum(d, acc_y); - for (; i_y < high_size; ++i_y) { - diff_y_sum += std::abs(err_y[i_y]); - } + float* row_best_cb = best_cb_ws->Row(y); + float* row_best_cr = best_cr_ws->Row(y); + float* row_err_cb = err_cb_ws->Row(y); + float* row_err_cr = err_cr_ws->Row(y); + float* row_tgt_cb = target_cb_ws->Row(y); + float* row_tgt_cr = target_cr_ws->Row(y); + + BoxAverageDownsampleRow( + tmp_lin_r_ws->Row(y * 2 + 0), tmp_lin_g_ws->Row(y * 2 + 0), + tmp_lin_b_ws->Row(y * 2 + 0), tmp_lin_r_ws->Row(y * 2 + 1), + tmp_lin_g_ws->Row(y * 2 + 1), tmp_lin_b_ws->Row(y * 2 + 1), down_width, + xsize_padded, row_err_cb, row_err_cr); - // Box-average downsample reconstructed RGB to compute Cb/Cr error. - for (size_t y = 0; y < down_height; ++y) { - const float* r0 = &tmp_lin_r[(y * 2 + 0) * xsize_padded]; - const float* g0 = &tmp_lin_g[(y * 2 + 0) * xsize_padded]; - const float* b0 = &tmp_lin_b[(y * 2 + 0) * xsize_padded]; - const float* r1 = &tmp_lin_r[(y * 2 + 1) * xsize_padded]; - const float* g1 = &tmp_lin_g[(y * 2 + 1) * xsize_padded]; - const float* b1 = &tmp_lin_b[(y * 2 + 1) * xsize_padded]; - - float* row_err_cb = &err_cb[y * down_width]; - float* row_err_cr = &err_cr[y * down_width]; - float* row_tgt_cb = &target_cb[y * down_width]; - float* row_tgt_cr = &target_cr[y * down_width]; - - // BoxAverageDownsampleRow outputs Cb/Cr directly, so we use err - // buffers as temp storage for the reconstructed Cb/Cr. - BoxAverageDownsampleRow(r0, g0, b0, r1, g1, b1, down_width, - xsize_padded, row_err_cb, row_err_cr); - - // Compute error = target - reconstructed size_t x = 0; for (; x + N <= down_width; x += N) { - auto tgt_cb = LoadU(d, row_tgt_cb + x); - auto rec_cb = LoadU(d, row_err_cb + x); - StoreU(Sub(tgt_cb, rec_cb), d, row_err_cb + x); - - auto tgt_cr = LoadU(d, row_tgt_cr + x); - auto rec_cr = LoadU(d, row_err_cr + x); - StoreU(Sub(tgt_cr, rec_cr), d, row_err_cr + x); + auto bcb = LoadU(d, row_best_cb + x); + auto bcr = LoadU(d, row_best_cr + x); + auto tcb = LoadU(d, row_tgt_cb + x); + auto tcr = LoadU(d, row_tgt_cr + x); + auto dcb = LoadU(d, row_err_cb + x); + auto dcr = LoadU(d, row_err_cr + x); + StoreU(Add(bcb, Sub(tcb, dcb)), d, row_best_cb + x); + StoreU(Add(bcr, Sub(tcr, dcr)), d, row_best_cr + x); } for (; x < down_width; ++x) { - row_err_cb[x] = row_tgt_cb[x] - row_err_cb[x]; - row_err_cr[x] = row_tgt_cr[x] - row_err_cr[x]; + row_best_cb[x] += row_tgt_cb[x] - row_err_cb[x]; + row_best_cr[x] += row_tgt_cr[x] - row_err_cr[x]; } } - - // Apply Y error correction with clamping. - const auto vmin = Set(d, 0.0f); - const auto vmax = Set(d, 255.0f); - size_t i_y_corr = 0; - for (; i_y_corr + N <= high_size; i_y_corr += N) { - auto y_vec = LoadU(d, &best_y[i_y_corr]); - auto e_y_vec = LoadU(d, &err_y[i_y_corr]); - auto res = Min(Max(Add(y_vec, e_y_vec), vmin), vmax); - StoreU(res, d, &best_y[i_y_corr]); - } - for (; i_y_corr < high_size; ++i_y_corr) { - best_y[i_y_corr] = std::max(0.0f, std::min(255.0f, best_y[i_y_corr] + err_y[i_y_corr])); - } - - // Apply Cb/Cr error correction. - size_t i_c_corr = 0; - for (; i_c_corr + N <= low_size; i_c_corr += N) { - auto cb_vec = Add(LoadU(d, &best_cb[i_c_corr]), LoadU(d, &err_cb[i_c_corr])); - auto cr_vec = Add(LoadU(d, &best_cr[i_c_corr]), LoadU(d, &err_cr[i_c_corr])); - StoreU(cb_vec, d, &best_cb[i_c_corr]); - StoreU(cr_vec, d, &best_cr[i_c_corr]); - } - for (; i_c_corr < low_size; ++i_c_corr) { - best_cb[i_c_corr] += err_cb[i_c_corr]; - best_cr[i_c_corr] += err_cr[i_c_corr]; - } + diff_y_sum += ReduceSum(d, acc_y); if (iter > 0 && (diff_y_sum < diff_y_threshold || diff_y_sum > prev_diff_y_sum)) @@ -721,18 +638,15 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { // Write best_y and best_cb/cr to raw_data output buffers. for (size_t y = 0; y < iMCU_height; ++y) { float* row_out_y = m->raw_data[0]->Row(y0 + y); - memcpy(row_out_y, &best_y[y * xsize_padded], - xsize_padded * sizeof(float)); + memcpy(row_out_y, best_y_ws->Row(y), xsize_padded * sizeof(float)); } const size_t y_out0 = y0 / 2; for (size_t y_out = 0; y_out < down_height; ++y_out) { float* row_out_cb = m->raw_data[1]->Row(y_out0 + y_out); float* row_out_cr = m->raw_data[2]->Row(y_out0 + y_out); - memcpy(row_out_cb, &best_cb[y_out * down_width], - down_width * sizeof(float)); - memcpy(row_out_cr, &best_cr[y_out * down_width], - down_width * sizeof(float)); + memcpy(row_out_cb, best_cb_ws->Row(y_out), down_width * sizeof(float)); + memcpy(row_out_cr, best_cr_ws->Row(y_out), down_width * sizeof(float)); } } From 126cc4c303c11fc61dd1b4324556c37ebce1447a Mon Sep 17 00:00:00 2001 From: Galaxy4594 <164440799+Galaxy4594@users.noreply.github.com> Date: Sun, 5 Apr 2026 16:34:48 -0700 Subject: [PATCH 06/10] fix boundary artifacts and remove AI fluff --- lib/jpegli/sharp_yuv.cc | 189 ++++++++++++++++++++++------------------ 1 file changed, 102 insertions(+), 87 deletions(-) diff --git a/lib/jpegli/sharp_yuv.cc b/lib/jpegli/sharp_yuv.cc index 154d2370..d040c70d 100644 --- a/lib/jpegli/sharp_yuv.cc +++ b/lib/jpegli/sharp_yuv.cc @@ -12,10 +12,8 @@ #include #include - #include "lib/jpegli/common.h" #include "lib/jpegli/encode_internal.h" -#include "lib/jpegli/simd.h" #undef HWY_TARGET_INCLUDE #define HWY_TARGET_INCLUDE "lib/jpegli/sharp_yuv.cc" @@ -34,8 +32,8 @@ using hwy::HWY_NAMESPACE::ConvertTo; using hwy::HWY_NAMESPACE::Div; using hwy::HWY_NAMESPACE::Floor; using hwy::HWY_NAMESPACE::IfThenElse; -using hwy::HWY_NAMESPACE::Le; using hwy::HWY_NAMESPACE::Lanes; +using hwy::HWY_NAMESPACE::Le; using hwy::HWY_NAMESPACE::LoadDup128; using hwy::HWY_NAMESPACE::LoadInterleaved2; using hwy::HWY_NAMESPACE::LoadU; @@ -85,8 +83,7 @@ inline void YCbCrToRGB_Scalar(float y, float cb, float cr, float* r, float* g, cb -= 128.0f; cr -= 128.0f; *r = y + cr * 1.402f; - *g = y + cb * (-0.114f * 1.772f / 0.587f) + - cr * (-0.299f * 1.402f / 0.587f); + *g = y + cb * (-0.114f * 1.772f / 0.587f) + cr * (-0.299f * 1.402f / 0.587f); *b = y + cb * 1.772f; } @@ -139,11 +136,11 @@ HWY_INLINE V EvalRationalPoly(const DF df, const V x, const T (&p)[NP], template HWY_INLINE V FastLog2f(const DF df, V x) { HWY_ALIGN const float p[4 * 3] = {HWY_REP4(-1.8503833400518310E-06f), - HWY_REP4(1.4287160470083755E+00f), - HWY_REP4(7.4245873327820566E-01f)}; + HWY_REP4(1.4287160470083755E+00f), + HWY_REP4(7.4245873327820566E-01f)}; HWY_ALIGN const float q[4 * 3] = {HWY_REP4(9.9032814277590719E-01f), - HWY_REP4(1.0096718572241148E+00f), - HWY_REP4(1.7409343003366853E-01f)}; + HWY_REP4(1.0096718572241148E+00f), + HWY_REP4(1.7409343003366853E-01f)}; const Rebind di; const auto x_bits = BitCast(di, x); const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab)); @@ -220,6 +217,8 @@ HWY_INLINE void YCbCrToRGB_SIMD(D d, V y, V cb, V cr, V* r, V* g, V* b) { template HWY_INLINE V TempLuma_SIMD(D d, V r, V g, V b) { + // using bt 709 values (0.2126f, 0.7152f, 0.0722f) may be more perceptually + // accurate, but it needs more visual testing. return MulAdd(Set(d, 0.299f), r, MulAdd(Set(d, 0.587f), g, Mul(Set(d, 0.114f), b))); } @@ -244,26 +243,29 @@ HWY_INLINE void RGBToYCbCr_SIMD(D d, V r, V g, V b, V* y, V* cb, V* cr) { *cr = MulAdd(Sub(Mul(r, kDiffR), y_base), kNormR, v128); } -// --------------------------------------------------------------------------- // SIMD box-average downsample of linear RGB -> YCbCr chroma -// Processes each row of down_width pixels. For each output pixel, averages -// a 2x2 block of linear RGB, converts back to gamma, then to YCbCr. -// --------------------------------------------------------------------------- -HWY_INLINE void BoxAverageDownsampleRow(const float* lin_r0, - const float* lin_g0, - const float* lin_b0, - const float* lin_r1, - const float* lin_g1, - const float* lin_b1, - size_t down_width, size_t xsize_padded, - float* out_cb, float* out_cr) { +HWY_INLINE void BoxDownsampleRow(const float* lin_r0, const float* lin_g0, + const float* lin_b0, const float* lin_r1, + const float* lin_g1, const float* lin_b1, + size_t down_width, size_t xsize_padded, + float* out_cb, float* out_cr) { const size_t N = hwy::HWY_NAMESPACE::Lanes(d); const auto quarter = Set(d, 0.25f); size_t x = 0; for (; x + N <= down_width; x += N) { - Vec r0e, r0o, g0e, g0o, b0e, b0o; - Vec r1e, r1o, g1e, g1o, b1e, b1o; + Vec r0e; + Vec r0o; + Vec g0e; + Vec g0o; + Vec b0e; + Vec b0o; + Vec r1e; + Vec r1o; + Vec g1e; + Vec g1o; + Vec b1e; + Vec b1o; LoadInterleaved2(d, lin_r0 + 2 * x, r0e, r0o); LoadInterleaved2(d, lin_g0 + 2 * x, g0e, g0o); LoadInterleaved2(d, lin_b0 + 2 * x, b0e, b0o); @@ -279,7 +281,9 @@ HWY_INLINE void BoxAverageDownsampleRow(const float* lin_r0, auto avg_g = LinearToGamma_SIMD(d, Mul(sum_g, quarter)); auto avg_b = LinearToGamma_SIMD(d, Mul(sum_b, quarter)); - Vec vy, vcb, vcr; + Vec vy; + Vec vcb; + Vec vcr; RGBToYCbCr_SIMD(d, avg_r, avg_g, avg_b, &vy, &vcb, &vcr); StoreU(vcb, d, out_cb + x); @@ -288,7 +292,9 @@ HWY_INLINE void BoxAverageDownsampleRow(const float* lin_r0, // Scalar tail for (; x < down_width; ++x) { - float lr_sum = 0, lg_sum = 0, lb_sum = 0; + float lr_sum = 0; + float lg_sum = 0; + float lb_sum = 0; for (int iy = 0; iy < 2; ++iy) { const float* src_r = (iy == 0) ? lin_r0 : lin_r1; const float* src_g = (iy == 0) ? lin_g0 : lin_g1; @@ -303,19 +309,16 @@ HWY_INLINE void BoxAverageDownsampleRow(const float* lin_r0, float avg_r = LinearToGamma(lr_sum * 0.25f); float avg_g = LinearToGamma(lg_sum * 0.25f); float avg_b = LinearToGamma(lb_sum * 0.25f); - float avg_y, avg_cb, avg_cr; + float avg_y; + float avg_cb; + float avg_cr; RGBToYCbCr_Scalar(avg_r, avg_g, avg_b, &avg_y, &avg_cb, &avg_cr); out_cb[x] = avg_cb; out_cr[x] = avg_cr; } } -// --------------------------------------------------------------------------- // SIMD tent-filter upsample of one chroma row -// For each x in [0, down_width), computes 4 upsampled values using the -// 9/3/3/1 tent filter, and writes them to the 4 corresponding full-res -// positions. -// --------------------------------------------------------------------------- HWY_INLINE void TentUpsampleRow(const float* prev_row, const float* cur_row, const float* next_row, size_t down_width, float* out0, float* out1) { @@ -373,10 +376,6 @@ HWY_INLINE void TentUpsampleRow(const float* prev_row, const float* cur_row, } } -// --------------------------------------------------------------------------- -// Main SharpYUV implementation -// --------------------------------------------------------------------------- - void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { jpeg_comp_master* m = cinfo->master; const size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor; @@ -399,8 +398,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { float* rows_in[MAX_SAMP_FACTOR]; auto& input = *m->smooth_input[c]; auto& output = *m->raw_data[c]; - for (size_t y_in = y0, y_out = y_out0; y_in < y1; - y_in += vf, ++y_out) { + for (size_t y_in = y0, y_out = y_out0; y_in < y1; y_in += vf, ++y_out) { for (int iy = 0; iy < vf; ++iy) { rows_in[iy] = input.Row(y_in + iy); } @@ -412,23 +410,28 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { const size_t down_width = xsize_padded / 2; const size_t down_height = iMCU_height / 2; + const size_t actual_height = + (y0 >= cinfo->image_height) + ? 1 + : std::min(y1, static_cast(cinfo->image_height)) - y0; + const size_t actual_down_height = (actual_height + 1) / 2; const size_t high_size = xsize_padded * iMCU_height; // Hoisted workspace buffers from master struct. jpegli::RowBuffer* ws = m->sharpyuv_workspace; - auto target_y_ws = &ws[0]; - auto best_y_ws = &ws[1]; - auto tmp_lin_r_ws = &ws[2]; - auto tmp_lin_g_ws = &ws[3]; - auto tmp_lin_b_ws = &ws[4]; - auto up_cb_ws = &ws[5]; - auto up_cr_ws = &ws[6]; - auto target_cb_ws = &ws[7]; - auto target_cr_ws = &ws[8]; - auto best_cb_ws = &ws[9]; - auto best_cr_ws = &ws[10]; - auto err_cb_ws = &ws[11]; - auto err_cr_ws = &ws[12]; + auto* target_y_ws = &ws[0]; + auto* best_y_ws = &ws[1]; + auto* lin_r_ws = &ws[2]; + auto* lin_g_ws = &ws[3]; + auto* lin_b_ws = &ws[4]; + auto* up_cb_ws = &ws[5]; + auto* up_cr_ws = &ws[6]; + auto* target_cb_ws = &ws[7]; + auto* target_cr_ws = &ws[8]; + auto* best_cb_ws = &ws[9]; + auto* best_cr_ws = &ws[10]; + auto* err_cb_ws = &ws[11]; + auto* err_cr_ws = &ws[12]; const size_t N = hwy::HWY_NAMESPACE::Lanes(d); @@ -436,21 +439,25 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { // Read pre-transform sRGB directly (if available) or convert from YCbCr. for (size_t y = 0; y < iMCU_height; ++y) { const float* row_y = m->smooth_input[0]->Row(y0 + y); - const float* row_cb = have_rgb_input ? nullptr : m->smooth_input[1]->Row(y0 + y); - const float* row_cr = have_rgb_input ? nullptr : m->smooth_input[2]->Row(y0 + y); + const float* row_cb = + have_rgb_input ? nullptr : m->smooth_input[1]->Row(y0 + y); + const float* row_cr = + have_rgb_input ? nullptr : m->smooth_input[2]->Row(y0 + y); const float* row_r = have_rgb_input ? m->input_rgb[0].Row(y0 + y) : nullptr; const float* row_g = have_rgb_input ? m->input_rgb[1].Row(y0 + y) : nullptr; const float* row_b = have_rgb_input ? m->input_rgb[2].Row(y0 + y) : nullptr; float* ty_out = target_y_ws->Row(y); float* by_out = best_y_ws->Row(y); - float* lr_out = tmp_lin_r_ws->Row(y); - float* lg_out = tmp_lin_g_ws->Row(y); - float* lb_out = tmp_lin_b_ws->Row(y); + float* lr_out = lin_r_ws->Row(y); + float* lg_out = lin_g_ws->Row(y); + float* lb_out = lin_b_ws->Row(y); size_t x = 0; for (; x + N <= xsize_padded; x += N) { auto py = LoadU(d, row_y + x); - decltype(py) r, g, b; + decltype(py) r; + decltype(py) g; + decltype(py) b; if (have_rgb_input) { r = LoadU(d, row_r + x); g = LoadU(d, row_g + x); @@ -476,7 +483,9 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { } for (; x < xsize_padded; ++x) { float py = row_y[x]; - float r, g, b; + float r; + float g; + float b; if (have_rgb_input) { r = row_r[x]; g = row_g[x]; @@ -489,6 +498,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { float lr = GammaToLinear(r); float lg = GammaToLinear(g); float lb = GammaToLinear(b); + // see TempLuma_SIMD for bt 709 coefficients float ly = 0.299f * lr + 0.587f * lg + 0.114f * lb; ty_out[x] = LinearToGamma(ly); by_out[x] = py; @@ -499,25 +509,25 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { } // Box-average downsample of linear RGB to target Cb/Cr. - for (size_t y = 0; y < down_height; ++y) { - const float* r0 = tmp_lin_r_ws->Row(y * 2 + 0); - const float* g0 = tmp_lin_g_ws->Row(y * 2 + 0); - const float* b0 = tmp_lin_b_ws->Row(y * 2 + 0); - const float* r1 = tmp_lin_r_ws->Row(y * 2 + 1); - const float* g1 = tmp_lin_g_ws->Row(y * 2 + 1); - const float* b1 = tmp_lin_b_ws->Row(y * 2 + 1); + for (size_t y = 0; y < actual_down_height; ++y) { + const float* r0 = lin_r_ws->Row(y * 2 + 0); + const float* g0 = lin_g_ws->Row(y * 2 + 0); + const float* b0 = lin_b_ws->Row(y * 2 + 0); + const float* r1 = lin_r_ws->Row(y * 2 + 1); + const float* g1 = lin_g_ws->Row(y * 2 + 1); + const float* b1 = lin_b_ws->Row(y * 2 + 1); float* dcb = target_cb_ws->Row(y); float* dcr = target_cr_ws->Row(y); - BoxAverageDownsampleRow(r0, g0, b0, r1, g1, b1, down_width, xsize_padded, - dcb, dcr); + BoxDownsampleRow(r0, g0, b0, r1, g1, b1, down_width, xsize_padded, dcb, + dcr); // Also initialize best_cb/cr = target_cb/cr memcpy(best_cb_ws->Row(y), dcb, down_width * sizeof(float)); memcpy(best_cr_ws->Row(y), dcr, down_width * sizeof(float)); } - for (size_t y = 0; y < down_height; ++y) { + for (size_t y = 0; y < actual_down_height; ++y) { best_cb_ws->PadRow(y, down_width, 1); best_cr_ws->PadRow(y, down_width, 1); } @@ -532,41 +542,43 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { auto acc_y = Zero(d); // Padding for upsampling (all rows for this iteration) - for (size_t y = 0; y < down_height; ++y) { + for (size_t y = 0; y < actual_down_height; ++y) { best_cb_ws->PadRow(y, down_width, 1); best_cr_ws->PadRow(y, down_width, 1); } // Pass 1: Upsample current best_cb/cr - for (size_t y = 0; y < down_height; ++y) { + for (size_t y = 0; y < actual_down_height; ++y) { size_t y_prev = (y == 0) ? 0 : y - 1; - size_t y_next = (y + 1 == down_height) ? y : y + 1; + size_t y_next = (y + 1 == actual_down_height) ? y : y + 1; TentUpsampleRow(best_cb_ws->Row(y_prev), best_cb_ws->Row(y), - best_cb_ws->Row(y_next), down_width, - up_cb_ws->Row(y * 2), up_cb_ws->Row(y * 2 + 1)); + best_cb_ws->Row(y_next), down_width, up_cb_ws->Row(y * 2), + up_cb_ws->Row(y * 2 + 1)); TentUpsampleRow(best_cr_ws->Row(y_prev), best_cr_ws->Row(y), - best_cr_ws->Row(y_next), down_width, - up_cr_ws->Row(y * 2), up_cr_ws->Row(y * 2 + 1)); + best_cr_ws->Row(y_next), down_width, up_cr_ws->Row(y * 2), + up_cr_ws->Row(y * 2 + 1)); } // Pass 2: Evaluate Y and Prepare for Chroma update - for (size_t y = 0; y < down_height; ++y) { + for (size_t y = 0; y < actual_down_height; ++y) { for (int iy = 0; iy < 2; ++iy) { size_t yy = y * 2 + iy; float* row_best_y = best_y_ws->Row(yy); const float* row_up_cb = up_cb_ws->Row(yy); const float* row_up_cr = up_cr_ws->Row(yy); const float* row_target_y = target_y_ws->Row(yy); - float* row_lin_r = tmp_lin_r_ws->Row(yy); - float* row_lin_g = tmp_lin_g_ws->Row(yy); - float* row_lin_b = tmp_lin_b_ws->Row(yy); + float* row_lin_r = lin_r_ws->Row(yy); + float* row_lin_g = lin_g_ws->Row(yy); + float* row_lin_b = lin_b_ws->Row(yy); size_t x = 0; for (; x + N <= xsize_padded; x += N) { auto py = LoadU(d, row_best_y + x); auto ucb = LoadU(d, row_up_cb + x); auto ucr = LoadU(d, row_up_cr + x); - Vec r, g, b; + Vec r; + Vec g; + Vec b; YCbCrToRGB_SIMD(d, py, ucb, ucr, &r, &g, &b); auto lin_r = GammaToLinear_SIMD(d, r); auto lin_g = GammaToLinear_SIMD(d, g); @@ -583,11 +595,14 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { } for (; x < xsize_padded; ++x) { float py = row_best_y[x]; - float r, g, b; + float r; + float g; + float b; YCbCrToRGB_Scalar(py, row_up_cb[x], row_up_cr[x], &r, &g, &b); float lr = GammaToLinear(r); float lg = GammaToLinear(g); float lb = GammaToLinear(b); + // see TempLuma_SIMD for bt 709 coefficients float ly = 0.299f * lr + 0.587f * lg + 0.114f * lb; float ey = row_target_y[x] - LinearToGamma(ly); diff_y_sum += std::abs(ey); @@ -605,11 +620,10 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { float* row_tgt_cb = target_cb_ws->Row(y); float* row_tgt_cr = target_cr_ws->Row(y); - BoxAverageDownsampleRow( - tmp_lin_r_ws->Row(y * 2 + 0), tmp_lin_g_ws->Row(y * 2 + 0), - tmp_lin_b_ws->Row(y * 2 + 0), tmp_lin_r_ws->Row(y * 2 + 1), - tmp_lin_g_ws->Row(y * 2 + 1), tmp_lin_b_ws->Row(y * 2 + 1), down_width, - xsize_padded, row_err_cb, row_err_cr); + BoxDownsampleRow(lin_r_ws->Row(y * 2 + 0), lin_g_ws->Row(y * 2 + 0), + lin_b_ws->Row(y * 2 + 0), lin_r_ws->Row(y * 2 + 1), + lin_g_ws->Row(y * 2 + 1), lin_b_ws->Row(y * 2 + 1), + down_width, xsize_padded, row_err_cb, row_err_cr); size_t x = 0; for (; x + N <= down_width; x += N) { @@ -643,10 +657,11 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { const size_t y_out0 = y0 / 2; for (size_t y_out = 0; y_out < down_height; ++y_out) { + size_t copy_y = std::min(y_out, actual_down_height - 1); float* row_out_cb = m->raw_data[1]->Row(y_out0 + y_out); float* row_out_cr = m->raw_data[2]->Row(y_out0 + y_out); - memcpy(row_out_cb, best_cb_ws->Row(y_out), down_width * sizeof(float)); - memcpy(row_out_cr, best_cr_ws->Row(y_out), down_width * sizeof(float)); + memcpy(row_out_cb, best_cb_ws->Row(copy_y), down_width * sizeof(float)); + memcpy(row_out_cr, best_cr_ws->Row(copy_y), down_width * sizeof(float)); } } From 7f0931fb9c06d917c799f046dce83604d8572556 Mon Sep 17 00:00:00 2001 From: Galaxy4594 <164440799+Galaxy4594@users.noreply.github.com> Date: Sat, 11 Apr 2026 19:44:22 -0700 Subject: [PATCH 07/10] +1 MP/s --- lib/jpegli/sharp_yuv.cc | 62 +++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/lib/jpegli/sharp_yuv.cc b/lib/jpegli/sharp_yuv.cc index d040c70d..850542a6 100644 --- a/lib/jpegli/sharp_yuv.cc +++ b/lib/jpegli/sharp_yuv.cc @@ -31,6 +31,7 @@ using hwy::HWY_NAMESPACE::BitCast; using hwy::HWY_NAMESPACE::ConvertTo; using hwy::HWY_NAMESPACE::Div; using hwy::HWY_NAMESPACE::Floor; +using hwy::HWY_NAMESPACE::GetLane; using hwy::HWY_NAMESPACE::IfThenElse; using hwy::HWY_NAMESPACE::Lanes; using hwy::HWY_NAMESPACE::Le; @@ -60,23 +61,7 @@ constexpr D d; // Scalar helpers (used for tail elements only) // --------------------------------------------------------------------------- -inline float GammaToLinear(float v) { - float n = std::max(0.0f, std::min(255.0f, v)) * (1.0f / 255.0f); - if (n <= 0.04045f) { - return n / 12.92f; - } else { - return std::pow((n + 0.055f) / 1.055f, 2.4f); - } -} -inline float LinearToGamma(float v) { - if (v <= 0.0031308f) { - return v * 12.92f * 255.0f; - } else { - return (1.055f * std::pow(std::max(0.0f, v), 1.0f / 2.4f) - 0.055f) * - 255.0f; - } -} inline void YCbCrToRGB_Scalar(float y, float cb, float cr, float* r, float* g, float* b) { @@ -94,8 +79,6 @@ inline void RGBToYCbCr_Scalar(float r, float g, float b, float* y, float* cb, const float kB = 0.114f; const float kAmpR = 0.701f; const float kAmpB = 0.886f; - const float kDiffR = kAmpR + kR; - const float kDiffB = kAmpB + kB; const float kNormR = 1.0f / (kAmpR + kG + kB); const float kNormB = 1.0f / (kR + kG + kAmpB); @@ -104,8 +87,8 @@ inline void RGBToYCbCr_Scalar(float r, float g, float b, float* y, float* cb, float b_base = b * kB; float y_base = r_base + g_base + b_base; *y = y_base; - *cb = (b * kDiffB - y_base) * kNormB + 128.0f; - *cr = (r * kDiffR - y_base) * kNormR + 128.0f; + *cb = (b - y_base) * kNormB + 128.0f; + *cr = (r - y_base) * kNormR + 128.0f; } // --------------------------------------------------------------------------- @@ -204,23 +187,38 @@ HWY_INLINE V LinearToGamma_SIMD(D d, V v) { return IfThenElse(is_low, low_val, high_val); } +inline float GammaToLinear(float v) { + HWY_CAPPED(float, 1) d1; + return GetLane(GammaToLinear_SIMD(d1, Set(d1, v))); +} + +inline float LinearToGamma(float v) { + HWY_CAPPED(float, 1) d1; + return GetLane(LinearToGamma_SIMD(d1, Set(d1, v))); +} + template HWY_INLINE void YCbCrToRGB_SIMD(D d, V y, V cb, V cr, V* r, V* g, V* b) { - auto v128 = Set(d, 128.0f); + const auto v128 = Set(d, 128.0f); cb = Sub(cb, v128); cr = Sub(cr, v128); - *r = MulAdd(cr, Set(d, 1.402f), y); - *g = MulAdd(cb, Set(d, -0.114f * 1.772f / 0.587f), - MulAdd(cr, Set(d, -0.299f * 1.402f / 0.587f), y)); - *b = MulAdd(cb, Set(d, 1.772f), y); + const auto kCrR = Set(d, 1.402f); + const auto kCbG = Set(d, -0.114f * 1.772f / 0.587f); + const auto kCrG = Set(d, -0.299f * 1.402f / 0.587f); + const auto kCbB = Set(d, 1.772f); + *r = MulAdd(cr, kCrR, y); + *g = MulAdd(cb, kCbG, MulAdd(cr, kCrG, y)); + *b = MulAdd(cb, kCbB, y); } template HWY_INLINE V TempLuma_SIMD(D d, V r, V g, V b) { // using bt 709 values (0.2126f, 0.7152f, 0.0722f) may be more perceptually // accurate, but it needs more visual testing. - return MulAdd(Set(d, 0.299f), r, - MulAdd(Set(d, 0.587f), g, Mul(Set(d, 0.114f), b))); + const auto kR = Set(d, 0.299f); + const auto kG = Set(d, 0.587f); + const auto kB = Set(d, 0.114f); + return MulAdd(kR, r, MulAdd(kG, g, Mul(kB, b))); } // Vectorized RGBToYCbCr: computes Y, Cb, Cr from gamma-space R, G, B vectors. @@ -229,18 +227,16 @@ HWY_INLINE void RGBToYCbCr_SIMD(D d, V r, V g, V b, V* y, V* cb, V* cr) { const auto kR = Set(d, 0.299f); const auto kG = Set(d, 0.587f); const auto kB = Set(d, 0.114f); - const auto kDiffR = Set(d, 0.701f + 0.299f); - const auto kDiffB = Set(d, 0.886f + 0.114f); const auto kNormR = Set(d, 1.0f / (0.701f + 0.587f + 0.114f)); const auto kNormB = Set(d, 1.0f / (0.299f + 0.587f + 0.886f)); const auto v128 = Set(d, 128.0f); auto y_base = MulAdd(kR, r, MulAdd(kG, g, Mul(kB, b))); *y = y_base; - // cb = (b * kDiffB - y_base) * kNormB + 128 - *cb = MulAdd(Sub(Mul(b, kDiffB), y_base), kNormB, v128); - // cr = (r * kDiffR - y_base) * kNormR + 128 - *cr = MulAdd(Sub(Mul(r, kDiffR), y_base), kNormR, v128); + // cb = (b - y_base) * kNormB + 128 + *cb = MulAdd(Sub(b, y_base), kNormB, v128); + // cr = (r - y_base) * kNormR + 128 + *cr = MulAdd(Sub(r, y_base), kNormR, v128); } // SIMD box-average downsample of linear RGB -> YCbCr chroma From dbe53bd0d9407176a2d97d58640ea0287994e476 Mon Sep 17 00:00:00 2001 From: Galaxy4594 <164440799+Galaxy4594@users.noreply.github.com> Date: Fri, 24 Apr 2026 03:42:50 -0700 Subject: [PATCH 08/10] minor stuff --- tools/cjpegli.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cjpegli.cc b/tools/cjpegli.cc index 35008edc..e48b4fa1 100644 --- a/tools/cjpegli.cc +++ b/tools/cjpegli.cc @@ -77,7 +77,7 @@ struct Args { cmdline->AddOptionFlag('\0', "xyb", "Convert to XYB colorspace", &settings.xyb, &SetBooleanTrue, 1); - cmdline->AddOptionFlag('\0', "sharpyuv", + cmdline->AddOptionFlag('\0', "sharp_yuv", "Use true linear-light sharp YUV downsampling for 4:2:0", &settings.use_sharpyuv, &SetBooleanTrue, 1); From 8bcfe96858b27030554e1fb130ae2f67564b1f51 Mon Sep 17 00:00:00 2001 From: Galaxy4594 <164440799+Galaxy4594@users.noreply.github.com> Date: Sun, 26 Apr 2026 02:52:36 -0700 Subject: [PATCH 09/10] remove dead code & fix comments another small speed up of +2 mp/s --- lib/jpegli/sharp_yuv.cc | 163 +++------------------------------------- lib/jpegli/sharp_yuv.h | 7 +- 2 files changed, 14 insertions(+), 156 deletions(-) diff --git a/lib/jpegli/sharp_yuv.cc b/lib/jpegli/sharp_yuv.cc index 850542a6..694bde38 100644 --- a/lib/jpegli/sharp_yuv.cc +++ b/lib/jpegli/sharp_yuv.cc @@ -7,7 +7,6 @@ #include "lib/jpegli/sharp_yuv.h" #include -#include #include #include #include @@ -26,12 +25,10 @@ namespace HWY_NAMESPACE { using hwy::HWY_NAMESPACE::Abs; using hwy::HWY_NAMESPACE::Add; -using hwy::HWY_NAMESPACE::And; using hwy::HWY_NAMESPACE::BitCast; using hwy::HWY_NAMESPACE::ConvertTo; using hwy::HWY_NAMESPACE::Div; using hwy::HWY_NAMESPACE::Floor; -using hwy::HWY_NAMESPACE::GetLane; using hwy::HWY_NAMESPACE::IfThenElse; using hwy::HWY_NAMESPACE::Lanes; using hwy::HWY_NAMESPACE::Le; @@ -42,7 +39,6 @@ using hwy::HWY_NAMESPACE::Max; using hwy::HWY_NAMESPACE::Min; using hwy::HWY_NAMESPACE::Mul; using hwy::HWY_NAMESPACE::MulAdd; -using hwy::HWY_NAMESPACE::Or; using hwy::HWY_NAMESPACE::Rebind; using hwy::HWY_NAMESPACE::ReduceSum; using hwy::HWY_NAMESPACE::Set; @@ -57,45 +53,7 @@ using hwy::HWY_NAMESPACE::Zero; using D = HWY_CAPPED(float, 8); constexpr D d; -// --------------------------------------------------------------------------- -// Scalar helpers (used for tail elements only) -// --------------------------------------------------------------------------- - - - -inline void YCbCrToRGB_Scalar(float y, float cb, float cr, float* r, float* g, - float* b) { - cb -= 128.0f; - cr -= 128.0f; - *r = y + cr * 1.402f; - *g = y + cb * (-0.114f * 1.772f / 0.587f) + cr * (-0.299f * 1.402f / 0.587f); - *b = y + cb * 1.772f; -} - -inline void RGBToYCbCr_Scalar(float r, float g, float b, float* y, float* cb, - float* cr) { - const float kR = 0.299f; - const float kG = 0.587f; - const float kB = 0.114f; - const float kAmpR = 0.701f; - const float kAmpB = 0.886f; - const float kNormR = 1.0f / (kAmpR + kG + kB); - const float kNormB = 1.0f / (kR + kG + kAmpB); - - float r_base = r * kR; - float g_base = g * kG; - float b_base = b * kB; - float y_base = r_base + g_base + b_base; - *y = y_base; - *cb = (b - y_base) * kNormB + 128.0f; - *cr = (r - y_base) * kNormR + 128.0f; -} - -// --------------------------------------------------------------------------- // Fast log2/exp2 approximations (same approach as adaptive_quantization.cc) -// --------------------------------------------------------------------------- - -// Evaluates a rational polynomial p(x)/q(x) via Horner's scheme. template HWY_INLINE V EvalRationalPoly(const DF df, const V x, const T (&p)[NP], const T (&q)[NQ]) { @@ -157,10 +115,7 @@ HWY_INLINE V FastPowf(const DF df, V x, float power) { return FastPow2f(df, Mul(FastLog2f(df, x), Set(df, power))); } -// --------------------------------------------------------------------------- // Highway SIMD gamma helpers using fast polynomial approximations -// --------------------------------------------------------------------------- - template HWY_INLINE V GammaToLinear_SIMD(D d, V v) { auto v_zero = Set(d, 0.0f); @@ -187,16 +142,6 @@ HWY_INLINE V LinearToGamma_SIMD(D d, V v) { return IfThenElse(is_low, low_val, high_val); } -inline float GammaToLinear(float v) { - HWY_CAPPED(float, 1) d1; - return GetLane(GammaToLinear_SIMD(d1, Set(d1, v))); -} - -inline float LinearToGamma(float v) { - HWY_CAPPED(float, 1) d1; - return GetLane(LinearToGamma_SIMD(d1, Set(d1, v))); -} - template HWY_INLINE void YCbCrToRGB_SIMD(D d, V y, V cb, V cr, V* r, V* g, V* b) { const auto v128 = Set(d, 128.0f); @@ -239,7 +184,8 @@ HWY_INLINE void RGBToYCbCr_SIMD(D d, V r, V g, V b, V* y, V* cb, V* cr) { *cr = MulAdd(Sub(r, y_base), kNormR, v128); } -// SIMD box-average downsample of linear RGB -> YCbCr chroma +// Box-averages a 2x2 block of linear-light RGB pixels and converts to +// gamma-space YCbCr, writing only the Cb and Cr chroma values. HWY_INLINE void BoxDownsampleRow(const float* lin_r0, const float* lin_g0, const float* lin_b0, const float* lin_r1, const float* lin_g1, const float* lin_b1, @@ -286,35 +232,11 @@ HWY_INLINE void BoxDownsampleRow(const float* lin_r0, const float* lin_g0, StoreU(vcr, d, out_cr + x); } - // Scalar tail - for (; x < down_width; ++x) { - float lr_sum = 0; - float lg_sum = 0; - float lb_sum = 0; - for (int iy = 0; iy < 2; ++iy) { - const float* src_r = (iy == 0) ? lin_r0 : lin_r1; - const float* src_g = (iy == 0) ? lin_g0 : lin_g1; - const float* src_b = (iy == 0) ? lin_b0 : lin_b1; - for (int ix = 0; ix < 2; ++ix) { - size_t si = x * 2 + ix; - lr_sum += src_r[si]; - lg_sum += src_g[si]; - lb_sum += src_b[si]; - } - } - float avg_r = LinearToGamma(lr_sum * 0.25f); - float avg_g = LinearToGamma(lg_sum * 0.25f); - float avg_b = LinearToGamma(lb_sum * 0.25f); - float avg_y; - float avg_cb; - float avg_cr; - RGBToYCbCr_Scalar(avg_r, avg_g, avg_b, &avg_y, &avg_cb, &avg_cr); - out_cb[x] = avg_cb; - out_cr[x] = avg_cr; - } + } -// SIMD tent-filter upsample of one chroma row +// 2x tent-filter upsample of one downsampled chroma row into two +// full-resolution output rows (out0 = even output row, out1 = odd output row). HWY_INLINE void TentUpsampleRow(const float* prev_row, const float* cur_row, const float* next_row, size_t down_width, float* out0, float* out1) { @@ -348,28 +270,7 @@ HWY_INLINE void TentUpsampleRow(const float* prev_row, const float* cur_row, StoreInterleaved2(val_bl, val_br, d, out1 + 2 * x); } - // Scalar tail - for (; x < down_width; ++x) { - size_t x_prev = (x == 0) ? 0 : x - 1; - size_t x_next = (x + 1 == down_width) ? x : x + 1; - - float cc = cur_row[x]; - float ct = prev_row[x]; - float cb = next_row[x]; - float cl = cur_row[x_prev]; - float cr = cur_row[x_next]; - float ctl = prev_row[x_prev]; - float ctr = prev_row[x_next]; - float cbl = next_row[x_prev]; - float cbr = next_row[x_next]; - - float c9 = cc * 9.0f; - float inv16 = 1.0f / 16.0f; - out0[x * 2 + 0] = (c9 + ct * 3.0f + cl * 3.0f + ctl) * inv16; - out0[(x * 2 + 1)] = (c9 + ct * 3.0f + cr * 3.0f + ctr) * inv16; - out1[x * 2 + 0] = (c9 + cb * 3.0f + cl * 3.0f + cbl) * inv16; - out1[(x * 2 + 1)] = (c9 + cb * 3.0f + cr * 3.0f + cbr) * inv16; - } + } void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { @@ -413,7 +314,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { const size_t actual_down_height = (actual_height + 1) / 2; const size_t high_size = xsize_padded * iMCU_height; - // Hoisted workspace buffers from master struct. + // Aliases to workspace buffers in master struct. jpegli::RowBuffer* ws = m->sharpyuv_workspace; auto* target_y_ws = &ws[0]; auto* best_y_ws = &ws[1]; @@ -477,31 +378,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { StoreU(lin_g, d, lg_out + x); StoreU(lin_b, d, lb_out + x); } - for (; x < xsize_padded; ++x) { - float py = row_y[x]; - float r; - float g; - float b; - if (have_rgb_input) { - r = row_r[x]; - g = row_g[x]; - b = row_b[x]; - } else { - float pcb = row_cb[x]; - float pcr = row_cr[x]; - YCbCrToRGB_Scalar(py, pcb, pcr, &r, &g, &b); - } - float lr = GammaToLinear(r); - float lg = GammaToLinear(g); - float lb = GammaToLinear(b); - // see TempLuma_SIMD for bt 709 coefficients - float ly = 0.299f * lr + 0.587f * lg + 0.114f * lb; - ty_out[x] = LinearToGamma(ly); - by_out[x] = py; - lr_out[x] = lr; - lg_out[x] = lg; - lb_out[x] = lb; - } + } // Box-average downsample of linear RGB to target Cb/Cr. @@ -589,24 +466,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { StoreU(lin_g, d, row_lin_g + x); StoreU(lin_b, d, row_lin_b + x); } - for (; x < xsize_padded; ++x) { - float py = row_best_y[x]; - float r; - float g; - float b; - YCbCrToRGB_Scalar(py, row_up_cb[x], row_up_cr[x], &r, &g, &b); - float lr = GammaToLinear(r); - float lg = GammaToLinear(g); - float lb = GammaToLinear(b); - // see TempLuma_SIMD for bt 709 coefficients - float ly = 0.299f * lr + 0.587f * lg + 0.114f * lb; - float ey = row_target_y[x] - LinearToGamma(ly); - diff_y_sum += std::abs(ey); - row_best_y[x] = std::max(0.0f, std::min(255.0f, py + ey)); - row_lin_r[x] = lr; - row_lin_g[x] = lg; - row_lin_b[x] = lb; - } + } float* row_best_cb = best_cb_ws->Row(y); @@ -632,10 +492,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { StoreU(Add(bcb, Sub(tcb, dcb)), d, row_best_cb + x); StoreU(Add(bcr, Sub(tcr, dcr)), d, row_best_cr + x); } - for (; x < down_width; ++x) { - row_best_cb[x] += row_tgt_cb[x] - row_err_cb[x]; - row_best_cr[x] += row_tgt_cr[x] - row_err_cr[x]; - } + } diff_y_sum += ReduceSum(d, acc_y); diff --git a/lib/jpegli/sharp_yuv.h b/lib/jpegli/sharp_yuv.h index a1d73522..703c6c48 100644 --- a/lib/jpegli/sharp_yuv.h +++ b/lib/jpegli/sharp_yuv.h @@ -11,9 +11,10 @@ namespace jpegli { -// Applies a sharp 2x2 area downscale in linear RGB space instead of -// gamma compressed YCbCr space. This prevents bleeding/blur artifacts -// and preserves color boundaries much better. +// Applies an iterative sharp chroma downsampling for 4:2:0 subsampling. +// The algorithm downsamples Cb/Cr in linear RGB space and iteratively refines +// the chroma values to minimize luma error after upsample, preserving color +// boundaries better than a simple box filter. void ApplySharpYuvDownsample(j_compress_ptr cinfo); } // namespace jpegli From 2d1cdb23da5905d260e31479afb36b915b0f40e4 Mon Sep 17 00:00:00 2001 From: Galaxy4594 <164440799+Galaxy4594@users.noreply.github.com> Date: Thu, 14 May 2026 03:16:33 -0700 Subject: [PATCH 10/10] more dead code --- lib/jpegli/sharp_yuv.cc | 33 ++++++++------------------------- 1 file changed, 8 insertions(+), 25 deletions(-) diff --git a/lib/jpegli/sharp_yuv.cc b/lib/jpegli/sharp_yuv.cc index 694bde38..e85cade4 100644 --- a/lib/jpegli/sharp_yuv.cc +++ b/lib/jpegli/sharp_yuv.cc @@ -8,7 +8,6 @@ #include #include -#include #include #include "lib/jpegli/common.h" @@ -30,7 +29,6 @@ using hwy::HWY_NAMESPACE::ConvertTo; using hwy::HWY_NAMESPACE::Div; using hwy::HWY_NAMESPACE::Floor; using hwy::HWY_NAMESPACE::IfThenElse; -using hwy::HWY_NAMESPACE::Lanes; using hwy::HWY_NAMESPACE::Le; using hwy::HWY_NAMESPACE::LoadDup128; using hwy::HWY_NAMESPACE::LoadInterleaved2; @@ -166,9 +164,9 @@ HWY_INLINE V TempLuma_SIMD(D d, V r, V g, V b) { return MulAdd(kR, r, MulAdd(kG, g, Mul(kB, b))); } -// Vectorized RGBToYCbCr: computes Y, Cb, Cr from gamma-space R, G, B vectors. +// Vectorized RGBToCbCr: computes Cb, Cr from gamma-space R, G, B vectors. template -HWY_INLINE void RGBToYCbCr_SIMD(D d, V r, V g, V b, V* y, V* cb, V* cr) { +HWY_INLINE void RGBToCbCr_SIMD(D d, V r, V g, V b, V* cb, V* cr) { const auto kR = Set(d, 0.299f); const auto kG = Set(d, 0.587f); const auto kB = Set(d, 0.114f); @@ -177,7 +175,6 @@ HWY_INLINE void RGBToYCbCr_SIMD(D d, V r, V g, V b, V* y, V* cb, V* cr) { const auto v128 = Set(d, 128.0f); auto y_base = MulAdd(kR, r, MulAdd(kG, g, Mul(kB, b))); - *y = y_base; // cb = (b - y_base) * kNormB + 128 *cb = MulAdd(Sub(b, y_base), kNormB, v128); // cr = (r - y_base) * kNormR + 128 @@ -185,12 +182,12 @@ HWY_INLINE void RGBToYCbCr_SIMD(D d, V r, V g, V b, V* y, V* cb, V* cr) { } // Box-averages a 2x2 block of linear-light RGB pixels and converts to -// gamma-space YCbCr, writing only the Cb and Cr chroma values. +// gamma-space Cb and Cr chroma values. HWY_INLINE void BoxDownsampleRow(const float* lin_r0, const float* lin_g0, const float* lin_b0, const float* lin_r1, const float* lin_g1, const float* lin_b1, - size_t down_width, size_t xsize_padded, - float* out_cb, float* out_cr) { + size_t down_width, float* out_cb, + float* out_cr) { const size_t N = hwy::HWY_NAMESPACE::Lanes(d); const auto quarter = Set(d, 0.25f); size_t x = 0; @@ -223,16 +220,13 @@ HWY_INLINE void BoxDownsampleRow(const float* lin_r0, const float* lin_g0, auto avg_g = LinearToGamma_SIMD(d, Mul(sum_g, quarter)); auto avg_b = LinearToGamma_SIMD(d, Mul(sum_b, quarter)); - Vec vy; Vec vcb; Vec vcr; - RGBToYCbCr_SIMD(d, avg_r, avg_g, avg_b, &vy, &vcb, &vcr); + RGBToCbCr_SIMD(d, avg_r, avg_g, avg_b, &vcb, &vcr); StoreU(vcb, d, out_cb + x); StoreU(vcr, d, out_cr + x); } - - } // 2x tent-filter upsample of one downsampled chroma row into two @@ -269,8 +263,6 @@ HWY_INLINE void TentUpsampleRow(const float* prev_row, const float* cur_row, StoreInterleaved2(val_tl, val_tr, d, out0 + 2 * x); StoreInterleaved2(val_bl, val_br, d, out1 + 2 * x); } - - } void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { @@ -378,7 +370,6 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { StoreU(lin_g, d, lg_out + x); StoreU(lin_b, d, lb_out + x); } - } // Box-average downsample of linear RGB to target Cb/Cr. @@ -392,19 +383,13 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { float* dcb = target_cb_ws->Row(y); float* dcr = target_cr_ws->Row(y); - BoxDownsampleRow(r0, g0, b0, r1, g1, b1, down_width, xsize_padded, dcb, - dcr); + BoxDownsampleRow(r0, g0, b0, r1, g1, b1, down_width, dcb, dcr); // Also initialize best_cb/cr = target_cb/cr memcpy(best_cb_ws->Row(y), dcb, down_width * sizeof(float)); memcpy(best_cr_ws->Row(y), dcr, down_width * sizeof(float)); } - for (size_t y = 0; y < actual_down_height; ++y) { - best_cb_ws->PadRow(y, down_width, 1); - best_cr_ws->PadRow(y, down_width, 1); - } - const auto vmin = Set(d, 0.0f); const auto vmax = Set(d, 255.0f); float prev_diff_y_sum = 1e30f; @@ -466,7 +451,6 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { StoreU(lin_g, d, row_lin_g + x); StoreU(lin_b, d, row_lin_b + x); } - } float* row_best_cb = best_cb_ws->Row(y); @@ -479,7 +463,7 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { BoxDownsampleRow(lin_r_ws->Row(y * 2 + 0), lin_g_ws->Row(y * 2 + 0), lin_b_ws->Row(y * 2 + 0), lin_r_ws->Row(y * 2 + 1), lin_g_ws->Row(y * 2 + 1), lin_b_ws->Row(y * 2 + 1), - down_width, xsize_padded, row_err_cb, row_err_cr); + down_width, row_err_cb, row_err_cr); size_t x = 0; for (; x + N <= down_width; x += N) { @@ -492,7 +476,6 @@ void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { StoreU(Add(bcb, Sub(tcb, dcb)), d, row_best_cb + x); StoreU(Add(bcr, Sub(tcr, dcr)), d, row_best_cr + x); } - } diff_y_sum += ReduceSum(d, acc_y);