diff --git a/lib/extras/enc/jpegli.cc b/lib/extras/enc/jpegli.cc index 14a7a398..cb2384e8 100644 --- a/lib/extras/enc/jpegli.cc +++ b/lib/extras/enc/jpegli.cc @@ -461,6 +461,9 @@ Status EncodeJpeg(const PackedPixelFile& ppf, const JpegSettings& jpeg_settings, } jpegli_enable_adaptive_quantization( &cinfo, TO_JXL_BOOL(jpeg_settings.use_adaptive_quantization)); + if (jpeg_settings.use_sharpyuv) { + jpegli_set_sharp_yuv(&cinfo, TRUE); + } if (jpeg_settings.psnr_target > 0.0) { jpegli_set_psnr(&cinfo, jpeg_settings.psnr_target, jpeg_settings.search_tolerance, diff --git a/lib/extras/enc/jpegli.h b/lib/extras/enc/jpegli.h index 7d841eb5..88db8074 100644 --- a/lib/extras/enc/jpegli.h +++ b/lib/extras/enc/jpegli.h @@ -24,6 +24,7 @@ class PackedPixelFile; struct JpegSettings { bool xyb = false; + bool use_sharpyuv = false; size_t target_size = 0; float quality = 0.0f; float distance = 1.f; diff --git a/lib/jpegli/common_internal.h b/lib/jpegli/common_internal.h index e41daa3d..80074b31 100644 --- a/lib/jpegli/common_internal.h +++ b/lib/jpegli/common_internal.h @@ -112,6 +112,7 @@ class RowBuffer { size_t xsize() const { return xsize_; }; size_t ysize() const { return ysize_; }; size_t stride() const { return stride_; } + bool HasData() const { return xsize_ > 0; } void PadRow(size_t y, size_t from, int border) { float* row = Row(y); diff --git a/lib/jpegli/encode.cc b/lib/jpegli/encode.cc index 4d9b0cae..2ae318d3 100644 --- a/lib/jpegli/encode.cc +++ b/lib/jpegli/encode.cc @@ -31,6 +31,7 @@ #include "lib/jpegli/quant.h" #include "lib/jpegli/simd.h" #include "lib/jpegli/types.h" +#include "lib/jpegli/sharp_yuv.h" namespace jpegli { @@ -430,6 +431,12 @@ void AllocateBuffers(j_compress_ptr cinfo) { for (int c = 0; c < num_all_components; ++c) { m->input_buffer[c].Allocate(cinfo, ysize_full, xsize_full); } + if (m->use_sharpyuv && cinfo->in_color_space == JCS_RGB && + cinfo->jpeg_color_space == JCS_YCbCr) { + for (int c = 0; c < 3; ++c) { + m->input_rgb[c].Allocate(cinfo, ysize_full, xsize_full); + } + } } for (int c = 0; c < cinfo->num_components; ++c) { jpeg_component_info* comp = &cinfo->comp_info[c]; @@ -490,6 +497,14 @@ void AllocateBuffers(j_compress_ptr cinfo) { memset(m->zero_bias_mul[c], 0, DCTSIZE2 * sizeof(float)); memset(m->zero_bias_offset[c], 0, DCTSIZE2 * sizeof(float)); } + if (m->use_sharpyuv) { + for (int i = 0; i < 7; ++i) { + m->sharpyuv_workspace[i].Allocate(cinfo, iMCU_height, xsize_full); + } + for (int i = 7; i < 13; ++i) { + m->sharpyuv_workspace[i].Allocate(cinfo, iMCU_height / 2, xsize_full / 2); + } + } } void InitProgressMonitor(j_compress_ptr cinfo) { @@ -600,7 +615,11 @@ void ProcessiMCURow(j_compress_ptr cinfo) { JPEGLI_CHECK(cinfo->master->next_iMCU_row < cinfo->total_iMCU_rows); if (!cinfo->raw_data_in) { ApplyInputSmoothing(cinfo); - DownsampleInputBuffer(cinfo); + if (cinfo->master->use_sharpyuv && cinfo->jpeg_color_space == JCS_YCbCr) { + ApplySharpYuvDownsample(cinfo); + } else { + DownsampleInputBuffer(cinfo); + } } ComputeAdaptiveQuantField(cinfo); if (IsStreamingSupported(cinfo)) { @@ -687,12 +706,14 @@ void jpegli_CreateCompress(j_compress_ptr cinfo, int version, memset(cinfo->arith_ac_K, 0, sizeof(cinfo->arith_ac_K)); cinfo->write_Adobe_marker = FALSE; cinfo->master = jpegli::Allocate(cinfo, 1); + memset(cinfo->master->input_rgb, 0, sizeof(cinfo->master->input_rgb)); jpegli::InitializeCompressParams(cinfo); cinfo->master->force_baseline = true; cinfo->master->xyb_mode = false; cinfo->master->cicp_transfer_function = 2; // unknown transfer function code cinfo->master->use_std_tables = false; cinfo->master->use_adaptive_quantization = true; + cinfo->master->use_sharpyuv = false; cinfo->master->progressive_level = jpegli::kDefaultProgressiveLevel; cinfo->master->data_type = JPEGLI_TYPE_UINT8; cinfo->master->endianness = JPEGLI_NATIVE_ENDIAN; @@ -927,6 +948,11 @@ void jpegli_enable_adaptive_quantization(j_compress_ptr cinfo, boolean value) { cinfo->master->use_adaptive_quantization = FROM_JXL_BOOL(value); } +void jpegli_set_sharp_yuv(j_compress_ptr cinfo, boolean enable) { + CheckState(cinfo, jpegli::kEncStart); + cinfo->master->use_sharpyuv = FROM_JXL_BOOL(enable); +} + void jpegli_simple_progression(j_compress_ptr cinfo) { CheckState(cinfo, jpegli::kEncStart); jpegli_set_progressive_level(cinfo, 2); @@ -1149,8 +1175,22 @@ JDIMENSION jpegli_write_scanlines(j_compress_ptr cinfo, JSAMPARRAY scanlines, cinfo->next_scanline += input_lag; } float* rows[jpegli::kMaxComponents]; + const bool snapshot_rgb = m->use_sharpyuv && + cinfo->in_color_space == JCS_RGB && + cinfo->jpeg_color_space == JCS_YCbCr; for (size_t i = input_lag; i < num_lines; ++i) { jpegli::ReadInputRow(cinfo, scanlines[i], rows); + if (snapshot_rgb) { + size_t row_idx = m->next_input_row - 1; + size_t img_w = cinfo->image_width; + size_t pad_w = m->xsize_blocks * DCTSIZE; + for (int c = 0; c < 3; ++c) { + float* rgb_row = m->input_rgb[c].Row(row_idx); + memcpy(rgb_row, rows[c], img_w * sizeof(float)); + float last = rgb_row[img_w - 1]; + for (size_t x = img_w; x <= pad_w; ++x) rgb_row[x] = last; + } + } (*m->color_transform)(rows, cinfo->image_width); jpegli::PadInputBuffer(cinfo, rows); jpegli::ProcessiMCURows(cinfo); diff --git a/lib/jpegli/encode.h b/lib/jpegli/encode.h index 5f620b98..364894b1 100644 --- a/lib/jpegli/encode.h +++ b/lib/jpegli/encode.h @@ -145,6 +145,9 @@ void jpegli_set_input_format(j_compress_ptr cinfo, JpegliDataType data_type, // Enabled by default. void jpegli_enable_adaptive_quantization(j_compress_ptr cinfo, boolean value); +// Enables linear-light sharp YUV downsampling. +void jpegli_set_sharp_yuv(j_compress_ptr cinfo, boolean enable); + // Sets the default progression parameters, where level 0 is sequential, and // greater level value means more progression steps. Default is 2. void jpegli_set_progressive_level(j_compress_ptr cinfo, int level); diff --git a/lib/jpegli/encode_internal.h b/lib/jpegli/encode_internal.h index 3f5d1a76..b260b6e0 100644 --- a/lib/jpegli/encode_internal.h +++ b/lib/jpegli/encode_internal.h @@ -68,6 +68,7 @@ struct ScanTokenInfo { struct jpeg_comp_master { jpegli::RowBuffer input_buffer[jpegli::kMaxComponents]; + jpegli::RowBuffer input_rgb[3]; jpegli::RowBuffer* smooth_input[jpegli::kMaxComponents]; jpegli::RowBuffer* raw_data[jpegli::kMaxComponents]; bool force_baseline; @@ -75,6 +76,7 @@ struct jpeg_comp_master { uint8_t cicp_transfer_function; bool use_std_tables; bool use_adaptive_quantization; + bool use_sharpyuv; int progressive_level; size_t xsize_blocks; size_t ysize_blocks; @@ -138,6 +140,7 @@ struct jpeg_comp_master { float psnr_tolerance; float min_distance; float max_distance; + jpegli::RowBuffer sharpyuv_workspace[13]; }; #endif // LIB_JPEGLI_ENCODE_INTERNAL_H_ diff --git a/lib/jpegli/sharp_yuv.cc b/lib/jpegli/sharp_yuv.cc new file mode 100644 index 00000000..e85cade4 --- /dev/null +++ b/lib/jpegli/sharp_yuv.cc @@ -0,0 +1,519 @@ +// Copyright (c) the JPEG XL Project Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file or at +// https://developers.google.com/open-source/licenses/bsd + +#include "lib/jpegli/sharp_yuv.h" + +#include +#include +#include + +#include "lib/jpegli/common.h" +#include "lib/jpegli/encode_internal.h" + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "lib/jpegli/sharp_yuv.cc" +#include +#include + +HWY_BEFORE_NAMESPACE(); +namespace jpegli { +namespace HWY_NAMESPACE { + +using hwy::HWY_NAMESPACE::Abs; +using hwy::HWY_NAMESPACE::Add; +using hwy::HWY_NAMESPACE::BitCast; +using hwy::HWY_NAMESPACE::ConvertTo; +using hwy::HWY_NAMESPACE::Div; +using hwy::HWY_NAMESPACE::Floor; +using hwy::HWY_NAMESPACE::IfThenElse; +using hwy::HWY_NAMESPACE::Le; +using hwy::HWY_NAMESPACE::LoadDup128; +using hwy::HWY_NAMESPACE::LoadInterleaved2; +using hwy::HWY_NAMESPACE::LoadU; +using hwy::HWY_NAMESPACE::Max; +using hwy::HWY_NAMESPACE::Min; +using hwy::HWY_NAMESPACE::Mul; +using hwy::HWY_NAMESPACE::MulAdd; +using hwy::HWY_NAMESPACE::Rebind; +using hwy::HWY_NAMESPACE::ReduceSum; +using hwy::HWY_NAMESPACE::Set; +using hwy::HWY_NAMESPACE::ShiftLeft; +using hwy::HWY_NAMESPACE::ShiftRight; +using hwy::HWY_NAMESPACE::StoreInterleaved2; +using hwy::HWY_NAMESPACE::StoreU; +using hwy::HWY_NAMESPACE::Sub; +using hwy::HWY_NAMESPACE::Vec; +using hwy::HWY_NAMESPACE::Zero; + +using D = HWY_CAPPED(float, 8); +constexpr D d; + +// Fast log2/exp2 approximations (same approach as adaptive_quantization.cc) +template +HWY_INLINE V EvalRationalPoly(const DF df, const V x, const T (&p)[NP], + const T (&q)[NQ]) { + constexpr size_t kDegP = NP / 4 - 1; + constexpr size_t kDegQ = NQ / 4 - 1; + auto yp = LoadDup128(df, &p[kDegP * 4]); + auto yq = LoadDup128(df, &q[kDegQ * 4]); + HWY_FENCE; + if (kDegP >= 1) yp = MulAdd(yp, x, LoadDup128(df, p + ((kDegP - 1) * 4))); + if (kDegQ >= 1) yq = MulAdd(yq, x, LoadDup128(df, q + ((kDegQ - 1) * 4))); + HWY_FENCE; + if (kDegP >= 2) yp = MulAdd(yp, x, LoadDup128(df, p + ((kDegP - 2) * 4))); + if (kDegQ >= 2) yq = MulAdd(yq, x, LoadDup128(df, q + ((kDegQ - 2) * 4))); + HWY_FENCE; + if (kDegP >= 3) yp = MulAdd(yp, x, LoadDup128(df, p + ((kDegP - 3) * 4))); + if (kDegQ >= 3) yq = MulAdd(yq, x, LoadDup128(df, q + ((kDegQ - 3) * 4))); + return Div(yp, yq); +} + +// Fast base-2 logarithm. L1 error ~3.9E-6. +template +HWY_INLINE V FastLog2f(const DF df, V x) { + HWY_ALIGN const float p[4 * 3] = {HWY_REP4(-1.8503833400518310E-06f), + HWY_REP4(1.4287160470083755E+00f), + HWY_REP4(7.4245873327820566E-01f)}; + HWY_ALIGN const float q[4 * 3] = {HWY_REP4(9.9032814277590719E-01f), + HWY_REP4(1.0096718572241148E+00f), + HWY_REP4(1.7409343003366853E-01f)}; + const Rebind di; + const auto x_bits = BitCast(di, x); + const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab)); + const auto exp_shifted = ShiftRight<23>(exp_bits); + const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted))); + const auto exp_val = ConvertTo(df, exp_shifted); + return Add(EvalRationalPoly(df, Sub(mantissa, Set(df, 1.0f)), p, q), exp_val); +} + +// Fast 2^x. Max relative error ~3e-7. +template +HWY_INLINE V FastPow2f(const DF df, V x) { + const Rebind di; + auto floorx = Floor(x); + auto exp = + BitCast(df, ShiftLeft<23>(Add(ConvertTo(di, floorx), Set(di, 127)))); + auto frac = Sub(x, floorx); + auto num = Add(frac, Set(df, 1.01749063e+01f)); + num = MulAdd(num, frac, Set(df, 4.88687798e+01f)); + num = MulAdd(num, frac, Set(df, 9.85506591e+01f)); + num = Mul(num, exp); + auto den = MulAdd(frac, Set(df, 2.10242958e-01f), Set(df, -2.22328856e-02f)); + den = MulAdd(den, frac, Set(df, -1.94414990e+01f)); + den = MulAdd(den, frac, Set(df, 9.85506633e+01f)); + return Div(num, den); +} + +// Fast pow(x, p) = 2^(p * log2(x)) for x > 0. +template +HWY_INLINE V FastPowf(const DF df, V x, float power) { + return FastPow2f(df, Mul(FastLog2f(df, x), Set(df, power))); +} + +// Highway SIMD gamma helpers using fast polynomial approximations +template +HWY_INLINE V GammaToLinear_SIMD(D d, V v) { + auto v_zero = Set(d, 0.0f); + auto v_max = Set(d, 255.0f); + auto n = Mul(Min(Max(v, v_zero), v_max), Set(d, 1.0f / 255.0f)); + auto is_low = Le(n, Set(d, 0.04045f)); + auto low_val = Mul(n, Set(d, 1.0f / 12.92f)); + // High path: ((n + 0.055) / 1.055) ^ 2.4 + auto base = Mul(Add(n, Set(d, 0.055f)), Set(d, 1.0f / 1.055f)); + base = Max(base, Set(d, 1e-6f)); + auto high_val = FastPowf(d, base, 2.4f); + return IfThenElse(is_low, low_val, high_val); +} + +template +HWY_INLINE V LinearToGamma_SIMD(D d, V v) { + v = Max(v, Set(d, 0.0f)); + auto is_low = Le(v, Set(d, 0.0031308f)); + auto low_val = Mul(v, Set(d, 12.92f * 255.0f)); + auto v_safe = Max(v, Set(d, 1e-6f)); + auto p = FastPowf(d, v_safe, 1.0f / 2.4f); + auto high_val = + Mul(Sub(Mul(p, Set(d, 1.055f)), Set(d, 0.055f)), Set(d, 255.0f)); + return IfThenElse(is_low, low_val, high_val); +} + +template +HWY_INLINE void YCbCrToRGB_SIMD(D d, V y, V cb, V cr, V* r, V* g, V* b) { + const auto v128 = Set(d, 128.0f); + cb = Sub(cb, v128); + cr = Sub(cr, v128); + const auto kCrR = Set(d, 1.402f); + const auto kCbG = Set(d, -0.114f * 1.772f / 0.587f); + const auto kCrG = Set(d, -0.299f * 1.402f / 0.587f); + const auto kCbB = Set(d, 1.772f); + *r = MulAdd(cr, kCrR, y); + *g = MulAdd(cb, kCbG, MulAdd(cr, kCrG, y)); + *b = MulAdd(cb, kCbB, y); +} + +template +HWY_INLINE V TempLuma_SIMD(D d, V r, V g, V b) { + // using bt 709 values (0.2126f, 0.7152f, 0.0722f) may be more perceptually + // accurate, but it needs more visual testing. + const auto kR = Set(d, 0.299f); + const auto kG = Set(d, 0.587f); + const auto kB = Set(d, 0.114f); + return MulAdd(kR, r, MulAdd(kG, g, Mul(kB, b))); +} + +// Vectorized RGBToCbCr: computes Cb, Cr from gamma-space R, G, B vectors. +template +HWY_INLINE void RGBToCbCr_SIMD(D d, V r, V g, V b, V* cb, V* cr) { + const auto kR = Set(d, 0.299f); + const auto kG = Set(d, 0.587f); + const auto kB = Set(d, 0.114f); + const auto kNormR = Set(d, 1.0f / (0.701f + 0.587f + 0.114f)); + const auto kNormB = Set(d, 1.0f / (0.299f + 0.587f + 0.886f)); + const auto v128 = Set(d, 128.0f); + + auto y_base = MulAdd(kR, r, MulAdd(kG, g, Mul(kB, b))); + // cb = (b - y_base) * kNormB + 128 + *cb = MulAdd(Sub(b, y_base), kNormB, v128); + // cr = (r - y_base) * kNormR + 128 + *cr = MulAdd(Sub(r, y_base), kNormR, v128); +} + +// Box-averages a 2x2 block of linear-light RGB pixels and converts to +// gamma-space Cb and Cr chroma values. +HWY_INLINE void BoxDownsampleRow(const float* lin_r0, const float* lin_g0, + const float* lin_b0, const float* lin_r1, + const float* lin_g1, const float* lin_b1, + size_t down_width, float* out_cb, + float* out_cr) { + const size_t N = hwy::HWY_NAMESPACE::Lanes(d); + const auto quarter = Set(d, 0.25f); + size_t x = 0; + + for (; x + N <= down_width; x += N) { + Vec r0e; + Vec r0o; + Vec g0e; + Vec g0o; + Vec b0e; + Vec b0o; + Vec r1e; + Vec r1o; + Vec g1e; + Vec g1o; + Vec b1e; + Vec b1o; + LoadInterleaved2(d, lin_r0 + 2 * x, r0e, r0o); + LoadInterleaved2(d, lin_g0 + 2 * x, g0e, g0o); + LoadInterleaved2(d, lin_b0 + 2 * x, b0e, b0o); + LoadInterleaved2(d, lin_r1 + 2 * x, r1e, r1o); + LoadInterleaved2(d, lin_g1 + 2 * x, g1e, g1o); + LoadInterleaved2(d, lin_b1 + 2 * x, b1e, b1o); + + auto sum_r = Add(Add(r0e, r0o), Add(r1e, r1o)); + auto sum_g = Add(Add(g0e, g0o), Add(g1e, g1o)); + auto sum_b = Add(Add(b0e, b0o), Add(b1e, b1o)); + + auto avg_r = LinearToGamma_SIMD(d, Mul(sum_r, quarter)); + auto avg_g = LinearToGamma_SIMD(d, Mul(sum_g, quarter)); + auto avg_b = LinearToGamma_SIMD(d, Mul(sum_b, quarter)); + + Vec vcb; + Vec vcr; + RGBToCbCr_SIMD(d, avg_r, avg_g, avg_b, &vcb, &vcr); + + StoreU(vcb, d, out_cb + x); + StoreU(vcr, d, out_cr + x); + } +} + +// 2x tent-filter upsample of one downsampled chroma row into two +// full-resolution output rows (out0 = even output row, out1 = odd output row). +HWY_INLINE void TentUpsampleRow(const float* prev_row, const float* cur_row, + const float* next_row, size_t down_width, + float* out0, float* out1) { + const size_t N = hwy::HWY_NAMESPACE::Lanes(d); + const auto k9 = Set(d, 9.0f); + const auto k3 = Set(d, 3.0f); + const auto kInv16 = Set(d, 1.0f / 16.0f); + size_t x = 0; + + for (; x + N <= down_width; x += N) { + auto vc = LoadU(d, cur_row + x); + auto vt = LoadU(d, prev_row + x); + auto vb = LoadU(d, next_row + x); + + auto vl = LoadU(d, cur_row + x - 1); + auto vtl = LoadU(d, prev_row + x - 1); + auto vbl = LoadU(d, next_row + x - 1); + + auto vr = LoadU(d, cur_row + x + 1); + auto vtr = LoadU(d, prev_row + x + 1); + auto vbr = LoadU(d, next_row + x + 1); + + auto c9 = Mul(vc, k9); + + auto val_tl = Mul(MulAdd(k3, Add(vt, vl), Add(c9, vtl)), kInv16); + auto val_tr = Mul(MulAdd(k3, Add(vt, vr), Add(c9, vtr)), kInv16); + auto val_bl = Mul(MulAdd(k3, Add(vb, vl), Add(c9, vbl)), kInv16); + auto val_br = Mul(MulAdd(k3, Add(vb, vr), Add(c9, vbr)), kInv16); + + StoreInterleaved2(val_tl, val_tr, d, out0 + 2 * x); + StoreInterleaved2(val_bl, val_br, d, out1 + 2 * x); + } +} + +void ApplySharpYuvDownsampleImpl(j_compress_ptr cinfo) { + jpeg_comp_master* m = cinfo->master; + const size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor; + const size_t y0 = m->next_iMCU_row * iMCU_height; + const size_t y1 = y0 + iMCU_height; + const size_t xsize_padded = m->xsize_blocks * DCTSIZE; + + if (cinfo->num_components < 3) return; + const int h_factor = + cinfo->max_h_samp_factor / cinfo->comp_info[1].h_samp_factor; + const int v_factor = + cinfo->max_v_samp_factor / cinfo->comp_info[1].v_samp_factor; + if (h_factor != 2 || v_factor != 2) { + for (int c = 1; c < cinfo->num_components; ++c) { + jpeg_component_info* comp = &cinfo->comp_info[c]; + const int hf = cinfo->max_h_samp_factor / comp->h_samp_factor; + const int vf = cinfo->max_v_samp_factor / comp->v_samp_factor; + if (hf == 1 && vf == 1) continue; + const size_t y_out0 = y0 / vf; + float* rows_in[MAX_SAMP_FACTOR]; + auto& input = *m->smooth_input[c]; + auto& output = *m->raw_data[c]; + for (size_t y_in = y0, y_out = y_out0; y_in < y1; y_in += vf, ++y_out) { + for (int iy = 0; iy < vf; ++iy) { + rows_in[iy] = input.Row(y_in + iy); + } + (*m->downsample_method[c])(rows_in, xsize_padded, output.Row(y_out)); + } + } + return; + } + + const size_t down_width = xsize_padded / 2; + const size_t down_height = iMCU_height / 2; + const size_t actual_height = + (y0 >= cinfo->image_height) + ? 1 + : std::min(y1, static_cast(cinfo->image_height)) - y0; + const size_t actual_down_height = (actual_height + 1) / 2; + const size_t high_size = xsize_padded * iMCU_height; + + // Aliases to workspace buffers in master struct. + jpegli::RowBuffer* ws = m->sharpyuv_workspace; + auto* target_y_ws = &ws[0]; + auto* best_y_ws = &ws[1]; + auto* lin_r_ws = &ws[2]; + auto* lin_g_ws = &ws[3]; + auto* lin_b_ws = &ws[4]; + auto* up_cb_ws = &ws[5]; + auto* up_cr_ws = &ws[6]; + auto* target_cb_ws = &ws[7]; + auto* target_cr_ws = &ws[8]; + auto* best_cb_ws = &ws[9]; + auto* best_cr_ws = &ws[10]; + auto* err_cb_ws = &ws[11]; + auto* err_cr_ws = &ws[12]; + + const size_t N = hwy::HWY_NAMESPACE::Lanes(d); + + const bool have_rgb_input = m->input_rgb[0].HasData(); + // Read pre-transform sRGB directly (if available) or convert from YCbCr. + for (size_t y = 0; y < iMCU_height; ++y) { + const float* row_y = m->smooth_input[0]->Row(y0 + y); + const float* row_cb = + have_rgb_input ? nullptr : m->smooth_input[1]->Row(y0 + y); + const float* row_cr = + have_rgb_input ? nullptr : m->smooth_input[2]->Row(y0 + y); + const float* row_r = have_rgb_input ? m->input_rgb[0].Row(y0 + y) : nullptr; + const float* row_g = have_rgb_input ? m->input_rgb[1].Row(y0 + y) : nullptr; + const float* row_b = have_rgb_input ? m->input_rgb[2].Row(y0 + y) : nullptr; + float* ty_out = target_y_ws->Row(y); + float* by_out = best_y_ws->Row(y); + float* lr_out = lin_r_ws->Row(y); + float* lg_out = lin_g_ws->Row(y); + float* lb_out = lin_b_ws->Row(y); + + size_t x = 0; + for (; x + N <= xsize_padded; x += N) { + auto py = LoadU(d, row_y + x); + decltype(py) r; + decltype(py) g; + decltype(py) b; + if (have_rgb_input) { + r = LoadU(d, row_r + x); + g = LoadU(d, row_g + x); + b = LoadU(d, row_b + x); + } else { + auto pcb = LoadU(d, row_cb + x); + auto pcr = LoadU(d, row_cr + x); + YCbCrToRGB_SIMD(d, py, pcb, pcr, &r, &g, &b); + } + + auto lin_r = GammaToLinear_SIMD(d, r); + auto lin_g = GammaToLinear_SIMD(d, g); + auto lin_b = GammaToLinear_SIMD(d, b); + + auto lin_y = TempLuma_SIMD(d, lin_r, lin_g, lin_b); + auto t_y = LinearToGamma_SIMD(d, lin_y); + + StoreU(t_y, d, ty_out + x); + StoreU(py, d, by_out + x); + StoreU(lin_r, d, lr_out + x); + StoreU(lin_g, d, lg_out + x); + StoreU(lin_b, d, lb_out + x); + } + } + + // Box-average downsample of linear RGB to target Cb/Cr. + for (size_t y = 0; y < actual_down_height; ++y) { + const float* r0 = lin_r_ws->Row(y * 2 + 0); + const float* g0 = lin_g_ws->Row(y * 2 + 0); + const float* b0 = lin_b_ws->Row(y * 2 + 0); + const float* r1 = lin_r_ws->Row(y * 2 + 1); + const float* g1 = lin_g_ws->Row(y * 2 + 1); + const float* b1 = lin_b_ws->Row(y * 2 + 1); + float* dcb = target_cb_ws->Row(y); + float* dcr = target_cr_ws->Row(y); + + BoxDownsampleRow(r0, g0, b0, r1, g1, b1, down_width, dcb, dcr); + + // Also initialize best_cb/cr = target_cb/cr + memcpy(best_cb_ws->Row(y), dcb, down_width * sizeof(float)); + memcpy(best_cr_ws->Row(y), dcr, down_width * sizeof(float)); + } + + const auto vmin = Set(d, 0.0f); + const auto vmax = Set(d, 255.0f); + float prev_diff_y_sum = 1e30f; + const float diff_y_threshold = 3.0f * high_size; + + for (int iter = 0; iter < 4; ++iter) { + float diff_y_sum = 0.0f; + auto acc_y = Zero(d); + + // Padding for upsampling (all rows for this iteration) + for (size_t y = 0; y < actual_down_height; ++y) { + best_cb_ws->PadRow(y, down_width, 1); + best_cr_ws->PadRow(y, down_width, 1); + } + + // Pass 1: Upsample current best_cb/cr + for (size_t y = 0; y < actual_down_height; ++y) { + size_t y_prev = (y == 0) ? 0 : y - 1; + size_t y_next = (y + 1 == actual_down_height) ? y : y + 1; + TentUpsampleRow(best_cb_ws->Row(y_prev), best_cb_ws->Row(y), + best_cb_ws->Row(y_next), down_width, up_cb_ws->Row(y * 2), + up_cb_ws->Row(y * 2 + 1)); + TentUpsampleRow(best_cr_ws->Row(y_prev), best_cr_ws->Row(y), + best_cr_ws->Row(y_next), down_width, up_cr_ws->Row(y * 2), + up_cr_ws->Row(y * 2 + 1)); + } + + // Pass 2: Evaluate Y and Prepare for Chroma update + for (size_t y = 0; y < actual_down_height; ++y) { + for (int iy = 0; iy < 2; ++iy) { + size_t yy = y * 2 + iy; + float* row_best_y = best_y_ws->Row(yy); + const float* row_up_cb = up_cb_ws->Row(yy); + const float* row_up_cr = up_cr_ws->Row(yy); + const float* row_target_y = target_y_ws->Row(yy); + float* row_lin_r = lin_r_ws->Row(yy); + float* row_lin_g = lin_g_ws->Row(yy); + float* row_lin_b = lin_b_ws->Row(yy); + + size_t x = 0; + for (; x + N <= xsize_padded; x += N) { + auto py = LoadU(d, row_best_y + x); + auto ucb = LoadU(d, row_up_cb + x); + auto ucr = LoadU(d, row_up_cr + x); + Vec r; + Vec g; + Vec b; + YCbCrToRGB_SIMD(d, py, ucb, ucr, &r, &g, &b); + auto lin_r = GammaToLinear_SIMD(d, r); + auto lin_g = GammaToLinear_SIMD(d, g); + auto lin_b = GammaToLinear_SIMD(d, b); + auto lin_y = TempLuma_SIMD(d, lin_r, lin_g, lin_b); + auto dec_y = LinearToGamma_SIMD(d, lin_y); + auto ey = Sub(LoadU(d, row_target_y + x), dec_y); + acc_y = Add(acc_y, Abs(ey)); + + StoreU(Min(Max(Add(py, ey), vmin), vmax), d, row_best_y + x); + StoreU(lin_r, d, row_lin_r + x); + StoreU(lin_g, d, row_lin_g + x); + StoreU(lin_b, d, row_lin_b + x); + } + } + + float* row_best_cb = best_cb_ws->Row(y); + float* row_best_cr = best_cr_ws->Row(y); + float* row_err_cb = err_cb_ws->Row(y); + float* row_err_cr = err_cr_ws->Row(y); + float* row_tgt_cb = target_cb_ws->Row(y); + float* row_tgt_cr = target_cr_ws->Row(y); + + BoxDownsampleRow(lin_r_ws->Row(y * 2 + 0), lin_g_ws->Row(y * 2 + 0), + lin_b_ws->Row(y * 2 + 0), lin_r_ws->Row(y * 2 + 1), + lin_g_ws->Row(y * 2 + 1), lin_b_ws->Row(y * 2 + 1), + down_width, row_err_cb, row_err_cr); + + size_t x = 0; + for (; x + N <= down_width; x += N) { + auto bcb = LoadU(d, row_best_cb + x); + auto bcr = LoadU(d, row_best_cr + x); + auto tcb = LoadU(d, row_tgt_cb + x); + auto tcr = LoadU(d, row_tgt_cr + x); + auto dcb = LoadU(d, row_err_cb + x); + auto dcr = LoadU(d, row_err_cr + x); + StoreU(Add(bcb, Sub(tcb, dcb)), d, row_best_cb + x); + StoreU(Add(bcr, Sub(tcr, dcr)), d, row_best_cr + x); + } + } + diff_y_sum += ReduceSum(d, acc_y); + + if (iter > 0 && + (diff_y_sum < diff_y_threshold || diff_y_sum > prev_diff_y_sum)) + break; + prev_diff_y_sum = diff_y_sum; + } + + // Write best_y and best_cb/cr to raw_data output buffers. + for (size_t y = 0; y < iMCU_height; ++y) { + float* row_out_y = m->raw_data[0]->Row(y0 + y); + memcpy(row_out_y, best_y_ws->Row(y), xsize_padded * sizeof(float)); + } + + const size_t y_out0 = y0 / 2; + for (size_t y_out = 0; y_out < down_height; ++y_out) { + size_t copy_y = std::min(y_out, actual_down_height - 1); + float* row_out_cb = m->raw_data[1]->Row(y_out0 + y_out); + float* row_out_cr = m->raw_data[2]->Row(y_out0 + y_out); + memcpy(row_out_cb, best_cb_ws->Row(copy_y), down_width * sizeof(float)); + memcpy(row_out_cr, best_cr_ws->Row(copy_y), down_width * sizeof(float)); + } +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace jpegli +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE +namespace jpegli { + +HWY_EXPORT(ApplySharpYuvDownsampleImpl); + +void ApplySharpYuvDownsample(j_compress_ptr cinfo) { + HWY_DYNAMIC_DISPATCH(ApplySharpYuvDownsampleImpl)(cinfo); +} + +} // namespace jpegli +#endif // HWY_ONCE diff --git a/lib/jpegli/sharp_yuv.h b/lib/jpegli/sharp_yuv.h new file mode 100644 index 00000000..703c6c48 --- /dev/null +++ b/lib/jpegli/sharp_yuv.h @@ -0,0 +1,22 @@ +// Copyright (c) the JPEG XL Project Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file or at +// https://developers.google.com/open-source/licenses/bsd + +#ifndef LIB_JPEGLI_SHARP_YUV_H_ +#define LIB_JPEGLI_SHARP_YUV_H_ + +#include "lib/jpegli/encode_internal.h" + +namespace jpegli { + +// Applies an iterative sharp chroma downsampling for 4:2:0 subsampling. +// The algorithm downsamples Cb/Cr in linear RGB space and iteratively refines +// the chroma values to minimize luma error after upsample, preserving color +// boundaries better than a simple box filter. +void ApplySharpYuvDownsample(j_compress_ptr cinfo); + +} // namespace jpegli + +#endif // LIB_JPEGLI_SHARP_YUV_H_ diff --git a/lib/jxl_lists.cmake b/lib/jxl_lists.cmake index f5fa51d9..83394772 100644 --- a/lib/jxl_lists.cmake +++ b/lib/jxl_lists.cmake @@ -198,6 +198,8 @@ set(JPEGXL_INTERNAL_JPEGLI_SOURCES jpegli/quant.h jpegli/render.cc jpegli/render.h + jpegli/sharp_yuv.cc + jpegli/sharp_yuv.h jpegli/simd.cc jpegli/simd.h jpegli/source_manager.cc diff --git a/tools/cjpegli.cc b/tools/cjpegli.cc index 7bcd598a..e48b4fa1 100644 --- a/tools/cjpegli.cc +++ b/tools/cjpegli.cc @@ -77,6 +77,10 @@ struct Args { cmdline->AddOptionFlag('\0', "xyb", "Convert to XYB colorspace", &settings.xyb, &SetBooleanTrue, 1); + cmdline->AddOptionFlag('\0', "sharp_yuv", + "Use true linear-light sharp YUV downsampling for 4:2:0", + &settings.use_sharpyuv, &SetBooleanTrue, 1); + cmdline->AddOptionFlag( '\0', "std_quant", "Use quantization tables based on Annex K of the JPEG standard.",