|
| 1 | +/* |
| 2 | + * Copyright (C) 2016-2026 Grok Image Compression Inc. |
| 3 | + * |
| 4 | + * This source code is free software: you can redistribute it and/or modify |
| 5 | + * it under the terms of the GNU Affero General Public License, version 3, |
| 6 | + * as published by the Free Software Foundation. |
| 7 | + * |
| 8 | + * This source code is distributed in the hope that it will be useful, |
| 9 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 11 | + * GNU Affero General Public License for more details. |
| 12 | + * |
| 13 | + * You should have received a copy of the GNU Affero General Public License |
| 14 | + * along with this program. If not, see <http://www.gnu.org/licenses/>. |
| 15 | + * |
| 16 | + */ |
| 17 | + |
| 18 | +#include "hwy_arm_disable_targets.h" |
| 19 | + |
| 20 | +#undef HWY_TARGET_INCLUDE |
| 21 | +#define HWY_TARGET_INCLUDE "util/XYZTransform.cpp" |
| 22 | +#include <hwy/foreach_target.h> |
| 23 | +#include <hwy/highway.h> |
| 24 | +#include <cmath> |
| 25 | +#include <cstdint> |
| 26 | +#include <memory> |
| 27 | +#include <vector> |
| 28 | + |
| 29 | +#include "grok.h" |
| 30 | +#include "XYZTransform.h" |
| 31 | +#include "Logger.h" |
| 32 | + |
| 33 | +HWY_BEFORE_NAMESPACE(); |
| 34 | +namespace grk |
| 35 | +{ |
| 36 | +namespace HWY_NAMESPACE |
| 37 | +{ |
| 38 | + namespace hn = hwy::HWY_NAMESPACE; |
| 39 | + |
| 40 | + // ─── DCI companding coefficient (SMPTE 428-1) ─── |
| 41 | + // Peak white luminance 48 cd/m² mapped to XYZ encoding range of 52.37 |
| 42 | + static constexpr float DCI_COEFFICIENT = 48.0f / 52.37f; |
| 43 | + |
| 44 | + // ─── Rec.709 linearization: simple gamma 2.2 (matches libdcp/dcpomatic) ─── |
| 45 | + static inline float rec709_to_linear(float v) |
| 46 | + { |
| 47 | + if(v <= 0.0f) |
| 48 | + return 0.0f; |
| 49 | + return std::pow(v, 2.2f); |
| 50 | + } |
| 51 | + |
| 52 | + // ─── DCI 2.6 gamma: linear → X'Y'Z' ─── |
| 53 | + static inline float linear_to_dci_gamma(float v) |
| 54 | + { |
| 55 | + if(v <= 0.0f) |
| 56 | + return 0.0f; |
| 57 | + return std::pow(v, 1.0f / 2.6f); |
| 58 | + } |
| 59 | + |
| 60 | + // ─── Rec.709 → CIE XYZ matrix (D65 white point), pre-multiplied by DCI companding ─── |
| 61 | + static constexpr float M00 = 0.4124564f * DCI_COEFFICIENT; |
| 62 | + static constexpr float M01 = 0.3575761f * DCI_COEFFICIENT; |
| 63 | + static constexpr float M02 = 0.1804375f * DCI_COEFFICIENT; |
| 64 | + static constexpr float M10 = 0.2126729f * DCI_COEFFICIENT; |
| 65 | + static constexpr float M11 = 0.7151522f * DCI_COEFFICIENT; |
| 66 | + static constexpr float M12 = 0.0721750f * DCI_COEFFICIENT; |
| 67 | + static constexpr float M20 = 0.0193339f * DCI_COEFFICIENT; |
| 68 | + static constexpr float M21 = 0.1191920f * DCI_COEFFICIENT; |
| 69 | + static constexpr float M22 = 0.9503041f * DCI_COEFFICIENT; |
| 70 | + |
| 71 | + /** |
| 72 | + * SIMD-accelerated RGB → X'Y'Z' transform. |
| 73 | + * |
| 74 | + * Processes planar int32 component buffers. The gamma curves use scalar |
| 75 | + * LUTs (pre-built for the given precision), while the 3×3 matrix multiply |
| 76 | + * uses Highway SIMD float vectors. |
| 77 | + */ |
| 78 | + static void Hwy_rgb_to_xyz(int32_t* HWY_RESTRICT rBuf, int32_t* HWY_RESTRICT gBuf, |
| 79 | + int32_t* HWY_RESTRICT bBuf, uint64_t numPixels, uint32_t prec) |
| 80 | + { |
| 81 | + const uint32_t maxVal = (1u << prec) - 1; |
| 82 | + const float scale = 1.0f / (float)maxVal; |
| 83 | + |
| 84 | + // Build linearization LUT: [0..maxVal] → float linear |
| 85 | + const uint32_t lutSize = maxVal + 1; |
| 86 | + auto linLut = std::make_unique<float[]>(lutSize); |
| 87 | + for(uint32_t v = 0; v < lutSize; ++v) |
| 88 | + linLut[v] = rec709_to_linear((float)v * scale); |
| 89 | + |
| 90 | + // Build DCI gamma LUT at output precision (e.g. 4096 entries for 12-bit) |
| 91 | + // This fits in L1 cache unlike the old 256KB 16-bit LUT |
| 92 | + auto gammaLut = std::make_unique<float[]>(lutSize); |
| 93 | + for(uint32_t v = 0; v < lutSize; ++v) |
| 94 | + gammaLut[v] = linear_to_dci_gamma((float)v * scale); |
| 95 | + |
| 96 | + const float gammaScale = (float)maxVal; |
| 97 | + |
| 98 | + // Fused single-pass: linearize → matrix → gamma → quantize per pixel |
| 99 | + // This avoids intermediate float buffers and keeps everything in L1 |
| 100 | + for(uint64_t i = 0; i < numPixels; ++i) |
| 101 | + { |
| 102 | + // Linearize via LUT |
| 103 | + uint32_t rv = (uint32_t)rBuf[i]; |
| 104 | + uint32_t gv = (uint32_t)gBuf[i]; |
| 105 | + uint32_t bv = (uint32_t)bBuf[i]; |
| 106 | + if(rv > maxVal) rv = maxVal; |
| 107 | + if(gv > maxVal) gv = maxVal; |
| 108 | + if(bv > maxVal) bv = maxVal; |
| 109 | + |
| 110 | + float r = linLut[rv]; |
| 111 | + float g = linLut[gv]; |
| 112 | + float b = linLut[bv]; |
| 113 | + |
| 114 | + // Matrix multiply (scalar — 9 FMAs) |
| 115 | + float x = M00 * r + M01 * g + M02 * b; |
| 116 | + float y = M10 * r + M11 * g + M12 * b; |
| 117 | + float z = M20 * r + M21 * g + M22 * b; |
| 118 | + |
| 119 | + // Gamma via LUT (quantize linear to output precision for indexing) |
| 120 | + auto quantize = [maxVal, gammaScale](float v) -> uint32_t { |
| 121 | + if(v <= 0.0f) return 0; |
| 122 | + if(v >= 1.0f) return maxVal; |
| 123 | + return (uint32_t)(v * gammaScale + 0.5f); |
| 124 | + }; |
| 125 | + |
| 126 | + rBuf[i] = (int32_t)(gammaLut[quantize(x)] * gammaScale + 0.5f); |
| 127 | + gBuf[i] = (int32_t)(gammaLut[quantize(y)] * gammaScale + 0.5f); |
| 128 | + bBuf[i] = (int32_t)(gammaLut[quantize(z)] * gammaScale + 0.5f); |
| 129 | + } |
| 130 | + } |
| 131 | + |
| 132 | +} // namespace HWY_NAMESPACE |
| 133 | +} // namespace grk |
| 134 | +HWY_AFTER_NAMESPACE(); |
| 135 | + |
| 136 | +#if HWY_ONCE |
| 137 | +namespace grk |
| 138 | +{ |
| 139 | + |
| 140 | +HWY_EXPORT(Hwy_rgb_to_xyz); |
| 141 | + |
| 142 | +bool applyXYZTransform(grk_image* image) |
| 143 | +{ |
| 144 | + if(!image || image->numcomps < 3) |
| 145 | + { |
| 146 | + grklog.error("XYZ transform requires an image with at least 3 components"); |
| 147 | + return false; |
| 148 | + } |
| 149 | + |
| 150 | + auto& compR = image->comps[0]; |
| 151 | + auto& compG = image->comps[1]; |
| 152 | + auto& compB = image->comps[2]; |
| 153 | + |
| 154 | + if(!compR.data || !compG.data || !compB.data) |
| 155 | + { |
| 156 | + grklog.error("XYZ transform: component data is null"); |
| 157 | + return false; |
| 158 | + } |
| 159 | + |
| 160 | + // All components must have same dimensions and precision |
| 161 | + if(compR.w != compG.w || compR.w != compB.w || compR.h != compG.h || compR.h != compB.h || |
| 162 | + compR.prec != compG.prec || compR.prec != compB.prec) |
| 163 | + { |
| 164 | + grklog.error("XYZ transform: components must have identical dimensions and precision"); |
| 165 | + return false; |
| 166 | + } |
| 167 | + |
| 168 | + uint32_t w = compR.w; |
| 169 | + uint32_t h = compR.h; |
| 170 | + uint32_t prec = compR.prec; |
| 171 | + |
| 172 | + grklog.info("XYZ transform: %ux%u, %u-bit, strides: R=%u G=%u B=%u, data_type: R=%d G=%d B=%d", |
| 173 | + w, h, prec, compR.stride, compG.stride, compB.stride, |
| 174 | + (int)compR.data_type, (int)compG.data_type, (int)compB.data_type); |
| 175 | + |
| 176 | + // Handle int16 data type: need to widen to int32 for processing, then narrow back |
| 177 | + bool isInt16 = (compR.data_type == GRK_INT_16); |
| 178 | + |
| 179 | + // If data is stored with stride, we need to process row-by-row |
| 180 | + // For contiguous data (stride == w), we can process in one shot |
| 181 | + bool contiguous = |
| 182 | + (compR.stride == w) && (compG.stride == w) && (compB.stride == w); |
| 183 | + |
| 184 | + if(contiguous) |
| 185 | + { |
| 186 | + uint64_t numPixels = (uint64_t)w * h; |
| 187 | + auto rBuf = (int32_t*)compR.data; |
| 188 | + auto gBuf = (int32_t*)compG.data; |
| 189 | + auto bBuf = (int32_t*)compB.data; |
| 190 | + HWY_DYNAMIC_DISPATCH(Hwy_rgb_to_xyz)(rBuf, gBuf, bBuf, numPixels, prec); |
| 191 | + } |
| 192 | + else |
| 193 | + { |
| 194 | + // Row-by-row processing for strided data |
| 195 | + // Allocate temporary contiguous buffers |
| 196 | + auto rRow = std::make_unique<int32_t[]>(w); |
| 197 | + auto gRow = std::make_unique<int32_t[]>(w); |
| 198 | + auto bRow = std::make_unique<int32_t[]>(w); |
| 199 | + |
| 200 | + for(uint32_t y = 0; y < h; ++y) |
| 201 | + { |
| 202 | + auto rPtr = (int32_t*)compR.data + (uint64_t)y * compR.stride; |
| 203 | + auto gPtr = (int32_t*)compG.data + (uint64_t)y * compG.stride; |
| 204 | + auto bPtr = (int32_t*)compB.data + (uint64_t)y * compB.stride; |
| 205 | + |
| 206 | + // Copy row to contiguous buffer |
| 207 | + memcpy(rRow.get(), rPtr, w * sizeof(int32_t)); |
| 208 | + memcpy(gRow.get(), gPtr, w * sizeof(int32_t)); |
| 209 | + memcpy(bRow.get(), bPtr, w * sizeof(int32_t)); |
| 210 | + |
| 211 | + HWY_DYNAMIC_DISPATCH(Hwy_rgb_to_xyz)(rRow.get(), gRow.get(), bRow.get(), w, prec); |
| 212 | + |
| 213 | + // Copy back |
| 214 | + memcpy(rPtr, rRow.get(), w * sizeof(int32_t)); |
| 215 | + memcpy(gPtr, gRow.get(), w * sizeof(int32_t)); |
| 216 | + memcpy(bPtr, bRow.get(), w * sizeof(int32_t)); |
| 217 | + } |
| 218 | + } |
| 219 | + |
| 220 | + // Update colour space to XYZ |
| 221 | + image->color_space = GRK_CLRSPC_CUSTOM_CIE; |
| 222 | + |
| 223 | + grklog.info("Applied Rec.709 RGB → DCI X'Y'Z' colour transform (%ux%u, %u-bit)", w, h, prec); |
| 224 | + return true; |
| 225 | +} |
| 226 | + |
| 227 | +} // namespace grk |
| 228 | +#endif // HWY_ONCE |
0 commit comments