Merge pull request #245 from SwayamInSync/hash-impl

SwayamInSync · web-flow · commit db1ee6a94d2a · 2025-12-24T01:06:57.000+05:30
FEAT: Implementing hash support in QuadDtype
diff --git a/.gitignore b/.gitignore
@@ -141,4 +141,5 @@ compile_commands.json
 # quaddtype
 /quaddtype/subprojects/qblas/
 /quaddtype/subprojects/sleef/
+/quaddtype/subprojects/pythoncapi-compat/
 .wraplock
diff --git a/quaddtype/meson.build b/quaddtype/meson.build
@@ -79,6 +79,10 @@ incdir_numpy = run_command(py,
   check : true
 ).stdout().strip()
 
+# pythoncapi-compat for portable C API usage across Python versions
+pythoncapi_compat_subproj = subproject('pythoncapi-compat')
+pythoncapi_compat_inc = pythoncapi_compat_subproj.get_variable('incdir')
+
 # print numpy version used
 numpy_version = run_command(py,
   ['-c', 'import numpy; print(numpy.__version__)'],
@@ -154,6 +158,7 @@ includes = include_directories(
         'numpy_quaddtype/src',
     ]
 )
+pythoncapi_includes = pythoncapi_compat_inc
 
 srcs = [
     'numpy_quaddtype/src/quad_common.h',
@@ -208,5 +213,5 @@ py.extension_module('_quaddtype_main',
   dependencies: dependencies,
   install: true,
   subdir: 'numpy_quaddtype',
-  include_directories: [includes, build_includes],
+  include_directories: [includes, build_includes, pythoncapi_includes],
 )
diff --git a/quaddtype/numpy_quaddtype/src/scalar.c b/quaddtype/numpy_quaddtype/src/scalar.c
@@ -17,6 +17,8 @@
 #include "dtype.h"
 #include "lock.h"
 #include "utilities.h"
+#include "constants.hpp"
+#include "pythoncapi_compat.h"
 
 
 QuadPrecisionObject *
@@ -624,6 +626,112 @@ static PyGetSetDef QuadPrecision_getset[] = {
     {NULL}  /* Sentinel */
 };
 
+/*
+ * Hash function for QuadPrecision scalars.
+ * 
+ * This implements the same algorithm as CPython's _Py_HashDouble, adapted for
+ * 128-bit floating point. The algorithm computes a hash based
+ * on the reduction of the value modulo the prime P = 2**PYHASH_BITS - 1.
+ * https://github.com/python/cpython/blob/20b69aac0d19a5e5358362410d9710887762f0e7/Python/pyhash.c#L87
+ * 
+ * Key invariant: hash(x) == hash(y) whenever x and y are numerically equal,
+ * even if x and y have different types. This ensures that:
+ *   hash(QuadPrecision(1.0)) == hash(1.0) == hash(1)
+ * 
+ * The algorithm:
+ * 1. Handle special cases: inf returns PyHASH_INF, nan uses pointer hash
+ * 2. Extract mantissa m in [0.5, 1.0) and exponent e via frexp(v) = m * 2^e
+ * 3. Process mantissa 28 bits at a time, accumulating into hash value x
+ * 4. Adjust for exponent using bit rotation (since 2^PyHASH_BITS ≡ 1 mod P)
+ * 5. Apply sign and handle the special case of -1 -> -2
+ */
+
+static Py_hash_t
+QuadPrecision_hash(QuadPrecisionObject *self)
+{
+    Sleef_quad value;
+    int sign = 1;
+    
+    if (self->backend == BACKEND_SLEEF) {
+        value = self->value.sleef_value;
+    }
+    else {
+        value = Sleef_cast_from_doubleq1((double)self->value.longdouble_value);
+    }
+    
+    // Check for NaN - use pointer hash (each NaN instance gets unique hash)
+    // This prevents hash table catastrophic pileups from NaN instances
+    if (Sleef_iunordq1(value, value)) {
+        return Py_HashPointer((void *)self);
+    }
+    
+    if (Sleef_icmpeqq1(value, QUAD_PRECISION_INF)) {
+        return PyHASH_INF;
+    }
+    if (Sleef_icmpeqq1(value, QUAD_PRECISION_NINF)) {
+        return -PyHASH_INF;
+    }
+    
+    // Handle sign
+    Sleef_quad zero = Sleef_cast_from_int64q1(0);
+    if (Sleef_icmpltq1(value, zero)) {
+        sign = -1;
+        value = Sleef_negq1(value);
+    }
+    
+    // Get mantissa and exponent: value = m * 2^e, where 0.5 <= m < 1.0
+    int exponent;
+    Sleef_quad mantissa = Sleef_frexpq1(value, &exponent);
+    
+    // Process 28 bits at a time (same as CPython's _Py_HashDouble)
+    // This works well for both binary and hexadecimal floating point
+    Py_uhash_t x = 0;
+    // 2^28 = 268435456 - exactly representable in double, so cast is safe
+    Sleef_quad multiplier = Sleef_cast_from_int64q1(1LL << 28);
+    
+    // Continue until mantissa becomes zero (all bits processed)
+    while (Sleef_icmpneq1(mantissa, zero)) {
+        // Rotate x left by 28 bits within PyHASH_MODULUS
+        x = ((x << 28) & PyHASH_MODULUS) | (x >> (PyHASH_BITS - 28));
+        
+        // Scale mantissa by 2^28
+        mantissa = Sleef_mulq1_u05(mantissa, multiplier);
+        exponent -= 28;
+        
+        // Extract integer part
+        Sleef_quad int_part = Sleef_truncq1(mantissa);
+        Py_uhash_t y = (Py_uhash_t)Sleef_cast_to_int64q1(int_part);
+        
+        // Remove integer part from mantissa (keep fractional part)
+        mantissa = Sleef_subq1_u05(mantissa, int_part);
+        
+        // Accumulate
+        x += y;
+        if (x >= PyHASH_MODULUS) {
+            x -= PyHASH_MODULUS;
+        }
+    }
+    
+    // Adjust for exponent: reduce e modulo PyHASH_BITS
+    // For negative exponents: PyHASH_BITS - 1 - ((-1 - e) % PyHASH_BITS)
+    int e = exponent >= 0 
+            ? exponent % PyHASH_BITS 
+            : PyHASH_BITS - 1 - ((-1 - exponent) % PyHASH_BITS);
+    
+    // Rotate x left by e bits
+    x = ((x << e) & PyHASH_MODULUS) | (x >> (PyHASH_BITS - e));
+    
+    // Apply sign
+    x = x * sign;
+    
+    // -1 is reserved for errors, so use -2 instead
+    if (x == (Py_uhash_t)-1) {
+        x = (Py_uhash_t)-2;
+    }
+    
+    return (Py_hash_t)x;
+}
+
 PyTypeObject QuadPrecision_Type = {
         PyVarObject_HEAD_INIT(NULL, 0).tp_name = "numpy_quaddtype.QuadPrecision",
         .tp_basicsize = sizeof(QuadPrecisionObject),
@@ -632,6 +740,7 @@ PyTypeObject QuadPrecision_Type = {
         .tp_dealloc = (destructor)QuadPrecision_dealloc,
         .tp_repr = (reprfunc)QuadPrecision_repr_dragon4,
         .tp_str = (reprfunc)QuadPrecision_str_dragon4,
+        .tp_hash = (hashfunc)QuadPrecision_hash,
         .tp_as_number = &quad_as_scalar,
         .tp_as_buffer = &QuadPrecision_as_buffer,
         .tp_richcompare = (richcmpfunc)quad_richcompare,
diff --git a/quaddtype/pyproject.toml b/quaddtype/pyproject.toml
@@ -55,3 +55,7 @@ strict_equality_for_none = true
 exclude = ["build", "numpy_quaddtype/src", "subprojects", "tests"]
 enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"]
 warn_unreachable = false
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+norecursedirs = ["subprojects", "build", ".mesonpy*"]
diff --git a/quaddtype/reinstall.sh b/quaddtype/reinstall.sh
@@ -5,6 +5,7 @@ rm -rf build/
 rm -rf dist/
 rm -rf subprojects/qblas
 rm -rf subprojects/sleef
+rm -rf subprojects/pythoncapi-compat
 rm -rf .mesonpy-*
 
 python -m pip uninstall -y numpy_quaddtype
diff --git a/quaddtype/subprojects/pythoncapi-compat.wrap b/quaddtype/subprojects/pythoncapi-compat.wrap
@@ -0,0 +1,6 @@
+[wrap-git]
+directory=pythoncapi-compat
+url=https://github.com/python/pythoncapi-compat.git
+revision=main
+[provide]
+pythoncapi_compat = pythoncapi_compat_dep
diff --git a/quaddtype/tests/test_quaddtype.py b/quaddtype/tests/test_quaddtype.py
@@ -5229,3 +5229,142 @@ def test_add_regression_zero_plus_small(self):
         
         assert result_yx == result_xy, f"0 + x = {result_yx}, but x + 0 = {result_xy}"
         assert result_yx == x, f"0 + x = {result_yx}, expected {x}"
+
+
+class TestQuadPrecisionHash:
+    """Test suite for QuadPrecision hash function.
+    
+    The hash implementation follows CPython's _Py_HashDouble algorithm to ensure
+    the invariant: hash(x) == hash(y) when x and y are numerically equal,
+    even across different types.
+    """
+
+    @pytest.mark.parametrize("value", [
+        # Values that are exactly representable in binary floating point
+        "0.0", "1.0", "-1.0", "2.0", "-2.0",
+        "0.5", "0.25", "1.5", "-0.5",
+        "100.0", "-100.0",
+        # Powers of 2 are exactly representable
+        "0.125", "0.0625", "4.0", "8.0",
+    ])
+    def test_hash_matches_float(self, value):
+        """Test that hash(QuadPrecision) == hash(float) for exactly representable values.
+        
+        Note: Only values that are exactly representable in both float64 and float128
+        should match. Values like 0.1, 0.3 will have different hashes because they
+        have different binary representations at different precisions.
+        """
+        quad_val = QuadPrecision(value)
+        float_val = float(value)
+        assert hash(quad_val) == hash(float_val)
+
+    @pytest.mark.parametrize("value", [0.1, 0.3, 0.7, 1.1, 2.3, 1e300, 1e-300])
+    def test_hash_matches_float_from_float(self, value):
+        """Test that QuadPrecision created from float has same hash as that float.
+        
+        When creating QuadPrecision from a Python float, the value is converted
+        from the float's double precision representation, so they should be
+        numerically equal and have the same hash.
+        """
+        quad_val = QuadPrecision(value)  # Created from float, not string
+        assert hash(quad_val) == hash(value)
+
+    @pytest.mark.parametrize("value", [0, 1, -1, 2, -2, 100, -100, 1000, -1000])
+    def test_hash_matches_int(self, value):
+        """Test that hash(QuadPrecision) == hash(int) for integer values."""
+        quad_val = QuadPrecision(value)
+        assert hash(quad_val) == hash(value)
+
+    def test_hash_matches_large_int(self):
+        """Test that hash(QuadPrecision) == hash(int) for large integers."""
+        big_int = 10**20
+        quad_val = QuadPrecision(str(big_int))
+        assert hash(quad_val) == hash(big_int)
+
+    def test_hash_infinity(self):
+        """Test that infinity hash matches Python's float infinity hash."""
+        assert hash(QuadPrecision("inf")) == hash(float("inf"))
+        assert hash(QuadPrecision("-inf")) == hash(float("-inf"))
+        # Standard PyHASH_INF values
+        assert hash(QuadPrecision("inf")) == 314159
+        assert hash(QuadPrecision("-inf")) == -314159
+
+    def test_hash_nan_unique(self):
+        """Test that each NaN instance gets a unique hash (pointer-based)."""
+        nan1 = QuadPrecision("nan")
+        nan2 = QuadPrecision("nan")
+        # NaN instances should have different hashes (based on object identity)
+        assert hash(nan1) != hash(nan2)
+
+    def test_hash_nan_same_instance(self):
+        """Test that the same NaN instance has consistent hash."""
+        nan = QuadPrecision("nan")
+        assert hash(nan) == hash(nan)
+
+    def test_hash_negative_one(self):
+        """Test that hash(-1) returns -2 (Python's hash convention)."""
+        # In Python, hash(-1) returns -2 because -1 is reserved for errors
+        assert hash(QuadPrecision(-1.0)) == -2
+        assert hash(QuadPrecision("-1.0")) == -2
+
+    def test_hash_set_membership(self):
+        """Test that QuadPrecision values work correctly in sets."""
+        vals = [QuadPrecision(1.0), QuadPrecision(2.0), QuadPrecision(1.0)]
+        unique_set = set(vals)
+        assert len(unique_set) == 2
+
+    def test_hash_set_cross_type(self):
+        """Test that QuadPrecision and float with same value are in same set bucket."""
+        s = {QuadPrecision(1.0)}
+        s.add(1.0)
+        assert len(s) == 1
+
+    def test_hash_dict_key(self):
+        """Test that QuadPrecision values work as dict keys."""
+        d = {QuadPrecision(1.0): "one", QuadPrecision(2.0): "two"}
+        assert d[QuadPrecision(1.0)] == "one"
+        assert d[QuadPrecision(2.0)] == "two"
+
+    def test_hash_dict_cross_type_lookup(self):
+        """Test that dict lookup works with float keys when hash matches."""
+        d = {QuadPrecision(1.0): "one"}
+        # Float lookup should work if hash and eq both work
+        assert d.get(1.0) == "one"
+
+    @pytest.mark.parametrize("value", [
+        # Powers of 2 outside double range but within quad range
+        # Double max exponent is ~1024, quad max is ~16384
+        2**1100, 2**2000, 2**5000, 2**10000,
+        -(2**1100), -(2**2000),
+        # Small powers of 2 (subnormal in double, normal in quad)  
+        2**(-1100), 2**(-2000),
+    ])
+    def test_hash_extreme_integers_outside_double_range(self, value):
+        """Test hash matches Python int for values outside double range.
+        
+        We use powers of 2 which are exactly representable in quad precision.
+        Since these integers are exact, hash(QuadPrecision(x)) must equal hash(x).
+        """
+        quad_val = QuadPrecision(value)
+        assert hash(quad_val) == hash(value)
+
+    @pytest.mark.parametrize("value", [
+        "1e500", "-1e500", "1e1000", "-1e1000", "1e-500", "-1e-500",
+        "1.23456789e500", "-9.87654321e-600",
+    ])
+    def test_hash_matches_mpmath(self, value):
+        """Test hash matches mpmath at quad precision (113 bits).
+        
+        mpmath with 113-bit precision represents the same value as QuadPrecision,
+        so their hashes must match.
+        """
+        mp.prec = 113
+        quad_val = QuadPrecision(value)
+        mpf_val = mp.mpf(value)
+        assert hash(quad_val) == hash(mpf_val)
+
+    @pytest.mark.parametrize("backend", ["sleef", "longdouble"])
+    def test_hash_backends(self, backend):
+        """Test hash works for both backends."""
+        quad_val = QuadPrecision(1.5, backend=backend)
+        assert hash(quad_val) == hash(1.5)