HAC Tests fail with numpy multi threading (#1060)

s3alfisc · web-flow · commit 9fe616319336 · 2025-11-06T22:25:30.000+01:00
* fix test bugs by forcing one threaded numpy

* encorce arrays contiguous

* pre commit
diff --git a/.github/workflows/ci-tests.yaml b/.github/workflows/ci-tests.yaml
@@ -43,6 +43,7 @@ jobs:
       - name: Run 'regular' tests
         run: |
           pixi run tests-regular
+          pixi run tests-hac
 
       - name: Upload coverage to Codecov (partial)
         uses: codecov/codecov-action@v4
diff --git a/pixi.toml b/pixi.toml
@@ -66,6 +66,7 @@ jaxlib = ">=0.4.38, <0.8"
 "tests-regular" = 'pytest tests -n 9 -m "not (extended or against_r_core or against_r_extended or plots)" --cov=pyfixest --cov-report=xml'
 "tests-extended" = 'pytest tests -n 9 -m "extended" --cov=pyfixest --cov-report=xml'
 "tests-fixest" = "pytest -rs tests/test_vs_fixest.py -n 9 --cov=pyfixest --cov-report=xml"
+"tests-hac" = { cmd = "pytest tests/test_hac_vs_fixest.py -v -n 9", env = { OMP_NUM_THREADS = "1", OPENBLAS_NUM_THREADS = "1", MKL_NUM_THREADS = "1", VECLIB_MAXIMUM_THREADS = "1", NUMEXPR_NUM_THREADS = "1" } }
 "debug" = "python pyfixest/debug.py"
 "update-test-data" = "Rscript tests/r_test_comparisons.R"
 "install-r-extended" = "Rscript r_test_requirements.R"
diff --git a/pyfixest/estimation/vcov_utils.py b/pyfixest/estimation/vcov_utils.py
@@ -118,8 +118,8 @@ def _hac_meat_loop(
         gamma_buffer.fill(0.0)
         weight = weights[lag_value]
 
-        scores_current = scores[lag_value:time_periods]
-        scores_lagged = scores[: time_periods - lag_value]
+        scores_current = np.ascontiguousarray(scores[lag_value:time_periods, :])
+        scores_lagged = np.ascontiguousarray(scores[: time_periods - lag_value, :])
 
         gamma_buffer[:, :] = scores_current.T @ scores_lagged
         meat += weight * (gamma_buffer + gamma_buffer.T)
@@ -142,10 +142,10 @@ def _get_bartlett_weights(lag: int):
 @nb.njit(parallel=False)
 def _nw_meat_time(scores: np.ndarray, time_arr: np.ndarray, lag: int):
     if time_arr is None:
-        ordered_scores = scores
+        ordered_scores = np.ascontiguousarray(scores)
     else:
         order = np.argsort(time_arr)
-        ordered_scores = scores[order]
+        ordered_scores = np.ascontiguousarray(scores[order])
 
     time_periods, k = ordered_scores.shape
     weights = _get_bartlett_weights(lag=lag)
@@ -191,7 +191,7 @@ def _get_panel_idx(
     return order, units, starts, counts, panel_arr_sorted, time_arr_sorted
 
 
-# @nb.njit(parallel=False)
+@nb.njit(parallel=False)
 def _nw_meat_panel(
     scores: np.ndarray,
     time_arr: np.ndarray,
@@ -229,6 +229,7 @@ def _nw_meat_panel(
 
     weights = _get_bartlett_weights(lag=lag)
 
+    scores = np.ascontiguousarray(scores)
     k = scores.shape[1]
 
     meat_nw_panel = np.zeros((k, k))
@@ -240,14 +241,14 @@ def _nw_meat_panel(
     for start, count in zip(starts, counts):
         end = start + count
 
-        score_i = scores[start:end, :]
+        score_i = np.ascontiguousarray(scores[start:end, :])
         gamma0 = score_i.T @ score_i
 
         gamma_l_sum.fill(0.0)
         Lmax = min(lag, count - 1)
         for lag_value in range(1, Lmax + 1):
-            score_curr = scores[start + lag_value : end, :]
-            score_prev = scores[start : end - lag_value, :]
+            score_curr = np.ascontiguousarray(scores[start + lag_value : end, :])
+            score_prev = np.ascontiguousarray(scores[start : end - lag_value, :])
             gamma_l = score_curr.T @ score_prev
             gamma_l_sum += weights[lag_value] * (gamma_l + gamma_l.T)
 
@@ -289,6 +290,7 @@ def _dk_meat_panel(
     time_periods, k = scores_time.shape
 
     weights = _get_bartlett_weights(lag=lag)
+    scores_time = np.ascontiguousarray(scores_time)
 
     return _hac_meat_loop(
         scores=scores_time, weights=weights, time_periods=time_periods, k=k, lag=lag
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,50 @@
+"Pytest configuration for pyfixest tests."
+
+import os
+
+import pytest
+
+
+@pytest.fixture(scope="session", autouse=True)
+def single_thread_blas():
+    """
+    Force single-threaded BLAS for deterministic HAC standard errors.
+
+    What Claude says:
+
+    Multi-threaded BLAS libraries (OpenBLAS, MKL, Accelerate) can produce
+    slightly different numerical results due to different parallel reduction
+    orders when computing matrix multiplications. This causes sporadic test
+    failures in HAC variance calculations even though both R and Python
+    implementations are mathematically correct.
+
+    The differences arise because floating-point arithmetic is not associative:
+    (a + b) + c ≠ a + (b + c) in IEEE 754. Different thread scheduling can
+    change the order of operations, leading to different rounding errors.
+
+    By forcing single-threaded execution, we ensure deterministic results
+    that match R's fixest package exactly.
+    """
+    # Store original values to restore after tests
+    original_values = {}
+
+    env_vars = [
+        "OMP_NUM_THREADS",
+        "OPENBLAS_NUM_THREADS",
+        "MKL_NUM_THREADS",
+        "VECLIB_MAXIMUM_THREADS",
+        "NUMEXPR_NUM_THREADS",
+    ]
+
+    for var in env_vars:
+        original_values[var] = os.environ.get(var)
+        os.environ[var] = "1"
+
+    yield
+
+    # Restore original values after all tests complete
+    for var, value in original_values.items():
+        if value is None:
+            os.environ.pop(var, None)
+        else:
+            os.environ[var] = value
diff --git a/tests/test_hac_vs_fixest.py b/tests/test_hac_vs_fixest.py
@@ -1,3 +1,13 @@
+"""
+Tests for HAC (Heteroskedasticity and Autocorrelation Consistent) standard errors.
+
+IMPORTANT: These tests require single-threaded BLAS for deterministic results.
+Multi-threaded BLAS libraries can produce slightly different numerical results
+(~1-4% variance in vcov elements) due to different parallel reduction orders,
+even though both implementations are mathematically correct. The conftest.py
+fixture `single_thread_blas` handles this automatically.
+"""
+
 import numpy as np
 import pandas as pd
 import pytest
@@ -249,10 +259,10 @@ def _get_r_panel_kwargs(time_id, panel_id, lag, inference):
     "vcov_kwargs",
     [
         {"lag": 2, "time_id": "year"},
-        {"lag": 4, "time_id": "year"},
+        {"lag": 8, "time_id": "year"},
         # now add panel id
         {"lag": 2, "time_id": "year", "panel_id": "unit"},
-        {"lag": 4, "time_id": "year", "panel_id": "unit"},
+        {"lag": 8, "time_id": "year", "panel_id": "unit"},
         # lag not required when panel_id is provided
         {"time_id": "year", "panel_id": "unit"},
     ],
@@ -354,9 +364,9 @@ def test_single_fit_feols_hac_panel(
     "balanced",
     [
         "balanced-consecutive",
-        # "balanced-non-consecutive",
-        # "non-balanced-consecutive",
-        # "non-balanced-non-consecutive",
+        "balanced-non-consecutive",
+        "non-balanced-consecutive",
+        "non-balanced-non-consecutive",
     ],
 )
 @pytest.mark.parametrize("fml", poisson_fmls)