Merge pull request freqtrade#13227 from yongzhe2160cs/feature/backtest-profit-pvalue

xmatthias · web-flow · commit 064e67c42af3 · 2026-06-21T14:57:35.000+02:00
Add mean-trade-return p-value to backtest summary metrics
diff --git a/docs/backtesting.md b/docs/backtesting.md
@@ -229,6 +229,7 @@ A backtesting result will look like that:
 │ Sortino (closed trades)                │ 2.57                                      │
 │ Calmar (closed trades)                 │ 43.03                                     │
 │ SQN                                    │ 0.71                                      │
+│ Mean profit p-value                    │ 0.4768                                    │
 │ Profit factor                          │ 1.30                                      │
 │ Expectancy (Ratio)                     │ 0.74 (0.04)                               │
 │ Avg. daily profit                      │ 1.844 USDT                                │
@@ -362,6 +363,7 @@ It contains key metrics about the performance of your strategy on backtesting da
 │ Sortino (closed trades)                │ 2.57                                      │
 │ Calmar (closed trades)                 │ 43.03                                     │
 │ SQN                                    │ 0.71                                      │
+│ Mean profit p-value                    │ 0.4768                                    │
 │ Profit factor                          │ 1.30                                      │
 │ Expectancy (Ratio)                     │ 0.74 (0.04)                               │
 │ Avg. daily profit                      │ 1.844 USDT                                │
@@ -424,6 +426,7 @@ It contains key metrics about the performance of your strategy on backtesting da
 - `Sortino (closed trades)`: Annualized Sortino ratio including only closed trades (ignoring open trades with profits or losses).
 - `Calmar (closed trades)`: Annualized Calmar ratio including only closed trades (ignoring open trades with profits or losses).
 - `SQN`: System Quality Number (SQN) - by Van Tharp.
+- `Mean profit p-value`: Two-sided p-value of a one-sample Student's t-test against the null hypothesis that the mean per-trade return is zero - in short, "is the average profit distinguishable from noise?". A small value (the usual bar is below `0.05`) means the observed edge is unlikely to be down to chance. Its underlying t-statistic is identical to `SQN`. See the note below for how to read it in practice.
 - `Profit factor`: Sum of the profits of all winning trades divided by the sum of the losses of all losing trades.
 - `Expectancy (Ratio)`: Expectancy ratio, which is the average profit or loss per trade. A negative expectancy ratio means that your strategy is not profitable.
 - `Avg. daily profit`: Average profit per day, calculated as `(Total Profit / Backtest Days)`.
@@ -455,6 +458,11 @@ It contains key metrics about the performance of your strategy on backtesting da
 - `Sortino (wallet balance)` Annualized Sortino ratio calculation including unrealized profits.
 - `Calmar (wallet balance)` Annualized Calmar ratio calculation including unrealized profits.
 
+??? Note "Reading the mean profit p-value"
+    Think of the p-value as the answer to one question: *if the strategy truly had no edge, how often would pure chance still hand you an average per-trade result at least this far from zero?* A value of `0.4768` therefore means roughly a 48% chance of a swing this large turning up from randomness alone - in other words the average profit is not distinguishable from luck. The lower the p-value, the less likely the result is a fluke, and a common rule of thumb is to treat anything below `0.05` (a 5% chance) as "statistically significant".
+
+    Two things keep this honest. The test assumes trades are independent and identically distributed, which real strategies rarely are (trades overlap and cluster in time), so the figure is an *optimistic* lower bound - the true uncertainty is usually larger. And because backtesting and hyperopt evaluate many strategies, some will score a low p-value by chance alone, so a small value only tells you a result is hard to explain by noise; it is not by itself proof of a genuine edge.
+
 !!! Tip "Wallet based Metrics"
     The metrics under the "Wallet based Metrics" section are calculated based on the unrealized balance, which includes the capital tied in open trades. This provides a more comprehensive view of the strategy's performance, as it accounts for both realized and unrealized profits and losses.
 
diff --git a/freqtrade/data/metrics.py b/freqtrade/data/metrics.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import pandas as pd
+from scipy import stats
 
 
 logger = logging.getLogger(__name__)
@@ -632,3 +633,24 @@ def calculate_sqn(trades: pd.DataFrame, starting_balance: float) -> float:
         sqn = -100.0
 
     return round(sqn, 4)
+
+
+def calculate_p_value(trades: pd.DataFrame, starting_balance: float) -> float:
+    """
+    Two-sided p-value for the null hypothesis that mean per-trade profit
+    (profit_abs / starting_balance) equals zero.
+    Returns 1.0 for fewer than 2 trades or zero-variance samples.
+
+    :param trades: DataFrame containing trades (requires column profit_abs)
+    :param starting_balance: Starting balance of the trading system
+    :return: Two-sided p-value in the range [0, 1]. Returns 1.0 (no evidence
+             against the null) when it cannot be computed - fewer than two
+             trades or zero return variance.
+    """
+    if len(trades) < 2:
+        return 1.0
+    returns = trades["profit_abs"] / starting_balance
+    if returns.std() == 0:
+        return 1.0
+    _, p_value = stats.ttest_1samp(returns, popmean=0)
+    return float(p_value)
diff --git a/freqtrade/optimize/optimize_reports/bt_output.py b/freqtrade/optimize/optimize_reports/bt_output.py
@@ -405,6 +405,10 @@ def text_table_add_metrics(strat_results: dict) -> None:
                 f"{strat_results['calmar']:.2f}" if "calmar" in strat_results else "N/A",
             ),
             ("SQN", f"{strat_results['sqn']:.2f}" if "sqn" in strat_results else "N/A"),
+            (
+                "Mean profit p-value",
+                (f"{strat_results['p_value']:.4g}" if "p_value" in strat_results else "N/A"),
+            ),
             (
                 "Profit factor",
                 (
diff --git a/freqtrade/optimize/optimize_reports/optimize_reports.py b/freqtrade/optimize/optimize_reports/optimize_reports.py
@@ -16,6 +16,7 @@
     calculate_market_change,
     calculate_max_drawdown,
     calculate_max_drawdown_from_balance,
+    calculate_p_value,
     calculate_sharpe,
     calculate_sharpe_from_balance,
     calculate_sortino,
@@ -224,6 +225,7 @@ def _generate_result_line(
         "sharpe": calculate_sharpe(result, min_date, max_date, starting_balance),
         "calmar": calculate_calmar(result, min_date, max_date, starting_balance),
         "sqn": calculate_sqn(result, starting_balance),
+        "p_value": calculate_p_value(result, starting_balance),
         "profit_factor": profit_factor,
         "max_drawdown_account": drawdown.relative_account_drawdown if drawdown else 0.0,
         "max_drawdown_abs": drawdown.drawdown_abs if drawdown else 0.0,
@@ -684,6 +686,7 @@ def generate_strategy_stats(
         "sharpe": calculate_sharpe(results, min_date, max_date, start_balance),
         "calmar": calculate_calmar(results, min_date, max_date, start_balance),
         "sqn": calculate_sqn(results, start_balance),
+        "p_value": calculate_p_value(results, start_balance),
         "wallet_stats": generate_wallet_stats(content.get("wallet_summary"), stake_currency),
         "profit_factor": profit_factor,
         "backtest_start": min_date.strftime(DATETIME_PRINT_FORMAT),
diff --git a/requirements-hyperopt.txt b/requirements-hyperopt.txt
@@ -2,7 +2,6 @@
 -r requirements.txt
 
 # Required for hyperopt
-scipy==1.17.1
 scikit-learn==1.9.0
 filelock==3.29.1
 optuna==4.9.0
diff --git a/requirements.txt b/requirements.txt
@@ -2,6 +2,7 @@ numpy==2.4.6
 pandas==3.0.3
 bottleneck==1.6.0
 numexpr==2.14.1
+scipy==1.17.1
 # Indicator libraries
 ft-pandas-ta==0.3.16
 ta-lib==0.6.8
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -533,6 +533,9 @@ def patch_torch_initlogs(mocker) -> None:
 
         module_name = "torch"
         mocked_module = types.ModuleType(module_name)
+        # SciPy's array-API dispatch probes ``torch.Tensor`` to classify inputs;
+        # expose a dummy so scipy.stats stays importable/usable under the mock.
+        mocked_module.Tensor = type("Tensor", (), {})
         sys.modules[module_name] = mocked_module
     else:
         try:
diff --git a/tests/data/test_metrics.py b/tests/data/test_metrics.py
@@ -18,6 +18,7 @@
     calculate_market_change,
     calculate_max_drawdown,
     calculate_max_drawdown_from_balance,
+    calculate_p_value,
     calculate_sharpe,
     calculate_sharpe_from_balance,
     calculate_sortino,
@@ -442,6 +443,56 @@ def test_calculate_sqn_cases(profits, starting_balance, expected_sqn, descriptio
     assert pytest.approx(sqn, rel=1e-4) == expected_sqn
 
 
+def test_calculate_p_value_edge_cases():
+    # Fewer than two trades -> not computable, returns "no evidence" default.
+    assert calculate_p_value(DataFrame({"profit_abs": []}), 100) == 1.0
+    assert calculate_p_value(DataFrame({"profit_abs": [1.0]}), 100) == 1.0
+
+    # Zero variance (all identical returns) -> not computable.
+    assert calculate_p_value(DataFrame({"profit_abs": [1.0, 1.0, 1.0]}), 100) == 1.0
+
+    # p-value is always within [0, 1].
+    p_value = calculate_p_value(DataFrame({"profit_abs": [1.0, -0.5, 2.0, -1.0]}), 100)
+    assert 0.0 <= p_value <= 1.0
+
+
+def test_calculate_p_value_scale_invariance():
+    # The t-statistic, and hence the p-value, is invariant to the stake scale.
+    profits = [1.0, -0.5, 2.0, -1.0, 0.5, 1.5, -0.5, 1.0]
+    trades = DataFrame({"profit_abs": profits})
+    p_small = calculate_p_value(trades, starting_balance=10)
+    p_large = calculate_p_value(trades, starting_balance=100_000)
+    assert pytest.approx(p_small, rel=1e-9) == p_large
+
+
+def test_calculate_p_value_matches_reference():
+    """
+    calculate_p_value must match scipy.stats.ttest_1samp, the canonical
+    reference, computed live for each case.
+    """
+    from scipy import stats
+
+    cases = [
+        [0.01, -0.005, 0.02, 0.015, -0.01],
+        [0.05, 0.04, 0.06, 0.045, 0.055],
+        [-0.01, -0.02, -0.015, -0.005, -0.025],
+        [0.001, -0.001, 0.001, -0.001],
+    ]
+    starting_balance = 1000.0
+    for returns in cases:
+        trades = DataFrame({"profit_abs": [r * starting_balance for r in returns]})
+        result = calculate_p_value(trades, starting_balance)
+        _, expected = stats.ttest_1samp(returns, popmean=0)
+        assert abs(result - float(expected)) < 1e-10
+
+
+def test_calculate_p_value_zero_mean():
+    # A strategy whose average trade is exactly break-even has a t-statistic of
+    # zero -> p-value of exactly 1.0 (entirely indistinguishable from noise).
+    trades = DataFrame({"profit_abs": [1.0, -1.0, 2.0, -2.0]})
+    assert calculate_p_value(trades, starting_balance=100) == 1.0
+
+
 @pytest.mark.parametrize(
     "start,end,days, expected",
     [
diff --git a/tests/optimize/test_optimize_reports.py b/tests/optimize/test_optimize_reports.py
@@ -232,6 +232,9 @@ def test_generate_backtest_stats(default_conf, testdatadir, tmp_path):
     assert strat_stats["drawdown_end_ts"] == 1510699380000
     assert strat_stats["drawdown_start_ts"] == 1510697400000
     assert strat_stats["pairlist"] == ["UNITTEST/BTC"]
+    # Statistical significance of the mean trade return
+    assert "p_value" in strat_stats
+    assert strat_stats["p_value"] == pytest.approx(0.8957701627)
 
     # Test storing stats
     filename = tmp_path / "btresult.json"
@@ -666,13 +669,17 @@ def test_text_table_add_metrics_shows_wallet_ratios(testdatadir, capsys):
         "max_drawdown_low": 0.95,
     }
 
+    strat_results["p_value"] = 0.0321
+
     text_table_add_metrics(strat_results)
     text = capsys.readouterr().out
 
     assert "Sharpe (daily wallet balance)" in text
     assert "Sortino (daily wallet balance)" in text
     assert "Calmar (daily wallet balance)" in text
     assert "Max % of account underwater (balance)" in text
+    assert "Mean profit p-value" in text
+    assert "0.0321" in text
 
 
 def test_generate_periodic_breakdown_stats(testdatadir):

Original file line number	Diff line number	Diff line change
`@@ -405,6 +405,10 @@ def text_table_add_metrics(strat_results: dict) -> None:`
`405`	`405`	`f"{strat_results['calmar']:.2f}" if "calmar" in strat_results else "N/A",`
`406`	`406`	`),`
`407`	`407`	`("SQN", f"{strat_results['sqn']:.2f}" if "sqn" in strat_results else "N/A"),`
	`408`	`+ (`
	`409`	`+ "Mean profit p-value",`
	`410`	`+ (f"{strat_results['p_value']:.4g}" if "p_value" in strat_results else "N/A"),`
	`411`	`+ ),`
`408`	`412`	`(`
`409`	`413`	`"Profit factor",`
`410`	`414`	`(`