diff --git a/docs/backtesting.md b/docs/backtesting.md index 00776ed661b..8840e50ab36 100644 --- a/docs/backtesting.md +++ b/docs/backtesting.md @@ -229,6 +229,7 @@ A backtesting result will look like that: │ Sortino (closed trades) │ 2.57 │ │ Calmar (closed trades) │ 43.03 │ │ SQN │ 0.71 │ +│ Mean profit p-value │ 0.4768 │ │ Profit factor │ 1.30 │ │ Expectancy (Ratio) │ 0.74 (0.04) │ │ Avg. daily profit │ 1.844 USDT │ @@ -362,6 +363,7 @@ It contains key metrics about the performance of your strategy on backtesting da │ Sortino (closed trades) │ 2.57 │ │ Calmar (closed trades) │ 43.03 │ │ SQN │ 0.71 │ +│ Mean profit p-value │ 0.4768 │ │ Profit factor │ 1.30 │ │ Expectancy (Ratio) │ 0.74 (0.04) │ │ Avg. daily profit │ 1.844 USDT │ @@ -424,6 +426,7 @@ It contains key metrics about the performance of your strategy on backtesting da - `Sortino (closed trades)`: Annualized Sortino ratio including only closed trades (ignoring open trades with profits or losses). - `Calmar (closed trades)`: Annualized Calmar ratio including only closed trades (ignoring open trades with profits or losses). - `SQN`: System Quality Number (SQN) - by Van Tharp. +- `Mean profit p-value`: Two-sided p-value of a one-sample Student's t-test against the null hypothesis that the mean per-trade return is zero - in short, "is the average profit distinguishable from noise?". A small value (the usual bar is below `0.05`) means the observed edge is unlikely to be down to chance. Its underlying t-statistic is identical to `SQN`. See the note below for how to read it in practice. - `Profit factor`: Sum of the profits of all winning trades divided by the sum of the losses of all losing trades. - `Expectancy (Ratio)`: Expectancy ratio, which is the average profit or loss per trade. A negative expectancy ratio means that your strategy is not profitable. - `Avg. daily profit`: Average profit per day, calculated as `(Total Profit / Backtest Days)`. @@ -455,6 +458,11 @@ It contains key metrics about the performance of your strategy on backtesting da - `Sortino (wallet balance)` Annualized Sortino ratio calculation including unrealized profits. - `Calmar (wallet balance)` Annualized Calmar ratio calculation including unrealized profits. +??? Note "Reading the mean profit p-value" + Think of the p-value as the answer to one question: *if the strategy truly had no edge, how often would pure chance still hand you an average per-trade result at least this far from zero?* A value of `0.4768` therefore means roughly a 48% chance of a swing this large turning up from randomness alone - in other words the average profit is not distinguishable from luck. The lower the p-value, the less likely the result is a fluke, and a common rule of thumb is to treat anything below `0.05` (a 5% chance) as "statistically significant". + + Two things keep this honest. The test assumes trades are independent and identically distributed, which real strategies rarely are (trades overlap and cluster in time), so the figure is an *optimistic* lower bound - the true uncertainty is usually larger. And because backtesting and hyperopt evaluate many strategies, some will score a low p-value by chance alone, so a small value only tells you a result is hard to explain by noise; it is not by itself proof of a genuine edge. + !!! Tip "Wallet based Metrics" The metrics under the "Wallet based Metrics" section are calculated based on the unrealized balance, which includes the capital tied in open trades. This provides a more comprehensive view of the strategy's performance, as it accounts for both realized and unrealized profits and losses. diff --git a/freqtrade/data/metrics.py b/freqtrade/data/metrics.py index 0d7c65bab02..695c844f441 100644 --- a/freqtrade/data/metrics.py +++ b/freqtrade/data/metrics.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd +from scipy import stats logger = logging.getLogger(__name__) @@ -632,3 +633,24 @@ def calculate_sqn(trades: pd.DataFrame, starting_balance: float) -> float: sqn = -100.0 return round(sqn, 4) + + +def calculate_p_value(trades: pd.DataFrame, starting_balance: float) -> float: + """ + Two-sided p-value for the null hypothesis that mean per-trade profit + (profit_abs / starting_balance) equals zero. + Returns 1.0 for fewer than 2 trades or zero-variance samples. + + :param trades: DataFrame containing trades (requires column profit_abs) + :param starting_balance: Starting balance of the trading system + :return: Two-sided p-value in the range [0, 1]. Returns 1.0 (no evidence + against the null) when it cannot be computed - fewer than two + trades or zero return variance. + """ + if len(trades) < 2: + return 1.0 + returns = trades["profit_abs"] / starting_balance + if returns.std() == 0: + return 1.0 + _, p_value = stats.ttest_1samp(returns, popmean=0) + return float(p_value) diff --git a/freqtrade/optimize/optimize_reports/bt_output.py b/freqtrade/optimize/optimize_reports/bt_output.py index f42054c65cf..8cadad0d256 100644 --- a/freqtrade/optimize/optimize_reports/bt_output.py +++ b/freqtrade/optimize/optimize_reports/bt_output.py @@ -405,6 +405,10 @@ def text_table_add_metrics(strat_results: dict) -> None: f"{strat_results['calmar']:.2f}" if "calmar" in strat_results else "N/A", ), ("SQN", f"{strat_results['sqn']:.2f}" if "sqn" in strat_results else "N/A"), + ( + "Mean profit p-value", + (f"{strat_results['p_value']:.4g}" if "p_value" in strat_results else "N/A"), + ), ( "Profit factor", ( diff --git a/freqtrade/optimize/optimize_reports/optimize_reports.py b/freqtrade/optimize/optimize_reports/optimize_reports.py index 15d25a19cdf..df2e4dcc258 100644 --- a/freqtrade/optimize/optimize_reports/optimize_reports.py +++ b/freqtrade/optimize/optimize_reports/optimize_reports.py @@ -16,6 +16,7 @@ calculate_market_change, calculate_max_drawdown, calculate_max_drawdown_from_balance, + calculate_p_value, calculate_sharpe, calculate_sharpe_from_balance, calculate_sortino, @@ -224,6 +225,7 @@ def _generate_result_line( "sharpe": calculate_sharpe(result, min_date, max_date, starting_balance), "calmar": calculate_calmar(result, min_date, max_date, starting_balance), "sqn": calculate_sqn(result, starting_balance), + "p_value": calculate_p_value(result, starting_balance), "profit_factor": profit_factor, "max_drawdown_account": drawdown.relative_account_drawdown if drawdown else 0.0, "max_drawdown_abs": drawdown.drawdown_abs if drawdown else 0.0, @@ -684,6 +686,7 @@ def generate_strategy_stats( "sharpe": calculate_sharpe(results, min_date, max_date, start_balance), "calmar": calculate_calmar(results, min_date, max_date, start_balance), "sqn": calculate_sqn(results, start_balance), + "p_value": calculate_p_value(results, start_balance), "wallet_stats": generate_wallet_stats(content.get("wallet_summary"), stake_currency), "profit_factor": profit_factor, "backtest_start": min_date.strftime(DATETIME_PRINT_FORMAT), diff --git a/requirements-hyperopt.txt b/requirements-hyperopt.txt index 9d9228bda45..e85e5137b25 100644 --- a/requirements-hyperopt.txt +++ b/requirements-hyperopt.txt @@ -2,7 +2,6 @@ -r requirements.txt # Required for hyperopt -scipy==1.17.1 scikit-learn==1.9.0 filelock==3.29.1 optuna==4.9.0 diff --git a/requirements.txt b/requirements.txt index 5fad3e6c3aa..7004e7d22b0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ numpy==2.4.6 pandas==3.0.3 bottleneck==1.6.0 numexpr==2.14.1 +scipy==1.17.1 # Indicator libraries ft-pandas-ta==0.3.16 ta-lib==0.6.8 diff --git a/tests/conftest.py b/tests/conftest.py index dc086046618..1a47d65a6c7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -533,6 +533,9 @@ def patch_torch_initlogs(mocker) -> None: module_name = "torch" mocked_module = types.ModuleType(module_name) + # SciPy's array-API dispatch probes ``torch.Tensor`` to classify inputs; + # expose a dummy so scipy.stats stays importable/usable under the mock. + mocked_module.Tensor = type("Tensor", (), {}) sys.modules[module_name] = mocked_module else: try: diff --git a/tests/data/test_metrics.py b/tests/data/test_metrics.py index 242700eefdd..170d760ddd1 100644 --- a/tests/data/test_metrics.py +++ b/tests/data/test_metrics.py @@ -18,6 +18,7 @@ calculate_market_change, calculate_max_drawdown, calculate_max_drawdown_from_balance, + calculate_p_value, calculate_sharpe, calculate_sharpe_from_balance, calculate_sortino, @@ -442,6 +443,56 @@ def test_calculate_sqn_cases(profits, starting_balance, expected_sqn, descriptio assert pytest.approx(sqn, rel=1e-4) == expected_sqn +def test_calculate_p_value_edge_cases(): + # Fewer than two trades -> not computable, returns "no evidence" default. + assert calculate_p_value(DataFrame({"profit_abs": []}), 100) == 1.0 + assert calculate_p_value(DataFrame({"profit_abs": [1.0]}), 100) == 1.0 + + # Zero variance (all identical returns) -> not computable. + assert calculate_p_value(DataFrame({"profit_abs": [1.0, 1.0, 1.0]}), 100) == 1.0 + + # p-value is always within [0, 1]. + p_value = calculate_p_value(DataFrame({"profit_abs": [1.0, -0.5, 2.0, -1.0]}), 100) + assert 0.0 <= p_value <= 1.0 + + +def test_calculate_p_value_scale_invariance(): + # The t-statistic, and hence the p-value, is invariant to the stake scale. + profits = [1.0, -0.5, 2.0, -1.0, 0.5, 1.5, -0.5, 1.0] + trades = DataFrame({"profit_abs": profits}) + p_small = calculate_p_value(trades, starting_balance=10) + p_large = calculate_p_value(trades, starting_balance=100_000) + assert pytest.approx(p_small, rel=1e-9) == p_large + + +def test_calculate_p_value_matches_reference(): + """ + calculate_p_value must match scipy.stats.ttest_1samp, the canonical + reference, computed live for each case. + """ + from scipy import stats + + cases = [ + [0.01, -0.005, 0.02, 0.015, -0.01], + [0.05, 0.04, 0.06, 0.045, 0.055], + [-0.01, -0.02, -0.015, -0.005, -0.025], + [0.001, -0.001, 0.001, -0.001], + ] + starting_balance = 1000.0 + for returns in cases: + trades = DataFrame({"profit_abs": [r * starting_balance for r in returns]}) + result = calculate_p_value(trades, starting_balance) + _, expected = stats.ttest_1samp(returns, popmean=0) + assert abs(result - float(expected)) < 1e-10 + + +def test_calculate_p_value_zero_mean(): + # A strategy whose average trade is exactly break-even has a t-statistic of + # zero -> p-value of exactly 1.0 (entirely indistinguishable from noise). + trades = DataFrame({"profit_abs": [1.0, -1.0, 2.0, -2.0]}) + assert calculate_p_value(trades, starting_balance=100) == 1.0 + + @pytest.mark.parametrize( "start,end,days, expected", [ diff --git a/tests/optimize/test_optimize_reports.py b/tests/optimize/test_optimize_reports.py index fcf5abffd07..c0bcd96b789 100644 --- a/tests/optimize/test_optimize_reports.py +++ b/tests/optimize/test_optimize_reports.py @@ -232,6 +232,9 @@ def test_generate_backtest_stats(default_conf, testdatadir, tmp_path): assert strat_stats["drawdown_end_ts"] == 1510699380000 assert strat_stats["drawdown_start_ts"] == 1510697400000 assert strat_stats["pairlist"] == ["UNITTEST/BTC"] + # Statistical significance of the mean trade return + assert "p_value" in strat_stats + assert strat_stats["p_value"] == pytest.approx(0.8957701627) # Test storing stats filename = tmp_path / "btresult.json" @@ -666,6 +669,8 @@ def test_text_table_add_metrics_shows_wallet_ratios(testdatadir, capsys): "max_drawdown_low": 0.95, } + strat_results["p_value"] = 0.0321 + text_table_add_metrics(strat_results) text = capsys.readouterr().out @@ -673,6 +678,8 @@ def test_text_table_add_metrics_shows_wallet_ratios(testdatadir, capsys): assert "Sortino (daily wallet balance)" in text assert "Calmar (daily wallet balance)" in text assert "Max % of account underwater (balance)" in text + assert "Mean profit p-value" in text + assert "0.0321" in text def test_generate_periodic_breakdown_stats(testdatadir):