chunk_stratified表现极好, deprecated chunk

kongdd · kongdd · commit fd0a344c218b · 2026-04-20T21:56:47.000+08:00
diff --git a/NAMESPACE b/NAMESPACE
@@ -9,6 +9,7 @@ S3method(print,kfold)
 S3method(select,matrix)
 export(GOF)
 export(NSE)
+export(chunk_stratified)
 export(cv_coef)
 export(kfold_calib)
 export(kfold_lm)
diff --git a/R/kford_ml.R b/R/kford_ml.R
@@ -13,7 +13,8 @@ kfold_ml <- function(X, Y, kfold = 5, FUN, ...){ #, threshold = 5000
     Y = as.matrix(Y)
 
     # ind_lst <- createFolds(1:nrow(X), k = kfold, list = TRUE)
-    ind_lst <- Ipaper::chunk(1:nrow(X), kfold)
+    # ind_lst <- Ipaper::chunk(1:nrow(X), kfold)
+    ind_lst <- chunk_stratified(Y, kfold)
 
     res <- future_map(ind_lst, kfold_calib,
         X = X, Y = Y,
@@ -23,6 +24,32 @@ kfold_ml <- function(X, Y, kfold = 5, FUN, ...){ #, threshold = 5000
     kfold_tidy(res, ind_lst, Y)
 }
 
+# chunk <- function(x, nchunk = 6) {
+#   split(x, cut(seq_along(x), nchunk, labels = FALSE)) %>% set_names(NULL)
+# }
+
+#' @export
+chunk_stratified <- function(y, kfold = 5) {
+    # 1. 获取按目标变量 Y 值大小排序的对应索引
+    idx_sorted <- order(y)
+
+    # 2. 计算能被切分成多少个大小为 kfold 的区块
+    n_blocks <- ceiling(length(y) / kfold)
+
+    # 3. 在每个区块内部进行 1:kfold 的随机乱序排列
+    #    保证局部随机性，同时维持宏观的分布均匀
+    set.seed(42) # 固定种子，保证交叉验证结果可精确复现
+    groups <- unlist(lapply(1:n_blocks, function(x) sample(1:kfold)))
+
+    # 4. 截去尾端多余的组号（对应 length(y) 不能整除 kfold 的情况）
+    groups <- groups[1:length(y)]
+
+    # 5. 将排序后的索引按照打乱后的组号分发，并去除 list 的 names
+    ind_lst <- unname(split(idx_sorted, groups))
+    return(ind_lst)
+}
+
+
 #' @inheritParams ranger::ranger
 #' @rdname kfold_ml
 #' @export
diff --git a/README.Rmd b/README.Rmd
@@ -2,6 +2,10 @@
 output: github_document
 ---
 
+[![R-CMD-check](https://github.com/rpkgs/rtrend/workflows/R-CMD-check/badge.svg)](https://github.com/rpkgs/rtrend/actions)
+[![codecov](https://codecov.io/gh/rpkgs/rtrend/branch/master/graph/badge.svg)](https://codecov.io/gh/rpkgs/rtrend)
+
+
 <!-- README.md is generated from README.Rmd. Please edit that file -->
 
 ```{r, include = FALSE}
diff --git a/README.md b/README.md
@@ -4,6 +4,8 @@
 # kfold
 
 <!-- badges: start -->
+[![R-CMD-check](https://github.com/rpkgs/rtrend/workflows/R-CMD-check/badge.svg)](https://github.com/rpkgs/rtrend/actions)
+[![codecov](https://codecov.io/gh/rpkgs/rtrend/branch/master/graph/badge.svg)](https://codecov.io/gh/rpkgs/rtrend)
 <!-- badges: end -->
 
 The goal of kfold is to …