|
4 | 4 | #' subset with the highest trait diversity according to a either pooled or mean |
5 | 5 | #' diversity index estimate. \loadmathjax |
6 | 6 | #' |
7 | | -#' For each cluster/group, multiple candidate subsets are sampled randomly and |
8 | | -#' the subset with the highest trait diversity according to either pooled or |
9 | | -#' mean diversity index estimate is retained. This is similar to the |
10 | | -#' "Maximization" or M strategy of |
11 | | -#' \insertCite{schoen_conservation_1993;textual}{SampleCore}. |
| 7 | +#' To identify subsets with highest diversity estimates, the following |
| 8 | +#' strategies are available. These strategies are similar to the "Maximization" |
| 9 | +#' or M strategy of \insertCite{schoen_conservation_1993;textual}{SampleCore}. |
| 10 | +#' |
| 11 | +#' \subsection{Random search / Monte Carlo Method}{For each cluster/group, |
| 12 | +#' multiple candidate subsets are sampled randomly and the subset with the |
| 13 | +#' highest trait diversity according to either pooled or mean diversity index |
| 14 | +#' estimate is retained. The quality of the solution improves with increasing |
| 15 | +#' \code{n.iter} but is not guaranteed to find the global optimum. |
| 16 | +#' } |
| 17 | +#' |
| 18 | +#' \subsection{Greedy search with 1-opt}{This method builds a solution |
| 19 | +#' incrementally by adding the accession that maximises the diversity score at |
| 20 | +#' each step, starting from the \code{always.selected} accessions (or a single |
| 21 | +#' randomly drawn accession when there are no accessions specified in |
| 22 | +#' \code{always.selected} present in the particular cluster/group ). The |
| 23 | +#' 'greedy' solution is then refined by a 1-opt local search controlled by |
| 24 | +#' \code{local.search} and \code{max.iter}. Greedy search is deterministic |
| 25 | +#' given a fixed \code{always.selected} set; when there are no accessions |
| 26 | +#' specified in \code{always.selected} present in the particular cluster/group |
| 27 | +#' results may vary across runs due to the random initialisation. |
| 28 | +#' |
| 29 | +#' \code{local.search = "best.improvement"} scans all possible single swaps |
| 30 | +#' in each pass and applies the one yielding the greatest improvement before |
| 31 | +#' restarting. his guarantees the steepest ascent at each pass but requires |
| 32 | +#' evaluating all \mjseqn{k \times (n - k)} swap pairs per pass, where |
| 33 | +#' \mjseqn{k} is the number of swappable accessions and \mjseqn{n - k} is the |
| 34 | +#' size of the candidate pool. |
| 35 | +#' |
| 36 | +#' \code{local.search = "first.improvement"} applies the first swap that |
| 37 | +#' improves the score and immediately restarts the search. This typically |
| 38 | +#' requires fewer score evaluations per pass and converges faster, but may |
| 39 | +#' find a different local optimum than \code{"best.improvement"}. |
| 40 | +#' |
| 41 | +#' Both strategies terminate when no improving swap exists (local optimum) |
| 42 | +#' or when \code{max.iter} passes have been completed. |
| 43 | +#' |
| 44 | +#' } |
| 45 | +#' |
12 | 46 | #' |
13 | 47 | #' Entries listed as \code{always.selected} are mandatorily included in the |
14 | 48 | #' selection. Warnings are issued if requested allocation is smaller than the |
|
25 | 59 | #' (\code{search = "random"}). |
26 | 60 | #' @param max.iter The maximum number of 1-opt passes for greedy search |
27 | 61 | #' (\code{search = "greedy"}). |
| 62 | +#' @param search Character string specifying the search strategy used to find |
| 63 | +#' the subset with the highest diversity score. Either \code{"random"} |
| 64 | +#' (default) or \code{"greedy"} (See \strong{Details}). |
| 65 | +#' @param local.search Character string specifying the local search strategy |
| 66 | +#' used in the 1-opt improvement phase of the greedy search (\code{search = |
| 67 | +#' "greedy"}). Either \code{"best.improvement"} (default) or |
| 68 | +#' \code{"first.improvement"}. Ignored when \code{search = "random"}. |
28 | 69 | #' |
29 | 70 | #' @returns A named list where each element contains the selected entry |
30 | 71 | #' identifiers for a cluster/group. |
|
40 | 81 | select.diversity <- function(data, names, group, alloc, |
41 | 82 | quantitative, qualitative, |
42 | 83 | always.selected = NULL, |
43 | | - div.index = c("shannon", "simpson", "mcintosh"), |
| 84 | + div.index = c("richness", "shannon", |
| 85 | + "simpson", "mcintosh"), |
44 | 86 | shannon.base = exp(1), |
45 | 87 | div.fun = NULL, |
46 | 88 | metric = c("mean", "pooled"), |
47 | 89 | search = c("random", "greedy"), |
| 90 | + local.search = c("best.improvement", |
| 91 | + "first.improvement"), |
48 | 92 | n.iter = 1000, |
49 | 93 | max.iter = 30) { |
50 | 94 |
|
51 | 95 | div.index <- match.arg(div.index) |
52 | 96 | metric <- match.arg(metric) |
53 | 97 | search <- match.arg(search) |
| 98 | + local.search <- match.arg(local.search) |
| 99 | + |
| 100 | + if (search == "random" && !missing(local.search)) { |
| 101 | + warning('"local.search" is ignored when search = "random"', |
| 102 | + call. = FALSE) |
| 103 | + } |
54 | 104 |
|
55 | 105 | checks.sample.core(data = data, size = NULL, |
56 | 106 | names = names, group = group, |
@@ -79,6 +129,7 @@ select.diversity <- function(data, names, group, alloc, |
79 | 129 | div_fun_internal <- |
80 | 130 | switch( |
81 | 131 | div.index, |
| 132 | + richness = function(x) length(unique(x)), |
82 | 133 | shannon = function(x) DiversityStats::shannon(x, base = shannon.base), |
83 | 134 | simpson = DiversityStats::gini_simpson, |
84 | 135 | mcintosh = DiversityStats::mcintosh_diversity |
@@ -195,86 +246,153 @@ select.diversity <- function(data, names, group, alloc, |
195 | 246 |
|
196 | 247 | ### Greedy initialization ---- |
197 | 248 |
|
| 249 | + if (SampleCore.debug) { |
| 250 | + message("--Greedy initialization started.---------------------\n\n") |
| 251 | + } |
| 252 | + |
198 | 253 | # Ignores max.iter |
199 | 254 |
|
200 | 255 | # when fixed_accns is NULL |
201 | 256 | if (length(fixed_accns) == 0L) { |
202 | 257 | seed_acc <- sample(rem_accns, 1L) |
203 | 258 | selected <- seed_acc # start from always-selected set |
204 | | - pool <- setdiff(rem_accns, seed_acc) # remaining candidates |
| 259 | + pool <- setdiff(rem_accns, seed_acc) # remaining candidates |
205 | 260 | n_to_add <- max(0L, n_rem - 1L) |
206 | 261 | } else { |
207 | 262 | selected <- fixed_accns |
208 | | - pool <- rem_accns |
| 263 | + pool <- rem_accns |
209 | 264 | n_to_add <- n_rem |
210 | 265 | } |
211 | 266 |
|
| 267 | + idx_lookup <- setNames(seq_len(nrow(sub_df)), group_accns) |
| 268 | + selected_idx <- idx_lookup[selected] |
| 269 | + pool_idx <- idx_lookup[pool] |
| 270 | + |
212 | 271 | for (i in seq_len(n_to_add)) { |
213 | | - # Score each candidate added to the current selected set |
214 | | - scores <- vapply(pool, function(cand) { |
215 | | - idx <- match(c(selected, cand), group_accns) |
216 | | - compute_score(idx = idx, |
217 | | - traits_mat = traits_mat, |
218 | | - div_fun = div_fun_internal, |
219 | | - metric = metric) |
| 272 | + scores <- vapply(pool_idx, function(cand_i) { |
| 273 | + compute_score(c(selected_idx, cand_i), traits_mat, div_fun_internal, metric) |
220 | 274 | }, numeric(1)) |
221 | 275 |
|
222 | | - best_cand <- pool[which.max(scores)] |
223 | | - selected <- c(selected, best_cand) |
224 | | - pool <- setdiff(pool, best_cand) |
| 276 | + best_pos <- which.max(scores) |
| 277 | + selected_idx <- c(selected_idx, pool_idx[best_pos]) |
| 278 | + pool_idx <- pool_idx[-best_pos] # integer remove by position — faster than setdiff |
225 | 279 | } |
226 | 280 |
|
227 | 281 | ### 1-opt local search ---- |
228 | 282 |
|
229 | | - current_sel <- selected |
230 | | - # idx <- match(selected, group_accns) |
| 283 | + if (SampleCore.debug) { |
| 284 | + message("--Local search started.------------------------------\n\n") |
| 285 | + } |
231 | 286 |
|
232 | | - # integer positions into traits_mat |
233 | | - idx_lookup <- setNames(seq_len(nrow(sub_df)), group_accns) |
234 | | - current_idx <- idx_lookup[current_sel] |
235 | | - fixed_idx <- idx_lookup[fixed_accns] |
236 | | - rem_idx <- idx_lookup[rem_accns] |
| 287 | + current_idx <- selected_idx |
| 288 | + fixed_idx <- idx_lookup[fixed_accns] |
| 289 | + rem_idx <- idx_lookup[rem_accns] |
| 290 | + current_score <- compute_score(current_idx, traits_mat, |
| 291 | + div_fun_internal, metric) |
237 | 292 |
|
238 | | - current_score <- |
239 | | - compute_score(idx = idx_lookup[selected], |
240 | | - traits_mat = traits_mat, |
241 | | - div_fun = div_fun_internal, |
242 | | - metric = metric) |
| 293 | + # Initialize indices ONCE |
| 294 | + swappable_idx <- setdiff(current_idx, fixed_idx) |
| 295 | + candidate_idx <- setdiff(rem_idx, current_idx) |
243 | 296 |
|
244 | 297 | iter_1opt <- 0L |
245 | 298 | repeat { |
246 | | - |
247 | 299 | if (iter_1opt >= max.iter) break # cap check |
248 | 300 |
|
249 | 301 | iter_1opt <- iter_1opt + 1L |
250 | 302 |
|
251 | | - # swappable and candidate pools as integer indices |
252 | | - swappable_idx <- setdiff(current_idx, fixed_idx) |
253 | | - candidate_idx <- setdiff(rem_idx, current_idx) # recomputed each pass — current_idx mutates |
254 | | - |
| 303 | + # Exit if no swaps are possible |
255 | 304 | if (length(swappable_idx) == 0L || length(candidate_idx) == 0L) break |
256 | 305 |
|
257 | | - # all (out, in) pairs — integer matrix, nrow = n_pairs |
258 | | - pairs <- expand.grid(out_i = swappable_idx, |
259 | | - in_i = candidate_idx) |
260 | | - |
261 | | - # score every pair in one vapply call |
262 | | - trial_scores <- vapply(seq_len(nrow(pairs)), function(k) { |
263 | | - trial_idx <- c(current_idx[current_idx != pairs$out_i[k]], |
264 | | - pairs$in_i[k]) |
265 | | - compute_score(trial_idx, traits_mat, div_fun_internal, metric) |
266 | | - }, numeric(1)) |
267 | | - |
268 | | - best_k <- which.max(trial_scores) |
269 | | - best_delta <- trial_scores[best_k] - current_score |
270 | | - |
271 | | - if (is.na(best_delta) || best_delta <= 0) break # local optimum - natural exit |
272 | | - |
273 | | - # apply best swap found in this pass |
274 | | - current_idx[current_idx == pairs$out_i[best_k]] <- pairs$in_i[best_k] |
275 | | - # current_score <- current_score + best_delta |
276 | | - current_score <- trial_scores[best_k] |
277 | | - |
| 306 | + improved <- FALSE |
| 307 | + |
| 308 | + #### Best-Improvement Strategy ---- |
| 309 | + if (local.search == "best.improvement") { |
| 310 | + best_overall_score <- current_score |
| 311 | + best_out_val <- NULL |
| 312 | + best_in_val <- NULL |
| 313 | + best_out_pos_in_swappable <- NULL |
| 314 | + best_in_pos_in_candidate <- NULL |
| 315 | + |
| 316 | + # Nested Loops: Scanning all possible swaps |
| 317 | + for (i in seq_along(swappable_idx)) { |
| 318 | + out_val <- swappable_idx[i] |
| 319 | + # Pre-calculate the subset excluding the 'out' candidate |
| 320 | + subset_minus_out <- current_idx[current_idx != out_val] |
| 321 | + |
| 322 | + for (j in seq_along(candidate_idx)) { |
| 323 | + in_val <- candidate_idx[j] |
| 324 | + trial_score <- |
| 325 | + compute_score(idx = c(subset_minus_out, in_val), |
| 326 | + traits_mat = traits_mat, |
| 327 | + div_fun =div_fun_internal, |
| 328 | + metric = metric) |
| 329 | + |
| 330 | + # Track the best improvement found so far |
| 331 | + if (trial_score > best_overall_score) { |
| 332 | + best_overall_score <- trial_score |
| 333 | + best_out_val <- out_val |
| 334 | + best_in_val <- in_val |
| 335 | + best_out_pos_in_swappable <- i |
| 336 | + best_in_pos_in_candidate <- j |
| 337 | + } |
| 338 | + } |
| 339 | + } |
| 340 | + |
| 341 | + # Check if an improvement was actually found in this pass |
| 342 | + if (!is.null(best_out_val)) { |
| 343 | + # Update current collection |
| 344 | + current_idx[match(best_out_val, current_idx)] <- best_in_val |
| 345 | + current_score <- best_overall_score |
| 346 | + |
| 347 | + # Update indices in-place |
| 348 | + swappable_idx[best_out_pos_in_swappable] <- best_in_val |
| 349 | + candidate_idx[best_in_pos_in_candidate] <- best_out_val |
| 350 | + improved <- TRUE |
| 351 | + |
| 352 | + if (SampleCore.debug) { |
| 353 | + message(sprintf("Best-improvement | Iter %d: Swapped out %d for %d. New score: %f", |
| 354 | + iter_1opt, best_out_val, best_in_val, |
| 355 | + current_score)) |
| 356 | + } |
| 357 | + } |
| 358 | + |
| 359 | + } else { |
| 360 | + |
| 361 | + ### First-improvement strategy ---- |
| 362 | + |
| 363 | + # Nested loops |
| 364 | + for (i in seq_along(swappable_idx)) { |
| 365 | + out_val <- swappable_idx[i] |
| 366 | + subset_minus_out <- current_idx[current_idx != out_val] |
| 367 | + |
| 368 | + for (j in seq_along(candidate_idx)) { |
| 369 | + in_val <- candidate_idx[j] |
| 370 | + trial_score <- compute_score(c(subset_minus_out, in_val), |
| 371 | + traits_mat, div_fun_internal, metric) |
| 372 | + |
| 373 | + if (trial_score > current_score) { |
| 374 | + # First improvement found - Apply swap immediately |
| 375 | + current_idx[match(out_val, current_idx)] <- in_val |
| 376 | + current_score <- trial_score |
| 377 | + |
| 378 | + # swap in-place |
| 379 | + swappable_idx[i] <- in_val # best_in enters swappable pool |
| 380 | + candidate_idx[j] <- out_val # best_out enters candidate pool |
| 381 | + improved <- TRUE |
| 382 | + |
| 383 | + if (SampleCore.debug) { |
| 384 | + message(sprintf("First-improvement | Iter %d: Swapped out %d for %d. New score: %f", |
| 385 | + iter_1opt, out_val, in_val, current_score)) |
| 386 | + } |
| 387 | + |
| 388 | + break # Break inner loop |
| 389 | + } |
| 390 | + } |
| 391 | + if (improved) break # Break outer loop to restart 1-opt with new current_idx |
| 392 | + } |
| 393 | + } |
| 394 | + |
| 395 | + if (!improved) break # Local optimum reached |
278 | 396 | } |
279 | 397 |
|
280 | 398 | best_subset <- group_accns[current_idx] |
|
0 commit comments