Skip to content

Commit ba43bbb

Browse files
committed
Added greedy search 1opt with both first and best improvment
1 parent cebaaf8 commit ba43bbb

2 files changed

Lines changed: 224 additions & 62 deletions

File tree

R/select.diversity.R

Lines changed: 174 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,45 @@
44
#' subset with the highest trait diversity according to a either pooled or mean
55
#' diversity index estimate. \loadmathjax
66
#'
7-
#' For each cluster/group, multiple candidate subsets are sampled randomly and
8-
#' the subset with the highest trait diversity according to either pooled or
9-
#' mean diversity index estimate is retained. This is similar to the
10-
#' "Maximization" or M strategy of
11-
#' \insertCite{schoen_conservation_1993;textual}{SampleCore}.
7+
#' To identify subsets with highest diversity estimates, the following
8+
#' strategies are available. These strategies are similar to the "Maximization"
9+
#' or M strategy of \insertCite{schoen_conservation_1993;textual}{SampleCore}.
10+
#'
11+
#' \subsection{Random search / Monte Carlo Method}{For each cluster/group,
12+
#' multiple candidate subsets are sampled randomly and the subset with the
13+
#' highest trait diversity according to either pooled or mean diversity index
14+
#' estimate is retained. The quality of the solution improves with increasing
15+
#' \code{n.iter} but is not guaranteed to find the global optimum.
16+
#' }
17+
#'
18+
#' \subsection{Greedy search with 1-opt}{This method builds a solution
19+
#' incrementally by adding the accession that maximises the diversity score at
20+
#' each step, starting from the \code{always.selected} accessions (or a single
21+
#' randomly drawn accession when there are no accessions specified in
22+
#' \code{always.selected} present in the particular cluster/group ). The
23+
#' 'greedy' solution is then refined by a 1-opt local search controlled by
24+
#' \code{local.search} and \code{max.iter}. Greedy search is deterministic
25+
#' given a fixed \code{always.selected} set; when there are no accessions
26+
#' specified in \code{always.selected} present in the particular cluster/group
27+
#' results may vary across runs due to the random initialisation.
28+
#'
29+
#' \code{local.search = "best.improvement"} scans all possible single swaps
30+
#' in each pass and applies the one yielding the greatest improvement before
31+
#' restarting. his guarantees the steepest ascent at each pass but requires
32+
#' evaluating all \mjseqn{k \times (n - k)} swap pairs per pass, where
33+
#' \mjseqn{k} is the number of swappable accessions and \mjseqn{n - k} is the
34+
#' size of the candidate pool.
35+
#'
36+
#' \code{local.search = "first.improvement"} applies the first swap that
37+
#' improves the score and immediately restarts the search. This typically
38+
#' requires fewer score evaluations per pass and converges faster, but may
39+
#' find a different local optimum than \code{"best.improvement"}.
40+
#'
41+
#' Both strategies terminate when no improving swap exists (local optimum)
42+
#' or when \code{max.iter} passes have been completed.
43+
#'
44+
#' }
45+
#'
1246
#'
1347
#' Entries listed as \code{always.selected} are mandatorily included in the
1448
#' selection. Warnings are issued if requested allocation is smaller than the
@@ -25,6 +59,13 @@
2559
#' (\code{search = "random"}).
2660
#' @param max.iter The maximum number of 1-opt passes for greedy search
2761
#' (\code{search = "greedy"}).
62+
#' @param search Character string specifying the search strategy used to find
63+
#' the subset with the highest diversity score. Either \code{"random"}
64+
#' (default) or \code{"greedy"} (See \strong{Details}).
65+
#' @param local.search Character string specifying the local search strategy
66+
#' used in the 1-opt improvement phase of the greedy search (\code{search =
67+
#' "greedy"}). Either \code{"best.improvement"} (default) or
68+
#' \code{"first.improvement"}. Ignored when \code{search = "random"}.
2869
#'
2970
#' @returns A named list where each element contains the selected entry
3071
#' identifiers for a cluster/group.
@@ -40,17 +81,26 @@
4081
select.diversity <- function(data, names, group, alloc,
4182
quantitative, qualitative,
4283
always.selected = NULL,
43-
div.index = c("shannon", "simpson", "mcintosh"),
84+
div.index = c("richness", "shannon",
85+
"simpson", "mcintosh"),
4486
shannon.base = exp(1),
4587
div.fun = NULL,
4688
metric = c("mean", "pooled"),
4789
search = c("random", "greedy"),
90+
local.search = c("best.improvement",
91+
"first.improvement"),
4892
n.iter = 1000,
4993
max.iter = 30) {
5094

5195
div.index <- match.arg(div.index)
5296
metric <- match.arg(metric)
5397
search <- match.arg(search)
98+
local.search <- match.arg(local.search)
99+
100+
if (search == "random" && !missing(local.search)) {
101+
warning('"local.search" is ignored when search = "random"',
102+
call. = FALSE)
103+
}
54104

55105
checks.sample.core(data = data, size = NULL,
56106
names = names, group = group,
@@ -79,6 +129,7 @@ select.diversity <- function(data, names, group, alloc,
79129
div_fun_internal <-
80130
switch(
81131
div.index,
132+
richness = function(x) length(unique(x)),
82133
shannon = function(x) DiversityStats::shannon(x, base = shannon.base),
83134
simpson = DiversityStats::gini_simpson,
84135
mcintosh = DiversityStats::mcintosh_diversity
@@ -195,86 +246,153 @@ select.diversity <- function(data, names, group, alloc,
195246

196247
### Greedy initialization ----
197248

249+
if (SampleCore.debug) {
250+
message("--Greedy initialization started.---------------------\n\n")
251+
}
252+
198253
# Ignores max.iter
199254

200255
# when fixed_accns is NULL
201256
if (length(fixed_accns) == 0L) {
202257
seed_acc <- sample(rem_accns, 1L)
203258
selected <- seed_acc # start from always-selected set
204-
pool <- setdiff(rem_accns, seed_acc) # remaining candidates
259+
pool <- setdiff(rem_accns, seed_acc) # remaining candidates
205260
n_to_add <- max(0L, n_rem - 1L)
206261
} else {
207262
selected <- fixed_accns
208-
pool <- rem_accns
263+
pool <- rem_accns
209264
n_to_add <- n_rem
210265
}
211266

267+
idx_lookup <- setNames(seq_len(nrow(sub_df)), group_accns)
268+
selected_idx <- idx_lookup[selected]
269+
pool_idx <- idx_lookup[pool]
270+
212271
for (i in seq_len(n_to_add)) {
213-
# Score each candidate added to the current selected set
214-
scores <- vapply(pool, function(cand) {
215-
idx <- match(c(selected, cand), group_accns)
216-
compute_score(idx = idx,
217-
traits_mat = traits_mat,
218-
div_fun = div_fun_internal,
219-
metric = metric)
272+
scores <- vapply(pool_idx, function(cand_i) {
273+
compute_score(c(selected_idx, cand_i), traits_mat, div_fun_internal, metric)
220274
}, numeric(1))
221275

222-
best_cand <- pool[which.max(scores)]
223-
selected <- c(selected, best_cand)
224-
pool <- setdiff(pool, best_cand)
276+
best_pos <- which.max(scores)
277+
selected_idx <- c(selected_idx, pool_idx[best_pos])
278+
pool_idx <- pool_idx[-best_pos] # integer remove by position — faster than setdiff
225279
}
226280

227281
### 1-opt local search ----
228282

229-
current_sel <- selected
230-
# idx <- match(selected, group_accns)
283+
if (SampleCore.debug) {
284+
message("--Local search started.------------------------------\n\n")
285+
}
231286

232-
# integer positions into traits_mat
233-
idx_lookup <- setNames(seq_len(nrow(sub_df)), group_accns)
234-
current_idx <- idx_lookup[current_sel]
235-
fixed_idx <- idx_lookup[fixed_accns]
236-
rem_idx <- idx_lookup[rem_accns]
287+
current_idx <- selected_idx
288+
fixed_idx <- idx_lookup[fixed_accns]
289+
rem_idx <- idx_lookup[rem_accns]
290+
current_score <- compute_score(current_idx, traits_mat,
291+
div_fun_internal, metric)
237292

238-
current_score <-
239-
compute_score(idx = idx_lookup[selected],
240-
traits_mat = traits_mat,
241-
div_fun = div_fun_internal,
242-
metric = metric)
293+
# Initialize indices ONCE
294+
swappable_idx <- setdiff(current_idx, fixed_idx)
295+
candidate_idx <- setdiff(rem_idx, current_idx)
243296

244297
iter_1opt <- 0L
245298
repeat {
246-
247299
if (iter_1opt >= max.iter) break # cap check
248300

249301
iter_1opt <- iter_1opt + 1L
250302

251-
# swappable and candidate pools as integer indices
252-
swappable_idx <- setdiff(current_idx, fixed_idx)
253-
candidate_idx <- setdiff(rem_idx, current_idx) # recomputed each pass — current_idx mutates
254-
303+
# Exit if no swaps are possible
255304
if (length(swappable_idx) == 0L || length(candidate_idx) == 0L) break
256305

257-
# all (out, in) pairs — integer matrix, nrow = n_pairs
258-
pairs <- expand.grid(out_i = swappable_idx,
259-
in_i = candidate_idx)
260-
261-
# score every pair in one vapply call
262-
trial_scores <- vapply(seq_len(nrow(pairs)), function(k) {
263-
trial_idx <- c(current_idx[current_idx != pairs$out_i[k]],
264-
pairs$in_i[k])
265-
compute_score(trial_idx, traits_mat, div_fun_internal, metric)
266-
}, numeric(1))
267-
268-
best_k <- which.max(trial_scores)
269-
best_delta <- trial_scores[best_k] - current_score
270-
271-
if (is.na(best_delta) || best_delta <= 0) break # local optimum - natural exit
272-
273-
# apply best swap found in this pass
274-
current_idx[current_idx == pairs$out_i[best_k]] <- pairs$in_i[best_k]
275-
# current_score <- current_score + best_delta
276-
current_score <- trial_scores[best_k]
277-
306+
improved <- FALSE
307+
308+
#### Best-Improvement Strategy ----
309+
if (local.search == "best.improvement") {
310+
best_overall_score <- current_score
311+
best_out_val <- NULL
312+
best_in_val <- NULL
313+
best_out_pos_in_swappable <- NULL
314+
best_in_pos_in_candidate <- NULL
315+
316+
# Nested Loops: Scanning all possible swaps
317+
for (i in seq_along(swappable_idx)) {
318+
out_val <- swappable_idx[i]
319+
# Pre-calculate the subset excluding the 'out' candidate
320+
subset_minus_out <- current_idx[current_idx != out_val]
321+
322+
for (j in seq_along(candidate_idx)) {
323+
in_val <- candidate_idx[j]
324+
trial_score <-
325+
compute_score(idx = c(subset_minus_out, in_val),
326+
traits_mat = traits_mat,
327+
div_fun =div_fun_internal,
328+
metric = metric)
329+
330+
# Track the best improvement found so far
331+
if (trial_score > best_overall_score) {
332+
best_overall_score <- trial_score
333+
best_out_val <- out_val
334+
best_in_val <- in_val
335+
best_out_pos_in_swappable <- i
336+
best_in_pos_in_candidate <- j
337+
}
338+
}
339+
}
340+
341+
# Check if an improvement was actually found in this pass
342+
if (!is.null(best_out_val)) {
343+
# Update current collection
344+
current_idx[match(best_out_val, current_idx)] <- best_in_val
345+
current_score <- best_overall_score
346+
347+
# Update indices in-place
348+
swappable_idx[best_out_pos_in_swappable] <- best_in_val
349+
candidate_idx[best_in_pos_in_candidate] <- best_out_val
350+
improved <- TRUE
351+
352+
if (SampleCore.debug) {
353+
message(sprintf("Best-improvement | Iter %d: Swapped out %d for %d. New score: %f",
354+
iter_1opt, best_out_val, best_in_val,
355+
current_score))
356+
}
357+
}
358+
359+
} else {
360+
361+
### First-improvement strategy ----
362+
363+
# Nested loops
364+
for (i in seq_along(swappable_idx)) {
365+
out_val <- swappable_idx[i]
366+
subset_minus_out <- current_idx[current_idx != out_val]
367+
368+
for (j in seq_along(candidate_idx)) {
369+
in_val <- candidate_idx[j]
370+
trial_score <- compute_score(c(subset_minus_out, in_val),
371+
traits_mat, div_fun_internal, metric)
372+
373+
if (trial_score > current_score) {
374+
# First improvement found - Apply swap immediately
375+
current_idx[match(out_val, current_idx)] <- in_val
376+
current_score <- trial_score
377+
378+
# swap in-place
379+
swappable_idx[i] <- in_val # best_in enters swappable pool
380+
candidate_idx[j] <- out_val # best_out enters candidate pool
381+
improved <- TRUE
382+
383+
if (SampleCore.debug) {
384+
message(sprintf("First-improvement | Iter %d: Swapped out %d for %d. New score: %f",
385+
iter_1opt, out_val, in_val, current_score))
386+
}
387+
388+
break # Break inner loop
389+
}
390+
}
391+
if (improved) break # Break outer loop to restart 1-opt with new current_idx
392+
}
393+
}
394+
395+
if (!improved) break # Local optimum reached
278396
}
279397

280398
best_subset <- group_accns[current_idx]

man/select.diversity.Rd

Lines changed: 50 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)