Skip to content

Commit ae3b198

Browse files
authored
Improvements to stats run-time performance (#877)
* Added total_size() methods to allele count and variant stats classes These methods return the total size of the classes' buffers in bytes. * Improved AlleleCount process and flush run-time This was done by using an unordered set for tracking sample names. * Improved SampleStats process and flush run-time This was done by using an unordered map for saving sample stats, minimizing map lookups via memoization, and replacing a nested map with structs. * Improved VariantStats process and flush run-time This was done by reusing vectors for (missing)GT values, using an unordered set for tracking sample names, minimizing map lookups via memoization, and creating separate codepaths for updating v2 and v3 arrays, the prior of which can be done much more efficiently via appending instead of using the v3 insertion strategy.
1 parent a130d21 commit ae3b198

6 files changed

Lines changed: 334 additions & 182 deletions

File tree

libtiledbvcf/src/stats/allele_count.cc

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,16 @@ void AlleleCount::flush(bool finalize) {
371371
}
372372
}
373373

374+
size_t AlleleCount::total_size() const {
375+
return contig_buffer_.size() + sizeof(uint64_t) * contig_offsets_.size() +
376+
sizeof(uint32_t) * pos_buffer_.size() + ref_buffer_.size() +
377+
sizeof(uint64_t) * ref_offsets_.size() + alt_buffer_.size() +
378+
sizeof(uint64_t) * alt_offsets_.size() + filter_buffer_.size() +
379+
sizeof(uint64_t) * filter_offsets_.size() + gt_buffer_.size() +
380+
sizeof(uint64_t) * gt_offsets_.size() +
381+
sizeof(int32_t) * count_buffer_.size();
382+
}
383+
374384
void AlleleCount::process(
375385
const bcf_hdr_t* hdr,
376386
const std::string& sample_name,
@@ -525,13 +535,13 @@ void AlleleCount::update_results() {
525535
std::tuple<size_t, size_t, size_t, size_t, size_t>
526536
AlleleCountReader::allele_count_buffer_sizes() {
527537
size_t ref = 0, alt = 0, filter = 0, gt = 0;
528-
for (std::pair<AlleleCountKey, int32_t> ac : AlleleCountGroupBy) {
538+
for (std::pair<AlleleCountKey, int32_t> ac : alleleCountGroupBy) {
529539
ref += ac.first.ref.size();
530540
alt += ac.first.alt.size();
531541
filter += ac.first.filter.size();
532542
gt += ac.first.gt.size();
533543
}
534-
return {AlleleCountGroupBy.size(), ref, alt, filter, gt};
544+
return {alleleCountGroupBy.size(), ref, alt, filter, gt};
535545
}
536546

537547
void AlleleCountReader::prepare_allele_count(Region region) {
@@ -554,7 +564,7 @@ void AlleleCountReader::prepare_allele_count(Region region) {
554564
// TODO: add function to generate key by concatenating pos, ref, alt,
555565
// filter, gt
556566
AlleleCountKey key(pos, ref, alt, filter, gt);
557-
AlleleCountGroupBy[key] += mq.data<uint32_t>("count")[i];
567+
alleleCountGroupBy[key] += mq.data<uint32_t>("count")[i];
558568
}
559569
}
560570
}
@@ -581,7 +591,7 @@ void AlleleCountReader::read_from_allele_count(
581591
filter_offsets[0] = 0;
582592
gt_offsets[0] = 0;
583593
size_t i = 0;
584-
for (std::pair<AlleleCountKey, int32_t> ac : AlleleCountGroupBy) {
594+
for (std::pair<AlleleCountKey, int32_t> ac : alleleCountGroupBy) {
585595
pos[i] = ac.first.pos;
586596
std::strncpy(
587597
ref + ref_offsets[i], ac.first.ref.data(), ac.first.ref.size());

libtiledbvcf/src/stats/allele_count.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@
3030
#include <atomic>
3131
#include <map>
3232
#include <mutex>
33+
#include <set>
3334
#include <string>
35+
#include <unordered_set>
3436
#include <vector>
3537

3638
#include <htslib/vcf.h>
@@ -187,6 +189,11 @@ class AlleleCount {
187189
*/
188190
void flush(bool finalize = false);
189191

192+
/**
193+
* Returns the sum of sizes of all buffers (in bytes).
194+
*/
195+
size_t total_size() const;
196+
190197
private:
191198
// give the AlleleCountReader access to AlleleCount's private members
192199
friend class AlleleCountReader;
@@ -252,7 +259,7 @@ class AlleleCount {
252259
int count_delta_ = 1;
253260

254261
// Set of sample names in this query (per thread)
255-
std::set<std::string> sample_names_;
262+
std::unordered_set<std::string> sample_names_;
256263

257264
// Counts grouped by "key" at the current locus.
258265
// Use map to keep dimension keys sorted and maintain global order.
@@ -395,7 +402,7 @@ class AlleleCountReader {
395402
AlleleCountReader(std::shared_ptr<Context> ctx, const Group& group);
396403

397404
private:
398-
std::map<AlleleCountKey, int32_t> AlleleCountGroupBy;
405+
std::map<AlleleCountKey, int32_t> alleleCountGroupBy;
399406
std::shared_ptr<Array> array_;
400407
};
401408

libtiledbvcf/src/stats/sample_stats.cc

Lines changed: 73 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
* THE SOFTWARE.
2525
*/
2626

27+
#include <ranges>
28+
2729
#include <htslib/vcf.h>
2830
#include <htslib/vcfutils.h>
2931

@@ -206,31 +208,35 @@ void SampleStats::process(
206208
const std::string& contig,
207209
uint32_t pos,
208210
bcf1_t* rec) {
211+
Stats& stats = stats_[sample];
212+
209213
if (bcf_get_format_int32(hdr, rec, "DP", &dst_, &ndst_) > 0) {
210214
if (dst_[0] != bcf_int32_missing) {
211-
auto dp = static_cast<uint64_t>(dst_[0]);
212-
stats_[sample]["dp_sum"] += dp;
213-
stats_[sample]["dp_sum2"] += dp * dp;
214-
stats_[sample]["dp_count"] += 1;
215-
if (!stats_[sample].contains("dp_min")) {
216-
stats_[sample]["dp_min"] = std::numeric_limits<uint64_t>::max();
215+
std::unique_ptr<DPStats>& dp_stats = stats.dp_stats;
216+
if (dp_stats == nullptr) {
217+
dp_stats.reset(new DPStats());
217218
}
218-
stats_[sample]["dp_min"] = std::min(stats_[sample]["dp_min"], dp);
219-
stats_[sample]["dp_max"] = std::max(stats_[sample]["dp_max"], dp);
219+
auto dp = static_cast<uint64_t>(dst_[0]);
220+
dp_stats->dp_sum += dp;
221+
dp_stats->dp_sum2 += dp * dp;
222+
dp_stats->dp_count += 1;
223+
dp_stats->dp_min = std::min(dp_stats->dp_min, dp);
224+
dp_stats->dp_max = std::max(dp_stats->dp_max, dp);
220225
}
221226
}
222227

223228
if (bcf_get_format_int32(hdr, rec, "GQ", &dst_, &ndst_) > 0) {
224229
if (dst_[0] != bcf_int32_missing) {
225-
auto gq = static_cast<uint64_t>(dst_[0]);
226-
stats_[sample]["gq_sum"] += gq;
227-
stats_[sample]["gq_sum2"] += gq * gq;
228-
stats_[sample]["gq_count"] += 1;
229-
if (!stats_[sample].contains("gq_min")) {
230-
stats_[sample]["gq_min"] = std::numeric_limits<uint64_t>::max();
230+
std::unique_ptr<GQStats>& gq_stats = stats.gq_stats;
231+
if (gq_stats == nullptr) {
232+
gq_stats.reset(new GQStats());
231233
}
232-
stats_[sample]["gq_min"] = std::min(stats_[sample]["gq_min"], gq);
233-
stats_[sample]["gq_max"] = std::max(stats_[sample]["gq_max"], gq);
234+
auto gq = static_cast<uint64_t>(dst_[0]);
235+
gq_stats->gq_sum += gq;
236+
gq_stats->gq_sum2 += gq * gq;
237+
gq_stats->gq_count += 1;
238+
gq_stats->gq_min = std::min(gq_stats->gq_min, gq);
239+
gq_stats->gq_max = std::max(gq_stats->gq_max, gq);
234240
}
235241
}
236242

@@ -307,19 +313,19 @@ void SampleStats::process(
307313
n_singleton += count == 1;
308314
}
309315

310-
stats_[sample]["n_records"] += 1;
311-
stats_[sample]["n_called"] += !is_missing;
312-
stats_[sample]["n_not_called"] += is_missing;
313-
stats_[sample]["n_hom_ref"] += is_hom_ref;
314-
stats_[sample]["n_het"] += is_het;
315-
stats_[sample]["n_singleton"] += n_singleton;
316-
stats_[sample]["n_snp"] += n_ti + n_tv;
317-
stats_[sample]["n_transition"] += n_ti;
318-
stats_[sample]["n_transversion"] += n_tv;
319-
stats_[sample]["n_insertion"] += n_ins;
320-
stats_[sample]["n_deletion"] += n_del;
321-
stats_[sample]["n_star"] += n_star;
322-
stats_[sample]["n_multiallelic"] += is_multi;
316+
stats.n_records += 1;
317+
stats.n_called += !is_missing;
318+
stats.n_not_called += is_missing;
319+
stats.n_hom_ref += is_hom_ref;
320+
stats.n_het += is_het;
321+
stats.n_singleton += n_singleton;
322+
stats.n_snp += n_ti + n_tv;
323+
stats.n_transition += n_ti;
324+
stats.n_transversion += n_tv;
325+
stats.n_insertion += n_ins;
326+
stats.n_deletion += n_del;
327+
stats.n_star += n_star;
328+
stats.n_multiallelic += is_multi;
323329
}
324330

325331
void SampleStats::flush(bool finalize) {
@@ -351,30 +357,54 @@ void SampleStats::flush(bool finalize) {
351357
add_buffer_(name);
352358
}
353359

354-
bool found_dp = false;
355-
bool found_gq = false;
356-
357-
for (const auto& [sample, stat] : stats_) {
360+
// Sort the stats keys and iterate them in order
361+
auto keys_view = stats_ | std::views::keys;
362+
std::vector<std::string> samples(keys_view.begin(), keys_view.end());
363+
std::sort(samples.begin(), samples.end());
364+
for (const std::string& sample : samples) {
365+
Stats& stats = stats_[sample];
358366
buffers["sample"]->push_back(sample);
359367

360368
// Add stats to buffers
361-
for (const auto& [field, value] : stat) {
362-
buffers[field]->push_back(value);
363-
found_dp |= field == "dp_sum";
364-
found_gq |= field == "gq_sum";
365-
}
366-
367-
// Add nulls for missing DP fields
368-
if (!found_dp) {
369+
buffers["n_records"]->push_back(stats.n_records);
370+
buffers["n_called"]->push_back(stats.n_called);
371+
buffers["n_not_called"]->push_back(stats.n_not_called);
372+
buffers["n_hom_ref"]->push_back(stats.n_hom_ref);
373+
buffers["n_het"]->push_back(stats.n_het);
374+
buffers["n_singleton"]->push_back(stats.n_singleton);
375+
buffers["n_snp"]->push_back(stats.n_snp);
376+
buffers["n_transition"]->push_back(stats.n_transition);
377+
buffers["n_transversion"]->push_back(stats.n_transversion);
378+
buffers["n_insertion"]->push_back(stats.n_insertion);
379+
buffers["n_deletion"]->push_back(stats.n_deletion);
380+
buffers["n_star"]->push_back(stats.n_star);
381+
buffers["n_multiallelic"]->push_back(stats.n_multiallelic);
382+
383+
// Add DP fields
384+
const std::unique_ptr<DPStats>& dp_stats = stats.dp_stats;
385+
if (dp_stats != nullptr) {
386+
buffers["dp_sum"]->push_back(dp_stats->dp_sum);
387+
buffers["dp_sum2"]->push_back(dp_stats->dp_sum2);
388+
buffers["dp_count"]->push_back(dp_stats->dp_count);
389+
buffers["dp_min"]->push_back(dp_stats->dp_min);
390+
buffers["dp_max"]->push_back(dp_stats->dp_max);
391+
} else {
369392
buffers["dp_sum"]->push_null();
370393
buffers["dp_sum2"]->push_null();
371394
buffers["dp_count"]->push_null();
372395
buffers["dp_min"]->push_null();
373396
buffers["dp_max"]->push_null();
374397
}
375398

376-
// Add nulls for missing GQ fields
377-
if (!found_gq) {
399+
// Add GQ fields
400+
const std::unique_ptr<GQStats>& gq_stats = stats.gq_stats;
401+
if (gq_stats != nullptr) {
402+
buffers["gq_sum"]->push_back(gq_stats->gq_sum);
403+
buffers["gq_sum2"]->push_back(gq_stats->gq_sum2);
404+
buffers["gq_count"]->push_back(gq_stats->gq_count);
405+
buffers["gq_min"]->push_back(gq_stats->gq_min);
406+
buffers["gq_max"]->push_back(gq_stats->gq_max);
407+
} else {
378408
buffers["gq_sum"]->push_null();
379409
buffers["gq_sum2"]->push_null();
380410
buffers["gq_count"]->push_null();

libtiledbvcf/src/stats/sample_stats.h

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,40 @@ class SampleStats {
187187
void flush(bool finalize = false);
188188

189189
private:
190+
struct DPStats {
191+
uint64_t dp_sum = 0;
192+
uint64_t dp_sum2 = 0;
193+
uint64_t dp_count = 0;
194+
uint64_t dp_min = std::numeric_limits<uint64_t>::max();
195+
uint64_t dp_max = 0;
196+
};
197+
198+
struct GQStats {
199+
uint64_t gq_sum = 0;
200+
uint64_t gq_sum2 = 0;
201+
uint64_t gq_count = 0;
202+
uint64_t gq_min = std::numeric_limits<uint64_t>::max();
203+
uint64_t gq_max = 0;
204+
};
205+
206+
struct Stats {
207+
uint64_t n_records = 0;
208+
uint64_t n_called = 0;
209+
uint64_t n_not_called = 0;
210+
uint64_t n_hom_ref = 0;
211+
uint64_t n_het = 0;
212+
uint64_t n_singleton = 0;
213+
uint64_t n_snp = 0;
214+
uint64_t n_transition = 0;
215+
uint64_t n_transversion = 0;
216+
uint64_t n_insertion = 0;
217+
uint64_t n_deletion = 0;
218+
uint64_t n_star = 0;
219+
uint64_t n_multiallelic = 0;
220+
std::unique_ptr<DPStats> dp_stats = nullptr;
221+
std::unique_ptr<GQStats> gq_stats = nullptr;
222+
};
223+
190224
// Array URI basename
191225
inline static const std::string SAMPLE_STATS_ARRAY = "sample_stats";
192226

@@ -208,8 +242,8 @@ class SampleStats {
208242
// Current contig
209243
std::string contig_;
210244

211-
// Aggregate stats for each sample. map: sample -> (map: field -> count)
212-
std::map<std::string, std::unordered_map<std::string, uint64_t>> stats_;
245+
// Aggregate stats for each sample.
246+
std::unordered_map<std::string, Stats> stats_;
213247

214248
// Get the URI for the array from the root group
215249
static std::string group_uri(const Group& group) {

0 commit comments

Comments
 (0)