diff --git a/README.md b/README.md index 7928e5eaf..be877c3d5 100644 --- a/README.md +++ b/README.md @@ -692,6 +692,14 @@ The earth sciences folder contain subfolders for different data formats encounte - 1000GP.chr*.chunks.txt: chunks of the chromosome obtain with GLIMPSE_chunk - AFR.gwas: Study locus file. From [SuShiE](https://github.com/mancusolab/sushie). - AFR.ld: LD matrix file. From [SuShiE](https://github.com/mancusolab/sushie). + - hdl/reference/chr1.1_toy.bim: Synthetic toy HDL-format BIM sidecar for chunk 1.1, generated by `generate_toy_hdl_data.R` for HDL-compatible inputs. + - hdl/reference/chr1.1_toy.rda: Synthetic toy HDL-format LD reference payload for chunk 1.1, generated by `generate_toy_hdl_data.R` for HDL-compatible inputs. + - hdl/reference/chr1.2_toy.bim: Synthetic toy HDL-format BIM sidecar for chunk 1.2, generated by `generate_toy_hdl_data.R` for HDL-compatible inputs. + - hdl/reference/chr1.2_toy.rda: Synthetic toy HDL-format LD reference payload for chunk 1.2, generated by `generate_toy_hdl_data.R` for HDL-compatible inputs. + - hdl/reference/toy_snp_counter.RData: Synthetic toy HDL-format SNP count metadata, generated by `generate_toy_hdl_data.R` for HDL-compatible inputs. + - hdl/reference/toy_snp_list.RData: Synthetic toy HDL-format SNP list metadata, generated by `generate_toy_hdl_data.R` for HDL-compatible inputs. + - sumstats/trait1_canonical.tsv: Synthetic canonical toy summary statistics for trait 1, generated by `hdl/generate_toy_hdl_data.R` for small GWAS-style module inputs. + - sumstats/trait2_canonical.tsv: Synthetic canonical toy summary statistics for trait 2, generated by `hdl/generate_toy_hdl_data.R` for small GWAS-style module inputs. - svsig: - NA03697B2_new.pbmm2.repeats.svsig.gz: structural variant file for NA03697B2_new.pbmm2.repeats.bam, created with PBSV discover version (2.9.0 default settings) diff --git a/data/genomics/homo_sapiens/popgen/hdl/README.md b/data/genomics/homo_sapiens/popgen/hdl/README.md new file mode 100644 index 000000000..f82369fe7 --- /dev/null +++ b/data/genomics/homo_sapiens/popgen/hdl/README.md @@ -0,0 +1,41 @@ +# HDL Toy Test Dataset + +These files are synthetic toy fixtures for HDL module testing in the companion +`nf-core/modules` work for `nf-core/modules#10912`. They are intended to exercise +[HDL](https://github.com/zhenin/HDL) inputs in tests, not to provide a +scientific LD reference panel or redistributed upstream reference bundle. + +## Layout + +- `reference/`: toy HDL LD reference chunks and metadata sidecars +- `../sumstats/`: canonical toy summary-statistics tables aligned to the toy SNPs + +## Regeneration + +From this directory: + +```bash +Rscript generate_toy_hdl_data.R +``` + +From the root of the `nf-core/test-datasets` worktree: + +```bash +Rscript data/genomics/homo_sapiens/popgen/hdl/generate_toy_hdl_data.R +``` + +## R Objects + +The `.bim` sidecars, both canonical `sumstats/*.tsv` files, and the R binary +payloads are all generated locally by `generate_toy_hdl_data.R` from fully +synthetic constants in this directory. + +- `reference/chr1.1_toy.rda` and `reference/chr1.2_toy.rda` each contain + synthetic `LDsc`, `lam`, and `V` objects for one toy HDL chunk. +- `reference/toy_snp_counter.RData` contains `nsnps.list` and + `nsnps.list.imputed`, each as a named one-element list with the toy chunk SNP + counts. +- `reference/toy_snp_list.RData` contains `snps.list.imputed.vector`, the four + synthetic SNP IDs shared by the toy fixtures. +- `../sumstats/trait1_canonical.tsv` and `../sumstats/trait2_canonical.tsv` are + tiny canonical summary-statistics tables keyed to those synthetic SNP IDs. diff --git a/data/genomics/homo_sapiens/popgen/hdl/generate_toy_hdl_data.R b/data/genomics/homo_sapiens/popgen/hdl/generate_toy_hdl_data.R new file mode 100644 index 000000000..799271dfe --- /dev/null +++ b/data/genomics/homo_sapiens/popgen/hdl/generate_toy_hdl_data.R @@ -0,0 +1,110 @@ +#!/usr/bin/env Rscript + +args <- commandArgs(trailingOnly = FALSE) +file_arg <- "--file=" +script_path <- sub(file_arg, "", args[grep(file_arg, args)]) + +if (length(script_path) != 1 || script_path == "") { + stop("Unable to determine the script path from commandArgs().") +} + +script_dir <- dirname(normalizePath(script_path)) +reference_dir <- file.path(script_dir, "reference") +sumstats_dir <- file.path(script_dir, "..", "sumstats") + +dir.create(reference_dir, recursive = TRUE, showWarnings = FALSE) +dir.create(sumstats_dir, recursive = TRUE, showWarnings = FALSE) + +writeLines( + c( + "1 rs1 0 101 A G", + "1 rs2 0 102 C T" + ), + file.path(reference_dir, "chr1.1_toy.bim") +) + +writeLines( + c( + "1 rs3 0 201 A C", + "1 rs4 0 202 G T" + ), + file.path(reference_dir, "chr1.2_toy.bim") +) + +lam <- c(1.3, 0.85) +LDsc <- c(1.1, 1.4) +V <- diag(2) +save( + LDsc, + lam, + V, + file = file.path(reference_dir, "chr1.1_toy.rda"), + compress = "gzip" +) + +lam <- c(1.25, 0.9) +LDsc <- c(1.2, 1.35) +V <- diag(2) +save( + LDsc, + lam, + V, + file = file.path(reference_dir, "chr1.2_toy.rda"), + compress = "gzip" +) + +nsnps.list <- list("1" = c(2, 2)) +nsnps.list.imputed <- list("1" = c(2, 2)) +save( + nsnps.list.imputed, + nsnps.list, + file = file.path(reference_dir, "toy_snp_counter.RData"), + compress = "gzip" +) + +snps.list.imputed.vector <- c("rs1", "rs2", "rs3", "rs4") +save( + snps.list.imputed.vector, + file = file.path(reference_dir, "toy_snp_list.RData"), + compress = "gzip" +) + +trait1 <- data.frame( + SNP = c("rs1", "rs2", "rs3", "rs4"), + A1 = c("A", "C", "A", "G"), + A2 = c("G", "T", "C", "T"), + CHR = c(1, 1, 1, 1), + POS = c(101, 102, 201, 202), + RSID = c("rs1", "rs2", "rs3", "rs4"), + EffectAllele = c("A", "C", "A", "G"), + OtherAllele = c("G", "T", "C", "T"), + N = c(10000, 10000, 10000, 10000), + Z = c(0.5, -0.2, 0.4, -0.1) +) +write.table( + trait1, + file.path(sumstats_dir, "trait1_canonical.tsv"), + sep = "\t", + quote = FALSE, + row.names = FALSE +) + +trait2 <- data.frame( + SNP = c("rs1", "rs2", "rs3", "rs4"), + A1 = c("A", "C", "A", "G"), + A2 = c("G", "T", "C", "T"), + CHR = c(1, 1, 1, 1), + POS = c(101, 102, 201, 202), + RSID = c("rs1", "rs2", "rs3", "rs4"), + EffectAllele = c("A", "C", "A", "G"), + OtherAllele = c("G", "T", "C", "T"), + N = c(12000, 12000, 12000, 12000), + Z = c(0.3, -0.4, 0.2, -0.2) +) +write.table( + trait2, + file.path(sumstats_dir, "trait2_canonical.tsv"), + sep = "\t", + quote = FALSE, + row.names = FALSE +) diff --git a/data/genomics/homo_sapiens/popgen/hdl/reference/chr1.1_toy.bim b/data/genomics/homo_sapiens/popgen/hdl/reference/chr1.1_toy.bim new file mode 100644 index 000000000..8535c8640 --- /dev/null +++ b/data/genomics/homo_sapiens/popgen/hdl/reference/chr1.1_toy.bim @@ -0,0 +1,2 @@ +1 rs1 0 101 A G +1 rs2 0 102 C T diff --git a/data/genomics/homo_sapiens/popgen/hdl/reference/chr1.1_toy.rda b/data/genomics/homo_sapiens/popgen/hdl/reference/chr1.1_toy.rda new file mode 100644 index 000000000..219f1b159 Binary files /dev/null and b/data/genomics/homo_sapiens/popgen/hdl/reference/chr1.1_toy.rda differ diff --git a/data/genomics/homo_sapiens/popgen/hdl/reference/chr1.2_toy.bim b/data/genomics/homo_sapiens/popgen/hdl/reference/chr1.2_toy.bim new file mode 100644 index 000000000..c001831a0 --- /dev/null +++ b/data/genomics/homo_sapiens/popgen/hdl/reference/chr1.2_toy.bim @@ -0,0 +1,2 @@ +1 rs3 0 201 A C +1 rs4 0 202 G T diff --git a/data/genomics/homo_sapiens/popgen/hdl/reference/chr1.2_toy.rda b/data/genomics/homo_sapiens/popgen/hdl/reference/chr1.2_toy.rda new file mode 100644 index 000000000..109808183 Binary files /dev/null and b/data/genomics/homo_sapiens/popgen/hdl/reference/chr1.2_toy.rda differ diff --git a/data/genomics/homo_sapiens/popgen/hdl/reference/toy_snp_counter.RData b/data/genomics/homo_sapiens/popgen/hdl/reference/toy_snp_counter.RData new file mode 100644 index 000000000..2d9852327 Binary files /dev/null and b/data/genomics/homo_sapiens/popgen/hdl/reference/toy_snp_counter.RData differ diff --git a/data/genomics/homo_sapiens/popgen/hdl/reference/toy_snp_list.RData b/data/genomics/homo_sapiens/popgen/hdl/reference/toy_snp_list.RData new file mode 100644 index 000000000..9324b958c Binary files /dev/null and b/data/genomics/homo_sapiens/popgen/hdl/reference/toy_snp_list.RData differ diff --git a/data/genomics/homo_sapiens/popgen/sumstats/README.md b/data/genomics/homo_sapiens/popgen/sumstats/README.md new file mode 100644 index 000000000..48aa5f55e --- /dev/null +++ b/data/genomics/homo_sapiens/popgen/sumstats/README.md @@ -0,0 +1,31 @@ +# Toy Population-Genetics Summary Statistics + +These files are tiny synthetic GWAS-style summary-statistics tables intended for +module testing. They are generated from fixed constants by the companion HDL +fixture generator at `../hdl/generate_toy_hdl_data.R`. + +## Layout + +- `trait1_canonical.tsv`: synthetic canonical summary statistics for trait 1 +- `trait2_canonical.tsv`: synthetic canonical summary statistics for trait 2 + +## Regeneration + +From the `hdl/` directory: + +```bash +Rscript generate_toy_hdl_data.R +``` + +From the root of the `nf-core/test-datasets` worktree: + +```bash +Rscript data/genomics/homo_sapiens/popgen/hdl/generate_toy_hdl_data.R +``` + +## Notes + +These tables are not HDL-specific at the file-format level. They are kept under +`popgen/sumstats/` so they can be reused by modules that consume small +GWAS-style tabular inputs, while the HDL reference panel assets remain grouped +under `popgen/hdl/reference/`. diff --git a/data/genomics/homo_sapiens/popgen/sumstats/trait1_canonical.tsv b/data/genomics/homo_sapiens/popgen/sumstats/trait1_canonical.tsv new file mode 100644 index 000000000..2ac8d27cd --- /dev/null +++ b/data/genomics/homo_sapiens/popgen/sumstats/trait1_canonical.tsv @@ -0,0 +1,5 @@ +SNP A1 A2 CHR POS RSID EffectAllele OtherAllele N Z +rs1 A G 1 101 rs1 A G 10000 0.5 +rs2 C T 1 102 rs2 C T 10000 -0.2 +rs3 A C 1 201 rs3 A C 10000 0.4 +rs4 G T 1 202 rs4 G T 10000 -0.1 diff --git a/data/genomics/homo_sapiens/popgen/sumstats/trait2_canonical.tsv b/data/genomics/homo_sapiens/popgen/sumstats/trait2_canonical.tsv new file mode 100644 index 000000000..6d84622e4 --- /dev/null +++ b/data/genomics/homo_sapiens/popgen/sumstats/trait2_canonical.tsv @@ -0,0 +1,5 @@ +SNP A1 A2 CHR POS RSID EffectAllele OtherAllele N Z +rs1 A G 1 101 rs1 A G 12000 0.3 +rs2 C T 1 102 rs2 C T 12000 -0.4 +rs3 A C 1 201 rs3 A C 12000 0.2 +rs4 G T 1 202 rs4 G T 12000 -0.2