Skip to content

Commit e8b411e

Browse files
committed
Merge branch 'ar/burn17' into 'master'
[v0.5.0] Adds open chromatin, refactor to workspaces See merge request machine-learning/modkit!283
2 parents a6fda78 + 6aea84f commit e8b411e

114 files changed

Lines changed: 6011 additions & 250 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,6 @@ scripts
2020
scripts/*
2121
*.bak
2222
artifacts
23-
artifacts/
23+
artifacts/
24+
venv
25+
venv/

.gitlab-ci.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@ stages:
6464
6565
# build images
6666
.rust_image: &rust_image
67-
# 1.84.1 (https://hub.docker.com/layers/library/rust/1.84.1-bookworm/images/sha256-9397ddee68c0d42dbef8d0b8a2ad2b7889928ea9ecc87499a53ea7370a9502b1)
68-
image: rust@sha256:738ae99a3d75623f41e6882566b4ef37e38a9840244a47efd4a0ca22e9628b88
67+
# 1.86.0 (https://hub.docker.com/layers/library/rust/1.86.0-bookworm/images/sha256-afcf03f9dc98b318f8c5e98d9b9699475fe1dafd1c53b76b541bd96749d6b4c3)
68+
image: rust@sha256:300ec56abce8cc9448ddea2172747d048ed902a3090e6b57babb2bf19f754081
6969
7070
.ubuntu_1604_image: &ub16_image
7171
image: ubuntu@sha256:1f1a2d56de1d604801a9671f301190704c25d604a416f59e03c04f5c6ffee0d6
@@ -142,6 +142,7 @@ manual_build_modkit_linux:
142142
cp LICENCE.txt ${dist_dir}
143143
cp target/release/modkit ${dist_dir}
144144
cp -r docs ${dist_dir}
145+
cp -r ochm/models ${dist_dir}
145146
echo "contents if distribution $(ls ${dist_dir})"
146147
tar -cvzf modkit_u16_x86_64.tar.gz ${dist_dir}
147148
ls modkit_u16_x86_64.tar.gz

Cargo.toml

Lines changed: 3 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,64 +1,3 @@
1-
[package]
2-
name = "mod_kit"
3-
version = "0.4.5"
4-
edition = "2021"
5-
6-
[[bin]]
7-
name = "modkit"
8-
path = "src/bin/main.rs"
9-
10-
[lib]
11-
path = "src/lib.rs"
12-
13-
[dependencies]
14-
ansi_term = "0.12.1"
15-
anyhow = "1.0.68"
16-
bigtools = "0.5.4"
17-
bio = "1.0.0"
18-
bitvec = "1.0.1"
19-
charming = "0.3.1"
20-
clap = { version = "4.0.29", features = ["derive", "wrap_help"] }
21-
common_macros = "0.1.1"
22-
crossbeam = "0.8.2"
23-
crossbeam-channel = "0.5.6"
24-
csv = "1.3.0"
25-
derive-new = "0.6.0"
26-
gzp = { version = "0.11.3", default-features = false, features = ["deflate_rust"] }
27-
humantime = "2.1.0"
28-
indexmap = "2.2.6"
29-
indicatif = { version = "0.17.1", features = ["rayon"] }
30-
itertools = "0.12.1"
31-
lazy_static = "1.4"
32-
linear-map = "1.2.0"
33-
log = "0.4.0"
34-
log-once = "0.4.0"
35-
log4rs = { version = "1.2.0", features = ["file_appender", "json_encoder"] }
36-
memchr = "2.6.4"
37-
ndarray = "0.15.6"
38-
nom = "7.1.3"
39-
num = "0.4.3"
40-
num-traits = "0.2.19"
41-
prettytable-rs = "0.10.0"
42-
pulp = "0.18.10"
43-
rand = "0.8.5"
44-
random_color = "1.0.0"
45-
rayon = "1.8.0"
46-
regex = "1.4"
47-
rust-htslib = "0.46.0"
48-
rust-lapper = "1.1.0"
49-
rustc-hash = "1.1.0"
50-
rv = "=0.16.0"
51-
statrs = "0.16.0"
52-
substring = "1.4.5"
53-
thiserror = "2.0.11"
54-
tokio = "1.42.0"
55-
tracing = "0.1.41"
56-
tracing-appender = "0.2.3"
57-
tracing-subscriber = { version = "0.3.19", features = ["json"] }
58-
59-
[dev-dependencies]
60-
assert_approx_eq = "1.1.0"
61-
similar-asserts = "1.4.2"
62-
tempfile = "3.2"
63-
serde = {version = "1.0.219", features = ["derive"]}
64-
1+
[workspace]
2+
resolver = "2"
3+
members = ["modkit", "modkit-core", "modkit-logging", "ochm"]

book/src/SUMMARY.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
- [Narrow output to specific positions](./intro_include_bed.md)
2424
- [Manipulate bedMethyl files](./intro_bedmethyl_merge.md)
2525
- [Check modified base tags](./intro_modbam_check_tags.md)
26+
- [Find regions of accessible chromatin](./intro_open_chromatin.md)
2627
- [Extended subcommand help](./advanced_usage.md)
2728
- [Troubleshooting](./troubleshooting.md)
2829
- [Frequently asked questions](./faq.md)
83.2 KB
Loading
69.3 KB
Loading

book/src/intro_open_chromatin.md

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
# Find regions of accessible chromatin
2+
3+
Nanopore sequencing can detect multiple base modifications simultaneously and we can leverage this capability by introducing exogenous base modifications at specific functional regions.
4+
One such method uses a 6mA methyltransferase such as EcoGII or Hia5 to label accessible regions of chromatinized DNA, usually by treatment of cell nuclei with the enzyme.
5+
6+
![Chromatin Accessibility treatment](./images/chromatin_stenciling_3.png "Cartoon schematic showing exogenous 6mA introduced to regions of accessible chromatin")
7+
8+
## Predict regions of open chromatin
9+
Modkit comes with a machine learning model that has been trained to identify regions of open chromatin based on 6mA signal.
10+
You can invoke this model with the following command:
11+
12+
```bash
13+
$ modkit open-chromatin predict ${mod_bam} \
14+
--model ${model} \
15+
--log modkit_predict.log \
16+
-o ./accessible_regions.bedgraph \
17+
--device 0
18+
```
19+
20+
Where `${model}` is the path to the directory with the model, for example `dist_modkit_v0.5.0_38fda16/models/r1041_e82_400bps_hac_v5.2.0@v0.1.0`.
21+
The output of the command is a bedGraph file with the following schema:
22+
23+
| column | name | description | type |
24+
|--------|------|-------------|------|
25+
| 1 | chrom | contig or scaffold | string |
26+
| 2 | start | start of the region | int |
27+
| 3 | end | end of the region | int |
28+
| 4 | probability | accessibility probability, (0.0, 1) | float |
29+
30+
## Configuring `--step-size`
31+
32+
The open chromatin model in Modkit works by making predictions on 100 base pair windows of the genome.
33+
The output of the model is a prediction _probability_ between 0.0 and 1.0, non-inclusive.
34+
To make predictions on the whole genome (or regions when the `--include-bed` option is provided) Modkit applies the model to overlapping windows.
35+
The `--step-size` determines how much to advance before making another prediction:
36+
37+
```text
38+
Genome <--//------------------//->
39+
Window 1 |---w----|
40+
Window 2 |_s_|---w----|
41+
```
42+
In the above `w` is the window size (100bp with the current model) and `s` is the step size which can be configured via the `--step-size` parameter.
43+
The smaller the step size, the finer-graned resolution the output at the cost of more computation.
44+
This can be seen in the following example browser image:
45+
46+
![Step size examples](./images/open_chromatin_step_size.png "Decreasing step size increases resolution")
47+
48+
Step size of 25 base pairs is the default.
49+
50+
## Using `--threshold` to get regions of high confidence
51+
52+
As seen above, the output from `ope-chromatin predict` is a stream of probabilities over the genome or desired region of the genome.
53+
A lot of these predictions are going to be very close to zero.
54+
You can remove low-probability regions with the `--threshold` option so that only intervals with a probability greater than or equal to this value are reported.
55+
Using this option in combination with `bedtools merge` can transform the bedGraph file into a BED of predicted open chromatin regions.
56+
57+
```bash
58+
$ modkit open-chromatin predict ${bam} \
59+
--model ${model} \
60+
--threshold 0.8 \
61+
--include-bed promoters_slop2000.bed \
62+
-o stdout \
63+
| bedtools merge -i - > accessible_regions.bed
64+
```
65+
66+
## Running with a GPU
67+
68+
Modkit comes distributed with the capability to run open chromatin prediction on a normal CPU, NVIDIA GPUs, and can be built to run on Apple GPUs.
69+
The normal distribution only has the capability to run on CPU hardware.
70+
At the current stage of development this configuration is probably too slow for most practical purposes outside of small region checks.
71+
The `candle` distribution can be downloaded and used with NVIDIA GPUs directly, however we cannot guarantee that it will work on _every_ GPU setup.
72+
For the best performance on GPU _or_ CPU, the `tch` (pytorch) distribution is recommended, however it requires that you download libtorch from the internet.
73+
Instructions can be found in the `BUILD_NOTES_*.txt` packaged with the software.
74+
75+
## Troubleshooting: Checking your input data
76+
77+
To predict regions of open chromatin, the input modBAM should have 6mA base modification calls.
78+
You can quickly check that you have valid 6mA calls using the [`check-tags` command](./intro_modbam_check_tags.md):
79+
80+
```bash
81+
$ modkit modbam check-tags ${bam} --head 1000
82+
```
83+
84+
This command will return non-zero when you have invalid MM/ML [SAMTags](https://samtools.github.io/hts-specs/SAMtags.pdf).
85+
You should expect to see an output similar to this:
86+
87+
```text
88+
> checking tags on first 1000 reads
89+
> no errors
90+
> num PASS records: 1000 (100.00%)
91+
> num records: 1000
92+
> valid record tag headers:
93+
+------------+-------+
94+
| tag_header | count |
95+
+------------+-------+
96+
| A+a. | 1000 |
97+
+------------+-------+
98+
99+
100+
> modified bases:
101+
+--------+--------------+----------+------+
102+
| strand | primary_base | mod_code | mode |
103+
+--------+--------------+----------+------+
104+
| + | A | a | . |
105+
+--------+--------------+----------+------+
106+
```
107+
108+
109+

book/src/quick_start.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ cargo install --git https://github.com/nanoporetech/modkit.git
3232
11. [Performing differential methylation scoring with `dmr`](./intro_dmr.md)
3333
12. [Checking modified base tags in a modBAM](./intro_modbam_check_tags.md)
3434
13. [Convert bedMethyl files to bigWig for visualization](./intro_bedmethyl_merge.md#convert-bedmethyl-to-bigwig)
35+
14. [Predict regions of open chromatin on MTase-treated DNA](./intro_open_chromatin.md)
3536

3637
## Notes and troubleshooting
3738
1. [General troubleshooting](./troubleshooting.md)

modkit-core/Cargo.toml

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
[package]
2+
name = "mod_kit"
3+
version = "0.4.5"
4+
edition = "2021"
5+
6+
#[[bin]]
7+
#name = "modkit"
8+
#path = "src/bin/main.rs"
9+
10+
[lib]
11+
path = "src/lib.rs"
12+
13+
[dependencies]
14+
ansi_term = "0.12.1"
15+
anyhow = "1.0.68"
16+
bigtools = "0.5.4"
17+
bio = "1.0.0"
18+
bitvec = "1.0.1"
19+
charming = "0.3.1"
20+
clap = { version = "4.0.29", features = ["derive", "wrap_help"] }
21+
common_macros = "0.1.1"
22+
crossbeam = "0.8.2"
23+
crossbeam-channel = "0.5.6"
24+
csv = "1.3.0"
25+
derive-new = "0.6.0"
26+
gzp = { version = "0.11.3", default-features = false, features = ["deflate_rust"] }
27+
humantime = "2.1.0"
28+
indexmap = "2.2.6"
29+
indicatif = { version = "0.17.1", features = ["rayon"] }
30+
itertools = "0.12.1"
31+
lazy_static = "1.4"
32+
linear-map = "1.2.0"
33+
log = "0.4.0"
34+
log-once = "0.4.0"
35+
memchr = "2.6.4"
36+
ndarray = "0.15.6"
37+
nom = "7.1.3"
38+
num = "0.4.3"
39+
num-traits = "0.2.19"
40+
prettytable-rs = "0.10.0"
41+
pulp = "0.18.10"
42+
rand = "0.8.5"
43+
random_color = "1.0.0"
44+
rayon = "1.8.0"
45+
regex = "1.4"
46+
rust-htslib = "0.46.0"
47+
rust-lapper = "1.1.0"
48+
rustc-hash = "1.1.0"
49+
rv = "=0.16.0"
50+
statrs = "0.16.0"
51+
substring = "1.4.5"
52+
thiserror = "2.0.11"
53+
tokio = "1.42.0"
54+
tracing = "0.1.41"
55+
modkit-logging = { path = "../modkit-logging" }
56+
57+
[dev-dependencies]
58+
assert_approx_eq = "1.1.0"
59+
similar-asserts = "1.4.2"
60+
tempfile = "3.2"
61+
serde = {version = "1.0.219", features = ["derive"]}
62+

src/adjust.rs renamed to modkit-core/src/adjust.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@ use crate::threshold_mod_caller::MultipleThresholdModCaller;
1919
use crate::util::{format_errors_table, get_query_name_string, get_ticker};
2020

2121
#[derive(new)]
22-
pub(crate) struct OverlappingRegexOffset(OverlappingRegex, usize);
22+
pub struct OverlappingRegexOffset(OverlappingRegex, usize);
2323

2424
impl OverlappingRegexOffset {
25-
pub(crate) fn as_str(&self) -> &str {
25+
pub fn as_str(&self) -> &str {
2626
self.0.as_str()
2727
}
2828

@@ -208,7 +208,7 @@ fn adjust_mod_probs<'a>(
208208
Ok(record)
209209
}
210210

211-
pub(crate) fn adjust_modbam(
211+
pub fn adjust_modbam(
212212
reader: &mut bam::Reader,
213213
writer: &mut bam::Writer,
214214
collapse_methods: &[CollapseMethod],
@@ -327,7 +327,7 @@ mod adjust_tests {
327327

328328
#[test]
329329
fn test_motif_filtering() {
330-
let bam_fp = "tests/resources/testing_all_context_calls.bam";
330+
let bam_fp = "../tests/resources/testing_all_context_calls.bam";
331331
let mut reader = bam::Reader::from_path(bam_fp).unwrap();
332332

333333
let regex_motifs = vec![
@@ -397,7 +397,7 @@ mod adjust_tests {
397397
}
398398
assert!(tested);
399399

400-
let bam_fp = "tests/resources/testing_all_context_calls.bam";
400+
let bam_fp = "../tests/resources/testing_all_context_calls.bam";
401401
let mut reader = bam::Reader::from_path(bam_fp).unwrap();
402402
let iter = reader
403403
.records()

0 commit comments

Comments
 (0)