|
15 | 15 |
|
16 | 16 |
|
17 | 17 | # %% |
| 18 | +import argparse |
18 | 19 | import json |
19 | 20 | import sys |
20 | 21 | from pathlib import Path |
|
23 | 24 | from tqdm import tqdm |
24 | 25 |
|
25 | 26 |
|
26 | | -sys.path.append("/workspace/codon_fm") |
| 27 | +sys.path.append("/workspace/codonfm") |
27 | 28 | from src.tokenizer import Tokenizer |
28 | 29 |
|
29 | 30 |
|
30 | | -data_path = Path("/data/ncbi/processed_unfiltered") |
31 | | -tax_ids_to_remove = json.load(open("/data/ncbi/taxids_to_remove.json")) |
32 | | -metadata = json.load(open(data_path / "metadata.json")) |
33 | | -tokenizer = Tokenizer() |
34 | | - |
35 | | - |
36 | | -groups = set([x["file_name"][:-4] for x in metadata["file_metadata"]]) # noqa: C403 |
37 | | -counts = {g: np.zeros(tokenizer.vocab_size) for g in groups} |
38 | | -for fm, cm in tqdm(zip(metadata["file_metadata"], metadata["chunks"]), total=len(metadata["file_metadata"])): |
39 | | - group = fm["file_name"][:-4] |
40 | | - if group in tax_ids_to_remove: |
41 | | - curr_taxids_to_remove = set(tax_ids_to_remove[group]) |
42 | | - else: |
43 | | - curr_taxids_to_remove = set() |
44 | | - mmap = np.memmap( |
45 | | - data_path / cm["sequences"]["path"], |
46 | | - dtype=cm["sequences"]["dtype"], |
47 | | - mode="r", |
48 | | - shape=tuple(cm["sequences"]["shape"]), |
49 | | - ) |
50 | | - idx_mmap = np.memmap( |
51 | | - data_path / cm["index"]["path"], dtype=cm["index"]["dtype"], mode="r", shape=tuple(cm["index"]["shape"]) |
52 | | - ) |
53 | | - for start, end, taxid in idx_mmap: |
54 | | - if taxid in curr_taxids_to_remove: |
55 | | - continue |
56 | | - seq = mmap[start:end] |
57 | | - idx, count = np.unique(seq, return_counts=True) |
58 | | - counts[group][idx] += count |
| 31 | +def main(pretraining_processed_data_dir: Path, data_dir: Path): |
| 32 | + """Check codon frequency.""" |
| 33 | + tax_ids_to_remove = json.load(open(data_dir / Path("taxids_to_remove.json"))) |
| 34 | + metadata = json.load(open(pretraining_processed_data_dir / "metadata.json")) |
| 35 | + tokenizer = Tokenizer() |
59 | 36 |
|
60 | | -# %% |
61 | | -for g in counts: |
62 | | - counts[g] = counts[g].tolist() |
63 | | -json.dump(counts, open("/data/ncbi/codon_counts_nopathogen.json", "w")) |
| 37 | + groups = set([x["file_name"][:-4] for x in metadata["file_metadata"]]) # noqa: C403 |
| 38 | + counts = {g: np.zeros(tokenizer.vocab_size) for g in groups} |
| 39 | + for fm, cm in tqdm(zip(metadata["file_metadata"], metadata["chunks"]), total=len(metadata["file_metadata"])): |
| 40 | + group = fm["file_name"][:-4] |
| 41 | + if group in tax_ids_to_remove: |
| 42 | + curr_taxids_to_remove = set(tax_ids_to_remove[group]) |
| 43 | + else: |
| 44 | + curr_taxids_to_remove = set() |
| 45 | + mmap = np.memmap( |
| 46 | + pretraining_processed_data_dir / cm["sequences"]["path"], |
| 47 | + dtype=cm["sequences"]["dtype"], |
| 48 | + mode="r", |
| 49 | + shape=tuple(cm["sequences"]["shape"]), |
| 50 | + ) |
| 51 | + idx_mmap = np.memmap( |
| 52 | + pretraining_processed_data_dir / cm["index"]["path"], |
| 53 | + dtype=cm["index"]["dtype"], |
| 54 | + mode="r", |
| 55 | + shape=tuple(cm["index"]["shape"]), |
| 56 | + ) |
| 57 | + for start, end, taxid in idx_mmap: |
| 58 | + if taxid in curr_taxids_to_remove: |
| 59 | + continue |
| 60 | + seq = mmap[start:end] |
| 61 | + idx, count = np.unique(seq, return_counts=True) |
| 62 | + counts[group][idx] += count |
| 63 | + |
| 64 | + # %% |
| 65 | + for g in counts: |
| 66 | + counts[g] = counts[g].tolist() |
| 67 | + json.dump(counts, open(data_dir / "codon_counts_nopathogen.json", "w")) |
| 68 | + |
| 69 | + |
| 70 | +if __name__ == "__main__": |
| 71 | + parser = argparse.ArgumentParser(description="Check codon frequency") |
| 72 | + parser.add_argument("--pretraining_processed_data_dir", type=str, required=True) |
| 73 | + parser.add_argument("--data_dir", type=str, required=True) |
| 74 | + args = parser.parse_args() |
| 75 | + main(Path(args.pretraining_processed_data_dir), Path(args.data_dir)) |
0 commit comments