|
42 | 42 | FASTA_EXTENSIONS = {'.fa', '.fasta', '.fna', '.fa.gz', '.fasta.gz', '.fna.gz'} |
43 | 43 |
|
44 | 44 |
|
| 45 | +def _logical_fasta_key(basename: str) -> str: |
| 46 | + """Plain and gzip of the same file share a key (e.g. a.fna and a.fna.gz -> a.fna).""" |
| 47 | + return basename[:-3] if basename.endswith(".gz") else basename |
| 48 | + |
| 49 | + |
45 | 50 | def deduplicate_contig_table(contig_table_path: str, contig_id_col: int = 1) -> None: |
46 | 51 | """ |
47 | 52 | Deduplicate a contig table in-place by contig ID (second column by default). |
@@ -112,17 +117,35 @@ def get_fasta_files(directory: str) -> List[str]: |
112 | 117 | """ |
113 | 118 | Get all FASTA files from a directory. |
114 | 119 | |
| 120 | + If both ``assembly.fna`` and ``assembly.fna.gz`` exist, keep only the ``.gz`` |
| 121 | + path so directories may ship compressed assets (CI) while developers keep |
| 122 | + uncompressed copies locally. |
| 123 | + |
115 | 124 | Args: |
116 | 125 | directory: Path to the directory containing FASTA files. |
117 | 126 | |
118 | 127 | Returns: |
119 | 128 | List of paths to FASTA files. |
120 | 129 | """ |
121 | | - fasta_files = [] |
| 130 | + seen: List[str] = [] |
122 | 131 | for ext in FASTA_EXTENSIONS: |
123 | 132 | pattern = os.path.join(directory, f"*{ext}") |
124 | | - fasta_files.extend(glob.glob(pattern)) |
125 | | - return sorted(fasta_files) |
| 133 | + seen.extend(glob.glob(pattern)) |
| 134 | + |
| 135 | + by_key: Dict[str, str] = {} |
| 136 | + for path in sorted(set(seen)): |
| 137 | + key = _logical_fasta_key(os.path.basename(path)) |
| 138 | + prev = by_key.get(key) |
| 139 | + if prev is None: |
| 140 | + by_key[key] = path |
| 141 | + elif path.endswith(".gz") and not prev.endswith(".gz"): |
| 142 | + by_key[key] = path |
| 143 | + elif prev.endswith(".gz") and not path.endswith(".gz"): |
| 144 | + pass |
| 145 | + else: |
| 146 | + by_key[key] = min(path, prev) |
| 147 | + |
| 148 | + return sorted(by_key.values()) |
126 | 149 |
|
127 | 150 |
|
128 | 151 | def prepare_blast_database( |
|
0 commit comments