-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
91 lines (79 loc) · 2.79 KB
/
Copy pathutils.py
File metadata and controls
91 lines (79 loc) · 2.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from email import utils
from zipfile import Path, ZipFile
import shutil
def check_zip_contains_file(input_file_path, expected_file):
with ZipFile(input_file_path, "r") as zf:
assert expected_file in zf.namelist(), f"{expected_file} not found in ZIP"
def unzip_file(input_file_path, output_dir):
with ZipFile(input_file_path, 'r') as zip_ref:
zip_ref.extractall(output_dir)
def read_file(file_path):
with open(file_path, 'r') as file:
return file.read()
def remove_directory(dir_path):
if dir_path.is_dir():
shutil.rmtree(dir_path)
def is_numeric(s: str) -> bool:
try:
float(s)
return True
except ValueError:
return False
def extract_path_metadata(zip_path: str) -> dict:
"""
Expected structure:
/datasets/caeg_production/libraires/lv7/008/003/{libid}/{date}_{fc}/{version}/{hash}/
stats/reads/fastqc/{data_type}/{filename}_fastqc.zip
"""
parts = Path(zip_path).parts
# parts[0] = '/'
# parts[1] = 'datasets'
# parts[2] = 'caeg_production'
# parts[3] = 'libraires'
# parts[4] = 'lv7' \
# parts[5] = '008' } sharding dirs — ignored
# parts[6] = '003' /
# parts[7] = libid
# parts[8] = date_fc e.g. '20231015_HV3TWDSX7'
# parts[9] = version e.g. 'v1.08'
# parts[10]= hash
# parts[11]= 'stats'
# parts[12]= 'reads'
# parts[13]= 'fastqc'
# parts[14]= data_type e.g. 'trim', 'raw'
# parts[15]= filename e.g. 'Lib_LV7001856478_L004_singleton_fastqc.zip'
libid = parts[7]
date_fc = parts[8]
pipeline_version = parts[9]
pipeline_hash = parts[10]
data_type = parts[14]
filename = parts[15]
# Split 'date_fc' → date and flowcell on the FIRST underscore only,
# since flowcell IDs can also contain underscores
date_str, _, flowcell = date_fc.partition("_")
try:
from datetime import datetime
run_date = datetime.strptime(date_str, "%Y%m%d").date()
except ValueError:
run_date = None # handle unexpected formats gracefully
# Parse filename: Lib_{libid}_{lane}_{subtype}_fastqc.zip
fname_match = re.match(
r"Lib_(?P<lib>[^_]+)_(?P<lane>L\d+)_(?P<subtype>.+)_fastqc\.zip",
filename
)
if fname_match:
lane = fname_match.group("lane")
subtype = fname_match.group("subtype") # R1, R2, singleton, etc.
else:
lane = None
subtype = None
return {
"libid": libid,
"run_date": run_date,
"flowcell": flowcell,
"pipeline_version": pipeline_version,
"pipeline_hash": pipeline_hash,
"data_type": data_type,
"lane": lane,
"subtype": subtype,
}