-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess_input.py
More file actions
127 lines (116 loc) · 5.8 KB
/
process_input.py
File metadata and controls
127 lines (116 loc) · 5.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import gzip
import json
import os
import sys
import shutil
import yaml
def get_section_string(f, start_line, end_line, return_string=False):
# consume starting lines
start_string = iter(f.readline, start_line)
start_string = ''.join(line for line in start_string)
# read YAML lines
yaml_string = iter(f.readline, end_line)
if return_string:
return ''.join(x for x in yaml_string)
else:
return [x for x in yaml_string]
def fill_from_yaml_data(yaml_only_dict, studies_samples_dict):
# fill experiment information (platform) ****
for index,exp in yaml_only_dict['ENA_experiment'].items():
study_alias = exp['study_alias']
sample_alias = exp['sample_alias']
if study_alias in studies_samples_dict.keys():
if sample_alias in studies_samples_dict[study_alias].keys():
studies_samples_dict[study_alias][sample_alias]['experiments'].append({'platform': exp['platform']})
else:
studies_samples_dict[study_alias][sample_alias] = {'experiments': [{'platform': exp['platform']}]}
else:
studies_samples_dict[study_alias] = {sample_alias: {'experiments':[{'platform': exp['platform']}]}}
def load_receipt_data(input_file_path):
# should do some health check of the input file?
# load yaml section
loaded_data = {}
yaml_delimiter = 'YAML -------------\n'
with open(input_file_path) as input_file:
yaml_only_section = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter, return_string=True))
fill_from_yaml_data(yaml_only_section, loaded_data)
# read study accessions
study_delimiter = 'Study accession details:\n'
end_line = '\n'
with open(input_file_path) as input_file:
studies_accession_lines = get_section_string(input_file, start_line=study_delimiter, end_line=end_line)
# loaded_data['studies'] = {}
for study_line in studies_accession_lines:
if study_line != '\n':
alias, accession, *_ = study_line.split('\t')
try:
loaded_data[alias]['accession'] = accession
except KeyError:
print(f"Experiment {exp} has unknown study or sample")
# loaded_data['studies'][alias]['accession'] = accession
samples_delimiter = 'Sample accession details:\n'
with open(input_file_path) as input_file:
samples_accession_lines = get_section_string(input_file, start_line=samples_delimiter, end_line=end_line)
## need to iterate over all studies, because here I don't know which study is the sample from.
# loaded_data['samples'] = {}
for sample_line in samples_accession_lines:
if sample_line != '\n':
alias, accession, *_ = sample_line.split('\t')
for study in loaded_data.keys():
if alias in loaded_data[study].keys():
loaded_data[study][alias]['accession'] = accession
break
return loaded_data
"""
Takes as input:
1. A receipt obtained from ENA submission tool:
a txt file that contains sections describing submission details.
2. A json file with the list of fasta that the user loaded
3. Path to write generated manifests
4. Manifest template path: the manifest with the global values set
(e.g COVERAGE, MINGAPLENGHT..)
"""
def main():
input_file_path = sys.argv[1]
fasta_names_list_path = sys.argv[2]
out_manifest_base = sys.argv[3]
manifest_template = sys.argv[4]
# load submitted data from receipt file
data_dict = load_receipt_data(input_file_path)
# iterate over the list of fasta files
with open(fasta_names_list_path, 'r') as fasta_files_json_file:
fasta_files_list = json.load(fasta_files_json_file)
with open('submit_list.tab', 'w') as written_manifests_out:
for fasta_file in fasta_files_list:
if fasta_file.endswith('.fasta.gz'):
sample_alias = fasta_file[:-9]
else:
sample_alias = fasta_file[:-6]
print(f'Processing {sample_alias}')
found_metadata = False
for study_alias in data_dict.keys():
if sample_alias in data_dict[study_alias].keys():
sample_accession = data_dict[study_alias][sample_alias]['accession']
study_accession = data_dict[study_alias]['accession']
### TODO get a string that concatenates plaform information from multiple exp
platform = data_dict[study_alias][sample_alias]['experiments'][0]['platform']
manifest_path = os.path.join(out_manifest_base, sample_alias + '.manifest.txt')
with open(manifest_path, "w") as output_handle:
# first dump the contents of manifest template
# containing the global vars
with open(manifest_template) as m_template:
output_handle.write(m_template.read())
output_handle.write("ASSEMBLYNAME\tconsensus_" + sample_alias + "\n")
output_handle.write("PLATFORM\t" + platform + "\n")
output_handle.write("STUDY\t" + study_accession + "\n")
output_handle.write("SAMPLE\t" + sample_accession + "\n")
# files should be available in the corresponding dir and named:
# sample_alias.fasta.gz
output_handle.write("FASTA\t" + sample_alias + '.fasta.gz' + "\n")
found_metadata = True
written_manifests_out.write(manifest_path + '\n')
break
if not found_metadata:
print(f'No metadata found for sample {sample_alias}')
if __name__ == '__main__':
main()