-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathrawfile2hdf_cospred.py
More file actions
300 lines (257 loc) · 12.5 KB
/
rawfile2hdf_cospred.py
File metadata and controls
300 lines (257 loc) · 12.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
import logging
import numpy as np
import pandas as pd
import os
import warnings
from argparse import ArgumentParser
from pyteomics import mzml, mgf
from prosit_model import io_local
import io_cospred
import params.constants_location as constants_location
from params.constants import (
VAL_SPLIT, SPECTRA_DIMENSION, BIN_MAXMZ, BIN_SIZE,
)
def filterPSM(dbsearch_df, csvfile):
"""
Filter the dbsearch_df DataFrame to include only rows where the combination of
`file`, `scan`, `modifiedseq`, and `charge` matches the corresponding
columns in the `csvfile`. Retain all columns in dbsearch_df and avoid column name collisions.
Args:
dbsearch_df (pd.DataFrame): DataFrame containing PSM data.
csvfile (str): Path to the CSV file containing the required columns.
Returns:
pd.DataFrame: Filtered DataFrame.
"""
# Load the CSV file
if not os.path.exists(csvfile):
raise FileNotFoundError(f"File not found: {csvfile}")
csv_df = pd.read_csv(csvfile)
# Ensure the required columns exist in both DataFrames
csv_to_dbsearch_mapping = {
'raw_file': 'file',
'scan_number': 'scan',
'modified_sequence': 'modifiedseq',
'precursor_charge': 'charge'
}
for csv_col, db_col in csv_to_dbsearch_mapping.items():
assert db_col in dbsearch_df.columns, f"dbsearch_df must contain '{db_col}' column."
assert csv_col in csv_df.columns, f"csvfile must contain '{csv_col}' column."
# Subset and rename columns in csv_df to match dbsearch_df
csv_df = csv_df[list(csv_to_dbsearch_mapping.keys())].rename(columns=csv_to_dbsearch_mapping)
# Perform the filtering based on all specified columns
filtered_df = dbsearch_df.merge(csv_df, on=list(csv_to_dbsearch_mapping.values()), how='inner')
logging.info(f"[USER] Filtered PSM DataFrame: {len(filtered_df)} rows remaining out of {len(dbsearch_df)}.")
return filtered_df
def constructCospredVec(mz_arr, intensity_arr):
intensity_arr = intensity_arr / np.max(intensity_arr)
vector_intensity = np.zeros(SPECTRA_DIMENSION, dtype=np.float16)
vector_mass = np.arange(0, BIN_MAXMZ, BIN_SIZE, dtype=np.float16)
index_arr = mz_arr / BIN_SIZE
index_arr = np.around(index_arr).astype('int16')
for i, index in enumerate(index_arr):
if (index > SPECTRA_DIMENSION - 1): # add intensity to last bin for high m/z
vector_intensity[-1] += intensity_arr[i]
else:
vector_intensity[index] += intensity_arr[i]
return vector_intensity, vector_mass
# Contruct ML friendly spectra matrix for transformer full prediction
def generateHDF5_transformer(usimgffile, reformatmgffile, dbsearch_df, csvfile, contrastcsvfile):
assert "file" in dbsearch_df.columns
assert "scan" in dbsearch_df.columns
assert "charge" in dbsearch_df.columns
assert "seq" in dbsearch_df.columns
assert "modifiedseq" in dbsearch_df.columns
assert "proforma" in dbsearch_df.columns
assert "score" in dbsearch_df.columns
assert "reverse" in dbsearch_df.columns
# retrieve spectrum of PSM from MGF
spectra = mgf.read(usimgffile)
mzs_df = []
for index, row in dbsearch_df.iterrows():
if (index % 100 == 0):
logging.info('Generating CSV Progress: {}%'.format(
index/dbsearch_df.shape[0]*100))
try:
# title format:'mzspec:repoID:phospho_sample1.raw:scan:9'
spectrum = spectra.get_spectrum(row['title'])
retention_time = spectrum['params']['rtinseconds']
collision_energy = float(spectrum['params']['ce'])
charge_state = int(spectrum['params']['charge'][0])
method = spectrum['params']['method']
mod_num = spectrum['params']['mod_num']
raw_file = row['file']
scan_number = row['scan']
sequence = row['seq']
score = row['score']
modified_sequence = row['modifiedseq']
proforma = row['proforma']
reverse = row['reverse']
# Transformer specific vector
intensity_vec, mz_vec = constructCospredVec(
spectrum['m/z array'], spectrum['intensity array'])
masses = mz_vec
intensities = intensity_vec
mzs_df.append(pd.Series([raw_file, scan_number, sequence, score,
modified_sequence, proforma,
mod_num, reverse,
collision_energy, charge_state,
masses, intensities,
retention_time, method
]))
except:
next
# construct CSV
mzs_df = pd.concat(mzs_df, axis=1).transpose()
mzs_df.columns = ['raw_file', 'scan_number', 'sequence', 'score',
'modified_sequence', 'proforma',
'mod_num', 'reverse',
'collision_energy', 'precursor_charge',
'masses', 'intensities',
'retention_time', 'method']
mzs_df = mzs_df.reset_index(drop=True)
mzs_df = mzs_df.dropna()
mzs_df.columns = mzs_df.columns.str.replace('[\r]', '')
# To prevent data leaking, only keep the peptides that are not in the contrast dataset
if (contrastcsvfile is not None):
constrast_dataset = pd.read_csv(contrastcsvfile, sep=',')
mzs_df = mzs_df[~mzs_df['proforma'].isin(constrast_dataset['proforma'])]
mzs_df.to_csv(csvfile, index=False) # CSV discards values in large vec
logging.info('[STATUS] Generating peptide list CSV ... DONE!')
# construct Dataset based on CoSpred Transformer definition
dataset = {
"collision_energy": mzs_df['collision_energy'].astype(np.uint8),
"collision_energy_aligned": io_cospred.get_float(mzs_df['collision_energy']),
"collision_energy_aligned_normed": io_cospred.get_float(mzs_df['collision_energy']/100.0),
"intensities_raw": io_cospred.get_2darray(mzs_df['intensities']).astype(np.float16),
"masses_pred": io_cospred.get_2darray(mzs_df['masses']).astype(np.float32),
"masses_raw": io_cospred.get_2darray(mzs_df['masses']).astype(np.float32),
"method": io_cospred.get_method_onehot(mzs_df['method']).astype(np.uint8),
"precursor_charge": mzs_df['precursor_charge'].astype(np.uint8),
"precursor_charge_onehot": io_cospred.get_precursor_charge_onehot(mzs_df['precursor_charge']).astype(np.uint8),
"raw_file": mzs_df['raw_file'].astype('S60'),
"reverse": io_cospred.get_boolean(mzs_df['reverse']),
"scan_number": io_cospred.get_number(mzs_df['scan_number']).astype(np.uint8),
"score": io_cospred.get_float(mzs_df['score']),
"modified_sequence": mzs_df['modified_sequence'].astype('S60'),
"sequence_integer": io_cospred.get_sequence_integer(mzs_df['modified_sequence']).astype(np.uint8),
"sequence_onehot": io_cospred.get_sequence_onehot(mzs_df['modified_sequence']).astype(np.uint8),
}
io_cospred.modifyMGFtitle(usimgffile, reformatmgffile)
return dataset
# def load_dataframe(input_data):
# """
# Load data into a pandas DataFrame based on the input type.
# Args:
# input_data (str or dict): Input data, either a file path to a CSV or a dictionary.
# Returns:
# pd.DataFrame: Loaded DataFrame.
# """
# if isinstance(input_data, dict):
# # Input is a dictionary
# logging.info("Input is a dictionary. Converting to DataFrame...")
# df = pd.DataFrame.from_dict(input_data)
# elif isinstance(input_data, str) and os.path.isfile(input_data):
# # Input is a file path
# logging.info("Input is a file path. Reading CSV...")
# df = pd.read_csv(input_data, sep=',', index_col=False)
# else:
# raise ValueError("Input must be either a dictionary or a valid file path to a CSV.")
# return df
def main():
# Suppress warning message of tensorflow compatibility
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3'
warnings.filterwarnings("ignore")
# Configure logging
log_file_prep = os.path.join(constants_location.LOGS_DIR, "cospred_prep.log")
logging.basicConfig(
filename=log_file_prep,
filemode="w", # Overwrite the log file each time the script runs
format="%(asctime)s - %(levelname)s - %(message)s",
level=logging.INFO # Set the logging level (INFO, DEBUG, WARNING, ERROR, CRITICAL)
)
# Optionally, log to both file and console
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
console.setFormatter(formatter)
logging.getLogger().addHandler(console)
parser = ArgumentParser()
parser.add_argument('-w', '--workflow', default='test',
help='workflow to use')
args = parser.parse_args()
workflow = args.workflow
temp_dir = constants_location.TEMP_DIR
trainsetfile = constants_location.TRAINFILE
testsetfile = constants_location.TESTFILE
traincsvfile = constants_location.TRAINCSV_PATH
testcsvfile = constants_location.TESTCSV_PATH
mgf_dir = constants_location.MGF_DIR
mgffile = constants_location.MGF_PATH
mzml_dir = constants_location.MZML_DIR
psmfile = constants_location.PSM_PATH
mappingfile = constants_location.MAPPINGFILE_PATH
if (workflow == 'split'):
pass
elif (workflow == 'train'):
usimgffile = constants_location.REFORMAT_TRAIN_USITITLE_PATH
reformatmgffile = constants_location.REFORMAT_TRAIN_PATH
hdf5file = constants_location.TRAINDATA_PATH
datasetfile = trainsetfile
csvfile = traincsvfile
elif (workflow == 'test'):
usimgffile = constants_location.REFORMAT_TEST_USITITLE_PATH
reformatmgffile = constants_location.REFORMAT_TEST_PATH
hdf5file = constants_location.TESTDATA_PATH
datasetfile = testsetfile
csvfile = testcsvfile
elif (workflow == 'predict'):
csvfile = constants_location.PREDICT_ORIGINAL
predict_csv = constants_location.PREDICTCSV_PATH
hdf5file = constants_location.PREDDATA_PATH
else:
logging.error("Unknown workflow choice.")
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
# get psm result
# if (workflow != 'predict' and workflow != 'combine'):
# dbsearch_df = io_cospred.getPSM(psmfile, mappingfile)
if (workflow == 'split'):
# hold out N records as testset
logging.info('[INFO] Workflow: Splitting the dataset ...')
io_cospred.splitMGF(mgf_dir, mgffile, trainsetfile, testsetfile, test_ratio=1-VAL_SPLIT)
logging.info('[STATUS] Splitting train vs test set ... DONE!')
# reformat the Spectra
elif (workflow == 'train' or workflow == 'test'):
logging.info(f'[INFO] Workflow: Annotating {workflow} set ...')
dbsearch_df = io_cospred.getPSM(psmfile, mgf_dir, mappingfile)
# regenerate header for downstream annotation
if not os.path.exists(usimgffile):
io_cospred.reformatMGF(datasetfile, mzml_dir, dbsearch_df, usimgffile, temp_dir)
# match peptide from PSM with spectra MGF to generate CSV with full spectra bins
if not os.path.exists(reformatmgffile):
if (workflow == 'train'):
dataset = generateHDF5_transformer(usimgffile, reformatmgffile, dbsearch_df,
csvfile, None)
elif (workflow == 'test'):
dataset = generateHDF5_transformer(usimgffile, reformatmgffile, dbsearch_df,
csvfile, traincsvfile)
io_local.to_hdf5(dataset, hdf5file)
logging.info(f'[STATUS] Generating {workflow} set ... DONE!')
else:
logging.info(f'[USER] {workflow} set is already existed')
elif (workflow == 'predict'):
logging.info('[INFO] Workflow: Spectrum prediction ...')
if not os.path.exists(csvfile):
logging.error('No peptide list available!')
else:
# transform to hdf5
dataset = io_cospred.constructDataset_frompep(csvfile, predict_csv)
io_cospred.to_hdf5(dataset, hdf5file)
# check generated hdf
io_cospred.read_hdf5(hdf5file)
logging.info(f'[STATUS] Generating {workflow} set ... DONE!')
else:
logging.error("Unknown workflow choice.")
if __name__ == "__main__":
main()