Skip to content

Commit 6807467

Browse files
committed
Add curateModelFromTables for batch curation from TSV inputs
Generic batch curation engine extracted from yeast-GEM's curateMetsRxnsGenes (see yeast-GEM phase 6 of the porting plan). Adds or updates metabolites, reactions and genes from TSV files; the schemas match yeast-GEM's data/modelCuration/template/ layout exactly so existing TSVs work unchanged. Differences from the yeast-GEM original: - Renamed to curateModelFromTables (the organism-agnostic name). yeast-GEM keeps a thin curateMetsRxnsGenes shim that adds the yeast prefixes and forwards here. - metPrefix and rxnPrefix are now parameters with defaults 'M_' and 'R_' (cobrapy / BiGG convention). Pass 's_' and 'r_' for yeast-GEM. - Behaviour otherwise identical: match metabolites by name[comp], reactions by stoichiometry, genes by id; anything after the listed core columns becomes a MIRIAM annotation whose namespace key is the column header.
1 parent cf06afe commit 6807467

1 file changed

Lines changed: 345 additions & 0 deletions

File tree

core/curateModelFromTables.m

Lines changed: 345 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,345 @@
1+
function newModel=curateModelFromTables(model,metsInfo,genesInfo,rxnsCoeffs,rxnsInfo,metPrefix,rxnPrefix)
2+
% curateModelFromTables
3+
% Curate existing and/or add new metabolites, reactions and genes
4+
% from tabular data files. Originally extracted from yeast-GEM's
5+
% curateMetsRxnsGenes; generalised here so any GEM project can drive
6+
% batch curation from the same set of *.tsv files.
7+
%
8+
% If the *.tsv files contain metabolites, reactions and/or genes that are
9+
% already present in the model, then information in the model will be
10+
% overwritten. Note that this includes empty annotations in the *.tsv
11+
% files! Metabolites are matched by metaboliteName[comp]; reactions by
12+
% the stoichiometry of its reactants and products; genes by their gene
13+
% name. This function can therefore be used to add new entities in the
14+
% model, or curate those already existing in the model.
15+
%
16+
% Input:
17+
% model RAVEN model structure to be curated.
18+
% metsInfo path to a *.tsv file with metabolite information, or
19+
% 'none' to skip metabolite curation. Columns:
20+
% metNames, comps, formula, charge, inchi, metNotes,
21+
% then any number of MIRIAM-namespace columns.
22+
% genesInfo path to a *.tsv file with gene information, or
23+
% 'none'. Columns: genes, geneShortNames, then MIRIAM.
24+
% rxnsCoeffs path to a *.tsv file with reaction stoichiometric
25+
% coefficients, or 'none'. Columns: rxnIdx, rxnNames,
26+
% metNames, comps, coefficient. One row per
27+
% (reaction, metabolite) pair.
28+
% rxnsInfo path to a *.tsv file with reaction information, or
29+
% 'none'. Columns: rxnIdx, rxnNames, grRules, lb, ub,
30+
% rev, subSystems, eccodes, rxnNotes, rxnReferences,
31+
% rxnConfidenceScores, then MIRIAM.
32+
% metPrefix prefix used to mint fresh metabolite ids (e.g. 's_'
33+
% for yeast-GEM, 'M_' for the cobrapy/BiGG default).
34+
% Default: 'M_'.
35+
% rxnPrefix prefix used to mint fresh reaction ids. Default: 'R_'.
36+
%
37+
% Output:
38+
% newModel curated RAVEN model structure.
39+
%
40+
% The 'everything after the core columns is MIRIAM' convention applies
41+
% to all three info tables: any column whose header is not one of the
42+
% listed core fields is treated as a MIRIAM annotation namespace and
43+
% stored on the matching entity.
44+
%
45+
% Usage: newModel = curateModelFromTables(model, metsInfo, genesInfo, ...
46+
% rxnsCoeffs, rxnsInfo, metPrefix, rxnPrefix)
47+
48+
if nargin==4
49+
error('Provide both a ''rxnsInfo'' and a ''rxnsCoeffs'' file')
50+
end
51+
if nargin<4
52+
rxnsInfo='none';
53+
rxnsCoeffs='none';
54+
end
55+
if nargin<3
56+
genesInfo='none';
57+
end
58+
if nargin<6 || isempty(metPrefix)
59+
metPrefix = 'M_';
60+
end
61+
if nargin<7 || isempty(rxnPrefix)
62+
rxnPrefix = 'R_';
63+
end
64+
newModel=model;
65+
66+
%% Metabolites
67+
if ~strcmp(metsInfo,'none')
68+
fid = fopen(metsInfo);
69+
raw = textscan(fid,['%q' repmat(' %q',1,17)],'Delimiter','\t');
70+
fclose(fid);
71+
metsToAdd.metNames = raw{1}(2:end);
72+
metsToAdd.compartments = raw{2}(2:end);
73+
metsToAdd.metFormulas = raw{3}(2:end);
74+
metsToAdd.metCharges = cellfun(@str2num,raw{4}(2:end),'UniformOutput',false);
75+
emptyEntry = cellfun(@isempty,metsToAdd.metCharges);
76+
if all(emptyEntry)
77+
metsToAdd = rmfield(metsToAdd,'metCharges');
78+
elseif any(emptyEntry) % If some charges are given, assume 0 for those without charges specified
79+
metsToAdd.metCharges(emptyEntry) = {0};
80+
metsToAdd.metCharges = cell2mat(metsToAdd.metCharges);
81+
else
82+
metsToAdd.metCharges = cell2mat(metsToAdd.metCharges);
83+
end
84+
metsToAdd.inchis = raw{5}(2:end);
85+
metsToAdd.metNotes = raw{6}(2:end);
86+
87+
% Check if metabolite already exists (check by metName[comp])
88+
existingMets = [];
89+
existingMetsIdx = [];
90+
newMets = [];
91+
for i=1:numel(metsToAdd.metNames)
92+
metIdx = find(strcmp(newModel.metNames,metsToAdd.metNames{i}));
93+
existMet = strcmp(metsToAdd.compartments{i},newModel.comps(newModel.metComps(metIdx)));
94+
if any(existMet)
95+
existingMets = [existingMets, i];
96+
existingMetsIdx = [existingMetsIdx, metIdx(existMet)];
97+
else
98+
newMets = [newMets, i];
99+
end
100+
end
101+
102+
% Overwrite annotation of existing entries
103+
if any(existingMets)
104+
if isfield(newModel,'metFormulas')
105+
newModel.metFormulas(existingMetsIdx) = metsToAdd.metFormulas(existingMets);
106+
end
107+
if isfield(newModel,'metCharges') && isfield(metsToAdd,'metCharges')
108+
newModel.metCharges(existingMetsIdx) = metsToAdd.metCharges(existingMets);
109+
end
110+
if isfield(newModel,'inchis')
111+
newModel.inchis(existingMetsIdx) = metsToAdd.inchis(existingMets);
112+
end
113+
newModel = extractAndAddMiriam(newModel,raw(7:end),existingMets,existingMetsIdx,'met');
114+
warning(['The following metabolites are already present in the model, '...
115+
'their annotation will be overwritten to match the metsInfo file. '...
116+
'If you do not particularly want to curate their annotations, it '...
117+
'would be better to removes these metabolites from metsInfo:\n\t\t%s'],...
118+
strjoin(strcat(metsToAdd.metNames(existingMets),'[',metsToAdd.compartments(existingMets),']'),'\n\t\t'));
119+
end
120+
121+
% Continue with new metabolites
122+
if numel(newMets)>0
123+
metsToAdd.metNames(existingMets) = [];
124+
metsToAdd.compartments(existingMets) = [];
125+
metsToAdd.metFormulas(existingMets) = [];
126+
if all(cellfun(@isempty,metsToAdd.metFormulas))
127+
metsToAdd = rmfield(metsToAdd,'metFormulas');
128+
end
129+
if isfield(metsToAdd,'metCharges')
130+
metsToAdd.metCharges(existingMets) = [];
131+
end
132+
metsToAdd.inchis(existingMets) = [];
133+
if all(cellfun(@isempty,metsToAdd.inchis))
134+
metsToAdd = rmfield(metsToAdd,'inchis');
135+
end
136+
metsToAdd.metNotes(existingMets) = [];
137+
if all(cellfun(@isempty,metsToAdd.metNotes))
138+
metsToAdd = rmfield(metsToAdd,'metNotes');
139+
end
140+
% Add metabolites
141+
newModel = addMets(newModel,metsToAdd,true,metPrefix);
142+
addedIdx = numel(newModel.mets)-numel(newMets)+1:numel(newModel.mets);
143+
newModel = extractAndAddMiriam(newModel,raw(7:end),newMets,addedIdx,'met');
144+
end
145+
end
146+
%% Genes
147+
if ~strcmp(genesInfo,'none')
148+
% Gather all data, first about reaction stoichiometries...
149+
fid = fopen(genesInfo);
150+
raw = textscan(fid,['%q' repmat(' %q',1,10)],'Delimiter','\t');
151+
fclose(fid);
152+
genesToAdd.genes = raw{1}(2:end);
153+
genesToAdd.geneShortNames = raw{2}(2:end);
154+
155+
[~,notNewGene,existingGene] = intersect(genesToAdd.genes,newModel.genes);
156+
if ~isempty(notNewGene)
157+
warning(['The following genes are already present in the model, their annotation '...
158+
'will be overwritten to match the genesInfo file: \n\t\t%s'],...
159+
strjoin(genesToAdd.genes(notNewGene),'\n\t\t'));
160+
if isfield(newModel,'geneShortNames')
161+
newModel.geneShortNames(existingGene) = genesToAdd.geneShortNames(notNewGene);
162+
end
163+
newModel = extractAndAddMiriam(newModel,raw(3:end),notNewGene,existingGene,'gene');
164+
end
165+
166+
% Continue with new genes
167+
toAdd = 1:numel(genesToAdd.genes);
168+
toAdd(notNewGene) = [];
169+
if ~isempty(toAdd)
170+
genesToAdd.genes = genesToAdd.genes(toAdd);
171+
genesToAdd.geneShortNames = genesToAdd.geneShortNames(toAdd);
172+
if all(cellfun(@isempty,genesToAdd.geneShortNames))
173+
genesToAdd = rmfield(genesToAdd,'geneShortNames');
174+
end
175+
% Add genes
176+
newModel = addGenesRaven(newModel,genesToAdd);
177+
addedIdx = numel(newModel.genes)-numel(toAdd)+1:numel(newModel.genes);
178+
newModel = extractAndAddMiriam(newModel,raw(3:end),toAdd,addedIdx,'gene');
179+
end
180+
end
181+
%% Reactions
182+
if ~any(strcmp({rxnsCoeffs,rxnsInfo},'none'))
183+
% Gather all data, first about reaction stoichiometries...
184+
fid = fopen(rxnsCoeffs);
185+
raw = textscan(fid,'%q %q %q %q %f','Delimiter','\t','HeaderLines',1);
186+
fclose(fid);
187+
rxnCheck.coeffsIdx = str2double(raw{1});
188+
rxns.rxnNames = raw{2};
189+
rxns.metNames = raw{3};
190+
rxns.comps = raw{4};
191+
rxns.coeffs = raw{5};
192+
rxnCheck.coeffs = strcat(raw{1},'***',rxns.rxnNames);
193+
194+
% ... and then additional reaction-specific data.
195+
fid = fopen(rxnsInfo);
196+
raw = textscan(fid,['%q' repmat(' %q',1,20)],'Delimiter','\t');
197+
fclose(fid);
198+
rxnCheck.rxnsIdx = str2double(raw{1}(2:end));
199+
rxnCheck.rxns = strcat(raw{1}(2:end),'***',raw{2}(2:end));
200+
201+
notMatching = setxor(rxnCheck.coeffs,rxnCheck.rxns);
202+
if numel(notMatching)>1
203+
error(['The following reactions ánd/or their indices are not matched '...
204+
'between the rxnsInfo and rxnsCoeffs files:\n\t\t%s'],...
205+
strjoin(regexprep(notMatching,'^\d+***',''),'\n\t\t'))
206+
end
207+
208+
rxnsToAdd.rxnNames = raw{2}(2:end);
209+
rxnsToAdd.grRules = raw{3}(2:end);
210+
rxnsToAdd.lb = cellfun(@str2num,raw{4}(2:end));
211+
rxnsToAdd.ub = cellfun(@str2num,raw{5}(2:end));
212+
rxnsToAdd.rev = cellfun(@str2num,raw{6}(2:end));
213+
rxnsToAdd.subSystems = raw{7}(2:end);
214+
rxnsToAdd.eccodes = raw{8}(2:end);
215+
rxnsToAdd.rxnNotes = raw{9}(2:end);
216+
rxnsToAdd.rxnReferences = raw{10}(2:end);
217+
rxnsToAdd.rxnConfidenceScores = cellfun(@str2num,raw{11}(2:end),'UniformOutput',false);
218+
emptyEntry = cellfun(@isempty,rxnsToAdd.rxnConfidenceScores);
219+
if all(emptyEntry)
220+
rxnsToAdd = rmfield(rxnsToAdd,'rxnConfidenceScores');
221+
elseif any(emptyEntry) % If some rxnConfidenceScores are given, assume 0 for those without rxnConfidenceScores specified
222+
rxnsToAdd.rxnConfidenceScores(emptyEntry) = {0};
223+
rxnsToAdd.rxnConfidenceScores = cell2mat(rxnsToAdd.rxnConfidenceScores);
224+
else
225+
rxnsToAdd.rxnConfidenceScores = cell2mat(rxnsToAdd.rxnConfidenceScores);
226+
end
227+
228+
existingRxn=[];
229+
notNewRxn=[];
230+
for i=1:numel(rxnCheck.rxnsIdx)
231+
rxnRows = find(rxnCheck.rxnsIdx(i)==rxnCheck.coeffsIdx);
232+
rxnsToAdd.mets{i,1} = cell(1,1);
233+
rxnsToAdd.stoichCoeffs{i,1} = cell(1,1);
234+
for j=1:numel(rxnRows)
235+
newMetComp = [rxns.metNames{rxnRows(j)}, '[', rxns.comps{rxnRows(j)}, ']'];
236+
try
237+
newMetComp = getIndexes(newModel,newMetComp,'metcomps');
238+
catch
239+
error(['Not all metabolites in reaction "%s" are present in the '...
240+
'model or in the provided table with new metabolites.'],rxnsToAdd.rxnNames{i})
241+
end
242+
rxnsToAdd.mets{i}(j) = newModel.mets(newMetComp);
243+
rxnsToAdd.stoichCoeffs{i}{j} = rxns.coeffs(rxnRows(j));
244+
end
245+
rxnsToAdd.stoichCoeffs{i}=cell2mat(rxnsToAdd.stoichCoeffs{i});
246+
%Check if the reaction not already exists, by stoichiometry of its
247+
%products and reactants
248+
modelCoeffs = transpose(newModel.S(getIndexes(newModel,rxnsToAdd.mets{i},'mets'),:));
249+
modelCoeffs2 = find(~all(modelCoeffs==0,2));
250+
modelCoeffs = modelCoeffs(modelCoeffs2,:);
251+
[~, duplicateRxn] = intersect(modelCoeffs,rxnsToAdd.stoichCoeffs{i},'rows');
252+
if ~isempty(duplicateRxn)
253+
existingRxn = [existingRxn,modelCoeffs2(duplicateRxn)];
254+
notNewRxn = [notNewRxn,i];
255+
end
256+
end
257+
258+
% If some reactions already existed, then replace its information
259+
if ~isempty(existingRxn)
260+
warning(['The following reactions are the same as existing model '...
261+
'reactions, their annotation will be overwritten to match the '...
262+
'rxnCoeffs file: \n\t\t(Existing reaction ID): New reaction name\n\t\t%s'],...
263+
strjoin(strcat('(',newModel.rxns(existingRxn),{'): '},rxnsToAdd.rxnNames(notNewRxn)),'\n\t\t'));
264+
newModel.rxnNames(existingRxn) = rxnsToAdd.rxnNames(notNewRxn);
265+
newModel.lb(existingRxn) = rxnsToAdd.lb(notNewRxn);
266+
newModel.ub(existingRxn) = rxnsToAdd.ub(notNewRxn);
267+
newModel.rev(existingRxn) = rxnsToAdd.rev(notNewRxn);
268+
if isfield(newModel,'subSystems')
269+
newModel.subSystems(existingRxn) = rxnsToAdd.subSystems(notNewRxn);
270+
end
271+
if isfield(newModel,'eccodes')
272+
newModel.eccodes(existingRxn) = rxnsToAdd.eccodes(notNewRxn);
273+
end
274+
if isfield(newModel,'rxnNotes')
275+
newModel.rxnNotes(existingRxn) = rxnsToAdd.rxnNotes(notNewRxn);
276+
end
277+
if isfield(newModel,'rxnConfidenceScores') && isfield(rxnsToAdd,'rxnConfidenceScores')
278+
newModel.rxnConfidenceScores(existingRxn) = rxnsToAdd.rxnConfidenceScores(notNewRxn);
279+
end
280+
emptyEntries = cellfun(@isempty,rxnsToAdd.grRules);
281+
if ~all(emptyEntries)
282+
newModel = changeGrRules(newModel,newModel.rxns(existingRxn(~emptyEntries)),rxnsToAdd.grRules(notNewRxn(~emptyEntries)),true);
283+
end
284+
newModel = extractAndAddMiriam(newModel,raw(11:end),notNewRxn,existingRxn,'rxn');
285+
end
286+
287+
% Continue with new reactions
288+
toAdd = 1:numel(rxnsToAdd.rxnNames);
289+
toAdd(notNewRxn) = [];
290+
if ~isempty(toAdd)
291+
rxnsToAdd.rxnNames = rxnsToAdd.rxnNames(toAdd);
292+
rxnsToAdd.lb = rxnsToAdd.lb(toAdd);
293+
rxnsToAdd.ub = rxnsToAdd.ub(toAdd);
294+
rxnsToAdd.rev = rxnsToAdd.rev(toAdd);
295+
rxnsToAdd.subSystems = rxnsToAdd.subSystems(toAdd);
296+
if all(cellfun(@isempty,rxnsToAdd.subSystems))
297+
rxnsToAdd = rmfield(rxnsToAdd,'subSystems');
298+
end
299+
rxnsToAdd.eccodes = rxnsToAdd.eccodes(toAdd);
300+
if all(cellfun(@isempty,rxnsToAdd.eccodes))
301+
rxnsToAdd = rmfield(rxnsToAdd,'eccodes');
302+
end
303+
rxnsToAdd.rxnNotes = rxnsToAdd.rxnNotes(toAdd);
304+
if all(cellfun(@isempty,rxnsToAdd.rxnNotes))
305+
rxnsToAdd = rmfield(rxnsToAdd,'rxnNotes');
306+
end
307+
if isfield(rxnsToAdd,'rxnConfidenceScores')
308+
rxnsToAdd.rxnConfidenceScores = rxnsToAdd.rxnConfidenceScores(toAdd);
309+
end
310+
rxnsToAdd.rxnReferences = rxnsToAdd.rxnReferences(toAdd);
311+
if all(cellfun(@isempty,rxnsToAdd.rxnReferences))
312+
rxnsToAdd = rmfield(rxnsToAdd,'rxnReferences');
313+
end
314+
rxnsToAdd.grRules = rxnsToAdd.grRules(toAdd);
315+
rxnsToAdd.mets = rxnsToAdd.mets(toAdd);
316+
rxnsToAdd.stoichCoeffs = rxnsToAdd.stoichCoeffs(toAdd);
317+
rxnsToAdd.rxns = generateNewIds(model,'rxns',rxnPrefix,numel(rxnsToAdd.rxnNames));
318+
319+
newModel = addRxns(newModel,rxnsToAdd,1,[],false,false);
320+
rxnsModelIdx = numel(newModel.rxns)-numel(toAdd)+1:numel(newModel.rxns);
321+
newModel = extractAndAddMiriam(newModel,raw(11:end),toAdd,rxnsModelIdx,'rxn');
322+
end
323+
end
324+
end
325+
326+
function newModel = extractAndAddMiriam(model,raw,inputIndex,modelIndex,type)
327+
newModel=model;
328+
miriamName = cell(numel(inputIndex),1);
329+
miriamValues = cell(1,numel(inputIndex));
330+
for i=1:numel(raw)
331+
miriamName{i} = raw{i}{1};
332+
miriamValues(1:numel(inputIndex),i) = raw{i}(inputIndex+1);
333+
end
334+
emptyMiriam = all(cellfun(@isempty,miriamValues),1);
335+
miriamName(emptyMiriam) = [];
336+
miriamValues(:,emptyMiriam) = [];
337+
if ~isfield(newModel,[type 'Miriams']);
338+
newModel.([type 'Miriams'])=cell(numel(newModel.([type 's'])),1);
339+
end
340+
if ~isempty(miriamName)
341+
for i=1:numel(miriamName)
342+
newModel = editMiriam(newModel,type,modelIndex,miriamName{i},miriamValues(:,i),'replace');
343+
end
344+
end
345+
end

0 commit comments

Comments
 (0)