|
| 1 | +function newModel=curateModelFromTables(model,metsInfo,genesInfo,rxnsCoeffs,rxnsInfo,metPrefix,rxnPrefix) |
| 2 | +% curateModelFromTables |
| 3 | +% Curate existing and/or add new metabolites, reactions and genes |
| 4 | +% from tabular data files. Originally extracted from yeast-GEM's |
| 5 | +% curateMetsRxnsGenes; generalised here so any GEM project can drive |
| 6 | +% batch curation from the same set of *.tsv files. |
| 7 | +% |
| 8 | +% If the *.tsv files contain metabolites, reactions and/or genes that are |
| 9 | +% already present in the model, then information in the model will be |
| 10 | +% overwritten. Note that this includes empty annotations in the *.tsv |
| 11 | +% files! Metabolites are matched by metaboliteName[comp]; reactions by |
| 12 | +% the stoichiometry of its reactants and products; genes by their gene |
| 13 | +% name. This function can therefore be used to add new entities in the |
| 14 | +% model, or curate those already existing in the model. |
| 15 | +% |
| 16 | +% Input: |
| 17 | +% model RAVEN model structure to be curated. |
| 18 | +% metsInfo path to a *.tsv file with metabolite information, or |
| 19 | +% 'none' to skip metabolite curation. Columns: |
| 20 | +% metNames, comps, formula, charge, inchi, metNotes, |
| 21 | +% then any number of MIRIAM-namespace columns. |
| 22 | +% genesInfo path to a *.tsv file with gene information, or |
| 23 | +% 'none'. Columns: genes, geneShortNames, then MIRIAM. |
| 24 | +% rxnsCoeffs path to a *.tsv file with reaction stoichiometric |
| 25 | +% coefficients, or 'none'. Columns: rxnIdx, rxnNames, |
| 26 | +% metNames, comps, coefficient. One row per |
| 27 | +% (reaction, metabolite) pair. |
| 28 | +% rxnsInfo path to a *.tsv file with reaction information, or |
| 29 | +% 'none'. Columns: rxnIdx, rxnNames, grRules, lb, ub, |
| 30 | +% rev, subSystems, eccodes, rxnNotes, rxnReferences, |
| 31 | +% rxnConfidenceScores, then MIRIAM. |
| 32 | +% metPrefix prefix used to mint fresh metabolite ids (e.g. 's_' |
| 33 | +% for yeast-GEM, 'M_' for the cobrapy/BiGG default). |
| 34 | +% Default: 'M_'. |
| 35 | +% rxnPrefix prefix used to mint fresh reaction ids. Default: 'R_'. |
| 36 | +% |
| 37 | +% Output: |
| 38 | +% newModel curated RAVEN model structure. |
| 39 | +% |
| 40 | +% The 'everything after the core columns is MIRIAM' convention applies |
| 41 | +% to all three info tables: any column whose header is not one of the |
| 42 | +% listed core fields is treated as a MIRIAM annotation namespace and |
| 43 | +% stored on the matching entity. |
| 44 | +% |
| 45 | +% Usage: newModel = curateModelFromTables(model, metsInfo, genesInfo, ... |
| 46 | +% rxnsCoeffs, rxnsInfo, metPrefix, rxnPrefix) |
| 47 | + |
| 48 | +if nargin==4 |
| 49 | + error('Provide both a ''rxnsInfo'' and a ''rxnsCoeffs'' file') |
| 50 | +end |
| 51 | +if nargin<4 |
| 52 | + rxnsInfo='none'; |
| 53 | + rxnsCoeffs='none'; |
| 54 | +end |
| 55 | +if nargin<3 |
| 56 | + genesInfo='none'; |
| 57 | +end |
| 58 | +if nargin<6 || isempty(metPrefix) |
| 59 | + metPrefix = 'M_'; |
| 60 | +end |
| 61 | +if nargin<7 || isempty(rxnPrefix) |
| 62 | + rxnPrefix = 'R_'; |
| 63 | +end |
| 64 | +newModel=model; |
| 65 | + |
| 66 | +%% Metabolites |
| 67 | +if ~strcmp(metsInfo,'none') |
| 68 | + fid = fopen(metsInfo); |
| 69 | + raw = textscan(fid,['%q' repmat(' %q',1,17)],'Delimiter','\t'); |
| 70 | + fclose(fid); |
| 71 | + metsToAdd.metNames = raw{1}(2:end); |
| 72 | + metsToAdd.compartments = raw{2}(2:end); |
| 73 | + metsToAdd.metFormulas = raw{3}(2:end); |
| 74 | + metsToAdd.metCharges = cellfun(@str2num,raw{4}(2:end),'UniformOutput',false); |
| 75 | + emptyEntry = cellfun(@isempty,metsToAdd.metCharges); |
| 76 | + if all(emptyEntry) |
| 77 | + metsToAdd = rmfield(metsToAdd,'metCharges'); |
| 78 | + elseif any(emptyEntry) % If some charges are given, assume 0 for those without charges specified |
| 79 | + metsToAdd.metCharges(emptyEntry) = {0}; |
| 80 | + metsToAdd.metCharges = cell2mat(metsToAdd.metCharges); |
| 81 | + else |
| 82 | + metsToAdd.metCharges = cell2mat(metsToAdd.metCharges); |
| 83 | + end |
| 84 | + metsToAdd.inchis = raw{5}(2:end); |
| 85 | + metsToAdd.metNotes = raw{6}(2:end); |
| 86 | + |
| 87 | + % Check if metabolite already exists (check by metName[comp]) |
| 88 | + existingMets = []; |
| 89 | + existingMetsIdx = []; |
| 90 | + newMets = []; |
| 91 | + for i=1:numel(metsToAdd.metNames) |
| 92 | + metIdx = find(strcmp(newModel.metNames,metsToAdd.metNames{i})); |
| 93 | + existMet = strcmp(metsToAdd.compartments{i},newModel.comps(newModel.metComps(metIdx))); |
| 94 | + if any(existMet) |
| 95 | + existingMets = [existingMets, i]; |
| 96 | + existingMetsIdx = [existingMetsIdx, metIdx(existMet)]; |
| 97 | + else |
| 98 | + newMets = [newMets, i]; |
| 99 | + end |
| 100 | + end |
| 101 | + |
| 102 | + % Overwrite annotation of existing entries |
| 103 | + if any(existingMets) |
| 104 | + if isfield(newModel,'metFormulas') |
| 105 | + newModel.metFormulas(existingMetsIdx) = metsToAdd.metFormulas(existingMets); |
| 106 | + end |
| 107 | + if isfield(newModel,'metCharges') && isfield(metsToAdd,'metCharges') |
| 108 | + newModel.metCharges(existingMetsIdx) = metsToAdd.metCharges(existingMets); |
| 109 | + end |
| 110 | + if isfield(newModel,'inchis') |
| 111 | + newModel.inchis(existingMetsIdx) = metsToAdd.inchis(existingMets); |
| 112 | + end |
| 113 | + newModel = extractAndAddMiriam(newModel,raw(7:end),existingMets,existingMetsIdx,'met'); |
| 114 | + warning(['The following metabolites are already present in the model, '... |
| 115 | + 'their annotation will be overwritten to match the metsInfo file. '... |
| 116 | + 'If you do not particularly want to curate their annotations, it '... |
| 117 | + 'would be better to removes these metabolites from metsInfo:\n\t\t%s'],... |
| 118 | + strjoin(strcat(metsToAdd.metNames(existingMets),'[',metsToAdd.compartments(existingMets),']'),'\n\t\t')); |
| 119 | + end |
| 120 | + |
| 121 | + % Continue with new metabolites |
| 122 | + if numel(newMets)>0 |
| 123 | + metsToAdd.metNames(existingMets) = []; |
| 124 | + metsToAdd.compartments(existingMets) = []; |
| 125 | + metsToAdd.metFormulas(existingMets) = []; |
| 126 | + if all(cellfun(@isempty,metsToAdd.metFormulas)) |
| 127 | + metsToAdd = rmfield(metsToAdd,'metFormulas'); |
| 128 | + end |
| 129 | + if isfield(metsToAdd,'metCharges') |
| 130 | + metsToAdd.metCharges(existingMets) = []; |
| 131 | + end |
| 132 | + metsToAdd.inchis(existingMets) = []; |
| 133 | + if all(cellfun(@isempty,metsToAdd.inchis)) |
| 134 | + metsToAdd = rmfield(metsToAdd,'inchis'); |
| 135 | + end |
| 136 | + metsToAdd.metNotes(existingMets) = []; |
| 137 | + if all(cellfun(@isempty,metsToAdd.metNotes)) |
| 138 | + metsToAdd = rmfield(metsToAdd,'metNotes'); |
| 139 | + end |
| 140 | + % Add metabolites |
| 141 | + newModel = addMets(newModel,metsToAdd,true,metPrefix); |
| 142 | + addedIdx = numel(newModel.mets)-numel(newMets)+1:numel(newModel.mets); |
| 143 | + newModel = extractAndAddMiriam(newModel,raw(7:end),newMets,addedIdx,'met'); |
| 144 | + end |
| 145 | +end |
| 146 | +%% Genes |
| 147 | +if ~strcmp(genesInfo,'none') |
| 148 | + % Gather all data, first about reaction stoichiometries... |
| 149 | + fid = fopen(genesInfo); |
| 150 | + raw = textscan(fid,['%q' repmat(' %q',1,10)],'Delimiter','\t'); |
| 151 | + fclose(fid); |
| 152 | + genesToAdd.genes = raw{1}(2:end); |
| 153 | + genesToAdd.geneShortNames = raw{2}(2:end); |
| 154 | + |
| 155 | + [~,notNewGene,existingGene] = intersect(genesToAdd.genes,newModel.genes); |
| 156 | + if ~isempty(notNewGene) |
| 157 | + warning(['The following genes are already present in the model, their annotation '... |
| 158 | + 'will be overwritten to match the genesInfo file: \n\t\t%s'],... |
| 159 | + strjoin(genesToAdd.genes(notNewGene),'\n\t\t')); |
| 160 | + if isfield(newModel,'geneShortNames') |
| 161 | + newModel.geneShortNames(existingGene) = genesToAdd.geneShortNames(notNewGene); |
| 162 | + end |
| 163 | + newModel = extractAndAddMiriam(newModel,raw(3:end),notNewGene,existingGene,'gene'); |
| 164 | + end |
| 165 | + |
| 166 | + % Continue with new genes |
| 167 | + toAdd = 1:numel(genesToAdd.genes); |
| 168 | + toAdd(notNewGene) = []; |
| 169 | + if ~isempty(toAdd) |
| 170 | + genesToAdd.genes = genesToAdd.genes(toAdd); |
| 171 | + genesToAdd.geneShortNames = genesToAdd.geneShortNames(toAdd); |
| 172 | + if all(cellfun(@isempty,genesToAdd.geneShortNames)) |
| 173 | + genesToAdd = rmfield(genesToAdd,'geneShortNames'); |
| 174 | + end |
| 175 | + % Add genes |
| 176 | + newModel = addGenesRaven(newModel,genesToAdd); |
| 177 | + addedIdx = numel(newModel.genes)-numel(toAdd)+1:numel(newModel.genes); |
| 178 | + newModel = extractAndAddMiriam(newModel,raw(3:end),toAdd,addedIdx,'gene'); |
| 179 | + end |
| 180 | +end |
| 181 | +%% Reactions |
| 182 | +if ~any(strcmp({rxnsCoeffs,rxnsInfo},'none')) |
| 183 | + % Gather all data, first about reaction stoichiometries... |
| 184 | + fid = fopen(rxnsCoeffs); |
| 185 | + raw = textscan(fid,'%q %q %q %q %f','Delimiter','\t','HeaderLines',1); |
| 186 | + fclose(fid); |
| 187 | + rxnCheck.coeffsIdx = str2double(raw{1}); |
| 188 | + rxns.rxnNames = raw{2}; |
| 189 | + rxns.metNames = raw{3}; |
| 190 | + rxns.comps = raw{4}; |
| 191 | + rxns.coeffs = raw{5}; |
| 192 | + rxnCheck.coeffs = strcat(raw{1},'***',rxns.rxnNames); |
| 193 | + |
| 194 | + % ... and then additional reaction-specific data. |
| 195 | + fid = fopen(rxnsInfo); |
| 196 | + raw = textscan(fid,['%q' repmat(' %q',1,20)],'Delimiter','\t'); |
| 197 | + fclose(fid); |
| 198 | + rxnCheck.rxnsIdx = str2double(raw{1}(2:end)); |
| 199 | + rxnCheck.rxns = strcat(raw{1}(2:end),'***',raw{2}(2:end)); |
| 200 | + |
| 201 | + notMatching = setxor(rxnCheck.coeffs,rxnCheck.rxns); |
| 202 | + if numel(notMatching)>1 |
| 203 | + error(['The following reactions ánd/or their indices are not matched '... |
| 204 | + 'between the rxnsInfo and rxnsCoeffs files:\n\t\t%s'],... |
| 205 | + strjoin(regexprep(notMatching,'^\d+***',''),'\n\t\t')) |
| 206 | + end |
| 207 | + |
| 208 | + rxnsToAdd.rxnNames = raw{2}(2:end); |
| 209 | + rxnsToAdd.grRules = raw{3}(2:end); |
| 210 | + rxnsToAdd.lb = cellfun(@str2num,raw{4}(2:end)); |
| 211 | + rxnsToAdd.ub = cellfun(@str2num,raw{5}(2:end)); |
| 212 | + rxnsToAdd.rev = cellfun(@str2num,raw{6}(2:end)); |
| 213 | + rxnsToAdd.subSystems = raw{7}(2:end); |
| 214 | + rxnsToAdd.eccodes = raw{8}(2:end); |
| 215 | + rxnsToAdd.rxnNotes = raw{9}(2:end); |
| 216 | + rxnsToAdd.rxnReferences = raw{10}(2:end); |
| 217 | + rxnsToAdd.rxnConfidenceScores = cellfun(@str2num,raw{11}(2:end),'UniformOutput',false); |
| 218 | + emptyEntry = cellfun(@isempty,rxnsToAdd.rxnConfidenceScores); |
| 219 | + if all(emptyEntry) |
| 220 | + rxnsToAdd = rmfield(rxnsToAdd,'rxnConfidenceScores'); |
| 221 | + elseif any(emptyEntry) % If some rxnConfidenceScores are given, assume 0 for those without rxnConfidenceScores specified |
| 222 | + rxnsToAdd.rxnConfidenceScores(emptyEntry) = {0}; |
| 223 | + rxnsToAdd.rxnConfidenceScores = cell2mat(rxnsToAdd.rxnConfidenceScores); |
| 224 | + else |
| 225 | + rxnsToAdd.rxnConfidenceScores = cell2mat(rxnsToAdd.rxnConfidenceScores); |
| 226 | + end |
| 227 | + |
| 228 | + existingRxn=[]; |
| 229 | + notNewRxn=[]; |
| 230 | + for i=1:numel(rxnCheck.rxnsIdx) |
| 231 | + rxnRows = find(rxnCheck.rxnsIdx(i)==rxnCheck.coeffsIdx); |
| 232 | + rxnsToAdd.mets{i,1} = cell(1,1); |
| 233 | + rxnsToAdd.stoichCoeffs{i,1} = cell(1,1); |
| 234 | + for j=1:numel(rxnRows) |
| 235 | + newMetComp = [rxns.metNames{rxnRows(j)}, '[', rxns.comps{rxnRows(j)}, ']']; |
| 236 | + try |
| 237 | + newMetComp = getIndexes(newModel,newMetComp,'metcomps'); |
| 238 | + catch |
| 239 | + error(['Not all metabolites in reaction "%s" are present in the '... |
| 240 | + 'model or in the provided table with new metabolites.'],rxnsToAdd.rxnNames{i}) |
| 241 | + end |
| 242 | + rxnsToAdd.mets{i}(j) = newModel.mets(newMetComp); |
| 243 | + rxnsToAdd.stoichCoeffs{i}{j} = rxns.coeffs(rxnRows(j)); |
| 244 | + end |
| 245 | + rxnsToAdd.stoichCoeffs{i}=cell2mat(rxnsToAdd.stoichCoeffs{i}); |
| 246 | + %Check if the reaction not already exists, by stoichiometry of its |
| 247 | + %products and reactants |
| 248 | + modelCoeffs = transpose(newModel.S(getIndexes(newModel,rxnsToAdd.mets{i},'mets'),:)); |
| 249 | + modelCoeffs2 = find(~all(modelCoeffs==0,2)); |
| 250 | + modelCoeffs = modelCoeffs(modelCoeffs2,:); |
| 251 | + [~, duplicateRxn] = intersect(modelCoeffs,rxnsToAdd.stoichCoeffs{i},'rows'); |
| 252 | + if ~isempty(duplicateRxn) |
| 253 | + existingRxn = [existingRxn,modelCoeffs2(duplicateRxn)]; |
| 254 | + notNewRxn = [notNewRxn,i]; |
| 255 | + end |
| 256 | + end |
| 257 | + |
| 258 | + % If some reactions already existed, then replace its information |
| 259 | + if ~isempty(existingRxn) |
| 260 | + warning(['The following reactions are the same as existing model '... |
| 261 | + 'reactions, their annotation will be overwritten to match the '... |
| 262 | + 'rxnCoeffs file: \n\t\t(Existing reaction ID): New reaction name\n\t\t%s'],... |
| 263 | + strjoin(strcat('(',newModel.rxns(existingRxn),{'): '},rxnsToAdd.rxnNames(notNewRxn)),'\n\t\t')); |
| 264 | + newModel.rxnNames(existingRxn) = rxnsToAdd.rxnNames(notNewRxn); |
| 265 | + newModel.lb(existingRxn) = rxnsToAdd.lb(notNewRxn); |
| 266 | + newModel.ub(existingRxn) = rxnsToAdd.ub(notNewRxn); |
| 267 | + newModel.rev(existingRxn) = rxnsToAdd.rev(notNewRxn); |
| 268 | + if isfield(newModel,'subSystems') |
| 269 | + newModel.subSystems(existingRxn) = rxnsToAdd.subSystems(notNewRxn); |
| 270 | + end |
| 271 | + if isfield(newModel,'eccodes') |
| 272 | + newModel.eccodes(existingRxn) = rxnsToAdd.eccodes(notNewRxn); |
| 273 | + end |
| 274 | + if isfield(newModel,'rxnNotes') |
| 275 | + newModel.rxnNotes(existingRxn) = rxnsToAdd.rxnNotes(notNewRxn); |
| 276 | + end |
| 277 | + if isfield(newModel,'rxnConfidenceScores') && isfield(rxnsToAdd,'rxnConfidenceScores') |
| 278 | + newModel.rxnConfidenceScores(existingRxn) = rxnsToAdd.rxnConfidenceScores(notNewRxn); |
| 279 | + end |
| 280 | + emptyEntries = cellfun(@isempty,rxnsToAdd.grRules); |
| 281 | + if ~all(emptyEntries) |
| 282 | + newModel = changeGrRules(newModel,newModel.rxns(existingRxn(~emptyEntries)),rxnsToAdd.grRules(notNewRxn(~emptyEntries)),true); |
| 283 | + end |
| 284 | + newModel = extractAndAddMiriam(newModel,raw(11:end),notNewRxn,existingRxn,'rxn'); |
| 285 | + end |
| 286 | + |
| 287 | + % Continue with new reactions |
| 288 | + toAdd = 1:numel(rxnsToAdd.rxnNames); |
| 289 | + toAdd(notNewRxn) = []; |
| 290 | + if ~isempty(toAdd) |
| 291 | + rxnsToAdd.rxnNames = rxnsToAdd.rxnNames(toAdd); |
| 292 | + rxnsToAdd.lb = rxnsToAdd.lb(toAdd); |
| 293 | + rxnsToAdd.ub = rxnsToAdd.ub(toAdd); |
| 294 | + rxnsToAdd.rev = rxnsToAdd.rev(toAdd); |
| 295 | + rxnsToAdd.subSystems = rxnsToAdd.subSystems(toAdd); |
| 296 | + if all(cellfun(@isempty,rxnsToAdd.subSystems)) |
| 297 | + rxnsToAdd = rmfield(rxnsToAdd,'subSystems'); |
| 298 | + end |
| 299 | + rxnsToAdd.eccodes = rxnsToAdd.eccodes(toAdd); |
| 300 | + if all(cellfun(@isempty,rxnsToAdd.eccodes)) |
| 301 | + rxnsToAdd = rmfield(rxnsToAdd,'eccodes'); |
| 302 | + end |
| 303 | + rxnsToAdd.rxnNotes = rxnsToAdd.rxnNotes(toAdd); |
| 304 | + if all(cellfun(@isempty,rxnsToAdd.rxnNotes)) |
| 305 | + rxnsToAdd = rmfield(rxnsToAdd,'rxnNotes'); |
| 306 | + end |
| 307 | + if isfield(rxnsToAdd,'rxnConfidenceScores') |
| 308 | + rxnsToAdd.rxnConfidenceScores = rxnsToAdd.rxnConfidenceScores(toAdd); |
| 309 | + end |
| 310 | + rxnsToAdd.rxnReferences = rxnsToAdd.rxnReferences(toAdd); |
| 311 | + if all(cellfun(@isempty,rxnsToAdd.rxnReferences)) |
| 312 | + rxnsToAdd = rmfield(rxnsToAdd,'rxnReferences'); |
| 313 | + end |
| 314 | + rxnsToAdd.grRules = rxnsToAdd.grRules(toAdd); |
| 315 | + rxnsToAdd.mets = rxnsToAdd.mets(toAdd); |
| 316 | + rxnsToAdd.stoichCoeffs = rxnsToAdd.stoichCoeffs(toAdd); |
| 317 | + rxnsToAdd.rxns = generateNewIds(model,'rxns',rxnPrefix,numel(rxnsToAdd.rxnNames)); |
| 318 | + |
| 319 | + newModel = addRxns(newModel,rxnsToAdd,1,[],false,false); |
| 320 | + rxnsModelIdx = numel(newModel.rxns)-numel(toAdd)+1:numel(newModel.rxns); |
| 321 | + newModel = extractAndAddMiriam(newModel,raw(11:end),toAdd,rxnsModelIdx,'rxn'); |
| 322 | + end |
| 323 | +end |
| 324 | +end |
| 325 | + |
| 326 | +function newModel = extractAndAddMiriam(model,raw,inputIndex,modelIndex,type) |
| 327 | +newModel=model; |
| 328 | +miriamName = cell(numel(inputIndex),1); |
| 329 | +miriamValues = cell(1,numel(inputIndex)); |
| 330 | +for i=1:numel(raw) |
| 331 | + miriamName{i} = raw{i}{1}; |
| 332 | + miriamValues(1:numel(inputIndex),i) = raw{i}(inputIndex+1); |
| 333 | +end |
| 334 | +emptyMiriam = all(cellfun(@isempty,miriamValues),1); |
| 335 | +miriamName(emptyMiriam) = []; |
| 336 | +miriamValues(:,emptyMiriam) = []; |
| 337 | +if ~isfield(newModel,[type 'Miriams']); |
| 338 | + newModel.([type 'Miriams'])=cell(numel(newModel.([type 's'])),1); |
| 339 | +end |
| 340 | +if ~isempty(miriamName) |
| 341 | + for i=1:numel(miriamName) |
| 342 | + newModel = editMiriam(newModel,type,modelIndex,miriamName{i},miriamValues(:,i),'replace'); |
| 343 | + end |
| 344 | +end |
| 345 | +end |
0 commit comments