Skip to content

Commit f595c49

Browse files
authored
chore: updated with KEGG166 (#598)
* chore: update MAFFT to 7.526 * chore: update HMMER to 3.4.0 * chore: update DIAMOND to 2.1.17 * chore: update BLAST+ to 2.17.0 * fix: use HMMER via WSL2 * chore: update kegg*.mat files with KEGG 116 * feat: getKEGGModelForOrganism refer to new HMMs * feat: getKEGGModelForOrganism HMM zip URL * fix: exportModel concatenate subSystems for groups
1 parent c246045 commit f595c49

114 files changed

Lines changed: 1666 additions & 1437 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitattributes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@ mafft binary
1010
.gitattributes export-ignore
1111
.gitignore export-ignore
1212
.github export-ignore
13+
*.mat filter=lfs diff=lfs merge=lfs -text

doc/external/kegg/getKEGGModelForOrganism.html

Lines changed: 523 additions & 508 deletions
Large diffs are not rendered by default.

doc/external/kegg/getModelFromKEGG.html

Lines changed: 243 additions & 242 deletions
Large diffs are not rendered by default.

doc/external/kegg/getRxnsFromKEGG.html

Lines changed: 430 additions & 429 deletions
Large diffs are not rendered by default.

doc/io/exportModel.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -631,7 +631,7 @@ <h2><a name="_source"></a>SOURCE CODE <a href="#_top"><img alt="^" border="0" sr
631631
0568 model.subSystems = cellfun(@(c) cellfun(@char, c, <span class="string">'UniformOutput'</span>, false), model.subSystems, <span class="string">'UniformOutput'</span>, false);
632632
0569
633633
0570 <span class="comment">% === 2) Flatten once: names and their reaction indices (vectorized) ===</span>
634-
0571 flatNames = [model.subSystems{:}]; <span class="comment">% 1×M cellstr of all subsystem labels</span>
634+
0571 flatNames = vertcat(model.subSystems{:}); <span class="comment">% 1×M cellstr of all subsystem labels</span>
635635
0572 <span class="keyword">if</span> isempty(flatNames)
636636
0573 <span class="comment">% Nothing to do: no subsystems present</span>
637637
0574 <span class="keyword">return</span>

doc/testing/unit_tests/hmmerTests.html

Lines changed: 128 additions & 116 deletions
Large diffs are not rendered by default.

doc/tutorial/index.html

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,7 @@ <h2>Matlab files in this directory:</h2>
2424
<h2>Other Matlab-specific files in this directory:</h2>
2525
<ul style="list-style-image:url(../matlabicon.gif)">
2626
<li>empty.mat</li><li>iMK1208+suppInfo.mat</li><li>pathway.mat</li><li>pcPathway.mat</li></ul>
27-
<h2>Subsequent directories:</h2>
28-
<ul style="list-style-image:url(../matlabicon.gif)">
29-
<li>struct_conversion</li></ul>
27+
3028

3129
<hr><address>Generated by <strong><a href="http://www.artefact.tk/software/matlab/m2html/" title="Matlab Documentation in HTML">m2html</a></strong> &copy; 2005</address>
3230
</body>

external/kegg/getKEGGModelForOrganism.m

Lines changed: 70 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
% The hidden Markov models as generated in 2b or
3939
% downloaded from BioMet Toolbox (see below)
4040
% The final directory in dataDir should be styled as
41-
% prok90_kegg105 or euk90_kegg105, indicating whether
41+
% prok90_kegg116 or euk90_kegg116, indicating whether
4242
% the HMMs were trained on pro- or eukaryotic
4343
% sequences; using which sequence similarity treshold
4444
% (first set of digits); using which KEGG version
@@ -99,12 +99,12 @@
9999
% If -1 is provided, CD-HIT is skipped (optional, default 0.9)
100100
% globalModel structure containing both model and KOModel
101101
% structures as generated by getModelFromKEGG. These
102-
% will otherwise be loaded by via getModelFromKEGG.
102+
% will otherwise be loaded by via getModelFromKEGG.
103103
% Providing globalKEGGmodel can speed up model
104104
% generation if getKEGGModelForOrganism is run
105105
% multiple times for different strains. Example:
106106
% [globalModel.model,globalModel.KOModel] = getModelFromKEGG;
107-
% (optional, default empty, global model is loaded by
107+
% (optional, default empty, global model is loaded by
108108
% getModelFromKEGG)
109109
%
110110
% Output:
@@ -259,36 +259,36 @@
259259
else
260260
outDir=char(outDir);
261261
end
262-
if nargin<5
262+
if nargin<5 || isempty(keepSpontaneous)
263263
keepSpontaneous=true;
264264
end
265-
if nargin<6
265+
if nargin<6 || isempty(keepUndefinedStoich)
266266
keepUndefinedStoich=true;
267267
end
268-
if nargin<7
268+
if nargin<7 || isempty(keepIncomplete)
269269
keepIncomplete=true;
270270
end
271-
if nargin<8
271+
if nargin<8 || isempty(keepGeneral)
272272
keepGeneral=false;
273273
end
274-
if nargin<9
274+
if nargin<9 || isempty(cutOff)
275275
cutOff=10^-50;
276276
end
277-
if nargin<10
277+
if nargin<10 || isempty(minScoreRatioKO)
278278
minScoreRatioKO=0.3;
279279
end
280-
if nargin<11
280+
if nargin<11 || isempty(minScoreRatioG)
281281
minScoreRatioG=0.8;
282282
end
283-
if nargin<12
283+
if nargin<12 || isempty(maxPhylDist)
284284
maxPhylDist=inf;
285285
%Include all sequences for each reaction
286286
end
287-
if nargin<13
287+
if nargin<13 || isempty(nSequences)
288288
nSequences=inf;
289289
%Include all sequences for each reaction
290290
end
291-
if nargin<14
291+
if nargin<14 || isempty(seqIdentity)
292292
seqIdentity=0.9;
293293
end
294294

@@ -315,9 +315,9 @@
315315
%required zip file already in working directory or have it extracted. If
316316
%the zip file and directory is not here, it is downloaded from the cloud
317317
if ~isempty(dataDir)
318-
hmmOptions={'euk90_kegg105','prok90_kegg105'};
318+
hmmOptions={'euk90_kegg116','prok90_kegg116'};
319319
if ~endsWith(dataDir,hmmOptions) %Check if dataDir ends with any of the hmmOptions.
320-
%If not, then check whether the required folders exist anyway.
320+
%If not, then check whether the required folders exist anyway.
321321
if ~isfile(fullfile(dataDir,'keggdb','genes.pep')) && ...
322322
~isfolder(fullfile(dataDir,'fasta')) && ...
323323
~isfolder(fullfile(dataDir,'aligned')) && ...
@@ -339,14 +339,14 @@
339339
else
340340
fprintf('Downloading the HMMs archive file... ');
341341
try
342-
websave([dataDir,'.zip'],['https://github.com/SysBioChalmers/RAVEN/releases/download/v2.8.0/',hmmOptions{hmmIndex},'.zip']);
342+
websave([dataDir,'.zip'],['https://github.com/SysBioChalmers/RAVEN/releases/download/v2.11.0/',hmmOptions{hmmIndex},'.zip']);
343343
catch ME
344344
if strcmp(ME.identifier,'MATLAB:webservices:HTTP404StatusCodeError')
345345
error('Failed to download the HMMs archive file, the server returned a 404 error, try again later. If the problem persists please report it on the RAVEN GitHub Issues page: https://github.com/SysBioChalmers/RAVEN/issues')
346346
end
347347
end
348348
end
349-
349+
350350
fprintf('COMPLETE\n');
351351
fprintf('Extracting the HMMs archive file... ');
352352
unzip([dataDir,'.zip']);
@@ -406,7 +406,7 @@
406406
if ~ismember(organismID,[phylDistsFull.ids 'eukaryotes' 'prokaryotes'])
407407
error('Provided organismID is incorrect. Only species abbreviations from KEGG Species List or "eukaryotes"/"prokaryotes" are allowed.');
408408
end
409-
409+
410410
fprintf(['Pruning the model from <strong>non-' organismID '</strong> genes... ']);
411411
if ismember(organismID,{'eukaryotes','prokaryotes'})
412412
phylDists=getPhylDist(fullfile(dataDir,'keggdb'),maxPhylDist==-1);
@@ -552,6 +552,20 @@
552552
return
553553
end
554554

555+
tmpFile=tempname;
556+
%On Windows, paths need to be translated to Unix before parsing it to WSL
557+
if ispc
558+
wslPath.tmpFile=getWSLpath(tmpFile);
559+
%mafft has problems writing to terminal (/dev/stderr) when running
560+
%on WSL via MATLAB, instead write and read progress file
561+
mafftOutput = tempname;
562+
wslPath.mafftOutput=getWSLpath(mafftOutput);
563+
wslPath.mafft=getWSLpath(fullfile(ravenPath,'software','mafft','mafft-linux64','mafft.bat'));
564+
wslPath.hmmbuild=getWSLpath(fullfile(ravenPath,'software','hmmer','hmmbuild'));
565+
wslPath.hmmsearch=getWSLpath(fullfile(ravenPath,'software','hmmer','hmmsearch'));
566+
wslPath.cdhit=getWSLpath(fullfile(ravenPath,'software','cd-hit','cd-hit'));
567+
end
568+
555569
%Check if alignment of FASTA files should be performed
556570
missingAligned=setdiff(KOModel.rxns,[alignedFiles;hmmFiles;alignedWorking;outFiles]);
557571
if ~isempty(missingAligned)
@@ -561,18 +575,7 @@
561575
fprintf('Performing clustering and multiple alignment for KEGG Orthology specific protein sets... 0%% complete');
562576
end
563577
missingAligned=missingAligned(randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(missingAligned)));
564-
tmpFile=tempname;
565-
%On Windows, paths need to be translated to Unix before parsing it to WSL
566-
if ispc
567-
wslPath.tmpFile=getWSLpath(tmpFile);
568-
%mafft has problems writing to terminal (/dev/stderr) when running
569-
%on WSL via MATLAB, instead write and read progress file
570-
mafftOutput = tempname;
571-
wslPath.mafftOutput=getWSLpath(mafftOutput);
572-
wslPath.mafft=getWSLpath(fullfile(ravenPath,'software','mafft','mafft-linux64','mafft.bat'));
573-
wslPath.cdhit=getWSLpath(fullfile(ravenPath,'software','cd-hit','cd-hit'));
574-
end
575-
578+
576579
for i=1:numel(missingAligned)
577580
%This is checked here because it could be that it is created by a
578581
%parallel process. The faw-files are saved as temporary files to
@@ -587,7 +590,7 @@
587590
dispEM(EM,false);
588591
continue;
589592
end
590-
593+
591594
%If the multi-FASTA file is empty then save an empty aligned
592595
%file and continue
593596
s=dir(fullfile(dataDir,'fasta',[missingAligned{i} '.fa']));
@@ -596,17 +599,17 @@
596599
fclose(fid);
597600
continue;
598601
end
599-
602+
600603
%Create an empty file to prevent other threads to start to work
601604
%on the same alignment
602605
fid=fopen(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),'w');
603606
fclose(fid);
604-
607+
605608
%First load the FASTA file, then select up to nSequences
606609
%sequences of the most closely related species, apply any
607610
%constraints from maxPhylDist, and save it as a temporary file,
608611
%and create the model from that
609-
612+
610613
fastaStruct=fastaread(fullfile(dataDir,'fasta',[missingAligned{i} '.fa']));
611614
phylDist=inf(numel(fastaStruct),1);
612615
for j=1:numel(fastaStruct)
@@ -620,24 +623,24 @@
620623
end
621624
end
622625
end
623-
626+
624627
%Inf means that it should not be included
625628
phylDist(phylDist>maxPhylDist)=[];
626-
629+
627630
%Sort based on phylDist
628631
[~, order]=sort(phylDist);
629-
632+
630633
%Save the first nSequences hits to a temporary FASTA file
631634
if nSequences<=numel(fastaStruct)
632635
fastaStruct=fastaStruct(order(1:nSequences));
633636
else
634637
fastaStruct=fastaStruct(order);
635638
end
636-
639+
637640
%Do the clustering and alignment if there are more than one
638641
%sequences, otherwise just save the sequence (or an empty file)
639642
if numel(fastaStruct)>1
640-
if seqIdentity~=-1
643+
if seqIdentity~=-1
641644
cdhitInpCustom=tempname;
642645
fastawrite(cdhitInpCustom,fastaStruct);
643646
if seqIdentity<=1 && seqIdentity>0.7
@@ -712,7 +715,7 @@
712715
end
713716
%Move the temporary file to the real one
714717
movefile(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),fullfile(dataDir,'aligned',[missingAligned{i} '.fa']),'f');
715-
718+
716719
%Print the progress every 25 files
717720
if rem(i-1,25) == 0
718721
progress=num2str(floor(100*numel(listFiles(fullfile(dataDir,'aligned','*.fa')))/numel(KOModel.rxns)));
@@ -750,7 +753,7 @@
750753
dispEM(EM,false);
751754
continue;
752755
end
753-
756+
754757
%If the multi-FASTA file is empty then save an empty aligned
755758
%file and continue
756759
s=dir(fullfile(dataDir,'aligned',[missingHMMs{i} '.fa']));
@@ -763,14 +766,20 @@
763766
%KO. This is because hmmbuild cannot overwrite existing files
764767
fid=fopen(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw']),'w');
765768
fclose(fid);
766-
769+
767770
%Create HMM
768-
[status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmbuild' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']) '" "' fullfile(dataDir,'aligned',[missingHMMs{i} '.fa']) '"']);
771+
if ismac || isunix
772+
[status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmbuild' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']) '" "' fullfile(dataDir,'aligned',[missingHMMs{i} '.fa']) '"']);
773+
else
774+
wslPath.hmmFile = getWSLpath(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']));
775+
wslPath.alignFile = getWSLpath(fullfile(dataDir,'aligned',[missingHMMs{i} '.fa']));
776+
[status, output] = system(['wsl "' wslPath.hmmbuild '" --cpu "' num2str(cores) '" "' wslPath.hmmFile '" "' wslPath.alignFile '"']);
777+
end
769778
if status~=0
770779
EM=['Error when training HMM for ' missingHMMs{i} ':\n' output];
771780
dispEM(EM);
772781
end
773-
782+
774783
%Delete the temporary file
775784
delete(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw']));
776785

@@ -805,30 +814,36 @@
805814
dispEM(EM,false);
806815
continue;
807816
end
808-
817+
809818
%Save an empty file to prevent several threads working on the
810819
%same file
811820
fid=fopen(fullfile(outDir,[missingOUT{i} '.out']),'w');
812821
fclose(fid);
813-
822+
814823
%If the HMM file is empty then save an out file and continue
815824
s=dir(fullfile(dataDir,'hmms',[missingOUT{i} '.hmm']));
816825
if s.bytes<=0
817826
continue;
818827
end
819-
828+
820829
%Check each gene in the input file against this model
821-
[status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmsearch' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingOUT{i} '.hmm']) '" "' fastaFile '"']);
830+
if ismac || isunix
831+
[status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmsearch' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingOUT{i} '.hmm']) '" "' fastaFile '"']);
832+
else
833+
wslPath.hmmFile = getWSLpath(fullfile(dataDir,'hmms',[missingOUT{i} '.hmm']));
834+
wslPath.fastaFile = getWSLpath(fastaFile);
835+
[status, output]=system(['wsl "' wslPath.hmmsearch '" --cpu "' num2str(cores) '" "' wslPath.hmmFile '" "' wslPath.fastaFile '"']);
836+
end
822837
if status~=0
823838
EM=['Error when querying HMM for ' missingOUT{i} ':\n' output];
824839
dispEM(EM);
825840
end
826-
841+
827842
%Save the output to a file
828843
fid=fopen(fullfile(outDir,[missingOUT{i} '.out']),'w');
829844
fwrite(fid,output);
830845
fclose(fid);
831-
846+
832847
%Print the progress every 25 files
833848
if rem(i-1,25) == 0
834849
progress=num2str(floor(100*numel(listFiles(fullfile(outDir,'*.out')))/numel(KOModel.rxns)));
@@ -861,16 +876,16 @@
861876
while 1
862877
%Get the next line
863878
tline = fgetl(fid);
864-
879+
865880
%Abort at end of file
866881
if ~ischar(tline)
867882
break;
868883
end
869-
884+
870885
if and(beginMatches,strcmp(tline,' ------ inclusion threshold ------'))
871886
break;
872887
end
873-
888+
874889
if beginMatches==false
875890
%This is how the listing of matches begins
876891
if any(strfind(tline,'E-value '))
@@ -883,7 +898,7 @@
883898
if ~strcmp(tline,' [No hits detected that satisfy reporting thresholds]') && ~isempty(tline)
884899
elements=regexp(tline,' ','split');
885900
elements=elements(cellfun(@any,elements));
886-
901+
887902
%Check if the match is below the treshhold
888903
score=str2double(elements{1});
889904
gene=elements{9};
@@ -952,7 +967,7 @@
952967
%Find the KOs and the corresponding genes
953968
J=ismember(KOModel.rxns,KOs);
954969
[~, K]=find(koGeneMat(J,:));
955-
970+
956971
if any(K)
957972
model.rxnGeneMat(i,K)=1;
958973
%Also delete KOs for which no genes were found. If no genes at

external/kegg/getModelFromKEGG.m

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,10 @@
4444
% Usage: [model,KOModel]=getModelFromKEGG(keggPath,keepSpontaneous,...
4545
% keepUndefinedStoich,keepIncomplete,keepGeneral)
4646

47+
ravenPath=findRAVENroot();
48+
4749
if nargin<1
48-
keggPath='RAVEN/external/kegg';
50+
keggPath=fullfile(ravenPath,'external','kegg');
4951
else
5052
keggPath=char(keggPath);
5153
end
@@ -62,7 +64,6 @@
6264
keepGeneral=false;
6365
end
6466

65-
ravenPath=findRAVENroot();
6667
modelFile=fullfile(ravenPath,'external','kegg','keggModel.mat');
6768
if exist(modelFile, 'file') && isNewestFile(ravenPath)
6869
fprintf(['Importing the global KEGG model from ' strrep(modelFile,'\','/') '... ']);

external/kegg/getRxnsFromKEGG.m

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,15 +68,16 @@
6868
% (except for '///')
6969
%
7070

71+
ravenPath=findRAVENroot();
72+
7173
if nargin<1
72-
keggPath='RAVEN/external/kegg';
74+
keggPath=fullfile(ravenPath,'external','kegg');
7375
else
7476
keggPath=char(keggPath);
7577
end
7678

7779
%Check if the reactions have been parsed before and saved. If so, load the
7880
%model
79-
ravenPath=findRAVENroot();
8081
rxnsFile=fullfile(ravenPath,'external','kegg','keggRxns.mat');
8182
if exist(rxnsFile, 'file')
8283
fprintf(['Importing KEGG reactions from ' strrep(rxnsFile,'\','/') '... ']);

0 commit comments

Comments
 (0)