|
38 | 38 | % The hidden Markov models as generated in 2b or |
39 | 39 | % downloaded from BioMet Toolbox (see below) |
40 | 40 | % The final directory in dataDir should be styled as |
41 | | -% prok90_kegg105 or euk90_kegg105, indicating whether |
| 41 | +% prok90_kegg116 or euk90_kegg116, indicating whether |
42 | 42 | % the HMMs were trained on pro- or eukaryotic |
43 | 43 | % sequences; using which sequence similarity treshold |
44 | 44 | % (first set of digits); using which KEGG version |
|
99 | 99 | % If -1 is provided, CD-HIT is skipped (optional, default 0.9) |
100 | 100 | % globalModel structure containing both model and KOModel |
101 | 101 | % structures as generated by getModelFromKEGG. These |
102 | | -% will otherwise be loaded by via getModelFromKEGG. |
| 102 | +% will otherwise be loaded by via getModelFromKEGG. |
103 | 103 | % Providing globalKEGGmodel can speed up model |
104 | 104 | % generation if getKEGGModelForOrganism is run |
105 | 105 | % multiple times for different strains. Example: |
106 | 106 | % [globalModel.model,globalModel.KOModel] = getModelFromKEGG; |
107 | | -% (optional, default empty, global model is loaded by |
| 107 | +% (optional, default empty, global model is loaded by |
108 | 108 | % getModelFromKEGG) |
109 | 109 | % |
110 | 110 | % Output: |
|
259 | 259 | else |
260 | 260 | outDir=char(outDir); |
261 | 261 | end |
262 | | -if nargin<5 |
| 262 | +if nargin<5 || isempty(keepSpontaneous) |
263 | 263 | keepSpontaneous=true; |
264 | 264 | end |
265 | | -if nargin<6 |
| 265 | +if nargin<6 || isempty(keepUndefinedStoich) |
266 | 266 | keepUndefinedStoich=true; |
267 | 267 | end |
268 | | -if nargin<7 |
| 268 | +if nargin<7 || isempty(keepIncomplete) |
269 | 269 | keepIncomplete=true; |
270 | 270 | end |
271 | | -if nargin<8 |
| 271 | +if nargin<8 || isempty(keepGeneral) |
272 | 272 | keepGeneral=false; |
273 | 273 | end |
274 | | -if nargin<9 |
| 274 | +if nargin<9 || isempty(cutOff) |
275 | 275 | cutOff=10^-50; |
276 | 276 | end |
277 | | -if nargin<10 |
| 277 | +if nargin<10 || isempty(minScoreRatioKO) |
278 | 278 | minScoreRatioKO=0.3; |
279 | 279 | end |
280 | | -if nargin<11 |
| 280 | +if nargin<11 || isempty(minScoreRatioG) |
281 | 281 | minScoreRatioG=0.8; |
282 | 282 | end |
283 | | -if nargin<12 |
| 283 | +if nargin<12 || isempty(maxPhylDist) |
284 | 284 | maxPhylDist=inf; |
285 | 285 | %Include all sequences for each reaction |
286 | 286 | end |
287 | | -if nargin<13 |
| 287 | +if nargin<13 || isempty(nSequences) |
288 | 288 | nSequences=inf; |
289 | 289 | %Include all sequences for each reaction |
290 | 290 | end |
291 | | -if nargin<14 |
| 291 | +if nargin<14 || isempty(seqIdentity) |
292 | 292 | seqIdentity=0.9; |
293 | 293 | end |
294 | 294 |
|
|
315 | 315 | %required zip file already in working directory or have it extracted. If |
316 | 316 | %the zip file and directory is not here, it is downloaded from the cloud |
317 | 317 | if ~isempty(dataDir) |
318 | | - hmmOptions={'euk90_kegg105','prok90_kegg105'}; |
| 318 | + hmmOptions={'euk90_kegg116','prok90_kegg116'}; |
319 | 319 | if ~endsWith(dataDir,hmmOptions) %Check if dataDir ends with any of the hmmOptions. |
320 | | - %If not, then check whether the required folders exist anyway. |
| 320 | + %If not, then check whether the required folders exist anyway. |
321 | 321 | if ~isfile(fullfile(dataDir,'keggdb','genes.pep')) && ... |
322 | 322 | ~isfolder(fullfile(dataDir,'fasta')) && ... |
323 | 323 | ~isfolder(fullfile(dataDir,'aligned')) && ... |
|
339 | 339 | else |
340 | 340 | fprintf('Downloading the HMMs archive file... '); |
341 | 341 | try |
342 | | - websave([dataDir,'.zip'],['https://github.com/SysBioChalmers/RAVEN/releases/download/v2.8.0/',hmmOptions{hmmIndex},'.zip']); |
| 342 | + websave([dataDir,'.zip'],['https://github.com/SysBioChalmers/RAVEN/releases/download/v2.11.0/',hmmOptions{hmmIndex},'.zip']); |
343 | 343 | catch ME |
344 | 344 | if strcmp(ME.identifier,'MATLAB:webservices:HTTP404StatusCodeError') |
345 | 345 | error('Failed to download the HMMs archive file, the server returned a 404 error, try again later. If the problem persists please report it on the RAVEN GitHub Issues page: https://github.com/SysBioChalmers/RAVEN/issues') |
346 | 346 | end |
347 | 347 | end |
348 | 348 | end |
349 | | - |
| 349 | + |
350 | 350 | fprintf('COMPLETE\n'); |
351 | 351 | fprintf('Extracting the HMMs archive file... '); |
352 | 352 | unzip([dataDir,'.zip']); |
|
406 | 406 | if ~ismember(organismID,[phylDistsFull.ids 'eukaryotes' 'prokaryotes']) |
407 | 407 | error('Provided organismID is incorrect. Only species abbreviations from KEGG Species List or "eukaryotes"/"prokaryotes" are allowed.'); |
408 | 408 | end |
409 | | - |
| 409 | + |
410 | 410 | fprintf(['Pruning the model from <strong>non-' organismID '</strong> genes... ']); |
411 | 411 | if ismember(organismID,{'eukaryotes','prokaryotes'}) |
412 | 412 | phylDists=getPhylDist(fullfile(dataDir,'keggdb'),maxPhylDist==-1); |
|
552 | 552 | return |
553 | 553 | end |
554 | 554 |
|
| 555 | +tmpFile=tempname; |
| 556 | +%On Windows, paths need to be translated to Unix before parsing it to WSL |
| 557 | +if ispc |
| 558 | + wslPath.tmpFile=getWSLpath(tmpFile); |
| 559 | + %mafft has problems writing to terminal (/dev/stderr) when running |
| 560 | + %on WSL via MATLAB, instead write and read progress file |
| 561 | + mafftOutput = tempname; |
| 562 | + wslPath.mafftOutput=getWSLpath(mafftOutput); |
| 563 | + wslPath.mafft=getWSLpath(fullfile(ravenPath,'software','mafft','mafft-linux64','mafft.bat')); |
| 564 | + wslPath.hmmbuild=getWSLpath(fullfile(ravenPath,'software','hmmer','hmmbuild')); |
| 565 | + wslPath.hmmsearch=getWSLpath(fullfile(ravenPath,'software','hmmer','hmmsearch')); |
| 566 | + wslPath.cdhit=getWSLpath(fullfile(ravenPath,'software','cd-hit','cd-hit')); |
| 567 | +end |
| 568 | + |
555 | 569 | %Check if alignment of FASTA files should be performed |
556 | 570 | missingAligned=setdiff(KOModel.rxns,[alignedFiles;hmmFiles;alignedWorking;outFiles]); |
557 | 571 | if ~isempty(missingAligned) |
|
561 | 575 | fprintf('Performing clustering and multiple alignment for KEGG Orthology specific protein sets... 0%% complete'); |
562 | 576 | end |
563 | 577 | missingAligned=missingAligned(randperm(RandStream.create('mrg32k3a','Seed',cputime()),numel(missingAligned))); |
564 | | - tmpFile=tempname; |
565 | | - %On Windows, paths need to be translated to Unix before parsing it to WSL |
566 | | - if ispc |
567 | | - wslPath.tmpFile=getWSLpath(tmpFile); |
568 | | - %mafft has problems writing to terminal (/dev/stderr) when running |
569 | | - %on WSL via MATLAB, instead write and read progress file |
570 | | - mafftOutput = tempname; |
571 | | - wslPath.mafftOutput=getWSLpath(mafftOutput); |
572 | | - wslPath.mafft=getWSLpath(fullfile(ravenPath,'software','mafft','mafft-linux64','mafft.bat')); |
573 | | - wslPath.cdhit=getWSLpath(fullfile(ravenPath,'software','cd-hit','cd-hit')); |
574 | | - end |
575 | | - |
| 578 | + |
576 | 579 | for i=1:numel(missingAligned) |
577 | 580 | %This is checked here because it could be that it is created by a |
578 | 581 | %parallel process. The faw-files are saved as temporary files to |
|
587 | 590 | dispEM(EM,false); |
588 | 591 | continue; |
589 | 592 | end |
590 | | - |
| 593 | + |
591 | 594 | %If the multi-FASTA file is empty then save an empty aligned |
592 | 595 | %file and continue |
593 | 596 | s=dir(fullfile(dataDir,'fasta',[missingAligned{i} '.fa'])); |
|
596 | 599 | fclose(fid); |
597 | 600 | continue; |
598 | 601 | end |
599 | | - |
| 602 | + |
600 | 603 | %Create an empty file to prevent other threads to start to work |
601 | 604 | %on the same alignment |
602 | 605 | fid=fopen(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),'w'); |
603 | 606 | fclose(fid); |
604 | | - |
| 607 | + |
605 | 608 | %First load the FASTA file, then select up to nSequences |
606 | 609 | %sequences of the most closely related species, apply any |
607 | 610 | %constraints from maxPhylDist, and save it as a temporary file, |
608 | 611 | %and create the model from that |
609 | | - |
| 612 | + |
610 | 613 | fastaStruct=fastaread(fullfile(dataDir,'fasta',[missingAligned{i} '.fa'])); |
611 | 614 | phylDist=inf(numel(fastaStruct),1); |
612 | 615 | for j=1:numel(fastaStruct) |
|
620 | 623 | end |
621 | 624 | end |
622 | 625 | end |
623 | | - |
| 626 | + |
624 | 627 | %Inf means that it should not be included |
625 | 628 | phylDist(phylDist>maxPhylDist)=[]; |
626 | | - |
| 629 | + |
627 | 630 | %Sort based on phylDist |
628 | 631 | [~, order]=sort(phylDist); |
629 | | - |
| 632 | + |
630 | 633 | %Save the first nSequences hits to a temporary FASTA file |
631 | 634 | if nSequences<=numel(fastaStruct) |
632 | 635 | fastaStruct=fastaStruct(order(1:nSequences)); |
633 | 636 | else |
634 | 637 | fastaStruct=fastaStruct(order); |
635 | 638 | end |
636 | | - |
| 639 | + |
637 | 640 | %Do the clustering and alignment if there are more than one |
638 | 641 | %sequences, otherwise just save the sequence (or an empty file) |
639 | 642 | if numel(fastaStruct)>1 |
640 | | - if seqIdentity~=-1 |
| 643 | + if seqIdentity~=-1 |
641 | 644 | cdhitInpCustom=tempname; |
642 | 645 | fastawrite(cdhitInpCustom,fastaStruct); |
643 | 646 | if seqIdentity<=1 && seqIdentity>0.7 |
|
712 | 715 | end |
713 | 716 | %Move the temporary file to the real one |
714 | 717 | movefile(fullfile(dataDir,'aligned',[missingAligned{i} '.faw']),fullfile(dataDir,'aligned',[missingAligned{i} '.fa']),'f'); |
715 | | - |
| 718 | + |
716 | 719 | %Print the progress every 25 files |
717 | 720 | if rem(i-1,25) == 0 |
718 | 721 | progress=num2str(floor(100*numel(listFiles(fullfile(dataDir,'aligned','*.fa')))/numel(KOModel.rxns))); |
|
750 | 753 | dispEM(EM,false); |
751 | 754 | continue; |
752 | 755 | end |
753 | | - |
| 756 | + |
754 | 757 | %If the multi-FASTA file is empty then save an empty aligned |
755 | 758 | %file and continue |
756 | 759 | s=dir(fullfile(dataDir,'aligned',[missingHMMs{i} '.fa'])); |
|
763 | 766 | %KO. This is because hmmbuild cannot overwrite existing files |
764 | 767 | fid=fopen(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw']),'w'); |
765 | 768 | fclose(fid); |
766 | | - |
| 769 | + |
767 | 770 | %Create HMM |
768 | | - [status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmbuild' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']) '" "' fullfile(dataDir,'aligned',[missingHMMs{i} '.fa']) '"']); |
| 771 | + if ismac || isunix |
| 772 | + [status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmbuild' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm']) '" "' fullfile(dataDir,'aligned',[missingHMMs{i} '.fa']) '"']); |
| 773 | + else |
| 774 | + wslPath.hmmFile = getWSLpath(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmm'])); |
| 775 | + wslPath.alignFile = getWSLpath(fullfile(dataDir,'aligned',[missingHMMs{i} '.fa'])); |
| 776 | + [status, output] = system(['wsl "' wslPath.hmmbuild '" --cpu "' num2str(cores) '" "' wslPath.hmmFile '" "' wslPath.alignFile '"']); |
| 777 | + end |
769 | 778 | if status~=0 |
770 | 779 | EM=['Error when training HMM for ' missingHMMs{i} ':\n' output]; |
771 | 780 | dispEM(EM); |
772 | 781 | end |
773 | | - |
| 782 | + |
774 | 783 | %Delete the temporary file |
775 | 784 | delete(fullfile(dataDir,'hmms',[missingHMMs{i} '.hmw'])); |
776 | 785 |
|
|
805 | 814 | dispEM(EM,false); |
806 | 815 | continue; |
807 | 816 | end |
808 | | - |
| 817 | + |
809 | 818 | %Save an empty file to prevent several threads working on the |
810 | 819 | %same file |
811 | 820 | fid=fopen(fullfile(outDir,[missingOUT{i} '.out']),'w'); |
812 | 821 | fclose(fid); |
813 | | - |
| 822 | + |
814 | 823 | %If the HMM file is empty then save an out file and continue |
815 | 824 | s=dir(fullfile(dataDir,'hmms',[missingOUT{i} '.hmm'])); |
816 | 825 | if s.bytes<=0 |
817 | 826 | continue; |
818 | 827 | end |
819 | | - |
| 828 | + |
820 | 829 | %Check each gene in the input file against this model |
821 | | - [status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmsearch' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingOUT{i} '.hmm']) '" "' fastaFile '"']); |
| 830 | + if ismac || isunix |
| 831 | + [status, output]=system(['"' fullfile(ravenPath,'software','hmmer',['hmmsearch' binEnd]) '" --cpu "' num2str(cores) '" "' fullfile(dataDir,'hmms',[missingOUT{i} '.hmm']) '" "' fastaFile '"']); |
| 832 | + else |
| 833 | + wslPath.hmmFile = getWSLpath(fullfile(dataDir,'hmms',[missingOUT{i} '.hmm'])); |
| 834 | + wslPath.fastaFile = getWSLpath(fastaFile); |
| 835 | + [status, output]=system(['wsl "' wslPath.hmmsearch '" --cpu "' num2str(cores) '" "' wslPath.hmmFile '" "' wslPath.fastaFile '"']); |
| 836 | + end |
822 | 837 | if status~=0 |
823 | 838 | EM=['Error when querying HMM for ' missingOUT{i} ':\n' output]; |
824 | 839 | dispEM(EM); |
825 | 840 | end |
826 | | - |
| 841 | + |
827 | 842 | %Save the output to a file |
828 | 843 | fid=fopen(fullfile(outDir,[missingOUT{i} '.out']),'w'); |
829 | 844 | fwrite(fid,output); |
830 | 845 | fclose(fid); |
831 | | - |
| 846 | + |
832 | 847 | %Print the progress every 25 files |
833 | 848 | if rem(i-1,25) == 0 |
834 | 849 | progress=num2str(floor(100*numel(listFiles(fullfile(outDir,'*.out')))/numel(KOModel.rxns))); |
|
861 | 876 | while 1 |
862 | 877 | %Get the next line |
863 | 878 | tline = fgetl(fid); |
864 | | - |
| 879 | + |
865 | 880 | %Abort at end of file |
866 | 881 | if ~ischar(tline) |
867 | 882 | break; |
868 | 883 | end |
869 | | - |
| 884 | + |
870 | 885 | if and(beginMatches,strcmp(tline,' ------ inclusion threshold ------')) |
871 | 886 | break; |
872 | 887 | end |
873 | | - |
| 888 | + |
874 | 889 | if beginMatches==false |
875 | 890 | %This is how the listing of matches begins |
876 | 891 | if any(strfind(tline,'E-value ')) |
|
883 | 898 | if ~strcmp(tline,' [No hits detected that satisfy reporting thresholds]') && ~isempty(tline) |
884 | 899 | elements=regexp(tline,' ','split'); |
885 | 900 | elements=elements(cellfun(@any,elements)); |
886 | | - |
| 901 | + |
887 | 902 | %Check if the match is below the treshhold |
888 | 903 | score=str2double(elements{1}); |
889 | 904 | gene=elements{9}; |
|
952 | 967 | %Find the KOs and the corresponding genes |
953 | 968 | J=ismember(KOModel.rxns,KOs); |
954 | 969 | [~, K]=find(koGeneMat(J,:)); |
955 | | - |
| 970 | + |
956 | 971 | if any(K) |
957 | 972 | model.rxnGeneMat(i,K)=1; |
958 | 973 | %Also delete KOs for which no genes were found. If no genes at |
|
0 commit comments