Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
7beb2f1
Integrate dnaseq-nextflow into ReFlow via new dnaseqExperiment template
jbrestel Apr 5, 2026
85a599f
Simplify dnaseq workflow: remove unused params, move shared queries t…
jbrestel Apr 14, 2026
95de88a
getTaxonId instead of orgAbbrev for cnvandploidyqueries script
jbrestel Apr 14, 2026
d4a7ae4
Adding ortho group file parameter to RetrieveGeneCNVAndPloidyQueries
rdemko2332 Apr 14, 2026
d79e9f5
Updating runGeneCNVAndPloidyQuery to function from ortho group flat f…
rdemko2332 Apr 14, 2026
3838280
Merge branch 'master' into dnaseq-reflow
jbrestel May 13, 2026
3e3692c
add repeatmasker bed and fix dependencies
jbrestel May 15, 2026
f21e855
minor
jbrestel May 15, 2026
c1c3a80
debug gusConfigFile
sufenhu May 18, 2026
2a22f73
fix syntax error
sufenhu May 18, 2026
ae0cc77
add gusConfigFile and fullOrthoGroupsFile parameters
sufenhu May 18, 2026
f71ca1e
more debug of gusConfigFile
sufenhu May 18, 2026
b15c9b1
No need to include the fullOrthoGroupsFile paramValue
sufenhu May 18, 2026
6f4620a
Removing entry parameter from ngs sample workflow in runNextflowOnClu…
rdemko2332 May 20, 2026
2de5def
add gusConfigFile parameter
sufenhu Jun 2, 2026
d463146
use the gusConfigFile provided as a parameter instead of the one in $…
sufenhu Jun 2, 2026
6b506ba
gate clusterOptions on lsf executor; add maxMemoryGigs param for bwaMem
jbrestel Jun 3, 2026
448d72f
dynamically size bwaMem memory from genome fasta using (Gb*3.3)+2 rule
jbrestel Jun 3, 2026
269c121
pass genomeFastaFile to nextflowConfig step for dynamic memory sizing
jbrestel Jun 3, 2026
d28c319
debug $genomeFastaFile
sufenhu Jun 4, 2026
13a14b6
add step to copy DNASeq bigwig files to webservices directory
jbrestel Jun 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 62 additions & 43 deletions Main/bin/runGeneCNVAndPloidyQuery
Original file line number Diff line number Diff line change
Expand Up @@ -7,45 +7,33 @@ use GUS::ObjRelP::DbiDatabase;
use GUS::Supported::GusConfig;
use CBIL::Util::PropertySet;

my ($gusConfigFile,$organismAbbrev,$geneSourceIdOrthologFile,$chrsForCalcsFile);
&GetOptions("organismAbbrev=s" => \$organismAbbrev,
my ($gusConfigFile,$taxonId,$orthoGroupFile,$geneSourceIdOrthologFile,$chrsForCalcsFile);
&GetOptions("taxonId=s" => \$taxonId,
"orthoGroupFile=s" => \$orthoGroupFile,
"geneSourceIdOrthologFile=s" => \$geneSourceIdOrthologFile,
"gusConfigFile=s" => \$gusConfigFile,
"chrsForCalcsFile=s" => \$chrsForCalcsFile);
my $ploidy = 2;

my $geneSourceSql = "with sequence as (
select gf.source_id as gene_source_id
, gf.na_feature_id
, ns.source_id as contig_source_id
, ns.source_id as sequence_source_id
, ns.TAXON_ID
from dots.genefeature gf
, DOTS.NASEQUENCE ns
, SRES.ONTOLOGYTERM ot
where gf.na_sequence_id = ns.na_sequence_id
and ot.name = 'chromosome'
and ns.SEQUENCE_ONTOLOGY_ID = ot.ONTOLOGY_TERM_ID
and ns.taxon_id = (select taxon_id from apidb.organism where abbrev = '$organismAbbrev')
), orthologs as (
select gf.na_feature_id, sg.name
from dots.genefeature gf
, dots.SequenceSequenceGroup ssg
, dots.SequenceGroup sg
, core.TableInfo ti
where gf.na_feature_id = ssg.sequence_id
and ssg.sequence_group_id = sg.sequence_group_id
and ssg.source_table_id = ti.table_id
and ti.name = 'GeneFeature'
)
select s.gene_source_id
, o.name
from sequence s
, orthologs o
where s.na_feature_id = o.na_feature_id";

my $chrsForCalcsSql = "select ns.source_id from dots.nasequence ns, sres.ontologyterm ot where ot.name = 'chromosome' and ot.ontology_term_id = ns.sequence_ontology_id and ns.taxon_id = (select taxon_id from apidb.organism where abbrev = '$organismAbbrev')";

$gusConfigFile = $ENV{GUS_HOME}."/config/gus.config";

my $proteinToGeneSql = "
SELECT aas.source_id AS protein_source_id,
gf.source_id AS gene_source_id
FROM dots.AASequence aas
JOIN dots.TranslatedAASequence tas ON aas.aa_sequence_id = tas.aa_sequence_id
JOIN dots.TranslatedAAFeature taf ON taf.aa_sequence_id = tas.aa_sequence_id
JOIN dots.Transcript t ON taf.na_feature_id = t.na_feature_id
JOIN dots.GeneFeature gf ON t.parent_id = gf.na_feature_id
WHERE aas.subclass_view = 'TranslatedAASequence'
AND aas.taxon_id = $taxonId
AND aas.taxon_id IN (
SELECT taxon_id
FROM apidb.organism
WHERE is_annotated_genome = 1
)
";

my $chrsForCalcsSql = "select ns.source_id from dots.nasequence ns, sres.ontologyterm ot where ot.name = 'chromosome' and ot.ontology_term_id = ns.sequence_ontology_id and ns.taxon_id = $taxonId";

#$gusConfigFile = $ENV{GUS_HOME}."/config/gus.config";
die "Config file $gusConfigFile does not exist" unless -e $gusConfigFile;

my @properties = ();
Expand All @@ -59,19 +47,50 @@ my $db = GUS::ObjRelP::DbiDatabase-> new($gusConfig->{props}->{dbiDsn},

my $dbh = $db->getQueryHandle();

my $orthoMclStmt = $dbh->prepare($geneSourceSql);
$orthoMclStmt->execute();
my $proteinToGeneStmt = $dbh->prepare($proteinToGeneSql);
$proteinToGeneStmt->execute();

my %proteinToGene;
while (my @row = $proteinToGeneStmt->fetchrow_array()){
$proteinToGene{$row[0]} = $row[1];
}

open(GENE,">$geneSourceIdOrthologFile");
while (my @row = $orthoMclStmt->fetchrow_array()){
print GENE "$row[0]\t$row[1]\n";
my %proteinToGroup;
open(GROUPS, "<$orthoGroupFile") or die "Cannot open $orthoGroupFile: $!";
while (my $line = <GROUPS>) {
chomp $line;
my ($groupId, $proteinList) = split(/:\s*/, $line, 2);
next unless defined $proteinList;
foreach my $protein (split(/\s+/, $proteinList)) {
$proteinToGroup{$protein} = $groupId;
}
}
close GROUPS;

my @proteinsWithNoGroup;
open(GENE, ">$geneSourceIdOrthologFile") or die "Cannot open $geneSourceIdOrthologFile: $!";
while (my ($protein, $gene) = each %proteinToGene) {
my $group = $proteinToGroup{$protein};
unless ($group) {
(my $altProtein = $protein) =~ s/:/\_/g;
$group = $proteinToGroup{$altProtein};
}
if ($group) {
print GENE "$gene\t$group\n";
} else {
push @proteinsWithNoGroup, $protein;
}
}
close GENE;

if (@proteinsWithNoGroup) {
print STDERR "The following proteins have no group assignment in $orthoGroupFile:\n" . join("\n", @proteinsWithNoGroup) . "\n";
}

my $chrsForCalcs = $dbh->prepare($chrsForCalcsSql);
$chrsForCalcs->execute();

open(CHRS,">$chrsForCalcsFile");
open(CHRS, ">$chrsForCalcsFile") or die "Cannot open $chrsForCalcsFile: $!";
while (my @row = $chrsForCalcs->fetchrow_array()){
print CHRS "$row[0]\t\n";
}
Expand Down
49 changes: 49 additions & 0 deletions Main/lib/perl/WorkflowSteps/CopyDnaseqBigwigToWebSvc.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package ApiCommonWorkflow::Main::WorkflowSteps::CopyDnaseqBigwigToWebSvc;

@ISA = (ApiCommonWorkflow::Main::WorkflowSteps::WorkflowStep);
use strict;
use warnings;
use ApiCommonWorkflow::Main::WorkflowSteps::WorkflowStep;
use ApiCommonWorkflow::Main::Util::OrganismInfo;

sub run {
my ($self, $test, $undo) = @_;

my $copyFromDir = $self->getParamValue('copyFromDir');
my $organismAbbrev = $self->getParamValue('organismAbbrev');
my $relativeDir = $self->getParamValue('relativeDir');
my $experimentDatasetName = $self->getParamValue('experimentDatasetName');
my $gusConfigFile = $self->getParamValue('gusConfigFile');

my $workflowDataDir = $self->getWorkflowDataDir();
$gusConfigFile = "$workflowDataDir/$gusConfigFile";
my $websiteFilesDir = $self->getWebsiteFilesDir($test);

my $organismNameForFiles =
$self->getOrganismInfo($test, $organismAbbrev, $gusConfigFile)->getNameForFiles();

my $experimentCopyToDir = "$websiteFilesDir/$relativeDir/$organismNameForFiles/dnaseq/bigwig/$experimentDatasetName";
my $sourceDir = "$workflowDataDir/$copyFromDir";

$self->testInputFile('copyFromDir', $sourceDir);

if ($undo) {
$self->runCmd(0, "rm -rf $experimentCopyToDir");
} else {
$self->runCmd($test, "mkdir -p $experimentCopyToDir");

opendir(my $dh, $sourceDir) or die "Cannot open results directory '$sourceDir': $!";
my @samples = grep { !/^\./ && -d "$sourceDir/$_" } readdir($dh);
closedir($dh);

die "No sample subdirectories found in '$sourceDir'" unless @samples;

foreach my $sample (@samples) {
my $sampleCopyToDir = "$experimentCopyToDir/$sample";
$self->runCmd($test, "mkdir -p $sampleCopyToDir");
$self->runCmd($test, "cp $sourceDir/$sample/*.bw $sampleCopyToDir/");
}
}
}

1;
39 changes: 39 additions & 0 deletions Main/lib/perl/WorkflowSteps/MakeDnaSeqLoadNextflowConfig.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package ApiCommonWorkflow::Main::WorkflowSteps::MakeDnaSeqLoadNextflowConfig;

@ISA = (ApiCommonWorkflow::Main::WorkflowSteps::WorkflowStep);

use strict;
use warnings;
use ApiCommonWorkflow::Main::WorkflowSteps::WorkflowStep;

sub run {
my ($self, $test, $undo) = @_;

my $indelDir = $self->getParamValue("indelDir");
my $extDbRlsSpec = $self->getParamValue("extDbRlsSpec");
my $genomeExtDbRlsSpec = $self->getParamValue("genomeExtDbRlsSpec");

my $configPath = $self->getWorkflowDataDir() . "/" . $self->getParamValue("nextflowConfigFile");

if ($undo) {
$self->runCmd(0, "rm -rf $configPath");
} else {
open(F, ">", $configPath) or die "$! :Can't open config file '$configPath' for writing";
print F
"
params {
indelDir = \"$indelDir\"
extDbRlsSpec = '\"$extDbRlsSpec\"'
genomeExtDbRlsSpec = '\"$genomeExtDbRlsSpec\"'
}

singularity {
enabled = true
autoMounts = true
}
";
close(F);
}
}

1;
135 changes: 135 additions & 0 deletions Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
package ApiCommonWorkflow::Main::WorkflowSteps::MakeDnaSeqNextflowConfig;

@ISA = (ApiCommonWorkflow::Main::WorkflowSteps::WorkflowStep);

use strict;
use warnings;
use ApiCommonWorkflow::Main::WorkflowSteps::WorkflowStep;

sub run {
my ($self, $test, $undo) = @_;

my $workingDirRelativePath = $self->getParamValue("workingDirRelativePath");

my $sampleSheetFile = $self->getParamValue("sampleSheetFile");
my $genomeFile = $self->getParamValue("genomeFile");
my $gtfFile = $self->getParamValue("gtfFile");
my $footprintFile = $self->getParamValue("footprintFile");
my $ploidy = $self->getParamValue("ploidy");
my $resultsDirectory = $self->getParamValue("resultsDirectory");
my $geneSourceIdOrthologFile = $self->getParamValue("geneSourceIdOrthologFile");
my $chrsForCalcFile = $self->getParamValue("chrsForCalcFile");

my $nextflowConfigFile = $self->getWorkflowDataDir() . "/" . $self->getParamValue("nextflowConfigFile");

# Translate local paths to cluster-side paths
my $digestedSampleSheet = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $sampleSheetFile);
my $digestedGenomeFile = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $genomeFile);
my $digestedGtfFile = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $gtfFile);
my $digestedFootprintFile = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $footprintFile);
my $digestedResultsDir = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $resultsDirectory);
my $digestedOrthologFile = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $geneSourceIdOrthologFile);
my $digestedChrsForCalcFile = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $chrsForCalcFile);

# Workflow config values
my $minCoverage = $self->getConfig("minCoverage");
my $winLen = $self->getConfig("winLen");
my $bwaThreads = $self->getConfig("bwaThreads");

my $executor = $self->getClusterExecutor();
my $queue = $self->getClusterQueue();
my $maxMemoryGigs = eval { $self->getParamValue("maxMemoryGigs") };
my $genomeFastaFile = $self->getWorkflowDataDir() . "/" . eval { $self->getParamValue("genomeFastaFile") };

my $isLsf = lc($executor) eq 'lsf';

# Dynamic bwaMem memory: (genome_Gb * 3.3) + 2 GB, rounded up to next power of 2 for safety
my $bwaDefaultMemMb = 4 * 1024; # fallback: 4 GB
if (defined($genomeFastaFile)) {
my $genomeSize = 0;
open(my $fh, "<", $genomeFastaFile) or die "Cannot open genome fasta '$genomeFastaFile': $!";
while (<$fh>) {
next if /^>/;
chomp;
$genomeSize += length($_);
}
close($fh);
my $genomeGb = $genomeSize / 1_000_000_000;
my $rawGb = ($genomeGb * 3.3) + 2;
my $safeGb = $rawGb * 1.25;
my $memGb = 1;
$memGb *= 2 while $memGb < $safeGb;
$bwaDefaultMemMb = $memGb * 1024;
}
my $bwaRetryMemMb = $bwaDefaultMemMb * 2;

# runFreebayes process block
my $freebayesBlock = " withName: 'runFreebayes' {\n";
$freebayesBlock .= " maxRetries = 1\n";
$freebayesBlock .= " errorStrategy = { task.exitStatus in 130..140 ? 'retry' : 'finish' }\n";
if ($isLsf) {
$freebayesBlock .= " clusterOptions = {\n";
$freebayesBlock .= " (task.attempt > 1 && task.exitStatus in 130..140)\n";
$freebayesBlock .= " ? '-M 12000 -R \"rusage [mem=12000] span[hosts=1]\"'\n";
$freebayesBlock .= " : '-M 4000 -R \"rusage [mem=4000] span[hosts=1]\"'\n";
$freebayesBlock .= " }\n";
}
$freebayesBlock .= " }\n";

# bwaMem process block
my $bwaRetries = defined($maxMemoryGigs) ? 0 : 1;
my $bwaBlock = " withName: 'bwaMem' {\n";
$bwaBlock .= " maxRetries = $bwaRetries\n";
if (!defined($maxMemoryGigs)) {
$bwaBlock .= " errorStrategy = { task.exitStatus in 130..140 ? 'retry' : 'finish' }\n";
}
if ($isLsf) {
if (defined($maxMemoryGigs)) {
my $memMb = int($maxMemoryGigs * 1024);
$bwaBlock .= " clusterOptions = '-M $memMb -R \"rusage [mem=$memMb] span[hosts=1]\"'\n";
} else {
$bwaBlock .= " clusterOptions = {\n";
$bwaBlock .= " (task.attempt > 1 && task.exitStatus in 130..140)\n";
$bwaBlock .= " ? '-M $bwaRetryMemMb -R \"rusage [mem=$bwaRetryMemMb] span[hosts=1]\"'\n";
$bwaBlock .= " : '-M $bwaDefaultMemMb -R \"rusage [mem=$bwaDefaultMemMb] span[hosts=1]\"'\n";
$bwaBlock .= " }\n";
}
}
$bwaBlock .= " }\n";

if ($undo) {
$self->runCmd(0, "rm -rf $nextflowConfigFile");
} else {
open(F, ">", $nextflowConfigFile) or die "$! :Can't open config file '$nextflowConfigFile' for writing";
print F "
params {
samplesheet = \"$digestedSampleSheet\"
bwaThreads = $bwaThreads
minCoverage = $minCoverage
genomeFastaFile = \"$digestedGenomeFile\"
gtfFile = \"$digestedGtfFile\"
footprintFile = \"$digestedFootprintFile\"
winLen = $winLen
ploidy = $ploidy
outputDir = \"$digestedResultsDir\"
geneSourceIdOrthologFile = \"$digestedOrthologFile\"
chrsForCalcFile = \"$digestedChrsForCalcFile\"
}

process {
executor = '$executor'
queue = '$queue'
$freebayesBlock
$bwaBlock
}

singularity {
enabled = true
autoMounts = true
}
";
close(F);
}
}

1;
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,8 @@ sub run {
my $varscanPValue = $self->getConfig("varscanPValue");
my $varscanMinVarFreqSnp = $self->getConfig("varscanMinVarFreqSnp");
my $varscanMinVarFreqCons = $self->getConfig("varscanMinVarFreqCons");
my $maxNumberOfReads = $self->getConfig("maxNumberOfReads");
my $hisat2Index = $self->getConfig("hisat2Index");
my $createIndex = $self->getConfig("createIndex");
my $trimmomaticAdaptorsFile = $self->getConfig("trimmomaticAdaptorsFile");
my $ebiFtpUser = $self->getConfig("ebiFtpUser");
my $ebiFtpPassword = $self->getConfig("ebiFtpPassword");

Expand Down Expand Up @@ -93,11 +91,9 @@ params {
hisat2Index = $hisat2Index
createIndex = $createIndex
outputDir = \"$clusterResultDir\"
trimmomaticAdaptorsFile = $trimmomaticAdaptorsFile
varscanPValue = $varscanPValue
varscanMinVarFreqSnp = $varscanMinVarFreqSnp
varscanMinVarFreqCons = $varscanMinVarFreqCons
maxNumberOfReads = $maxNumberOfReads
taxonId = \"$taxonId\"
geneSourceIdOrthologFile = \"$geneSourceIdOrthologFile\"
chrsForCalcFile = \"$chrsForCalcFile\"
Expand Down
2 changes: 1 addition & 1 deletion Main/lib/perl/WorkflowSteps/MakeGtfForGuidedCufflinks.pm
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ sub run {
my $project = $self->getParamValue("project");
my $genomeExtDbRlsSpec = $self->getParamValue("genomeExtDbRlsSpec");
my $cdsOnly = $self->getBooleanParamValue("cdsOnly");
my $gusConfigFile = $self->getGusConfigFile();
my $gusConfigFile = $self->getWorkflowDataDir() . "/" . $self->getParamValue('gusConfigFile');

my $cmd = "makeGtf.pl --outputFile $workflowDataDir/$gtfDir/$outputFile --project $project --genomeExtDbRlsSpec '$genomeExtDbRlsSpec' --gusConfigFile $gusConfigFile";

Expand Down
Loading