Merge pull request #496 from GoekeLab/devel

Chen Ying · web-flow · commit 14081c6ca181 · 2025-06-23T11:12:59.000+08:00
update multiplex_major_patch to keep track of the changes made to main branch
diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml
@@ -54,9 +54,9 @@ jobs:
       fail-fast: false
       matrix:
         config:
-          - { os: ubuntu-latest, r: '4.4', bioc: '3.19', cont: "bioconductor/bioconductor_docker:RELEASE_3_19", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
-          - { os: macOS-latest, r: '4.4', bioc: '3.19'}
-          - { os: windows-latest, r: '4.4', bioc: '3.19'}
+          - { os: ubuntu-latest, r: '4.4.2', bioc: '3.20', cont: "bioconductor/bioconductor_docker:RELEASE_3_20", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
+          - { os: macOS-latest, r: '4.4.2', bioc: '3.20'}
+          ## - { os: windows-latest, r: '4.4', bioc: '3.20'}
           ## Check https://github.com/r-lib/actions/tree/master/examples
           ## for examples using the http-user-agent
     env:
@@ -107,16 +107,16 @@ jobs:
         uses: actions/cache@v4
         with:
           path: ${{ env.R_LIBS_USER }}
-          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE-r-4.3-${{ hashFiles('.github/depends.Rds') }}
-          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE-r-4.3-
+          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE-r-4.4.2-${{ hashFiles('.github/depends.Rds') }}
+          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE-r-4.4.2-
 
       - name: Cache R packages on Linux
         if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' "
         uses: actions/cache@v4
         with:
           path: /home/runner/work/_temp/Library
-          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.3-${{ hashFiles('.github/depends.Rds') }}
-          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.3-
+          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.4.2-${{ hashFiles('.github/depends.Rds') }}
+          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.4.2-
 
       - name: Install Linux system dependencies
         if: runner.os == 'Linux'
@@ -339,7 +339,7 @@ jobs:
         if: failure()
         uses: actions/upload-artifact@v4
         with:
-          name: ${{ runner.os }}-biocversion-RELEASE-r-4.4-results
+          name: ${{ runner.os }}-biocversion-RELEASE-r-4.4.2-results
           path: check
 
       - uses: docker/build-push-action@v1
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: bambu
 Type: Package
 Title: Context-Aware Transcript Quantification from Long Read RNA-Seq data
-Version: 3.5.1
+Version: 3.11.1
 Authors@R: c(person("Ying", "Chen", role = c("cre","aut"),
              email = "chen_ying@gis.a-star.edu.sg"),
              person("Andre", "Sim", role = "aut",
diff --git a/R/bambu-extendAnnotations-utilityCombine.R b/R/bambu-extendAnnotations-utilityCombine.R
@@ -17,7 +17,7 @@ isore.combineTranscriptCandidates <- function(readClassList,
     combinedSplicedTranscripts <- 
         combineSplicedTranscriptModels(readClassList, bpParameters, 
         min.readCount, min.readFractionByGene, 
-        min.txScore.multiExon, min.txScore.singleExon, verbose) %>% data.table()
+        min.txScore.multiExon, min.txScore.singleExon, verbose)
     combinedSplicedTranscripts[,confidenceType := "highConfidenceJunctionReads"]
     # when single exon min score is greater than 1, skip unspliced transcripts combination
     # this is a very customized config, useful when data is very big 
@@ -92,40 +92,34 @@ sequentialCombineFeatureTibble <- function(readClassList,
 
 #' @noRd 
 updateStartEndReadCount <- function(combinedFeatureTibble){
-    combinedFeatureTibble <- combinedFeatureTibble %>% 
-        mutate(rowID = row_number())
-    
-    startEndCountTibble <- combinedFeatureTibble %>% 
-        select(rowID, starts_with("start"),starts_with("end"),
-            starts_with("readCount")) %>%
-        tidyr::pivot_longer(c(starts_with("start"),starts_with("end"),
-            starts_with("readCount")), names_to = c(".value","set"),
-            names_pattern = "(.*)\\.(.)") %>%
-        group_by(rowID) %>% 
-        mutate(sumReadCount = sum(readCount,na.rm = TRUE))
+    setDT(combinedFeatureTibble)
+    combinedFeatureTibble[, rowID := .I]
     
-    startTibble <- select(startEndCountTibble, rowID, start, readCount, 
-        sumReadCount) %>% 
-        arrange(start) %>%
-        filter(cumsum(readCount)/sumReadCount>=0.5) %>% 
-        filter(row_number()==1)
-    endTibble <- select(startEndCountTibble, rowID, end, readCount, 
-        sumReadCount) %>% 
-        arrange(end) %>% 
-        filter(cumsum(readCount)/sumReadCount>=0.5) %>% 
-        filter(row_number()==1)
+    colNames <- colnames(combinedFeatureTibble)
+    readCountCols <- sort(colNames[grep("^readCount", colNames)]) # to make sure it's ordered by sample name
+    startCols <- sort(colNames[grep("^start", colNames)])
+    endCols <- sort(colNames[grep("^end", colNames)])
     
-    combinedFeatureTibble <- combinedFeatureTibble %>% 
-        dplyr::select(intronStarts, intronEnds, chr, strand, maxTxScore, 
-            maxTxScore.noFit, NSampleReadCount, NSampleReadProp, 
-            NSampleTxScore, rowID) %>%
-        full_join(select(startTibble, rowID, start), by = "rowID") %>% 
-        full_join(select(endTibble, rowID, end, readCount=sumReadCount), 
-        by = "rowID") %>%
-        select(-rowID)
+    startEndDt <- combinedFeatureTibble[, 
+        .(start = readCountWeightedMedian(.SD,x,y),
+        end = readCountWeightedMedian(.SD,z,y),
+        readCount = sum(.SD[,y], na.rm = TRUE)),
+        by = rowID,  env = I(list(x = startCols, y = readCountCols,z = endCols))]
+
+    combinedFeatureTibble <- startEndDt[combinedFeatureTibble[,.(intronStarts, intronEnds, chr, strand, maxTxScore, 
+                                                                 maxTxScore.noFit, NSampleReadCount, NSampleReadProp, 
+                                                                 NSampleTxScore, rowID)], on = "rowID"]
+    combinedFeatureTibble[, rowID := NULL]
     return(combinedFeatureTibble)
 }
 
+#' Function to get median value without interpolation using certain column names
+#' @noRd
+readCountWeightedMedian <- function(dt, valuevar, timesvar){
+    sortVector <- rep(na.omit(unlist(dt[,..valuevar])), 
+                times = as.integer(na.omit(unlist(dt[,..timesvar]))))
+    return(min(sortVector[sortVector>=quantile(sortVector, probs = 0.5)]))
+}
 
 
 #' Function to combine featureTibble and create the NSample variables 
diff --git a/R/prepareAnnotations_utilityFunctions.R b/R/prepareAnnotations_utilityFunctions.R
@@ -25,7 +25,7 @@ prepareAnnotationsFromGTF <- function(file) {
             "score", "strand", "frame", "attribute")
         data <- data[data$type == "exon", ]
         data$strand[data$strand == "."] <- "*"
-        data$GENEID <- gsub("gene_id (.*?);.*", "\\1", data$attribute)
+        data$GENEID <- gsub(".*gene_id (.*?);.*", "\\1", data$attribute)
         data$TXNAME <- gsub(".*transcript_id (.*?);.*", "\\1", data$attribute)
         data$NDR <- NULL
         data$maxTxScore <- NULL
diff --git a/README.md b/README.md
@@ -216,10 +216,16 @@ By default bambu will write four .gtf files
 ```rscript
 writeBambuOutput(se, path = "./bambu/")
 ```
-If you would like to avoid outputting any of the above .gtf for space concerns, each can be toggled off with the below arguments.
+
+If you are only interested in the novel transcripts, one can filter this 'se' object first to remove reference annotations.
 ```rscript
-writeBambuOutput(se.novel, path = "./bambu/", outputExtendedAnno = FALSE, outputAll = FALSE, outputBambuModels = FALSE, outputNovelOnly = FALSE)
+se.novel = se[mcols(se)$novelTranscript,]
+writeBambuOutput(se.novel, path = "./bambu/")
 ```
+If you are only interested in full-length transcripts that were detected by Bambu in at least 1 sample.
+```rscript
+se.novel = se[mcols(se)$novelTranscript&(apply(assays(se)$fullLengthCounts >= 1,1,sum)>=1),]
+writeBambuOutput(se.novel, path = "./bambu/")
 
 If quant is set to FALSE i.e. only transcript discovery is performed, only the rowRanges output of the extended annotations is returned (a GRangesList object). The equivalent rowData can be accessed with mcols()
 These annotations can be written to a .gtf file using writeAnnotationsToGTF(GRangesList_object, output_path).
@@ -679,31 +685,16 @@ metadata(rowRanges(se))$warnings
 
 ### Release History
 
-**bambu v3.9.0**
-
-Release date: 2025-xxx-xx
-
-- Subset transcripts and those above the NDR threshold are placed into the metadata of the annotations in $subsetTranscripts and $lowConfidenceTranscripts respectively (when filtered out by default).
-- adds the setNDR function
-- outputs the NDR, txScore and txScore.noFit as attributes to the gtf file and these are also read in with prepareAnnotations.
-- Added setNDR as part of quant, which means that users can provide their extendedAnnotations alongside an NDR threshold when running bambu and it will automatically adjust the NDR used for quant. This means users do not need to manually filter the NDR value themselves.
-- NDR and other stats are now copied over to equal transcripts even if above the NDR threshold (previously only happened for those below the NDR threshold)
-- Read class to transcript assignment is now its own step instead of being done with quant. This is turned on and off with assignDist. 
-- Added demultiplexed argument
-- Added spatial argument
-- Added sampleNames argument
-- Added cleanReads argument
-- Added dedupUMI argument
-- Added clusters argument
-- Deprecated lowMemory - This has been replaced by processByChromosome
-- Added processByChomosome (the old memory)
-- Added processByBam argument
-- Added importBambuResults()
-- writeBambuOutput now outputs all information needed to import Bambu results from text files
-- Count outputs are all now in sparse matrix format
+
+**bambu v3.8.2**
+
+Release date: 2025-02-06
 
 Minor changes:
-- Warnings will no longer occur if there are seqlevels in the readGrgList that are not in the annotations or genome. This was done by setting seqlevels of the reads to only those in the reads. Warning was constantly occuring because all the scaffolds used in alignment were in the bam files, even if no reads from these scaffolds existed.
+
+- Fix large number of samples [issue](https://github.com/GoekeLab/bambu/issues/450)  
+- Fix denovo bug issue 
+
 
 **bambu v3.2.6**
 
diff --git a/inst/extdata/seIsoReCombined_SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.rds b/inst/extdata/seIsoReCombined_SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.rds
diff --git a/inst/extdata/seIsoReRef_SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.rds b/inst/extdata/seIsoReRef_SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.rds
diff --git a/tests/testthat/test_isore.R b/tests/testthat/test_isore.R
@@ -71,8 +71,8 @@ test_that("isore.combineTranscriptCandidates completes successfully", {
     
     expect_equal(seIsoReCombined, seIsoReCombinedExpected)
     expect_named(seIsoReCombined,
-                 c('intronStarts', 'intronEnds', 'chr', 'strand', 'maxTxScore', 'maxTxScore.noFit',
-                   'NSampleReadCount', 'NSampleReadProp', 'NSampleTxScore', 'start', 'end', 'readCount', 'confidenceType') 
+                 c('start', 'end', 'readCount','intronStarts', 'intronEnds', 'chr', 'strand', 'maxTxScore', 'maxTxScore.noFit',
+                   'NSampleReadCount', 'NSampleReadProp', 'NSampleTxScore',  'confidenceType') 
     )
 })
 

Original file line number	Diff line number	Diff line change
`@@ -71,8 +71,8 @@ test_that("isore.combineTranscriptCandidates completes successfully", {`
`71`	`71`
`72`	`72`	`expect_equal(seIsoReCombined, seIsoReCombinedExpected)`
`73`	`73`	`expect_named(seIsoReCombined,`
`74`		`- c('intronStarts', 'intronEnds', 'chr', 'strand', 'maxTxScore', 'maxTxScore.noFit',`
`75`		`- 'NSampleReadCount', 'NSampleReadProp', 'NSampleTxScore', 'start', 'end', 'readCount', 'confidenceType')`
	`74`	`+ c('start', 'end', 'readCount','intronStarts', 'intronEnds', 'chr', 'strand', 'maxTxScore', 'maxTxScore.noFit',`
	`75`	`+ 'NSampleReadCount', 'NSampleReadProp', 'NSampleTxScore', 'confidenceType')`
`76`	`76`	`)`
`77`	`77`	`})`
`78`	`78`