-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathbarcode_faker.R
More file actions
125 lines (105 loc) · 6.06 KB
/
barcode_faker.R
File metadata and controls
125 lines (105 loc) · 6.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# This function adds fake barcodes to the start of reads in Illumina FASTQ files for compatibility with TASSEL5 GBSv2.
# fastq_dir is the directory path (string) of *unzipped* FASTQ files
# Returns unzipped FASTQ files with fake barcodes and quality scores, named according to the GBSv2 requirement,
# and a key, "barcode_key.csv", which matches original file name, new file name, flowcell, lane, and barcode
# All files are output to the working directory.
#
# Example usage:
# barcode_faker(fastq_dir = "C:/Users/yourname/yourfastqs")
#
# WARNINGS:
# Always run barcode_faker on a folder containing all files you intend to use in a given GBSv2 run. Multiple separate runs
# of barcode_faker can cause barcodes to be repeated across sample files (though unlikely), and will certainly cause
# file names (flowcell and lane) to be repeated.
# The barcodes generated by this function may recreate enzyme cut sites. The function is not intended to make barcodes for practical use.
# Never delete your original FASTQ files after running barcode_faker, and remember to back up the originals.
# Marlee R. Labroo, dulynoted713@gmail.com.
barcode_faker <- function(fastq_dir){
print("Always run barcode_faker on a folder containing all files you intend to use in a given GBSv2 run. Multiple separate runs of barcode_faker can cause barcodes to be repeated across sample files, though unlikely, and will certainly cause file names to be repeated.")
#get the fastq file paths
fastqs <- list.files(path = fastq_dir, full.names = TRUE)
fastq.check <- list.files(path = fastq_dir, full.names = FALSE)
#check that all files in dir are fastqs #bug fixed 28 July 2020- reported by Dr. Smit Dhakal
long <- length(grep(".fastq", fastq.check))
short <- length(grep(".fq", fastq.check))
if(long != 0){
if(length(fastq.check) != long){
stop("Not all of the files in your input folder seem to have .fq or .fastq extensions. Please move or delete such files.")
}
}
if(short != 0){
if(length(fastq.check) != short){
stop("Not all of the files in your input folder seem to have .fq or .fastq extensions. Please move or delete such files.")
}
}
# new method: given a number of input files, determine the min barcode length needed to ensure enough unique barcodes are generated
# the old way failed if a lot of files had short barcodes of the same length- reported 18 Aug 2022 by Meghan Brady
# https://www.calculatorsoup.com/calculators/discretemathematics/combinationsreplacement.php
# n = 4
# r = min barcode length
# C = number of files
# solve for r considering n and C are given
C <- length(fastqs)
constant <- (factorial(3) * C) - 6 # get the constant
r <- round(polyroot(c(constant, -11, -6, -1)), 2) # get the roots of the polynomial
r <- ceiling(Re(r)[Re(r) > 0]) # subset the real and positive roots, and round up
if(r < 4){r <- 4} # keep a minimum barcode length of 4, probably unnecessary
#this function recursively makes a vector of unique fake barcodes to ensure no non-unique barcodes are generated
#this step does not protect against enzyme cut site being found in the barcode
barcode_inventor <- function(fake_barcodes, r){
fake_barcodes <- c()
# make some barcodes of the length needed to ensure unique barcodes for all files are possible
for(i in 1:length(fastqs)){
fake_barcodes[i] <- paste(sample(c("A", "T", "C", "G"), size = r, replace = TRUE), collapse = "")
}
# check if the barcodes are all unique
if(length(fake_barcodes) == length(unique(fake_barcodes))){
return(fake_barcodes)
} else {
barcode_inventor
}
}
fake_barcodes <- barcode_inventor(fake_barcodes, r)
#make the fake perfect quality scores matching the barcode length
fake_quals <- rep(paste(rep("E", times = r), collapse = ""), times = length(fastqs))
#make fake header files for FASTQ reads
#headers are totally fake (do not refer to input file)
#fake headers allows interoperability between TASSEL and demultiplexing software FASTQ formats
fake_flowcell_no <- seq(from = 1, to = length(fastqs), by = 1)
fake_headers <- paste("D00553R:56:", "C8B56ANXX", fake_flowcell_no, ":3:1101:1203:2037 1:N:0:3", sep = "")
fake_flowcells <- paste("C8B56ANXX", fake_flowcell_no, sep = "")
fake_lanes <- rep(3, times = length(fastqs))
#make fake file names that appropriately reference fake flowcell and lane
newfile_names <- paste(fake_flowcells, "_", "3","_","fastq.fastq", sep = "")
#stop if files with the fake file names already exist
workdir <- getwd()
dirfil <- list.files(path = workdir, full.names = FALSE)
if(sum(newfile_names %in% dirfil) != 0){
stop("Files in your working directory seem to have same names as those
output by this function. Please move them, delete them, or choose a new working directory.")
}
#write a key of barcode, flowcell, lane and new file names
key <- cbind(fastqs, newfile_names, fake_flowcells, fake_lanes, fake_barcodes)
write.csv(key, "fakebarcodes_key.csv", row.names = FALSE)
#read 400 lines per file at a time, paste on the barcodes and quality scores, make unique flowcells,
#and write them out with unique names
for(i in 1:length(fastqs)){
incon <- file(fastqs[i], open = "r")
outcon <- file(newfile_names[i], open = "w")
while(length(mylines <- readLines(incon, n = 400, warn = FALSE))){
head_pos <- seq(1, length(mylines), by = 4)
seq_pos <- seq(2, length(mylines), by = 4)
qual_pos <- seq(4, length(mylines), by = 4)
for(j in 1:length(mylines)){
mylines[head_pos[j]] <- fake_headers[i]
mylines[seq_pos[j]] <- paste(fake_barcodes[i], mylines[seq_pos[j]], sep = "")
mylines[qual_pos[j]] <- paste(fake_quals[i], mylines[qual_pos[j]], sep = "")
}
writeLines(mylines, outcon)
}
print(paste("Finished writing File", i, "...", sep = " "))
#close the connections
close(outcon, warn = FALSE)
close(incon, warn = FALSE)
}
}