comparison test-data/gentest.R @ 6:088b980f5f09 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/dada2 commit ea6c9c638e742c097b0ef294161eeea447c09e06
author iuc
date Fri, 30 Jun 2023 07:52:55 +0000
parents 724307021d1e
children 16e32ba74679
comparison
equal deleted inserted replaced
5:e55eb3d22f79 6:088b980f5f09
1 library(dada2, quietly = T) 1 library(dada2, quietly = TRUE)
2 library(ggplot2, quietly = T) 2 library(ggplot2, quietly = TRUE)
3 3
4 sample_names <- c("F3D0_S188_L001", "F3D141_S207_L001") 4 sample_names <- c("F3D0_S188_L001", "F3D141_S207_L001")
5 fwd <- c("F3D0_S188_L001_R1_001.fastq.gz", "F3D141_S207_L001_R1_001.fastq.gz") 5 fwd <- c("F3D0_S188_L001_R1_001.fastq.gz", "F3D141_S207_L001_R1_001.fastq.gz")
6 rev <- c("F3D0_S188_L001_R2_001.fastq.gz", "F3D141_S207_L001_R2_001.fastq.gz") 6 rev <- c("F3D0_S188_L001_R2_001.fastq.gz", "F3D141_S207_L001_R2_001.fastq.gz")
7 7
10 10
11 print("filterAndTrim") 11 print("filterAndTrim")
12 12
13 for (i in seq_len(fwd)) { 13 for (i in seq_len(fwd)) {
14 ftout <- dada2::filterAndTrim(fwd[i], filt_fwd[i], rev[i], filt_rev[i]) 14 ftout <- dada2::filterAndTrim(fwd[i], filt_fwd[i], rev[i], filt_rev[i])
15 b <- paste(strsplit(fwd[i], ".", fixed = T)[[1]][1], "tab", sep = ".") 15 b <- paste(strsplit(fwd[i], ".", fixed = TRUE)[[1]][1], "tab", sep = ".")
16 write.table(ftout, b, quote = F, sep = "\t", col.names = NA) 16 write.table(ftout, b, quote = FALSE, sep = "\t", col.names = NA)
17 } 17 }
18 18
19 # In the test only the 1st data set is used 19 # In the test only the 1st data set is used
20 t <- data.frame() 20 t <- data.frame()
21 t <- rbind(t, ftout[1, ]) 21 t <- rbind(t, ftout[1, ])
22 colnames(t) <- colnames(ftout) 22 colnames(t) <- colnames(ftout)
23 rownames(t) <- rownames(ftout)[1] 23 rownames(t) <- rownames(ftout)[1]
24 write.table(t, "filterAndTrim.tab", quote = F, sep = "\t", col.names = NA) 24 write.table(t, "filterAndTrim.tab", quote = FALSE, sep = "\t", col.names = NA)
25 25
26 names(fwd) <- sample_names 26 names(fwd) <- sample_names
27 names(rev) <- sample_names 27 names(rev) <- sample_names
28 names(filt_fwd) <- sample_names 28 names(filt_fwd) <- sample_names
29 names(filt_rev) <- sample_names 29 names(filt_rev) <- sample_names
77 77
78 78
79 # make sequence table 79 # make sequence table
80 print("makeSequenceTable") 80 print("makeSequenceTable")
81 seqtab <- makeSequenceTable(merged) 81 seqtab <- makeSequenceTable(merged)
82 write.table(t(seqtab), file = "makeSequenceTable.tab", quote = F, sep = "\t", row.names = T, col.names = NA) 82 write.table(t(seqtab), file = "makeSequenceTable.tab", quote = FALSE, sep = "\t", row.names = TRUE, col.names = NA)
83 83
84 reads_per_seqlen <- tapply(colSums(seqtab), factor(nchar(getSequences(seqtab))), sum) 84 reads_per_seqlen <- tapply(colSums(seqtab), factor(nchar(getSequences(seqtab))), sum)
85 df <- data.frame(length = as.numeric(names(reads_per_seqlen)), count = reads_per_seqlen) 85 df <- data.frame(length = as.numeric(names(reads_per_seqlen)), count = reads_per_seqlen)
86 pdf("makeSequenceTable.pdf") 86 pdf("makeSequenceTable.pdf")
87 ggplot(data = df, aes(x = length, y = count)) + 87 ggplot(data = df, aes(x = length, y = count)) +
90 bequiet <- dev.off() 90 bequiet <- dev.off()
91 91
92 # remove bimera 92 # remove bimera
93 print("removeBimera") 93 print("removeBimera")
94 seqtab_nochim <- dada2::removeBimeraDenovo(seqtab) 94 seqtab_nochim <- dada2::removeBimeraDenovo(seqtab)
95 write.table(t(seqtab), file = "removeBimeraDenovo.tab", quote = F, sep = "\t", row.names = T, col.names = NA) 95 write.table(t(seqtab), file = "removeBimeraDenovo.tab", quote = FALSE, sep = "\t", row.names = TRUE, col.names = NA)
96 96
97 # assign taxonomy/species 97 # assign taxonomy/species
98 tl <- "Level1,Level2,Level3,Level4,Level5" 98 tl <- "Level1,Level2,Level3,Level4,Level5"
99 tl <- strsplit(tl, ",")[[1]] 99 tl <- strsplit(tl, ",")[[1]]
100 100
101 set.seed(42) 101 set.seed(42)
102 print("assignTaxonomyAndSpecies") 102 print("assignTaxonomyAndSpecies")
103 taxa <- dada2::assignTaxonomy(seqtab_nochim, "reference.fa.gz", outputBootstraps = T, taxLevels = tl, multithread = 1) 103 taxa <- dada2::assignTaxonomy(seqtab_nochim, "reference.fa.gz", outputBootstraps = TRUE, taxLevels = tl, multithread = 1)
104 104
105 taxa$tax <- dada2::addSpecies(taxa$tax, "reference_species.fa.gz") 105 taxa$tax <- dada2::addSpecies(taxa$tax, "reference_species.fa.gz")
106 write.table(taxa$tax, file = "assignTaxonomyAddspecies.tab", quote = F, sep = "\t", row.names = T, col.names = NA) 106 write.table(taxa$tax, file = "assignTaxonomyAddspecies.tab", quote = FALSE, sep = "\t", row.names = TRUE, col.names = NA)
107 107
108 write.table(taxa$boot, file = "assignTaxonomyAddspecies_boot.tab", quote = F, sep = "\t", row.names = T, col.names = NA) 108 write.table(taxa$boot, file = "assignTaxonomyAddspecies_boot.tab", quote = FALSE, sep = "\t", row.names = TRUE, col.names = NA)
109 109
110 110
111 ## Generate extra test data for parameter testing 111 ## Generate extra test data for parameter testing
112 print("alternatives") 112 print("alternatives")
113 dada2::filterAndTrim(fwd, c("filterAndTrim_single_F3D0_R1.fq.gz", "filterAndTrim_single_F3D141_R1.fq.gz"), rm.phix = T, orient.fwd = "TACGG") 113 dada2::filterAndTrim(fwd, c("filterAndTrim_single_F3D0_R1.fq.gz", "filterAndTrim_single_F3D141_R1.fq.gz"), rm.phix = TRUE, orient.fwd = "TACGG")
114 114
115 dada2::filterAndTrim(fwd, c("filterAndTrim_single_trimmers_F3D0_R1.fq.gz", "filterAndTrim_single_trimmers_F3D141_R1.fq.gz"), truncQ = 30, truncLen = 2, trimLeft = 150, trimRight = 2) 115 dada2::filterAndTrim(fwd, c("filterAndTrim_single_trimmers_F3D0_R1.fq.gz", "filterAndTrim_single_trimmers_F3D141_R1.fq.gz"), truncQ = 30, truncLen = 2, trimLeft = 150, trimRight = 2)
116 116
117 dada2::filterAndTrim(fwd, c("filterAndTrim_single_filters_F3D0_R1.fq.gz", "filterAndTrim_single_filters_F3D141_R1.fq.gz"), maxLen = 255, minLen = 60, maxN = 100, minQ = 13, maxEE = 1) 117 dada2::filterAndTrim(fwd, c("filterAndTrim_single_filters_F3D0_R1.fq.gz", "filterAndTrim_single_filters_F3D141_R1.fq.gz"), maxLen = 255, minLen = 60, maxN = 100, minQ = 13, maxEE = 1)
118 118
120 merged_nondef <- dada2::mergePairs(dada_fwd, filt_fwd, dada_rev, filt_rev, minOverlap = 8, maxMismatch = 1, justConcatenate = TRUE, trimOverhang = TRUE) 120 merged_nondef <- dada2::mergePairs(dada_fwd, filt_fwd, dada_rev, filt_rev, minOverlap = 8, maxMismatch = 1, justConcatenate = TRUE, trimOverhang = TRUE)
121 for (id in sample_names) { 121 for (id in sample_names) {
122 saveRDS(merged_nondef[[id]], file = paste("mergePairs_", id, "_nondefault.Rdata", sep = "")) 122 saveRDS(merged_nondef[[id]], file = paste("mergePairs_", id, "_nondefault.Rdata", sep = ""))
123 } 123 }
124 rb_dada_fwd <- dada2::removeBimeraDenovo(dada_fwd[["F3D0_S188_L001"]]) 124 rb_dada_fwd <- dada2::removeBimeraDenovo(dada_fwd[["F3D0_S188_L001"]])
125 write.table(rb_dada_fwd, file = "removeBimeraDenovo_F3D0_dada_uniques.tab", quote = F, sep = "\t", row.names = T, col.names = F) 125 write.table(rb_dada_fwd, file = "removeBimeraDenovo_F3D0_dada_uniques.tab", quote = FALSE, sep = "\t", row.names = TRUE, col.names = FALSE)
126 126
127 rb_merged <- dada2::removeBimeraDenovo(merged, method = "pooled") 127 rb_merged <- dada2::removeBimeraDenovo(merged, method = "pooled")
128 saveRDS(rb_merged, file = "removeBimeraDenovo_F3D0_mergepairs.Rdata") 128 saveRDS(rb_merged, file = "removeBimeraDenovo_F3D0_mergepairs.Rdata")
129 129
130 # SeqCounts 130 # SeqCounts
132 sum(dada2::getUniques(x)) 132 sum(dada2::getUniques(x))
133 } 133 }
134 134
135 print("seqCounts ft") 135 print("seqCounts ft")
136 samples <- list() 136 samples <- list()
137 samples[["F3D0_S188_L001_R1_001.tab"]] <- read.table("F3D0_S188_L001_R1_001.tab", header = T, sep = "\t", row.names = 1) 137 samples[["F3D0_S188_L001_R1_001.tab"]] <- read.table("F3D0_S188_L001_R1_001.tab", header = TRUE, sep = "\t", row.names = 1)
138 dname <- "filter" 138 dname <- "filter"
139 tdf <- samples[["F3D0_S188_L001_R1_001.tab"]] 139 tdf <- samples[["F3D0_S188_L001_R1_001.tab"]]
140 names(tdf) <- paste(dname, names(tdf)) 140 names(tdf) <- paste(dname, names(tdf))
141 tdf <- cbind(data.frame(samples = names(samples)), tdf) 141 tdf <- cbind(data.frame(samples = names(samples)), tdf)
142 write.table(tdf, "seqCounts_filter.tab", quote = F, sep = "\t", row.names = F, col.names = T) 142 write.table(tdf, "seqCounts_filter.tab", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)
143 143
144 samples <- list() 144 samples <- list()
145 samples[["F3D0_S188_L001_R1_001.tab"]] <- read.table("F3D0_S188_L001_R1_001.tab", header = T, sep = "\t", row.names = 1) 145 samples[["F3D0_S188_L001_R1_001.tab"]] <- read.table("F3D0_S188_L001_R1_001.tab", header = TRUE, sep = "\t", row.names = 1)
146 samples[["F3D141_S207_L001_R1_001.tab"]] <- read.table("F3D141_S207_L001_R1_001.tab", header = T, sep = "\t", row.names = 1) 146 samples[["F3D141_S207_L001_R1_001.tab"]] <- read.table("F3D141_S207_L001_R1_001.tab", header = TRUE, sep = "\t", row.names = 1)
147 dname <- "filter" 147 dname <- "filter"
148 tdf <- samples[["F3D0_S188_L001_R1_001.tab"]] 148 tdf <- samples[["F3D0_S188_L001_R1_001.tab"]]
149 tdf <- rbind(tdf, samples[["F3D141_S207_L001_R1_001.tab"]]) 149 tdf <- rbind(tdf, samples[["F3D141_S207_L001_R1_001.tab"]])
150 names(tdf) <- paste(dname, names(tdf)) 150 names(tdf) <- paste(dname, names(tdf))
151 tdf <- cbind(data.frame(samples = names(samples)), tdf) 151 tdf <- cbind(data.frame(samples = names(samples)), tdf)
152 write.table(tdf, "seqCounts_filter_both.tab", quote = F, sep = "\t", row.names = F, col.names = T) 152 write.table(tdf, "seqCounts_filter_both.tab", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)
153 153
154 print("seqCounts dada") 154 print("seqCounts dada")
155 samples <- list() 155 samples <- list()
156 samples[["dada_F3D0_S188_L001_R1.Rdata"]] <- readRDS("dada_F3D0_S188_L001_R1.Rdata") 156 samples[["dada_F3D0_S188_L001_R1.Rdata"]] <- readRDS("dada_F3D0_S188_L001_R1.Rdata")
157 samples[["dada_F3D141_S207_L001_R1.Rdata"]] <- readRDS("dada_F3D141_S207_L001_R1.Rdata") 157 samples[["dada_F3D141_S207_L001_R1.Rdata"]] <- readRDS("dada_F3D141_S207_L001_R1.Rdata")
158 dname <- "dadaF" 158 dname <- "dadaF"
159 tdf <- data.frame(samples = names(samples)) 159 tdf <- data.frame(samples = names(samples))
160 tdf[[dname]] <- sapply(samples, get_n) 160 tdf[[dname]] <- sapply(samples, get_n)
161 write.table(tdf, "seqCounts_dadaF.tab", quote = F, sep = "\t", row.names = F, col.names = T) 161 write.table(tdf, "seqCounts_dadaF.tab", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)
162 162
163 print("seqCounts mp") 163 print("seqCounts mp")
164 samples <- list() 164 samples <- list()
165 samples[["mergePairs_F3D0_S188_L001.Rdata"]] <- readRDS("mergePairs_F3D0_S188_L001.Rdata") 165 samples[["mergePairs_F3D0_S188_L001.Rdata"]] <- readRDS("mergePairs_F3D0_S188_L001.Rdata")
166 samples[["mergePairs_F3D141_S207_L001.Rdata"]] <- readRDS("mergePairs_F3D141_S207_L001.Rdata") 166 samples[["mergePairs_F3D141_S207_L001.Rdata"]] <- readRDS("mergePairs_F3D141_S207_L001.Rdata")
167 dname <- "merge" 167 dname <- "merge"
168 tdf <- data.frame(samples = names(samples)) 168 tdf <- data.frame(samples = names(samples))
169 tdf[[dname]] <- sapply(samples, get_n) 169 tdf[[dname]] <- sapply(samples, get_n)
170 write.table(tdf, "seqCounts_merge.tab", quote = F, sep = "\t", row.names = F, col.names = T) 170 write.table(tdf, "seqCounts_merge.tab", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)
171 171
172 print("seqCounts st") 172 print("seqCounts st")
173 samples <- list() 173 samples <- list()
174 samples <- t(as.matrix(read.table("makeSequenceTable.tab", header = T, sep = "\t", row.names = 1))) 174 samples <- t(as.matrix(read.table("makeSequenceTable.tab", header = TRUE, sep = "\t", row.names = 1)))
175 dname <- "seqtab" 175 dname <- "seqtab"
176 tdf <- data.frame(samples = row.names(samples)) 176 tdf <- data.frame(samples = row.names(samples))
177 tdf[[dname]] <- rowSums(samples) 177 tdf[[dname]] <- rowSums(samples)
178 write.table(tdf, "seqCounts_seqtab.tab", quote = F, sep = "\t", row.names = F, col.names = T) 178 write.table(tdf, "seqCounts_seqtab.tab", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)
179 179
180 print("seqCounts rb") 180 print("seqCounts rb")
181 samples <- list() 181 samples <- list()
182 samples <- t(as.matrix(read.table("removeBimeraDenovo.tab", header = T, sep = "\t", row.names = 1))) 182 samples <- t(as.matrix(read.table("removeBimeraDenovo.tab", header = TRUE, sep = "\t", row.names = 1)))
183 dname <- "nochim" 183 dname <- "nochim"
184 tdf <- data.frame(samples = row.names(samples)) 184 tdf <- data.frame(samples = row.names(samples))
185 tdf[[dname]] <- rowSums(samples) 185 tdf[[dname]] <- rowSums(samples)
186 write.table(tdf, "seqCounts_nochim.tab", quote = F, sep = "\t", row.names = F, col.names = T) 186 write.table(tdf, "seqCounts_nochim.tab", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)