1
|
1 #!/usr/bin/env Rscript
|
|
2
|
|
3 # A command-line interface to DESeq2 for use with Galaxy
|
|
4 # written by Bjoern Gruening and modified by Michael Love 2016.03.30
|
|
5 #
|
|
6 # one of these arguments is required:
|
|
7 #
|
|
8 # 'factors' a JSON list object from Galaxy
|
|
9 #
|
|
10 # 'sample_table' is a sample table as described in ?DESeqDataSetFromHTSeq
|
|
11 # with columns: sample name, filename, then factors (variables)
|
|
12 #
|
|
13 # the output file has columns:
|
|
14 #
|
|
15 # baseMean (mean normalized count)
|
|
16 # log2FoldChange (by default a moderated LFC estimate)
|
|
17 # lfcSE (the standard error)
|
|
18 # stat (the Wald statistic)
|
|
19 # pvalue (p-value from comparison of Wald statistic to a standard Normal)
|
|
20 # padj (adjusted p-value, Benjamini Hochberg correction on genes which pass the mean count filter)
|
|
21 #
|
|
22 # the first variable in 'factors' and first column in 'sample_table' will be the primary factor.
|
|
23 # the levels of the primary factor are used in the order of appearance in factors or in sample_table.
|
|
24 #
|
|
25 # by default, levels in the order A,B,C produces a single comparison of B vs A, to a single file 'outfile'
|
|
26 #
|
|
27 # for the 'many_contrasts' flag, levels in the order A,B,C produces comparisons C vs A, B vs A, C vs B,
|
|
28 # to a number of files using the 'outfile' prefix: 'outfile.condition_C_vs_A' etc.
|
|
29 # all plots will still be sent to a single PDF, named by the arg 'plots', with extra pages.
|
|
30 #
|
|
31 # fit_type is an integer valued argument, with the options from ?estimateDisperions
|
|
32 # 1 "parametric"
|
|
33 # 2 "local"
|
|
34 # 3 "mean"
|
|
35
|
|
36 # setup R error handling to go to stderr
|
|
37 options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
|
|
38
|
|
39 # we need that to not crash galaxy with an UTF8 error on German LC settings.
|
|
40 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
|
|
41
|
|
42 library("getopt")
|
|
43 library("tools")
|
|
44 options(stringAsFactors = FALSE, useFancyQuotes = FALSE)
|
|
45 args <- commandArgs(trailingOnly = TRUE)
|
|
46
|
|
47 # get options, using the spec as defined by the enclosed list.
|
|
48 # we read the options from the default: commandArgs(TRUE).
|
|
49 spec <- matrix(c(
|
|
50 "quiet", "q", 0, "logical",
|
|
51 "help", "h", 0, "logical",
|
|
52 "outfile", "o", 1, "character",
|
|
53 "countsfile", "n", 1, "character",
|
|
54 "factors", "f", 1, "character",
|
|
55 "plots" , "p", 1, "character",
|
|
56 "sample_table", "s", 1, "character",
|
|
57 "tximport", "i", 0, "logical",
|
|
58 "tx2gene", "x", 1, "character", # a space-sep tx-to-gene map or GTF file (auto detect .gtf/.GTF)
|
|
59 "fit_type", "t", 1, "integer",
|
|
60 "many_contrasts", "m", 0, "logical",
|
|
61 "outlier_replace_off" , "a", 0, "logical",
|
|
62 "outlier_filter_off" , "b", 0, "logical",
|
|
63 "auto_mean_filter_off", "c", 0, "logical",
|
|
64 "beta_prior_off", "d", 0, "logical"),
|
|
65 byrow=TRUE, ncol=4)
|
|
66 opt <- getopt(spec)
|
|
67
|
|
68 # if help was asked for print a friendly message
|
|
69 # and exit with a non-zero error code
|
|
70 if (!is.null(opt$help)) {
|
|
71 cat(getopt(spec, usage=TRUE))
|
|
72 q(status=1)
|
|
73 }
|
|
74
|
|
75 # enforce the following required arguments
|
|
76 if (is.null(opt$outfile)) {
|
|
77 cat("'outfile' is required\n")
|
|
78 q(status=1)
|
|
79 }
|
|
80 if (is.null(opt$sample_table) & is.null(opt$factors)) {
|
|
81 cat("'factors' or 'sample_table' is required\n")
|
|
82 q(status=1)
|
|
83 }
|
|
84
|
|
85 verbose <- if (is.null(opt$quiet)) {
|
|
86 TRUE
|
|
87 } else {
|
|
88 FALSE
|
|
89 }
|
|
90
|
|
91 if (!is.null(opt$tximport)) {
|
|
92 if (is.null(opt$tx2gene)) stop("A transcript-to-gene map or a GTF file is required for tximport")
|
|
93 if (tolower(file_ext(opt$tx2gene)) == "gtf") {
|
|
94 gtfFile <- opt$tx2gene
|
|
95 } else {
|
|
96 gtfFile <- NULL
|
|
97 tx2gene <- read.table(opt$tx2gene, header=FALSE)
|
|
98 }
|
|
99 useTXI <- TRUE
|
|
100 } else {
|
|
101 useTXI <- FALSE
|
|
102 }
|
|
103
|
|
104 suppressPackageStartupMessages({
|
|
105 library("DESeq2")
|
|
106 library("RColorBrewer")
|
|
107 library("gplots")
|
|
108 })
|
|
109
|
|
110 # build or read sample table
|
|
111
|
|
112 trim <- function (x) gsub("^\\s+|\\s+$", "", x)
|
|
113
|
|
114 # switch on if 'factors' was provided:
|
|
115 if (!is.null(opt$factors)) {
|
|
116 library("rjson")
|
|
117 parser <- newJSONParser()
|
|
118 parser$addData(opt$factors)
|
|
119 factorList <- parser$getObject()
|
|
120 factors <- sapply(factorList, function(x) x[[1]])
|
|
121 primaryFactor <- factors[1]
|
|
122 filenamesIn <- unname(unlist(factorList[[1]][[2]]))
|
|
123 sampleTable <- data.frame(sample=basename(filenamesIn),
|
|
124 filename=filenamesIn,
|
|
125 row.names=filenamesIn,
|
|
126 stringsAsFactors=FALSE)
|
|
127 for (factor in factorList) {
|
|
128 factorName <- trim(factor[[1]])
|
|
129 sampleTable[[factorName]] <- character(nrow(sampleTable))
|
|
130 lvls <- sapply(factor[[2]], function(x) names(x))
|
|
131 for (i in seq_along(factor[[2]])) {
|
|
132 files <- factor[[2]][[i]][[1]]
|
|
133 sampleTable[files,factorName] <- trim(lvls[i])
|
|
134 }
|
|
135 sampleTable[[factorName]] <- factor(sampleTable[[factorName]], levels=lvls)
|
|
136 }
|
|
137 rownames(sampleTable) <- sampleTable$sample
|
|
138 } else {
|
|
139 # read the sample_table argument
|
|
140 # this table is described in ?DESeqDataSet
|
|
141 # one column for the sample name, one for the filename, and
|
|
142 # the remaining columns for factors in the analysis
|
|
143 sampleTable <- read.delim(opt$sample_table, stringsAsFactors=FALSE)
|
|
144 factors <- colnames(sampleTable)[-c(1:2)]
|
|
145 for (factor in factors) {
|
|
146 lvls <- unique(as.character(sampleTable[[factor]]))
|
|
147 sampleTable[[factor]] <- factor(sampleTable[[factor]], levels=lvls)
|
|
148 }
|
|
149 }
|
|
150
|
|
151 primaryFactor <- factors[1]
|
|
152 designFormula <- as.formula(paste("~", paste(rev(factors), collapse=" + ")))
|
|
153
|
|
154 if (verbose) {
|
|
155 cat("DESeq2 run information\n\n")
|
|
156 cat("sample table:\n")
|
|
157 print(sampleTable[,-c(1:2),drop=FALSE])
|
|
158 cat("\ndesign formula:\n")
|
|
159 print(designFormula)
|
|
160 cat("\n\n")
|
|
161 }
|
|
162
|
|
163 # these are plots which are made once for each analysis
|
|
164 generateGenericPlots <- function(dds, factors) {
|
|
165 rld <- rlog(dds)
|
|
166 d=plotPCA(rld, intgroup=rev(factors), returnData=TRUE)
|
|
167 labs <- paste0(seq_len(ncol(dds)), ": ", do.call(paste, as.list(colData(dds)[factors])))
|
|
168 library(ggplot2)
|
|
169 print(ggplot(d, aes(x=PC1,y=PC2, col=group,label=factor(labs)), environment = environment()) + geom_point() + geom_text(size=3))
|
|
170 dat <- assay(rld)
|
|
171 colnames(dat) <- labs
|
|
172 distsRL <- dist(t(dat))
|
|
173 mat <- as.matrix(distsRL)
|
|
174 hc <- hclust(distsRL)
|
|
175 hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
|
|
176 heatmap.2(mat, Rowv=as.dendrogram(hc), symm=TRUE, trace="none", col = rev(hmcol),
|
|
177 main="Sample-to-sample distances", margin=c(13,13))
|
|
178 plotDispEsts(dds, main="Dispersion estimates")
|
|
179 }
|
|
180
|
|
181 # these are plots which can be made for each comparison, e.g.
|
|
182 # once for C vs A and once for B vs A
|
|
183 generateSpecificPlots <- function(res, threshold, title_suffix) {
|
|
184 use <- res$baseMean > threshold
|
|
185 if (sum(!use) == 0) {
|
|
186 h <- hist(res$pvalue, breaks=0:50/50, plot=FALSE)
|
|
187 barplot(height = h$counts,
|
|
188 col = "powderblue", space = 0, xlab="p-values", ylab="frequency",
|
|
189 main=paste("Histogram of p-values for",title_suffix))
|
|
190 text(x = c(0, length(h$counts)), y = 0, label=paste(c(0,1)), adj=c(0.5,1.7), xpd=NA)
|
|
191 } else {
|
|
192 h1 <- hist(res$pvalue[!use], breaks=0:50/50, plot=FALSE)
|
|
193 h2 <- hist(res$pvalue[use], breaks=0:50/50, plot=FALSE)
|
|
194 colori <- c("filtered (low count)"="khaki", "not filtered"="powderblue")
|
|
195 barplot(height = rbind(h1$counts, h2$counts), beside = FALSE,
|
|
196 col = colori, space = 0, xlab="p-values", ylab="frequency",
|
|
197 main=paste("Histogram of p-values for",title_suffix))
|
|
198 text(x = c(0, length(h1$counts)), y = 0, label=paste(c(0,1)), adj=c(0.5,1.7), xpd=NA)
|
|
199 legend("topright", fill=rev(colori), legend=rev(names(colori)), bg="white")
|
|
200 }
|
|
201 plotMA(res, main= paste("MA-plot for",title_suffix), ylim=range(res$log2FoldChange, na.rm=TRUE))
|
|
202 }
|
|
203
|
|
204 if (verbose) {
|
|
205 cat(paste("primary factor:",primaryFactor,"\n"))
|
|
206 if (length(factors) > 1) {
|
|
207 cat(paste("other factors in design:",paste(factors[-length(factors)],collapse=","),"\n"))
|
|
208 }
|
|
209 cat("\n---------------------\n")
|
|
210 }
|
|
211
|
|
212 # if JSON input from Galaxy, path is absolute
|
|
213 # otherwise, from sample_table, assume it is relative
|
|
214 dir <- if (is.null(opt$factors)) {
|
|
215 "."
|
|
216 } else {
|
|
217 ""
|
|
218 }
|
|
219
|
|
220 if (!useTXI) {
|
|
221 # construct the object from HTSeq files
|
|
222 dds <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
|
|
223 directory = dir,
|
|
224 design = designFormula)
|
|
225 } else {
|
|
226 # construct the object using tximport
|
|
227 # first need to make the tx2gene table
|
|
228 # this takes ~2-3 minutes using Bioconductor functions
|
|
229 if (!is.null(gtfFile)) {
|
|
230 suppressPackageStartupMessages({
|
|
231 library("GenomicFeatures")
|
|
232 })
|
|
233 txdb <- makeTxDbFromGFF(gtfFile, format="gtf")
|
|
234 k <- keys(txdb, keytype = "GENEID")
|
|
235 df <- select(txdb, keys = k, keytype = "GENEID", columns = "TXNAME")
|
|
236 tx2gene <- df[, 2:1] # tx ID, then gene ID
|
|
237 }
|
|
238 library("tximport")
|
|
239 txiFiles <- as.character(sampleTable[,2])
|
|
240 names(txiFiles) <- as.character(sampleTable[,1])
|
|
241 txi <- tximport(txiFiles, type="sailfish", tx2gene=tx2gene)
|
|
242 dds <- DESeqDataSetFromTximport(txi,
|
|
243 sampleTable[,3:ncol(sampleTable),drop=FALSE],
|
|
244 designFormula)
|
|
245 }
|
|
246
|
|
247 if (verbose) cat(paste(ncol(dds), "samples with counts over", nrow(dds), "genes\n"))
|
|
248 # optional outlier behavior
|
|
249 if (is.null(opt$outlier_replace_off)) {
|
|
250 minRep <- 7
|
|
251 } else {
|
|
252 minRep <- Inf
|
|
253 if (verbose) cat("outlier replacement off\n")
|
|
254 }
|
|
255 if (is.null(opt$outlier_filter_off)) {
|
|
256 cooksCutoff <- TRUE
|
|
257 } else {
|
|
258 cooksCutoff <- FALSE
|
|
259 if (verbose) cat("outlier filtering off\n")
|
|
260 }
|
|
261
|
|
262 # optional automatic mean filtering
|
|
263 if (is.null(opt$auto_mean_filter_off)) {
|
|
264 independentFiltering <- TRUE
|
|
265 } else {
|
|
266 independentFiltering <- FALSE
|
|
267 if (verbose) cat("automatic filtering on the mean off\n")
|
|
268 }
|
|
269
|
|
270 # shrinkage of LFCs
|
|
271 if (is.null(opt$beta_prior_off)) {
|
|
272 betaPrior <- TRUE
|
|
273 } else {
|
|
274 betaPrior <- FALSE
|
|
275 if (verbose) cat("beta prior off\n")
|
|
276 }
|
|
277
|
|
278 # dispersion fit type
|
|
279 if (is.null(opt$fit_type)) {
|
|
280 fitType <- "parametric"
|
|
281 } else {
|
|
282 fitType <- c("parametric","local","mean")[opt$fit_type]
|
|
283 }
|
|
284
|
|
285 if (verbose) cat(paste("using disperion fit type:",fitType,"\n"))
|
|
286
|
|
287 # run the analysis
|
|
288 dds <- DESeq(dds, fitType=fitType, betaPrior=betaPrior, minReplicatesForReplace=minRep)
|
|
289
|
|
290 # create the generic plots and leave the device open
|
|
291 if (!is.null(opt$plots)) {
|
|
292 if (verbose) cat("creating plots\n")
|
|
293 pdf(opt$plots)
|
|
294 generateGenericPlots(dds, factors)
|
|
295 }
|
|
296
|
|
297 n <- nlevels(colData(dds)[[primaryFactor]])
|
|
298 allLevels <- levels(colData(dds)[[primaryFactor]])
|
|
299
|
|
300 if (!is.null(opt$countsfile)) {
|
|
301 labs <- paste0(seq_len(ncol(dds)), ": ", do.call(paste, as.list(colData(dds)[factors])))
|
|
302 normalizedCounts<-counts(dds,normalized=TRUE)
|
|
303 colnames(normalizedCounts)<-labs
|
|
304 write.table(normalizedCounts, file=opt$countsfile, sep="\t", col.names=NA, quote=FALSE)
|
|
305 }
|
|
306
|
|
307 if (is.null(opt$many_contrasts)) {
|
|
308 # only contrast the first and second level of the primary factor
|
|
309 ref <- allLevels[1]
|
|
310 lvl <- allLevels[2]
|
|
311 res <- results(dds, contrast=c(primaryFactor, lvl, ref),
|
|
312 cooksCutoff=cooksCutoff,
|
|
313 independentFiltering=independentFiltering)
|
|
314 if (verbose) {
|
|
315 cat("summary of results\n")
|
|
316 cat(paste0(primaryFactor,": ",lvl," vs ",ref,"\n"))
|
|
317 print(summary(res))
|
|
318 }
|
|
319 resSorted <- res[order(res$padj),]
|
|
320 outDF <- as.data.frame(resSorted)
|
|
321 outDF$geneID <- rownames(outDF)
|
|
322 outDF <- outDF[,c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
|
|
323 filename <- opt$outfile
|
|
324 write.table(outDF, file=filename, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE)
|
|
325 if (independentFiltering) {
|
|
326 threshold <- unname(attr(res, "filterThreshold"))
|
|
327 } else {
|
|
328 threshold <- 0
|
|
329 }
|
|
330 title_suffix <- paste0(primaryFactor,": ",lvl," vs ",ref)
|
|
331 if (!is.null(opt$plots)) {
|
|
332 generateSpecificPlots(res, threshold, title_suffix)
|
|
333 }
|
|
334 } else {
|
|
335 # rotate through the possible contrasts of the primary factor
|
|
336 # write out a sorted table of results with the contrast as a suffix
|
|
337 # add contrast specific plots to the device
|
|
338 for (i in seq_len(n-1)) {
|
|
339 ref <- allLevels[i]
|
|
340 contrastLevels <- allLevels[(i+1):n]
|
|
341 for (lvl in contrastLevels) {
|
|
342 res <- results(dds, contrast=c(primaryFactor, lvl, ref),
|
|
343 cooksCutoff=cooksCutoff,
|
|
344 independentFiltering=independentFiltering)
|
|
345 resSorted <- res[order(res$padj),]
|
|
346 outDF <- as.data.frame(resSorted)
|
|
347 outDF$geneID <- rownames(outDF)
|
|
348 outDF <- outDF[,c("geneID", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
|
|
349 filename <- paste0(opt$outfile,".",primaryFactor,"_",lvl,"_vs_",ref)
|
|
350 write.table(outDF, file=filename, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE)
|
|
351 if (independentFiltering) {
|
|
352 threshold <- unname(attr(res, "filterThreshold"))
|
|
353 } else {
|
|
354 threshold <- 0
|
|
355 }
|
|
356 title_suffix <- paste0(primaryFactor,": ",lvl," vs ",ref)
|
|
357 if (!is.null(opt$plots)) {
|
|
358 generateSpecificPlots(res, threshold, title_suffix)
|
|
359 }
|
|
360 }
|
|
361 }
|
|
362 }
|
|
363
|
|
364 # close the plot device
|
|
365 if (!is.null(opt$plots)) {
|
|
366 cat("closing plot device\n")
|
|
367 dev.off()
|
|
368 }
|
|
369
|
|
370 cat("Session information:\n\n")
|
|
371
|
|
372 sessionInfo()
|
|
373
|