Mercurial > repos > bgruening > deseq2
view deseq2.R @ 17:a05999fd6e26 draft
Uploaded
author | bgruening |
---|---|
date | Mon, 30 Sep 2013 10:51:41 -0400 |
parents | 1d2a02bc2208 |
children | aad8927093ac |
line wrap: on
line source
## Setup R error handling to go to stderr options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) # we need that to not crash galaxy with an UTF8 error on German LC settings. Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") library('getopt'); options(stringAsfactors = FALSE, useFancyQuotes = FALSE) args <- commandArgs(trailingOnly = TRUE) #get options, using the spec as defined by the enclosed list. #we read the options from the default: commandArgs(TRUE). spec = matrix(c( 'verbose', 'v', 2, "integer", 'help' , 'h', 0, "logical", 'outfile' , 'o', 1, "character", 'outfilefiltered' , 'f', 1, "character", 'plots' , 'p', 2, "character", 'input' , 'i', 1, "character", 'factors', 'm', 2, "character", 'fittype', 't', 2, "character", 'threshold', 'c', 2, "double" # 'organism', 'g', 2, "character" ), byrow=TRUE, ncol=4); opt = getopt(spec); # if help was asked for print a friendly message # and exit with a non-zero error code if ( !is.null(opt$help) ) { cat(getopt(spec, usage=TRUE)); q(status=1); } if( is.null(opt$fittype)) opt$fittype = "parametric" trim <- function (x) gsub("^\\s+|\\s+$", "", x) opt$samples <- trim(opt$samples) opt$factors <- trim(opt$factors) htseqCountTable = read.table(opt$input, sep="\t", comment="", as.is=T) colnames(htseqCountTable) <- htseqCountTable[2,] names(htseqCountTable)<-colnames(htseqCountTable) conditions <- htseqCountTable[1,] conditions <- unlist(conditions[,-1]) featurenames <- htseqCountTable[-(1:2), 1] htseqCountTable <- htseqCountTable[-(1:2),-1] htseqCountTable[,1] <- gsub(",", ".", htseqCountTable[,1]) l <- unique(c(conditions)) library('rjson') library('DESeq2') #library('biomaRt') if ( !is.null(opt$plots) ) { pdf(opt$plots) } ## The following function takes deseq data object, computes, writes and plots the results. computeAndWriteResults <- function(dds, sampleCols, outputcsv, featurenames_filtered) { dds <- DESeq(dds, fitType= opt$fittype) sizeFactors(dds) res <- results(dds) resCols <- colnames(res) cond <- c(rep(paste(unique(conditions[sampleCols]),collapse="_"), nrow(res))) if(missing(featurenames_filtered)){ res[, "geneIds"] <- featurenames # rownames(res) <- featurenames title_prefix = "Complete: " }else{ res[, "geneIds"] <- featurenames_filtered # rownames(res) <- featurenames title_prefix = "Filtered: " } print(sum(res$padj < .1, na.rm=TRUE)) print(opt$organism) # if(opt$organism != "other"){ # dataset = "" # if(opt$organism == "mouse") # dataset = "mmusculus_gene_ensembl" # if(opt$organism == "human") # dataset = "hsapiens_gene_ensembl" # if(opt$organism == "fly") # dataset = "dmelanogaster_gene_ensembl" # ensembldb = useMart("ensembl",dataset=dataset) # # annot <- getBM(attributes = c("ensembl_gene_id", "external_gene_id","description"), # filters = "ensembl_gene_id", # values=res[, "geneIds"], # mart=ensembldb) # res <- merge(res, annot, # by.x = "geneIds", # by.y = "ensembl_gene_id", # all.x=TRUE) # resCols <- colnames(res) # resSorted <- res[order(res$padj),] # write.table(as.data.frame(resSorted[,c(resCols)]), file = outputcsv, sep="\t", quote = FALSE, append=TRUE, row.names = FALSE, col.names = FALSE) # }else{ resSorted <- res[order(res$padj),] write.table(as.data.frame(resSorted[,c("geneIds", resCols)]), file = outputcsv, sep="\t", quote = FALSE, append=TRUE, row.names = FALSE, col.names = FALSE) # } if ( !is.null(opt$plots) ) { plotDispEsts(dds, main= paste(title_prefix, "Dispersion estimate plot")) plotMA(dds, main= paste(title_prefix, "MA-plot")) library("RColorBrewer") library("gplots") select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:30] hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100) rld <- rlogTransformation(dds, blind=TRUE) vsd <- varianceStabilizingTransformation(dds, blind=TRUE) distsRL <- dist(t(assay(rld))) mat <- as.matrix(distsRL) heatmap.2(mat, trace="none", col = rev(hmcol), main = paste(title_prefix, "Sample-to-sample distances")) } return(res) } ## This functions filters out the genes with low counts and plots p-value distribution independentFilter <- function(deseqRes, countTable, sampleColumns, designFormula){ # use = (deseqRes$baseMean > quantile(deseqRes$baseMean, probs = opt$threshold) & (!is.na(deseqRes$pvalue))) use = (deseqRes$baseMean > opt$threshold & !is.na(deseqRes$pvalue)) ddsFilt <- DESeqDataSetFromMatrix(countData = countTable[use, ], colData = sampleTable, design = designFormula) computeAndWriteResults(ddsFilt, sampleColumns, opt$outfilefiltered, featurenames[use]) ## p-value distribution h1 <- hist(deseqRes$pvalue[!use], breaks=50, plot=FALSE) h2 <- hist(deseqRes$pvalue[use], breaks=50, plot=FALSE) colori <- c(`filtered out`="khaki", `complete`="powderblue") barplot(height = rbind(h1$counts, h2$counts), beside = FALSE, col = colori, space = 0, main = "Distribution of p-values", ylab="frequency") text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)), adj = c(0.5,1.7), xpd=NA) legend("topright", fill=rev(colori), legend=rev(names(colori))) } parser <- newJSONParser() parser$addData( opt$factors ) factorsList <- parser$getObject() sampleColumns <- c() for (j in 2:length(factorsList[[1]])){ for (k in 1:length(names(factorsList[[1]][[j]]))){ for (l in 1:length(factorsList[[1]][[j]][[k]])){ sampleColumns[[length(sampleColumns) + 1]] <- as.integer(factorsList[[1]][[j]][[k]][l]) - 1 } } } sampleColumns<-sort(unique(sampleColumns)) htseqSubCountTable <- htseqCountTable[,sampleColumns] ntabcols <- length(htseqSubCountTable) htseqSubCountTable <- as.integer(unlist(htseqSubCountTable)) htseqSubCountTable <- matrix(unlist(htseqSubCountTable), ncol = ntabcols, byrow = FALSE) colnames(htseqSubCountTable) <- names(htseqCountTable)[sampleColumns] sampleTable = data.frame(row.names=colnames(htseqSubCountTable)) for(i in 1:length(factorsList)){ factorName<-factorsList[[i]][[1]] for (j in 2:length(factorsList[[i]])){ effected_cols <- c() for (k in 1:length(names(factorsList[[i]][[j]]))){ for (l in 1:length(factorsList[[i]][[j]][[k]])){ if(colnames(htseqCountTable)[factorsList[[i]][[j]][[k]][l]-1] %in% rownames(sampleTable)){ sampleTable[colnames(htseqCountTable)[factorsList[[i]][[j]][[k]][l]-1],factorName] <- names(factorsList[[i]][[j]])[k] } } } } } sampleTable[is.na(sampleTable)] <- "undefined" sampleTable factorNames <- c(names(sampleTable)[-1], names(sampleTable)[1]) designFormula <- as.formula(paste("", paste(factorNames, collapse=" + "), sep=" ~ ")) dds = DESeqDataSetFromMatrix(countData = htseqSubCountTable, colData = sampleTable, design = designFormula) deseqres <- computeAndWriteResults(dds, sampleColumns, opt$outfile) ## independent filtering independentFilter(deseqres, htseqSubCountTable, sampleColumns, designFormula) dev.off() sessionInfo()