# HG changeset patch # User bgruening # Date 1380552701 14400 # Node ID a05999fd6e268bc20fb05441f25e8520bdc2d831 # Parent 1d2a02bc220843dbe04bcdbead96f6008d8e7f58 Uploaded diff -r 1d2a02bc2208 -r a05999fd6e26 deseq2.R --- a/deseq2.R Mon Sep 09 07:09:38 2013 -0400 +++ b/deseq2.R Mon Sep 30 10:51:41 2013 -0400 @@ -10,23 +10,24 @@ #get options, using the spec as defined by the enclosed list. #we read the options from the default: commandArgs(TRUE). spec = matrix(c( - 'verbose', 'v', 2, "integer", - 'help' , 'h', 0, "logical", - 'outputfile' , 'o', 1, "character", - 'plots' , 'p', 2, "character", - 'input' , 'i', 1, "character", - 'samples', 's', 1, "character", - 'factors', 'f', 2, "character", - 'fittype', 't', 2, "character" + 'verbose', 'v', 2, "integer", + 'help' , 'h', 0, "logical", + 'outfile' , 'o', 1, "character", + 'outfilefiltered' , 'f', 1, "character", + 'plots' , 'p', 2, "character", + 'input' , 'i', 1, "character", + 'factors', 'm', 2, "character", + 'fittype', 't', 2, "character", + 'threshold', 'c', 2, "double" +# 'organism', 'g', 2, "character" ), byrow=TRUE, ncol=4); opt = getopt(spec); - # if help was asked for print a friendly message # and exit with a non-zero error code if ( !is.null(opt$help) ) { - cat(getopt(spec, usage=TRUE)); - q(status=1); + cat(getopt(spec, usage=TRUE)); + q(status=1); } if( is.null(opt$fittype)) @@ -42,148 +43,146 @@ names(htseqCountTable)<-colnames(htseqCountTable) conditions <- htseqCountTable[1,] conditions <- unlist(conditions[,-1]) -tagnames <- htseqCountTable[-(1:2), 1] +featurenames <- htseqCountTable[-(1:2), 1] htseqCountTable <- htseqCountTable[-(1:2),-1] htseqCountTable[,1] <- gsub(",", ".", htseqCountTable[,1]) l <- unique(c(conditions)) -library( "DESeq2" ) +library('rjson') +library('DESeq2') +#library('biomaRt') if ( !is.null(opt$plots) ) { - pdf(opt$plots) + pdf(opt$plots) } -if (opt$samples=="all_vs_all"){ - # all versus all - for (i in seq(1, length(l), by=1)) { - k=i+1 - if(k<=length(l)){ - for (j in seq(k, length(l), by=1)) { - currentColmuns <- which(conditions %in% c(l[[i]], l[[j]])) - sampleNames <- names(htseqCountTable[currentColmuns]) - - currentPairTable <- htseqCountTable[currentColmuns] - ncondition1cols <- length(which(conditions %in% c(l[[i]]))) - ncondition2cols <- length(which(conditions %in% c(l[[i]]))) - - ntabcols <- length(currentPairTable) - currentPairTable <- as.integer(unlist(currentPairTable)) - currentPairTable <- matrix(unlist(currentPairTable), ncol = ntabcols, byrow = FALSE) - colnames(currentPairTable) <- sampleNames - - pdata = data.frame(condition=factor(conditions[currentColmuns]),row.names=colnames(currentPairTable)) - print(pdata) - - dds = DESeqDataSetFromMatrix(countData = currentPairTable, colData = pdata, design = ~ condition) - - dds <- DESeq(dds, fitType= opt$fittype) - sizeFactors(dds) - res <- results(dds) - resCols <- colnames(res) - - cond <- c(rep(paste(unique(conditions[currentColmuns]),collapse="_"), nrow(res))) - res[, "condition"] <- cond - res[, "geneIds"] <- tagnames - - resSorted <- res[order(res$padj),] - write.table(as.data.frame(resSorted[,c("condition", "geneIds", resCols)]), file = opt$outputfile, sep="\t", quote = FALSE, append=TRUE, row.names = FALSE, col.names = FALSE) - - if ( !is.null(opt$plots) ) { - plotDispEsts(dds) - plotMA(dds) - - library("RColorBrewer") - library("gplots") - select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:30] - hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100) - - rld <- rlogTransformation(dds, blind=TRUE) - vsd <- varianceStabilizingTransformation(dds, blind=TRUE) - distsRL <- dist(t(assay(rld))) - mat <- as.matrix(distsRL) - heatmap.2(mat, trace="none", col = rev(hmcol), main = "Sample-to-sample distances") - } - } - } - } -}else{ - sampleSets <- unlist(strsplit(opt$samples, " ")) - samplesControl <- {} - samplesExperiment <- {} - samplesControl <- unlist(strsplit(sampleSets[1], ",")) - samplesExperiment <- unlist(strsplit(sampleSets[2], ",")) - - # the minus one is needed because we get column indizes including the first column - sampleColumns <- c() - for (i in samplesControl) sampleColumns[[length(sampleColumns) + 1]] <- as.integer(i) - 1 - for (i in samplesExperiment) sampleColumns[[length(sampleColumns) + 1]] <- as.integer(i) - 1 - htseqSubCountTable <- htseqCountTable[,sampleColumns] - - ntabcols <- length(htseqSubCountTable) - htseqSubCountTable <- as.integer(unlist(htseqSubCountTable)) - htseqSubCountTable <- matrix(unlist(htseqSubCountTable), ncol = ntabcols, byrow = FALSE) - colnames(htseqSubCountTable) <- names(htseqCountTable)[sampleColumns] - - pdata = data.frame(row.names=colnames(htseqSubCountTable)) +## The following function takes deseq data object, computes, writes and plots the results. +computeAndWriteResults <- function(dds, sampleCols, outputcsv, featurenames_filtered) { + dds <- DESeq(dds, fitType= opt$fittype) + sizeFactors(dds) + res <- results(dds) + resCols <- colnames(res) + cond <- c(rep(paste(unique(conditions[sampleCols]),collapse="_"), nrow(res))) + if(missing(featurenames_filtered)){ + res[, "geneIds"] <- featurenames +# rownames(res) <- featurenames + title_prefix = "Complete: " + }else{ + res[, "geneIds"] <- featurenames_filtered +# rownames(res) <- featurenames + title_prefix = "Filtered: " + } + print(sum(res$padj < .1, na.rm=TRUE)) + print(opt$organism) +# if(opt$organism != "other"){ +# dataset = "" +# if(opt$organism == "mouse") +# dataset = "mmusculus_gene_ensembl" +# if(opt$organism == "human") +# dataset = "hsapiens_gene_ensembl" +# if(opt$organism == "fly") +# dataset = "dmelanogaster_gene_ensembl" +# ensembldb = useMart("ensembl",dataset=dataset) +# +# annot <- getBM(attributes = c("ensembl_gene_id", "external_gene_id","description"), +# filters = "ensembl_gene_id", +# values=res[, "geneIds"], +# mart=ensembldb) - if(!is.null(opt$factors)){ - factorSets <- unlist(strsplit(opt$factors, " ")) - for (factorSet in factorSets) { - currentFactor <- unlist(strsplit(factorSet, ":")) - for (factor in currentFactor[2]) { - factoredCols <- unlist(strsplit(factor, ",")) - factoredCols <- as.numeric(factoredCols) - factors <- c() - for (i in sampleColumns){ - j=i+1 - if(j %in% factoredCols) - factors[[length(factors) + 1]] <- "yes" - else - factors[[length(factors) + 1]] <- "no" - } - # only add factor if factor is applied to the selected samples - if((length(unique(factors))) >= 2){ - pdata[[currentFactor[1]]]<-factor(factors) - }else{ - write("Input-Error 01: You can only apply factors to selected samples.", stderr()) - } - } - } - } - pdata$condition<-factor(conditions[sampleColumns]) - print(pdata) - - designFormula <- as.formula(paste("", paste(names(pdata), collapse=" + "), sep=" ~ ")) - dds = DESeqDataSetFromMatrix(countData = htseqSubCountTable, colData = pdata, design = designFormula) - - dds <- DESeq(dds, fitType= opt$fittype) - sizeFactors(dds) - res <- results(dds) - resCols <- colnames(res) - - cond <- c(rep(paste(unique(conditions[sampleColumns]),collapse="_"), nrow(res))) - res[, "condition"] <- cond - res[, "geneIds"] <- tagnames - - resSorted <- res[order(res$padj),] - write.table(as.data.frame(resSorted[,c("condition", "geneIds", resCols)]), file = opt$outputfile, sep="\t", quote = FALSE, row.names = FALSE, col.names = FALSE) - - if ( !is.null(opt$plots) ) { - plotDispEsts(dds) - plotMA(dds) +# res <- merge(res, annot, +# by.x = "geneIds", +# by.y = "ensembl_gene_id", +# all.x=TRUE) + +# resCols <- colnames(res) +# resSorted <- res[order(res$padj),] +# write.table(as.data.frame(resSorted[,c(resCols)]), file = outputcsv, sep="\t", quote = FALSE, append=TRUE, row.names = FALSE, col.names = FALSE) +# }else{ + resSorted <- res[order(res$padj),] + write.table(as.data.frame(resSorted[,c("geneIds", resCols)]), file = outputcsv, sep="\t", quote = FALSE, append=TRUE, row.names = FALSE, col.names = FALSE) +# } + + if ( !is.null(opt$plots) ) { + plotDispEsts(dds, main= paste(title_prefix, "Dispersion estimate plot")) + plotMA(dds, main= paste(title_prefix, "MA-plot")) library("RColorBrewer") library("gplots") select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:30] hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100) - + rld <- rlogTransformation(dds, blind=TRUE) vsd <- varianceStabilizingTransformation(dds, blind=TRUE) distsRL <- dist(t(assay(rld))) mat <- as.matrix(distsRL) - heatmap.2(mat, trace="none", col = rev(hmcol), main = "Sample-to-sample distances") - } + heatmap.2(mat, trace="none", col = rev(hmcol), main = paste(title_prefix, "Sample-to-sample distances")) + } + return(res) +} + +## This functions filters out the genes with low counts and plots p-value distribution +independentFilter <- function(deseqRes, countTable, sampleColumns, designFormula){ +# use = (deseqRes$baseMean > quantile(deseqRes$baseMean, probs = opt$threshold) & (!is.na(deseqRes$pvalue))) + use = (deseqRes$baseMean > opt$threshold & !is.na(deseqRes$pvalue)) + ddsFilt <- DESeqDataSetFromMatrix(countData = countTable[use, ], colData = sampleTable, design = designFormula) + computeAndWriteResults(ddsFilt, sampleColumns, opt$outfilefiltered, featurenames[use]) + + ## p-value distribution + h1 <- hist(deseqRes$pvalue[!use], breaks=50, plot=FALSE) + h2 <- hist(deseqRes$pvalue[use], breaks=50, plot=FALSE) + colori <- c(`filtered out`="khaki", `complete`="powderblue") + barplot(height = rbind(h1$counts, h2$counts), beside = FALSE, + col = colori, space = 0, main = "Distribution of p-values", ylab="frequency") + text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)), adj = c(0.5,1.7), xpd=NA) + legend("topright", fill=rev(colori), legend=rev(names(colori))) +} + +parser <- newJSONParser() +parser$addData( opt$factors ) +factorsList <- parser$getObject() + +sampleColumns <- c() + +for (j in 2:length(factorsList[[1]])){ + for (k in 1:length(names(factorsList[[1]][[j]]))){ + for (l in 1:length(factorsList[[1]][[j]][[k]])){ + sampleColumns[[length(sampleColumns) + 1]] <- as.integer(factorsList[[1]][[j]][[k]][l]) - 1 + } + } } + +sampleColumns<-sort(unique(sampleColumns)) +htseqSubCountTable <- htseqCountTable[,sampleColumns] +ntabcols <- length(htseqSubCountTable) +htseqSubCountTable <- as.integer(unlist(htseqSubCountTable)) +htseqSubCountTable <- matrix(unlist(htseqSubCountTable), ncol = ntabcols, byrow = FALSE) +colnames(htseqSubCountTable) <- names(htseqCountTable)[sampleColumns] +sampleTable = data.frame(row.names=colnames(htseqSubCountTable)) + +for(i in 1:length(factorsList)){ + factorName<-factorsList[[i]][[1]] + for (j in 2:length(factorsList[[i]])){ + effected_cols <- c() + for (k in 1:length(names(factorsList[[i]][[j]]))){ + for (l in 1:length(factorsList[[i]][[j]][[k]])){ + if(colnames(htseqCountTable)[factorsList[[i]][[j]][[k]][l]-1] %in% rownames(sampleTable)){ + sampleTable[colnames(htseqCountTable)[factorsList[[i]][[j]][[k]][l]-1],factorName] <- names(factorsList[[i]][[j]])[k] + } + } + } + } +} +sampleTable[is.na(sampleTable)] <- "undefined" +sampleTable + +factorNames <- c(names(sampleTable)[-1], names(sampleTable)[1]) +designFormula <- as.formula(paste("", paste(factorNames, collapse=" + "), sep=" ~ ")) +dds = DESeqDataSetFromMatrix(countData = htseqSubCountTable, colData = sampleTable, design = designFormula) +deseqres <- computeAndWriteResults(dds, sampleColumns, opt$outfile) + +## independent filtering +independentFilter(deseqres, htseqSubCountTable, sampleColumns, designFormula) + dev.off() - +sessionInfo() diff -r 1d2a02bc2208 -r a05999fd6e26 deseq2.py --- a/deseq2.py Mon Sep 09 07:09:38 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,8 +0,0 @@ -#!/usr/bin/env python -import os, sys -import rpy2.robjects as robjects -from rpy2.robjects.packages import importr -deseq2 = importr("DESeq2") - -sys.stdout.write('Works!') - diff -r 1d2a02bc2208 -r a05999fd6e26 deseq2.xml --- a/deseq2.xml Mon Sep 09 07:09:38 2013 -0400 +++ b/deseq2.xml Mon Sep 30 10:51:41 2013 -0400 @@ -1,5 +1,5 @@ - Determines differentially expressed transcripts from read alignments + Determines differentially expressed features from count data Rscript DESeq2 @@ -7,33 +7,33 @@ DESEQ2_SCRIPT_PATH + #import simplejson deseq2.R -o "$deseq_out" + --outfilefiltered "$deseq_out_filtered" #if $pdf: - -p "$plots" + -p "$plots" #end if - + -i "$input_matrix" - #if $filter_sel.filter_sel_opts == 'all_vs_all': - -s 'all_vs_all' - #else: - -s ## build a string like '1,2 5,6' - "${filter_sel.control_cols} ${filter_sel.experiement_cols}" + #set $temp_factor_name = list() + #for $factor in $rep_factorName: + #set $temp_factor = dict() + #for $level in $factor.rep_factorLevel: + ##$temp_factor_list.append( '%s::%s:%s' % ($factor.factorName.replace(' ','_'), $level.factorLevel, $level.factorIndex) ) + $temp_factor.update({str($level.factorLevel): map(int, str($level.factorIndex).split(','))}) + #end for + $temp_factor_name.append([str($factor.factorName), $temp_factor]) - #set $temp_factor_list = list() - #set $is_multi_factor_analysis = False - #for $factor in $filter_sel.factor: - #set $is_multi_factor_analysis = True - $temp_factor_list.append( '%s:%s' % ($factor.factor_name.replace(' ','_'), $factor.factor_index) ) #end for - #if $is_multi_factor_analysis: - -f "#echo ' '.join( $temp_factor_list )#" - #end if - #end if - -t $fittype + ##-m "#echo ' '.join( $temp_factor_list )#" + -m '#echo simplejson.dumps(temp_factor_name)#' + ##--organism "$organism" + ##-t "$fittype" + -c $countthreshold + 'Count reads in features with htseq-count'"/> + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + - - - - - + + + + + - - - - pdf == True - - + + + + + pdf == True + + + - + .. class:: infomark @@ -122,13 +130,12 @@ ====== ========================================================== Column Description ------ ---------------------------------------------------------- - 1 Condition tested (corresponds to the conitions from the first line of your count matrix) - 2 Gene Identifiers - 3 mean normalised counts, averaged over all samples from both conditions - 4 the logarithm (to basis 2) of the fold change - 5 standard error estimate for the log2 fold change estimate - 6 p value for the statistical significance of this change - 7 p value adjusted for multiple testing with the Benjamini-Hochberg procedure + 1 Gene Identifiers + 2 mean normalised counts, averaged over all samples from both conditions + 3 the logarithm (to basis 2) of the fold change + 4 standard error estimate for the log2 fold change estimate + 5 p value for the statistical significance of this change + 6 p value adjusted for multiple testing with the Benjamini-Hochberg procedure which controls false discovery rate (FDR) ====== ========================================================== @@ -150,5 +157,5 @@ .. _DESeq2: http://master.bioconductor.org/packages/release/bioc/html/DESeq2.html - + diff -r 1d2a02bc2208 -r a05999fd6e26 matrix_helper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matrix_helper.py Mon Sep 30 10:51:41 2013 -0400 @@ -0,0 +1,66 @@ + +from galaxy.tools.parameters import DataToolParameter + +def get_matrix_header( input_dataset ): + """ + Not used currently, because the reload of the ckeckboxes did not work. + """ + input_handle = open( input_dataset.file_name ) + first_header = input_handle.readline() + second_header = input_handle.readline() + return [('%s::%s' % (cname2,cname1), str(int(col_num) + 1), False) for col_num, (cname2, cname1) in enumerate(zip(second_header.split()[1:],first_header.split()[1:])) ] + + +def validate_input( trans, error_map, param_values, page_param_map ): + """ + Validates the user input, before execution. + """ + factors = param_values['rep_factorName'] + factor_name_list = [] + factor_duplication = False + level_duplication = False + overlapping_selection = False + + + for factor in factors: + # factor names should be unique + fn = factor['factorName'] + if fn in factor_name_list: + factor_duplication = True + break + factor_name_list.append( fn ) + + level_name_list = list() + factor_index_list = list() + + for level in factor['rep_factorLevel']: + # level names under one factor should be unique + fl = level['factorLevel'] + if fl in level_name_list: + level_duplication = True + level_name_list.append( fl ) + + fi = level['factorIndex'] + if fi: + # the checkboxes should not have an overlap + for check in fi: + if check in factor_index_list: + overlapping_selection = True + factor_index_list.append( check ) + + if level_duplication: + error_map['rep_factorName'] = [ dict() for t in factors ] + for i in range( len( factors ) ): + error_map['rep_factorName'][i]['rep_factorLevel'] = [ {'factorLevel': 'Factor levels for each factor need to be unique'} for t in factor['rep_factorLevel'] ] + break + if overlapping_selection: + error_map['rep_factorName'] = [ dict() for t in factors ] + for i in range( len( factors ) ): + error_map['rep_factorName'][i]['rep_factorLevel'] = [ {'factorIndex': 'The samples from different factors are not allowed to overlap'} for t in factor['rep_factorLevel'] ] + break + + if factor_duplication: + error_map['rep_factorName'] = [ dict() for t in factors ] + for i in range( len( factors ) ): + error_map['rep_factorName'][i]['factorName'] = 'Factor names need to be unique' + diff -r 1d2a02bc2208 -r a05999fd6e26 tool_dependencies.xml --- a/tool_dependencies.xml Mon Sep 09 07:09:38 2013 -0400 +++ b/tool_dependencies.xml Mon Sep 30 10:51:41 2013 -0400 @@ -4,7 +4,7 @@ - + $REPOSITORY_INSTALL_DIR/scripts