# HG changeset patch # User pavanvidem # Date 1440765451 14400 # Node ID 7604d324c5aa2594c35e1b8734c4fc00ab2c6d69 Uploaded diff -r 000000000000 -r 7604d324c5aa dexseq/dexseq.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dexseq/dexseq.R Fri Aug 28 08:37:31 2015 -0400 @@ -0,0 +1,108 @@ +## Setup R error handling to go to stderr +options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) +# we need that to not crash galaxy with an UTF8 error on German LC settings. +Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") + +library("DEXSeq") +library('getopt') +library('rjson') + + +options(stringAsfactors = FALSE, useFancyQuotes = FALSE) +args <- commandArgs(trailingOnly = TRUE) + +#get options, using the spec as defined by the enclosed list. +#we read the options from the default: commandArgs(TRUE). +spec = matrix(c( + 'verbose', 'v', 2, "integer", + 'help', 'h', 0, "logical", + 'gtf', 'a', 1, "character", + 'outfile', 'o', 1, "character", + 'report', 'r', 1, "character", + 'factors', 'f', 1, "character", + 'threads', 'p', 1, "integer", + 'fdr', 'c', 1, "double", +), byrow=TRUE, ncol=4); +opt = getopt(spec); + +setwd(opt$report) + +# if help was asked for print a friendly message +# and exit with a non-zero error code +if ( !is.null(opt$help) ) { + cat(getopt(spec, usage=TRUE)); + q(status=1); +} + +trim <- function (x) gsub("^\\s+|\\s+$", "", x) +opt$samples <- trim(opt$samples) +opt$factors <- trim(opt$factors) + +parser <- newJSONParser() +parser$addData( opt$factors ) +factorsList <- parser$getObject() + +sampleTable<-data.frame() +countFiles<-c() +factorNames<-c() +primaryFactor<-"" +for(factor in factorsList){ + factorName<-factor[[1]] + factorNames<-append(factorNames, paste(factorName,"exon",sep=":")) + factorValuesMapList<-factor[[2]] + c = length(factorValuesMapList) + for (factorValuesMap in factorValuesMapList){ + for(files in factorValuesMap){ + for(file in files){ + if(primaryFactor == "") { + countFiles<-append(countFiles,file) + } + sampleTable[basename(file),factorName]<-paste(c,names(factorValuesMap),sep="_") + } + } + c = c-1 + } + if(primaryFactor == ""){ + primaryFactor <- factorName + } +} + +factorNames<-append(factorNames,"exon") +factorNames<-append(factorNames,"sample") +factorNames<-rev(factorNames) +formulaFullModel <- as.formula(paste("", paste(factorNames, collapse=" + "), sep=" ~ ")) +factorNames <- head(factorNames,-1) +formulaReducedModel <- as.formula(paste("", paste(factorNames, collapse=" + "), sep=" ~ ")) + +sampleTable +formulaFullModel +formulaReducedModel +primaryFactor +countFiles +opt$report +file.path(opt$report,"DEXSeq_analysis.RData") +getwd() + +dxd = DEXSeqDataSetFromHTSeq(countFiles, sampleData=sampleTable, design= formulaFullModel, flattenedfile=opt$gtf) + +colData(dxd) +dxd <- estimateSizeFactors(dxd) +sizeFactors(dxd) +BPPARAM=MulticoreParam(workers=opt$threads) +dxd <- estimateDispersions(dxd, formula=formulaFullModel, BPPARAM=BPPARAM) +dxd <- testForDEU(dxd, reducedModel=formulaReducedModel, fullModel=formulaFullModel, BPPARAM=BPPARAM) +dxd <- estimateExonFoldChanges(dxd, fitExpToVar=primaryFactor, BPPARAM=BPPARAM) +res <- DEXSeqResults(dxd) +table(res$padj <= opt$fdr) +resSorted <- res[order(res$padj),] +head(resSorted) + +write.csv(as.data.frame(resSorted), file=opt$outfile) + +if ( !is.null(opt$report) ) { + save(dxd, resSorted, file = file.path(opt$report,"DEXSeq_analysis.RData")) + save.image() + DEXSeqHTML(res, FDR=opt$fdr, color=c("#C3EEE7","#B7FEA0","#F1E7A1","#CEAEFF","#FF8F43","#EDC3C5","#AAA8AA","#FF0000","#637EE9","#FBFBFB"), BPPARAM=BPPARAM) + unlink(file.path(opt$report,"DEXSeq_analysis.RData")) +} +sessionInfo() diff -r 000000000000 -r 7604d324c5aa dexseq/dexseq.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dexseq/dexseq.xml Fri Aug 28 08:37:31 2015 -0400 @@ -0,0 +1,140 @@ + + Determines differentially expressed features from count tables + + R + Rscript + deseq2 + DESeq2 + + + + #import json + Rscript /usr/local/galaxy/galaxy-dist/tools/test/deseq2/dexseq.R + #set $reportdir = $deseq_out.files_path + -o "$deseq_out" + -p 12 + #set $temp_factor_names = list() + #for $factor in $rep_factorName: + #set $temp_factor = list() + #for $level in $factor.rep_factorLevel: + #set $count_files = list() + #for $file in $level.countsFile: + $count_files.append(str($file)) + #end for + $temp_factor.append( {str($level.factorLevel): $count_files} ) + #end for + $temp_factor_names.append([str($factor.factorName), $temp_factor]) + #end for + -f '#echo json.dumps(temp_factor_names)#' + -a $gtf + #if $report: + -p "$reportdir" + #end if + -c $fdr_cutoff + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + pdf == True + + + + + +.. class:: infomark + +**What it does** + +Estimate variance-mean dependence in count data from high-throughput sequencing assays and test for differential expression based on a model using the negative binomial distribution + + +**Inputs** + +DESeq2_ takes count tables that generated from the htseq-count as input. Count tables must be generated for each sample individually. DESeq2 is capable of handling multiple factors that effect your experiment. The first factor you input is considered as the primary factor that affects gene expressions. You also input several secondary factors that might influence your experiment. But the final output will be changes in genes due to primary factor in presence of secondary factors. Each factor has two levels/states. You need to select appropriate count table from your history for each factor level. + +The following table gives some examples of factors and their levels: + +========= ============== =============== +Factor Factor level 1 Factor level 2 +--------- -------------- --------------- +Treatment Treated Untreated +--------- -------------- --------------- +Condition Knockdown Wildtype +--------- -------------- --------------- +TimePoint Day4 Day1 +--------- -------------- --------------- +SeqType SingleEnd PairedEnd +--------- -------------- --------------- +Gender Female Male +========= ============== =============== + +*Note*: Output log2 fold changes are based on primary factor level 1 vs. factor level2. Here the order of factor levels is important. For example, for the factor 'Treatment' given in above table, DESeq2 computes fold changes of 'Treated' samples against 'Untreated', i.e. the values correspond to up or down regulations of genes in Treated samples. + +**Output** + +DESeq2_ generates a tabular file containing the different columns and optional visualized results as PDF. + +====== ========================================================== +Column Description +------ ---------------------------------------------------------- + 1 Gene Identifiers + 2 mean normalised counts, averaged over all samples from both conditions + 3 the logarithm (to basis 2) of the fold change (See the note in inputs section) + 4 standard error estimate for the log2 fold change estimate + 5 Wald statistic + 6 p value for the statistical significance of this change + 7 p value adjusted for multiple testing with the Benjamini-Hochberg procedure + which controls false discovery rate (FDR) +====== ========================================================== + + + + + +.. _DESeq2: http://master.bioconductor.org/packages/release/bioc/html/DESeq2.html + + + + + 10.1101/002832 + + diff -r 000000000000 -r 7604d324c5aa dexseq/dexseq_helper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dexseq/dexseq_helper.py Fri Aug 28 08:37:31 2015 -0400 @@ -0,0 +1,98 @@ + +from galaxy.tools.parameters import DataToolParameter + +def get_matrix_header( input_dataset ): + """ + Not used currently, because the reload of the ckeckboxes did not work. + """ + input_handle = open( input_dataset.file_name ) + first_header = input_handle.readline() + second_header = input_handle.readline() + return [('%s::%s' % (cname2,cname1), str(int(col_num) + 1), False) for col_num, (cname2, cname1) in enumerate(zip(second_header.split()[1:],first_header.split()[1:])) ] + + + +def _construct_error_map( error_map, rep_dict, rep_parent, child, error_value ): + """ + Its no so easy to create a propper error_map for repetitions in Galaxy. + This is a helper function. + """ + + error_map[ rep_parent ] = [ dict() for t in rep_dict ] + for i in range( len( rep_dict ) ): + error_map[ rep_parent ][i][ child ] = error_value + + + +def validate_input( trans, error_map, param_values, page_param_map ): + """ + Validates the user input, before execution. + """ + factors = param_values['rep_factorName'] + factor_name_list = [] + factor_duplication = False + level_duplication = False + overlapping_selection = False + + first_condition = True + factor_indieces = list() + + for factor in factors: + # factor names should be unique + fn = factor['factorName'] + if fn in factor_name_list: + factor_duplication = True + break + factor_name_list.append( fn ) + + level_name_list = list() + factor_index_list = list() + + if first_condition and len( factor['rep_factorLevel'] ) < 2: + # first condition needs to have at least 2 levels + _construct_error_map( error_map, factors, 'rep_factorName', 'rep_factorLevel', [ {'factorLevel': 'The first condition should have at least 2 factor'} for t in factor['rep_factorLevel'] ] ) + + for level in factor['rep_factorLevel']: + # level names under one factor should be unique + fl = level['factorLevel'] + if fl in level_name_list: + level_duplication = True + level_name_list.append( fl ) + + fi = level['factorIndex'] + if fi: + # the checkboxes should not have an overlap + for check in fi: + if check in factor_index_list: + overlapping_selection = True + factor_index_list.append( check ) + + print set(factor_index_list) + print factor_indieces + if set(factor_index_list) in factor_indieces: + _construct_error_map( error_map, factors, 'rep_factorName', 'rep_factorLevel', [ {'factorLevel': 'It is not allowed to have two identical factors, that means two factors with the same toggeled checked boxes. '} for t in factor['rep_factorLevel'] ] ) + else: + factor_indieces.append( set(factor_index_list) ) + + + + if level_duplication: + error_map['rep_factorName'] = [ dict() for t in factors ] + for i in range( len( factors ) ): + error_map['rep_factorName'][i]['rep_factorLevel'] = [ {'factorLevel': 'Factor levels for each factor need to be unique'} for t in factor['rep_factorLevel'] ] + break + if overlapping_selection: + error_map['rep_factorName'] = [ dict() for t in factors ] + for i in range( len( factors ) ): + error_map['rep_factorName'][i]['rep_factorLevel'] = [ {'factorIndex': 'The samples from different factors are not allowed to overlap'} for t in factor['rep_factorLevel'] ] + break + + first_condition = False + + if factor_duplication: + _construct_error_map( error_map, factors, 'rep_factorName', 'factorName', 'Factor names need to be unique' ) + """ + error_map['rep_factorName'] = [ dict() for t in factors ] + for i in range( len( factors ) ): + error_map['rep_factorName'][i]['factorName'] = 'Factor names need to be unique' + """ diff -r 000000000000 -r 7604d324c5aa dexseq/tool_dependancy.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dexseq/tool_dependancy.xml Fri Aug 28 08:37:31 2015 -0400 @@ -0,0 +1,31 @@ + + + + + + + + + + + + + + + + + + + https://github.com/bgruening/download_store/raw/master/DEXSeq_1.14.1/Biostrings_2.36.4.tar.gz + https://github.com/bgruening/download_store/raw/master/DEXSeq_1.14.1/hwriter_1.3.2.tar.gz + https://github.com/bgruening/download_store/raw/master/DEXSeq_1.14.1/statmod_1.4.21.tar.gz + https://github.com/bgruening/download_store/raw/master/DEXSeq_1.14.1/biomaRt_2.24.0.tar.gz + https://github.com/bgruening/download_store/raw/master/DEXSeq_1.14.1/Rsamtools_1.20.4.tar.gz + + + + + Differential exon usage based on the negative binomial distribution. + + +