Mercurial > repos > bgruening > deseq2
changeset 20:bbea9c694b34 draft
Uploaded
author | bgruening |
---|---|
date | Wed, 19 Feb 2014 06:11:20 -0500 |
parents | a1aa11e02862 |
children | d32de046ba31 |
files | deseq2.xml deseq_helper.py foldchanges_heatmap.r foldchanges_heatmap.xml matrix_helper.py tool_dependencies.xml |
diffstat | 6 files changed, 246 insertions(+), 83 deletions(-) [+] |
line wrap: on
line diff
--- a/deseq2.xml Mon Sep 30 12:28:02 2013 -0400 +++ b/deseq2.xml Wed Feb 19 06:11:20 2014 -0500 @@ -8,7 +8,7 @@ <!--<requirement type="set_environment">DESEQ2_SCRIPT_PATH</requirement>--> </requirements> <command interpreter="Rscript"> - #import simplejson + #import json deseq2.R -o "$deseq_out" --outfilefiltered "$deseq_out_filtered" @@ -16,7 +16,7 @@ #if $pdf: -p "$plots" #end if - + -i "$input_matrix" #set $temp_factor_name = list() @@ -31,7 +31,7 @@ #end for ##-m "#echo ' '.join( $temp_factor_list )#" - -m '#echo simplejson.dumps(temp_factor_name)#' + -m '#echo json.dumps(temp_factor_name)#' ##--organism "$organism" ##-t "$fittype" -c $countthreshold @@ -53,15 +53,15 @@ <inputs> <param format="tabular" name="input_matrix" type="data" label="Countmatrix" help="You can create a count matrix with the tool 'Count reads in features with htseq-count'"/> - + <repeat name="rep_factorName" title="Factor/Condition" min="1"> - <param name="factorName" type="text" value="FactorName" label="Specify a factor name" help=""/> + <param name="factorName" type="text" value="FactorName" label="Specify a factor name" help="" /> <repeat name="rep_factorLevel" title="Factor level" min="1"> - <param name="factorLevel" type="text" value="FactorLevel" label="Specify a factor level" help=""/> + <param name="factorLevel" type="text" value="FactorLevel" label="Specify a factor level" help="" /> <param name="factorIndex" label="Select columns that are associated with this factor level" type="data_column" data_ref="input_matrix" numerical="True" multiple="true" use_header_names="true" size="120" display="checkboxes"> - <validator type="no_options" message="Please select at least one column."/> + <validator type="no_options" message="Please select at least one column." /> </param> </repeat> </repeat> @@ -90,24 +90,25 @@ <option value="fly">fly</option> <option value="other">other</option> </param--> - <param name="countthreshold" size="10" type="float" value="10" label="Filter out features with mean normalized counts lower than this value"/> + <param name="countthreshold" size="10" type="float" value="10.0" label="Filter out features with mean normalized counts lower than this value"/> <param name="fittype" type="select" label="Type of fitting of dispersions to the mean intensity"> <option value="parametric">parametric</option> <option value="local">local</option> <option value="mean">mean</option> - </param> - <param name="pdf" type="boolean" truevalue="" falsevalue="" checked="true" label="Visualising the analysis results" - help="output an additional PDF files" /> + </param> + <param name="pdf" type="boolean" truevalue="" falsevalue="" checked="true" + label="Visualising the analysis results" + help="output an additional PDF files" /> </inputs> <outputs> - <data format="tabular" name="deseq_out" label="DESeq2 result file on ${on_string}"/> - <data format="tabular" name="deseq_out_filtered" label="Independent filtering result file on ${on_string}"/> + <data format="tabular" name="deseq_out" label="DESeq2 result file on ${on_string}" /> + <data format="tabular" name="deseq_out_filtered" label="Independent filtering result file on ${on_string}" /> <data format="pdf" name="plots" label="DESeq2 plots on ${on_string}"> <filter>pdf == True</filter> </data> </outputs> - <code file="matrix_helper.py" /> + <code file="deseq_helper.py" /> <help> @@ -147,7 +148,7 @@ DESeq2_ Authors: Michael Love (MPIMG Berlin), Simon Anders, Wolfgang Huber (EMBL Heidelberg) -If _DESeq2_ is used to obtain results for scientific publications it +If DESeq2_ is used to obtain results for scientific publications it should be cited as [1]_. A paper describing DESeq2_ is in preparation.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/deseq_helper.py Wed Feb 19 06:11:20 2014 -0500 @@ -0,0 +1,98 @@ + +from galaxy.tools.parameters import DataToolParameter + +def get_matrix_header( input_dataset ): + """ + Not used currently, because the reload of the ckeckboxes did not work. + """ + input_handle = open( input_dataset.file_name ) + first_header = input_handle.readline() + second_header = input_handle.readline() + return [('%s::%s' % (cname2,cname1), str(int(col_num) + 1), False) for col_num, (cname2, cname1) in enumerate(zip(second_header.split()[1:],first_header.split()[1:])) ] + + + +def _construct_error_map( error_map, rep_dict, rep_parent, child, error_value ): + """ + Its no so easy to create a propper error_map for repetitions in Galaxy. + This is a helper function. + """ + + error_map[ rep_parent ] = [ dict() for t in rep_dict ] + for i in range( len( rep_dict ) ): + error_map[ rep_parent ][i][ child ] = error_value + + + +def validate_input( trans, error_map, param_values, page_param_map ): + """ + Validates the user input, before execution. + """ + factors = param_values['rep_factorName'] + factor_name_list = [] + factor_duplication = False + level_duplication = False + overlapping_selection = False + + first_condition = True + factor_indieces = list() + + for factor in factors: + # factor names should be unique + fn = factor['factorName'] + if fn in factor_name_list: + factor_duplication = True + break + factor_name_list.append( fn ) + + level_name_list = list() + factor_index_list = list() + + if first_condition and len( factor['rep_factorLevel'] ) < 2: + # first condition needs to have at least 2 levels + _construct_error_map( error_map, factors, 'rep_factorName', 'rep_factorLevel', [ {'factorLevel': 'The first condition should have at least 2 factor'} for t in factor['rep_factorLevel'] ] ) + + for level in factor['rep_factorLevel']: + # level names under one factor should be unique + fl = level['factorLevel'] + if fl in level_name_list: + level_duplication = True + level_name_list.append( fl ) + + fi = level['factorIndex'] + if fi: + # the checkboxes should not have an overlap + for check in fi: + if check in factor_index_list: + overlapping_selection = True + factor_index_list.append( check ) + + print set(factor_index_list) + print factor_indieces + if set(factor_index_list) in factor_indieces: + _construct_error_map( error_map, factors, 'rep_factorName', 'rep_factorLevel', [ {'factorLevel': 'It is not allowed to have two identical factors, that means two factors with the same toggeled checked boxes. '} for t in factor['rep_factorLevel'] ] ) + else: + factor_indieces.append( set(factor_index_list) ) + + + + if level_duplication: + error_map['rep_factorName'] = [ dict() for t in factors ] + for i in range( len( factors ) ): + error_map['rep_factorName'][i]['rep_factorLevel'] = [ {'factorLevel': 'Factor levels for each factor need to be unique'} for t in factor['rep_factorLevel'] ] + break + if overlapping_selection: + error_map['rep_factorName'] = [ dict() for t in factors ] + for i in range( len( factors ) ): + error_map['rep_factorName'][i]['rep_factorLevel'] = [ {'factorIndex': 'The samples from different factors are not allowed to overlap'} for t in factor['rep_factorLevel'] ] + break + + first_condition = False + + if factor_duplication: + _construct_error_map( error_map, factors, 'rep_factorName', 'factorName', 'Factor names need to be unique' ) + """ + error_map['rep_factorName'] = [ dict() for t in factors ] + for i in range( len( factors ) ): + error_map['rep_factorName'][i]['factorName'] = 'Factor names need to be unique' + """
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/foldchanges_heatmap.r Wed Feb 19 06:11:20 2014 -0500 @@ -0,0 +1,44 @@ +## Setup R error handling to go to stderr +options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) +# we need that to not crash galaxy with an UTF8 error on German LC settings. +Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") + +args <- commandArgs(trailingOnly = TRUE) + +library("gplots") +library("RColorBrewer") + +tables <- {} +labels <- {} + +labels <- unlist(strsplit(args[1], ",")) +tables <- unlist(strsplit(args[2], ",")) +ids_file=args[3] +output_pdf_file=args[4] + +reds <- colorRampPalette(brewer.pal(9,"Reds"))(100) +reds <- rev(reds) +greens <- colorRampPalette(brewer.pal(9,"Greens"))(100) +cols <- c(reds, greens) + +ids <- read.table(ids_file) + +mat <- c() + +for(i in 1:length(tables)) { + # get data frame + curr_table <- read.table(tables[[i]], sep="\t", header=FALSE) + log2FCvect <- c() + for(j in 1:length(ids$V1)) { + log2FCvect <- c(log2FCvect, curr_table$V3[which(curr_table$V1 %in% ids$V1[j])]) + } + # build foldChange data frame for heatmap + mat <- cbind(mat, log2FCvect) +} +pdf(output_pdf_file) +hm <- heatmap.2(mat, col = cols, trace="none", labCol=labels, labRow=NULL,scale="none",symm=F,symkey=T,symbreaks=T, cexCol=0.8, cexRow=0.8) +dev.off() + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/foldchanges_heatmap.xml Wed Feb 19 06:11:20 2014 -0500 @@ -0,0 +1,86 @@ +<tool id="foldchange_heatmap" name="fold change heatmap" version="1.0"> + <description>Plot heat map of fold changes from multiple DESeq2 outputs</description> + <requirements> + <requirement type="binary">Rscript</requirement> + </requirements> + <command interpreter="Rscript"> + #set $label_list = list() + #set $table_list = list() + #for $input in $deseqout: + $label_list.append('%s' %($input.label)) + #end for + #for $input in $deseqout: + $table_list.append('%s' %($input.table)) + #end for + foldchanges_heatmap.r "#echo ','.join( $label_list )#" "#echo ','.join( $table_list )#" $gene_ids_file $plots + ## TODO: instead of throwing away stderr, try Bjoern's fix + 2> /dev/null + </command> + <stdio> + <regex match="Execution halted" + source="both" + level="fatal" + description="Execution halted." /> + </stdio> + <inputs> + <repeat name="deseqout" title="DESeq2 output" min="1"> + <param name="label" type="text" value="Label" label="Specify a label" help="" /> + <param name="table" label="Select DESeq2 output" type="data" format="tabular"/> + </repeat> + <param name="gene_ids_file" label="Select file containing gene ids" type="data" format="tabular"/> + </inputs> + + <outputs> + <data format="pdf" name="plots" label="Heatmap plot on ${on_string}"/> + </outputs> + <help> + +.. class:: infomark + +**What it does** + +Estimate variance-mean dependence in count data from high-throughput sequencing assays and test for differential expression based on a model using the negative binomial distribution + + +**Inputs** + +DESeq2_ requires one count matrix as input file. You can use the tool + + + +**Output** + +DESeq2_ generates a tabular file containing the different columns and optional visualized results as PDF. + +====== ========================================================== +Column Description +------ ---------------------------------------------------------- + 1 Gene Identifiers + 2 mean normalised counts, averaged over all samples from both conditions + 3 the logarithm (to basis 2) of the fold change + 4 standard error estimate for the log2 fold change estimate + 5 p value for the statistical significance of this change + 6 p value adjusted for multiple testing with the Benjamini-Hochberg procedure + which controls false discovery rate (FDR) +====== ========================================================== + + +------ + +**References** + +DESeq2_ Authors: Michael Love (MPIMG Berlin), Simon Anders, Wolfgang Huber (EMBL Heidelberg) + +If DESeq2_ is used to obtain results for scientific publications it +should be cited as [1]_. A paper describing DESeq2_ is in preparation. + + + +.. [1] Anders, S and Huber, W (2010): `Differential expression analysis for sequence count data`_. + +.. _Differential expression analysis for sequence count data: http://dx.doi.org/10.1186/gb-2010-11-10-r106 +.. _DESeq2: http://master.bioconductor.org/packages/release/bioc/html/DESeq2.html + + + </help> +</tool>
--- a/matrix_helper.py Mon Sep 30 12:28:02 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,66 +0,0 @@ - -from galaxy.tools.parameters import DataToolParameter - -def get_matrix_header( input_dataset ): - """ - Not used currently, because the reload of the ckeckboxes did not work. - """ - input_handle = open( input_dataset.file_name ) - first_header = input_handle.readline() - second_header = input_handle.readline() - return [('%s::%s' % (cname2,cname1), str(int(col_num) + 1), False) for col_num, (cname2, cname1) in enumerate(zip(second_header.split()[1:],first_header.split()[1:])) ] - - -def validate_input( trans, error_map, param_values, page_param_map ): - """ - Validates the user input, before execution. - """ - factors = param_values['rep_factorName'] - factor_name_list = [] - factor_duplication = False - level_duplication = False - overlapping_selection = False - - - for factor in factors: - # factor names should be unique - fn = factor['factorName'] - if fn in factor_name_list: - factor_duplication = True - break - factor_name_list.append( fn ) - - level_name_list = list() - factor_index_list = list() - - for level in factor['rep_factorLevel']: - # level names under one factor should be unique - fl = level['factorLevel'] - if fl in level_name_list: - level_duplication = True - level_name_list.append( fl ) - - fi = level['factorIndex'] - if fi: - # the checkboxes should not have an overlap - for check in fi: - if check in factor_index_list: - overlapping_selection = True - factor_index_list.append( check ) - - if level_duplication: - error_map['rep_factorName'] = [ dict() for t in factors ] - for i in range( len( factors ) ): - error_map['rep_factorName'][i]['rep_factorLevel'] = [ {'factorLevel': 'Factor levels for each factor need to be unique'} for t in factor['rep_factorLevel'] ] - break - if overlapping_selection: - error_map['rep_factorName'] = [ dict() for t in factors ] - for i in range( len( factors ) ): - error_map['rep_factorName'][i]['rep_factorLevel'] = [ {'factorIndex': 'The samples from different factors are not allowed to overlap'} for t in factor['rep_factorLevel'] ] - break - - if factor_duplication: - error_map['rep_factorName'] = [ dict() for t in factors ] - for i in range( len( factors ) ): - error_map['rep_factorName'][i]['factorName'] = 'Factor names need to be unique' -
--- a/tool_dependencies.xml Mon Sep 30 12:28:02 2013 -0400 +++ b/tool_dependencies.xml Wed Feb 19 06:11:20 2014 -0500 @@ -1,10 +1,10 @@ <?xml version="1.0"?> <tool_dependency> <package name="R_3_0_1" version="3.0.1"> - <repository changeset_revision="e16e08f41ed7" name="package_r_3_0_1" owner="iuc" toolshed="http://testtoolshed.g2.bx.psu.edu" /> + <repository changeset_revision="4666f68ad4d5" name="package_r_3_0_1" owner="iuc" toolshed="http://testtoolshed.g2.bx.psu.edu" /> </package> <package name="deseq2" version="1.0.17"> - <repository changeset_revision="3e3d1a50a347" name="package_deseq2_1_0_17" owner="bgruening" toolshed="http://testtoolshed.g2.bx.psu.edu" /> + <repository changeset_revision="3b6a8c377895" name="package_deseq2_1_0_17" owner="bgruening" toolshed="http://testtoolshed.g2.bx.psu.edu" /> </package> <!--<set_environment version="1.0"> <environment_variable name="DESEQ2_SCRIPT_PATH" action="set_to">$REPOSITORY_INSTALL_DIR/scripts</environment_variable>