changeset 0:cada2d2f0a7c draft

Uploaded
author yhoogstrate
date Wed, 26 Mar 2014 09:12:30 -0400
parents
children 9fcffbe6c206
files featurecounts.xml featurecounts2bed.sh tool-data/gene_sets.loc.sample tool_data_table_conf.xml.sample tool_dependencies.xml
diffstat 5 files changed, 383 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/featurecounts.xml	Wed Mar 26 09:12:30 2014 -0400
@@ -0,0 +1,237 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tool id="featurecounts" name="featureCounts">
+	<description>Measure gene expression in RNA-Seq experiments from SAM or BAM files.</description>
+	<requirements>
+		<requirement type="package" version="1.4.4">featurecounts</requirement>
+		<requirement type="package" version="1.0.0">featurecounts2bed</requirement>
+	</requirements>
+	<command>
+		<!--
+			The following script is written in the "Cheetah" language:
+			http://www.cheetahtemplate.org/docs/users_guide_html_multipage/contents.html
+		-->
+		
+		<!-- Check 01: do the alignments have a dbkey and is the option set to using it?  -->
+		#if $reference_gene_sets_source.source_select == "attribute" and len({ alignment.metadata.dbkey:True for alignment in $alignments }.keys()) != 1
+			echo "Invalid number of dbkeys are found: ${ len({ alignment.metadata.dbkey:True for alignment in $alignments }.keys()) }, while only one should be used. Make sure that the alignments are done on the same reference genome and that 'tool-data/gene_sets.loc' is configured properly!" >&amp;2
+		#else
+			<!-- Check 02: are all alignments from the same type (bam || sam)  -->
+			#if len({ alignment.extension:True for alignment in $alignments }.keys()) != 1
+				echo "Either all files must be SAM or all files must be BAM, no mixture is allowed." >&amp;2
+			#else
+				featureCounts
+					-a 
+					#if $reference_gene_sets_source.source_select == "indexed_filtered"
+						"$reference_gene_sets_source.reference_gene_sets"
+					#else if $reference_gene_sets_source.source_select == "indexed_all"
+						"$reference_gene_sets_source.reference_gene_sets"
+					#else if $reference_gene_sets_source.source_select == "history"
+						"$reference_gene_sets_source.reference_gene_sets"
+					#else
+						<!--
+							This is a workaround to obtain the "genome.fa" file that
+							corresponds to the dbkey of the alignments.
+							Because this file is "calculated" during run-time, it can
+							be used in a workflow.
+						-->
+						"${ filter( lambda x: str( x[0] ) == str( { alignment.metadata.dbkey:True for alignment in $alignments }.keys()[0] ), $__app__.tool_data_tables[ 'gene_sets' ].get_fields() )[0][2] }"
+					#end if
+					
+					-o "$output"
+					-T $threads
+					
+					#if $extended_parameters.parameters == "extended"
+						-t $extended_parameters.gff_feature_type
+						-g $extended_parameters.gff_feature_attribute
+						$extended_parameters.summarization_level
+						$extended_parameters.contribute_to_multiple_features
+						$extended_parameters.protocol
+						$extended_parameters.multimapping_counts
+						-Q $extended_parameters.mapping_quality
+						$extended_parameters.fragment_counting
+						$extended_parameters.check_distance
+						-d $extended_parameters.minimum_fragment_length
+						-D $extended_parameters.maximum_fragment_length
+						$extended_parameters.only_both_ends
+						$extended_parameters.exclude_chimerics
+						$extended_parameters.namesort
+					#end if
+					
+					#for $alignment in $alignments
+						 ${alignment}
+					#end for
+					
+					2>&amp;1
+				
+				#if $format == "tabdel_default" or $format.value == "tabdel_default"
+					; cp $output tmp.txt
+					; egrep -v "^#" tmp.txt > tmp2.txt
+					; cut -f 1,7 tmp2.txt > tmp_left.txt
+					; cut -f 6 tmp2.txt > tmp_right.txt
+					; paste tmp_left.txt tmp_right.txt > $output
+				#elif $format == "tabdel_short" or $format.value == "tabdel_short"
+					; cp $output tmp.txt
+					; egrep -v "^#" tmp.txt | cut -f 1,7 > $output
+				#end if
+				
+				## For every alignment, replace its filename for: "hid: sample name"
+				#for $alignment in $alignments
+					#set $alignment_escaped = str($alignment).replace('/', '\/').replace('.', '\.')
+					#set $alignment_name_escaped = str(alignment.hid)+": "+str($alignment.name).replace('\t',' ').replace('\\','\\\\').replace("'","\\'").replace('/','\/')
+					
+					#if $format.value == "tabdel_default" or $format.value == "tabdel_short"
+						; sed -e '1 s/$alignment_escaped/${alignment_name_escaped}/g' $output > tmp.txt
+					#elif $format.value == "bed":
+						; featurecounts2bed.sh -f "$output" > tmp.txt
+					#else
+						; sed -e '1,2 s/$alignment_escaped/${alignment_name_escaped}/g' $output > tmp.txt
+					#end if
+					
+					; mv tmp.txt $output
+				#end for
+			#end if
+		#end if
+	</command>
+	
+	<inputs>
+		<param name="alignments" type="data" format="bam,sam" label="Alignment file" help="The input alignment file(s) where the gene expression has to be counted. The file can have a SAM or BAM format; but ALL files in the series must be in THE SAME format." multiple="true" />
+		
+		<!-- Find out how to access the the GTF/GFF file(s) -->
+		<conditional name="reference_gene_sets_source">
+			<param name="source_select" type="select" label="GFF/GTF Source">
+				<option value="indexed_filtered">Use a built-in index (which fits your reference)</option>
+				<option value="history">Use reference from the history</option>
+				<option value="indexed_all">Use a built-in index (entire list) - avoid this option if possible; only useful if you design a workflow</option>
+				<option value="attribute">Use a built-in index based on the 'metadata.dbkey' attribute; ideal in workflows</option>
+			</param>
+			<when value="indexed_filtered">
+				<param name="reference_gene_sets" type="select" label="Reference Gene Sets used during alignment (GFF/GTF)" >
+					<options from_data_table="gene_sets"><!-- replaces 'from_file="gene_sets"' - more strict -->
+						<column name="name"  index="0"/>
+						<column name="dbkey" index="1"/>
+						<column name="value" index="2"/>
+						<filter type="data_meta" ref="alignments" multiple="false" key="dbkey" column="1" />
+						<validator type="no_options" message="No indexes are available for the selected input dataset" />
+					</options>
+				</param>
+			</when>
+			<when value="history">
+				<param name="reference_gene_sets" format="gff" type="data" label="Gene annotation file" help="The program assumes that the provided annotation file is in GTF format. Make sure that the gene annotaiton file corresponds to the same reference genome as used for the alignment." />
+			</when>
+			<when value="indexed_all">
+				<param name="reference_gene_sets" type="select" label="Reference Gene Sets used during alignment (GFF/GTF)" >
+					<options from_data_table="gene_sets"><!-- replaces 'from_file="gene_sets"' - more strict -->
+						<column name="name"  index="0"/>
+						<column name="dbkey" index="1"/>
+						<column name="value" index="2"/>
+						<validator type="no_options" message="No indexes are available for the selected input dataset" />
+					</options>
+				</param>
+			</when>
+			<when value="attribute">
+				<!-- Do nothing, determine GTF/GFF file at runtime -->
+			</when>
+		</conditional>
+		
+		<param name="format" type="select" label="Output format">
+			<option value="complex">featureCounts 1.4.0+ default (extensive; complex)</option>
+			<option value="tabdel_default" selected="true">Gene-name "\t" gene-count "\t" gene-length (tab-delimited)</option>
+			<option value="tabdel_short">Gene-name "\t" gene-count (tab-delimited)</option>
+			<option value="bed">BED format (line per exon): chr "\t" start "\t" stop "\t" description "\t" readcount (tab-delimited)</option>
+		</param>
+		
+		<param name="threads" type="integer" value="2" min="1" label="Number of the CPU threads. Higher numbers only make sense with a higher number of samples." />
+		
+		<conditional name="extended_parameters">
+			<param name="parameters" type="select" label="featureCounts parameters" help="For more advanced featureCounts settings.">
+				<option value="default">Default settings</option>
+				<option value="extended">Extended settings</option>
+			</param>
+			<when value="default">
+			</when>
+			<when value="extended">
+				<param name="gff_feature_type" type="text" value="exon" label="GFF feature type filter" help="Specify the feature type. Only rows which have the matched matched feature type in the provided GTF annotation file will be included for read counting. `exon' by default." />
+				
+				<param name="gff_feature_attribute" type="text" value="gene_id" label="GFF gene identifier" help="Specify the attribute type used to group features (eg. exons) into meta-features (eg. genes), when GTF annotation is provided. `gene_id' by default. This attribute type is usually the gene identifier. This argument is useful for the meta-feature level summarization." />
+				
+				<param name ="summarization_level" type="boolean" truevalue=" -f" falsevalue="" label="On feature level" help="If specified, read summarization will be performed at the feature level. By default (-f is not specified), the read summarization is performed at the meta-feature level." />
+				
+				<param name ="contribute_to_multiple_features" type="boolean" truevalue=" -O" falsevalue="" label="Allow read to contribute to multiple features" help="If specified, reads (or fragments if -p is specified) will be allowed to be assigned to more than one matched meta- feature (or matched feature if -f is specified)" />
+				
+				<param name="protocol" type="select" label="Strand specific protocol" help="Indicate if strand-specific read counting should be performed. It has three possible values: 0 (unstranded), 1 (stranded) and 2 (reversely stranded). 0 by default.">
+					<option value=" -s 0" selected="true">Unstranded</option>
+					<option value=" -s 1">Stranded (forwards)</option>
+					<option value=" -s 2">Stranded (reverse)</option>
+				</param>
+				
+				<param name="multimapping_counts" type="boolean" truevalue=" -M" falsevalue="" label="Count multi-mapping reads/fragments" help="If specified, multi-mapping reads/fragments will be counted (ie. a multi-mapping read will be counted up to N times if it has N reported mapping locations). The program uses the `NH' tag to find multi-mapping reads." />
+				
+				<param name="mapping_quality" type="integer" value="0" label="Minimum read quality" help="The minimum mapping quality score a read must satisfy in order to be counted. For paired-end reads, at least one end should satisfy this criteria. 0 by default." />
+				
+				<param name="fragment_counting" type="boolean" truevalue=" -p" falsevalue="" label="PE: Count fragments instead of reads" help="Paired-end specific: If specified, fragments (or templates) will be counted instead of reads. The two reads from the same fragment must be adjacent to each other in the provided SAM/BAM file. If SAM/BAM input does not meet this requirement, the -S (sorting) option should be provided as well." />
+				
+				<param name="check_distance" type="boolean" truevalue=" -P" falsevalue="" label="PE: Check paired-end distance" help="Paired-end specific: If specified, paired-end distance will be checked when assigning fragments to meta-features or features. This option is only applicable when -p (Count fragments instead of reads) is specified. The distance thresholds should be specified using -d and -D (minimum and maximum fragment/template length) options." />
+				
+				<param name="minimum_fragment_length" type="integer" value="50" label="PE: Minimum fragment/template length." />
+				<param name="maximum_fragment_length" type="integer" value="600" label="PE: Maximum fragment/template length." />
+				
+				<param name="only_both_ends" type="boolean" truevalue=" -B" falsevalue="" label="PE: only allow fragments with both reads aligned"  help="Paired-end specific: If specified, only fragments that have both ends successfully aligned will be considered for summarization. This option is only applicable for paired-end reads." />
+				
+				<param name="exclude_chimerics" type="boolean" truevalue=" -C" falsevalue="" label="PE: Exclude chimeric fragments"  help="Paired-end specific: If specified, the chimeric fragments (those fragments that have their two ends aligned to different chromosomes) will NOT be included for summarization. This option is only applicable for paired-end read data." />
+				
+				<param name="namesort" type="boolean" truevalue=" -S" falsevalue="" label="PE: Name-sort reads (slow!)"  help="Paired-end specific: If specified, the program will reorder input reads according to their names and make reads from the same pair be adjacent to each other. This option should be provided when reads from the same pair are not adjacent to each other in input SAM/BAM files (for instance sorting reads by chromosomal locations could decouple reads from the same pair)." />
+			</when>
+		</conditional>
+	</inputs>
+	
+	<outputs>
+		<data format="tabular" name="output" label="${tool.name} on ${', '.join([ str(a.hid)+': '+a.name for a in $alignments ])}" />
+	</outputs>
+	
+	<help>
+featureCounts::
+**Overview**
+
+FeatureCounts is a light-weight read counting program written entirely in the C programming language. It can be used to count both gDNA-seq and RNA-seq reads for genomic features in in SAM/BAM files.
+It has a variety of advanced parameters but its major strength is its outstanding performance: analysis of a 10GB BAM file takes about 7 minutes on a single average CPU (Homo Sapiens genome)!
+Liao Y, Smyth GK and Shi W. featureCounts: an efficient general-purpose program for assigning sequence reads to genomic features. Bioinformatics, Advance Access, accepted on Nov 7, 2013
+
+featureCounts is part of a bigger analysis suite called subread:
+http://subread.sourceforge.net/
+Liao Y, Smyth GK and Shi W. The Subread aligner: fast, accurate and scalable read mapping by seed-and-vote. Nucleic Acids Research, 41(10):e108, 2013
+
+**Input formats**
+
+Alignments should be provided in either:
+* SAM format - hhttp://samtools.sourceforge.net/samtools.shtml#5
+* BAM format
+
+Gene regions should be provided in the GFF/GTF format:
+* http://genome.ucsc.edu/FAQ/FAQformat.html#format3
+* http://www.ensembl.org/info/website/upload/gff.html
+
+**Installation**
+
+1) Make sure you have proper GFF/GTF files (corresponding to your reference genome used for the aligment) uploaded to your history.
+
+2) Make sure that your gene_sets.loc is configured properly as data table. This is generally done by copying the right information into: tool_data_table_conf.xml.
+https://wiki.galaxyproject.org/Admin/Tools/Data%20Tables
+
+**Examples**
+
+**License**
+
+* featureCounts / subread: GNU General Public License version 3.0 (GPLv3)
+
+**Contact**
+
+The tool wrapper has been written by Youri Hoogstrate from the Erasmus Medical Center (Rotterdam, Netherlands) on behalf of the Translational Research IT (TraIT) project:
+http://www.ctmm.nl/en/programmas/infrastructuren/traitprojecttranslationeleresearch
+
+I want to thank the Marius van den Beek for his contribution as well.
+
+More tools by the Translational Research IT (TraIT) project can be found in the following repository:
+http://toolshed.dtls.nl/
+
+</help>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/featurecounts2bed.sh	Wed Mar 26 09:12:30 2014 -0400
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+# This tool has been written by Youri Hoogstrate from the Erasmus
+# Medical Center (Rotterdam, Netherlands) on behalf of the Translational
+# Research IT (TraIT) project:
+# http://www.ctmm.nl/en/programmas/infrastructuren/traitprojecttranslationeleresearch
+# 
+# More tools by the Translational Research IT (TraIT) project can be
+# found in the following repository:
+# http://toolshed.dtls.nl/
+
+exon_level="true"
+filename=""
+
+# Parse parameters
+while getopts e:f: option
+do
+	case "${option}"
+	in
+		e) exon_level=${OPTARG};;
+		f) filename=$OPTARG;;
+	esac
+done
+
+# Convert the file
+if [ $filename == "" ]; then
+	echo "Usage:"
+	echo "  -e [true, false]   true = entry for every exon; false = line for genes first exon"
+	echo "  -f                 FILENAME from featureCounts"
+else
+	while read line; do
+		first=${line:0:1}
+		if [ $first != "#" ]; then
+			columns=($line)
+			uid=${columns[@]:0:1}
+			if [ $uid != "Geneid" ]; then
+				chr=${columns[@]:1:1}
+				start=${columns[@]:2:1}
+				stop=${columns[@]:3:1}
+				direction=${columns[@]:4:1}
+				length=${columns[@]:5:1}
+				count=${columns[@]:6:1}
+				
+				chr_splitted=($(echo $chr | tr ";" "\n"))
+				start_splitted=($(echo $start | tr ";" "\n"))
+				stop_splitted=($(echo $stop | tr ";" "\n"))
+				strand_splitted=($(echo $direction | tr ";" "\n"))
+				
+				if [ $exon_level == "true" ]; then
+					n=${#chr_splitted[@]}
+				else
+					n=1
+				fi
+				
+				for (( i=0; i<$n; i++ ))
+				do
+					echo ${chr_splitted[@]:$i:1}"	"${start_splitted[@]:$i:1}"	"${stop_splitted[@]:$i:1}"	"$uid" ("$((${stop_splitted[@]:$i:1}-${start_splitted[@]:$i:1}))"/"$length"nt)	"$count"	"${strand_splitted[@]:$i:1}
+				done
+			fi
+		fi
+	done < $filename
+fi
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/gene_sets.loc.sample	Wed Mar 26 09:12:30 2014 -0400
@@ -0,0 +1,39 @@
+# This is a sample file distributed with Galaxy that enables tools
+# to use gene/exon annotations in the GFF/GTF format. You will need
+# to add all the gene set annotations in this configuration file.
+# Because of this file, galaxy tools are able to access gene annotations
+# provided as GFF / GTF files by selecting on:
+# - Priorty (defined by the order in the file)
+# - Provider
+# - dbkey (reference genome id)
+# 
+# The gene_sets.loc file syntax is:
+# <name/UID> <dbkey> <path> <provider> <URL/reference*>
+#  *optional
+# 
+# Please ensure maximally one TAB (\t) between two columns!
+# ---------------------------------------------------------
+# 
+# In case you have TWO or MORE providers PER dbkey, the one mentioned
+# first in the file, should have the "default" priority.
+#
+
+
+
+# [UCSC - UCSC Genes: knownGene]
+
+#hg19.UCSC knownGene (mm/'yy)	hg19	/depot/data2/galaxy/hg19/gene_sets/Homo_sapiens.GRCh37.74.gtf	ucsc	http://genome.ucsc.edu/
+#hg18.UCSC knownGene (mm/'yy)	hg18	/depot/data2/galaxy/hg18/gene_sets/Homo_sapiens.NCBI36.54.gtf	ucsc	http://genome.ucsc.edu/
+
+
+
+# [RefSeq Genes: RefGene]
+# link: ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/
+
+
+
+# [ Ensembl]
+# link: http://www.ensembl.org/info/data/ftp/index.html
+
+#Homo_sapiens.GRCh37.74	hg19	/depot/data2/galaxy/hg19/gene_sets/Homo_sapiens.GRCh37.74.gtf	ensembl	ftp://ftp.ensembl.org/pub/release-74/gtf/homo_sapiens/Homo_sapiens.GRCh37.74.gtf.gz
+#Homo_sapiens.NCBI36.54	hg18	/depot/data2/galaxy/hg18/gene_sets/Homo_sapiens.NCBI36.54.gtf	ensembl	ftp://ftp.ensembl.org/pub/release-54/gtf/homo_sapiens/Homo_sapiens.NCBI36.54.gtf.gz
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Wed Mar 26 09:12:30 2014 -0400
@@ -0,0 +1,8 @@
+<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc-->
+<tables>
+    <!-- Location of all GFF/GTF files -->
+    <table name="gene_sets" comment_char="#">
+        <columns>name, dbkey, value, provider, reference</columns>
+        <file path="tool-data/gene_sets.loc" />
+    </table>
+</tables>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Wed Mar 26 09:12:30 2014 -0400
@@ -0,0 +1,37 @@
+<?xml version="1.0"?>
+<tool_dependency>
+	<package name="featurecounts" version="1.4.4">
+		<install version="1.0">
+			<actions>
+				<action type="shell_command">wget http://sourceforge.net/projects/subread/files/subread-1.4.4/subread-1.4.4-source.tar.gz &amp;&amp; tar -zxvf subread-1.4.4-source.tar.gz &amp;&amp; cd subread-1.4.4-source/src/ &amp;&amp; make -f Makefile.Linux &amp;&amp; cd ../../</action>
+				<action type="move_file">
+					<source>../subread-1.4.4-source/bin/featureCounts</source>
+					<destination>$INSTALL_DIR/bin</destination>
+				</action>
+				<action type="set_environment">
+					<environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable>
+					<environment_variable name="PATH" action="prepend_to">$REPOSITORY_INSTALL_DIR</environment_variable>
+				</action>
+			</actions>
+		</install>
+		<readme>
+			Downloads and installs featureCounts; requires WGET, GNU AUTOTOOLS and TAR to be installed!
+		</readme>
+	</package>
+	
+	<package name="featurecounts2bed" version="1.0.0">
+		<install version="1.0">
+			<actions>
+				<action type="shell_command">wget http://yhoogstrate@toolshed.dtls.nl/repos/yhoogstrate/featurecounts/raw-file/tip/featurecounts2bed.sh &amp;&amp; chmod 755 *.sh</action>
+				<action type="move_file">
+					<source>../featurecounts2bed.sh</source>
+					<destination>$INSTALL_DIR/bin</destination>
+				</action>
+				<action type="set_environment">
+					<environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable>
+					<environment_variable name="PATH" action="prepend_to">$REPOSITORY_INSTALL_DIR</environment_variable>
+				</action>
+			</actions>
+		</install>
+	</package>
+</tool_dependency>