# HG changeset patch # User yhoogstrate # Date 1395839550 14400 # Node ID cada2d2f0a7c057becc8617120876b82295e9d0b Uploaded diff -r 000000000000 -r cada2d2f0a7c featurecounts.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/featurecounts.xml Wed Mar 26 09:12:30 2014 -0400 @@ -0,0 +1,237 @@ + + + Measure gene expression in RNA-Seq experiments from SAM or BAM files. + + featurecounts + featurecounts2bed + + + + + + #if $reference_gene_sets_source.source_select == "attribute" and len({ alignment.metadata.dbkey:True for alignment in $alignments }.keys()) != 1 + echo "Invalid number of dbkeys are found: ${ len({ alignment.metadata.dbkey:True for alignment in $alignments }.keys()) }, while only one should be used. Make sure that the alignments are done on the same reference genome and that 'tool-data/gene_sets.loc' is configured properly!" >&2 + #else + + #if len({ alignment.extension:True for alignment in $alignments }.keys()) != 1 + echo "Either all files must be SAM or all files must be BAM, no mixture is allowed." >&2 + #else + featureCounts + -a + #if $reference_gene_sets_source.source_select == "indexed_filtered" + "$reference_gene_sets_source.reference_gene_sets" + #else if $reference_gene_sets_source.source_select == "indexed_all" + "$reference_gene_sets_source.reference_gene_sets" + #else if $reference_gene_sets_source.source_select == "history" + "$reference_gene_sets_source.reference_gene_sets" + #else + + "${ filter( lambda x: str( x[0] ) == str( { alignment.metadata.dbkey:True for alignment in $alignments }.keys()[0] ), $__app__.tool_data_tables[ 'gene_sets' ].get_fields() )[0][2] }" + #end if + + -o "$output" + -T $threads + + #if $extended_parameters.parameters == "extended" + -t $extended_parameters.gff_feature_type + -g $extended_parameters.gff_feature_attribute + $extended_parameters.summarization_level + $extended_parameters.contribute_to_multiple_features + $extended_parameters.protocol + $extended_parameters.multimapping_counts + -Q $extended_parameters.mapping_quality + $extended_parameters.fragment_counting + $extended_parameters.check_distance + -d $extended_parameters.minimum_fragment_length + -D $extended_parameters.maximum_fragment_length + $extended_parameters.only_both_ends + $extended_parameters.exclude_chimerics + $extended_parameters.namesort + #end if + + #for $alignment in $alignments + ${alignment} + #end for + + 2>&1 + + #if $format == "tabdel_default" or $format.value == "tabdel_default" + ; cp $output tmp.txt + ; egrep -v "^#" tmp.txt > tmp2.txt + ; cut -f 1,7 tmp2.txt > tmp_left.txt + ; cut -f 6 tmp2.txt > tmp_right.txt + ; paste tmp_left.txt tmp_right.txt > $output + #elif $format == "tabdel_short" or $format.value == "tabdel_short" + ; cp $output tmp.txt + ; egrep -v "^#" tmp.txt | cut -f 1,7 > $output + #end if + + ## For every alignment, replace its filename for: "hid: sample name" + #for $alignment in $alignments + #set $alignment_escaped = str($alignment).replace('/', '\/').replace('.', '\.') + #set $alignment_name_escaped = str(alignment.hid)+": "+str($alignment.name).replace('\t',' ').replace('\\','\\\\').replace("'","\\'").replace('/','\/') + + #if $format.value == "tabdel_default" or $format.value == "tabdel_short" + ; sed -e '1 s/$alignment_escaped/${alignment_name_escaped}/g' $output > tmp.txt + #elif $format.value == "bed": + ; featurecounts2bed.sh -f "$output" > tmp.txt + #else + ; sed -e '1,2 s/$alignment_escaped/${alignment_name_escaped}/g' $output > tmp.txt + #end if + + ; mv tmp.txt $output + #end for + #end if + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +featureCounts:: +**Overview** + +FeatureCounts is a light-weight read counting program written entirely in the C programming language. It can be used to count both gDNA-seq and RNA-seq reads for genomic features in in SAM/BAM files. +It has a variety of advanced parameters but its major strength is its outstanding performance: analysis of a 10GB BAM file takes about 7 minutes on a single average CPU (Homo Sapiens genome)! +Liao Y, Smyth GK and Shi W. featureCounts: an efficient general-purpose program for assigning sequence reads to genomic features. Bioinformatics, Advance Access, accepted on Nov 7, 2013 + +featureCounts is part of a bigger analysis suite called subread: +http://subread.sourceforge.net/ +Liao Y, Smyth GK and Shi W. The Subread aligner: fast, accurate and scalable read mapping by seed-and-vote. Nucleic Acids Research, 41(10):e108, 2013 + +**Input formats** + +Alignments should be provided in either: +* SAM format - hhttp://samtools.sourceforge.net/samtools.shtml#5 +* BAM format + +Gene regions should be provided in the GFF/GTF format: +* http://genome.ucsc.edu/FAQ/FAQformat.html#format3 +* http://www.ensembl.org/info/website/upload/gff.html + +**Installation** + +1) Make sure you have proper GFF/GTF files (corresponding to your reference genome used for the aligment) uploaded to your history. + +2) Make sure that your gene_sets.loc is configured properly as data table. This is generally done by copying the right information into: tool_data_table_conf.xml. +https://wiki.galaxyproject.org/Admin/Tools/Data%20Tables + +**Examples** + +**License** + +* featureCounts / subread: GNU General Public License version 3.0 (GPLv3) + +**Contact** + +The tool wrapper has been written by Youri Hoogstrate from the Erasmus Medical Center (Rotterdam, Netherlands) on behalf of the Translational Research IT (TraIT) project: +http://www.ctmm.nl/en/programmas/infrastructuren/traitprojecttranslationeleresearch + +I want to thank the Marius van den Beek for his contribution as well. + +More tools by the Translational Research IT (TraIT) project can be found in the following repository: +http://toolshed.dtls.nl/ + + + \ No newline at end of file diff -r 000000000000 -r cada2d2f0a7c featurecounts2bed.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/featurecounts2bed.sh Wed Mar 26 09:12:30 2014 -0400 @@ -0,0 +1,62 @@ +#!/bin/bash + +# This tool has been written by Youri Hoogstrate from the Erasmus +# Medical Center (Rotterdam, Netherlands) on behalf of the Translational +# Research IT (TraIT) project: +# http://www.ctmm.nl/en/programmas/infrastructuren/traitprojecttranslationeleresearch +# +# More tools by the Translational Research IT (TraIT) project can be +# found in the following repository: +# http://toolshed.dtls.nl/ + +exon_level="true" +filename="" + +# Parse parameters +while getopts e:f: option +do + case "${option}" + in + e) exon_level=${OPTARG};; + f) filename=$OPTARG;; + esac +done + +# Convert the file +if [ $filename == "" ]; then + echo "Usage:" + echo " -e [true, false] true = entry for every exon; false = line for genes first exon" + echo " -f FILENAME from featureCounts" +else + while read line; do + first=${line:0:1} + if [ $first != "#" ]; then + columns=($line) + uid=${columns[@]:0:1} + if [ $uid != "Geneid" ]; then + chr=${columns[@]:1:1} + start=${columns[@]:2:1} + stop=${columns[@]:3:1} + direction=${columns[@]:4:1} + length=${columns[@]:5:1} + count=${columns[@]:6:1} + + chr_splitted=($(echo $chr | tr ";" "\n")) + start_splitted=($(echo $start | tr ";" "\n")) + stop_splitted=($(echo $stop | tr ";" "\n")) + strand_splitted=($(echo $direction | tr ";" "\n")) + + if [ $exon_level == "true" ]; then + n=${#chr_splitted[@]} + else + n=1 + fi + + for (( i=0; i<$n; i++ )) + do + echo ${chr_splitted[@]:$i:1}" "${start_splitted[@]:$i:1}" "${stop_splitted[@]:$i:1}" "$uid" ("$((${stop_splitted[@]:$i:1}-${start_splitted[@]:$i:1}))"/"$length"nt) "$count" "${strand_splitted[@]:$i:1} + done + fi + fi + done < $filename +fi diff -r 000000000000 -r cada2d2f0a7c tool-data/gene_sets.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/gene_sets.loc.sample Wed Mar 26 09:12:30 2014 -0400 @@ -0,0 +1,39 @@ +# This is a sample file distributed with Galaxy that enables tools +# to use gene/exon annotations in the GFF/GTF format. You will need +# to add all the gene set annotations in this configuration file. +# Because of this file, galaxy tools are able to access gene annotations +# provided as GFF / GTF files by selecting on: +# - Priorty (defined by the order in the file) +# - Provider +# - dbkey (reference genome id) +# +# The gene_sets.loc file syntax is: +# +# *optional +# +# Please ensure maximally one TAB (\t) between two columns! +# --------------------------------------------------------- +# +# In case you have TWO or MORE providers PER dbkey, the one mentioned +# first in the file, should have the "default" priority. +# + + + +# [UCSC - UCSC Genes: knownGene] + +#hg19.UCSC knownGene (mm/'yy) hg19 /depot/data2/galaxy/hg19/gene_sets/Homo_sapiens.GRCh37.74.gtf ucsc http://genome.ucsc.edu/ +#hg18.UCSC knownGene (mm/'yy) hg18 /depot/data2/galaxy/hg18/gene_sets/Homo_sapiens.NCBI36.54.gtf ucsc http://genome.ucsc.edu/ + + + +# [RefSeq Genes: RefGene] +# link: ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/ + + + +# [ Ensembl] +# link: http://www.ensembl.org/info/data/ftp/index.html + +#Homo_sapiens.GRCh37.74 hg19 /depot/data2/galaxy/hg19/gene_sets/Homo_sapiens.GRCh37.74.gtf ensembl ftp://ftp.ensembl.org/pub/release-74/gtf/homo_sapiens/Homo_sapiens.GRCh37.74.gtf.gz +#Homo_sapiens.NCBI36.54 hg18 /depot/data2/galaxy/hg18/gene_sets/Homo_sapiens.NCBI36.54.gtf ensembl ftp://ftp.ensembl.org/pub/release-54/gtf/homo_sapiens/Homo_sapiens.NCBI36.54.gtf.gz diff -r 000000000000 -r cada2d2f0a7c tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Wed Mar 26 09:12:30 2014 -0400 @@ -0,0 +1,8 @@ + + + + + name, dbkey, value, provider, reference + +
+
\ No newline at end of file diff -r 000000000000 -r cada2d2f0a7c tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Wed Mar 26 09:12:30 2014 -0400 @@ -0,0 +1,37 @@ + + + + + + wget http://sourceforge.net/projects/subread/files/subread-1.4.4/subread-1.4.4-source.tar.gz && tar -zxvf subread-1.4.4-source.tar.gz && cd subread-1.4.4-source/src/ && make -f Makefile.Linux && cd ../../ + + ../subread-1.4.4-source/bin/featureCounts + $INSTALL_DIR/bin + + + $INSTALL_DIR/bin + $REPOSITORY_INSTALL_DIR + + + + + Downloads and installs featureCounts; requires WGET, GNU AUTOTOOLS and TAR to be installed! + + + + + + + wget http://yhoogstrate@toolshed.dtls.nl/repos/yhoogstrate/featurecounts/raw-file/tip/featurecounts2bed.sh && chmod 755 *.sh + + ../featurecounts2bed.sh + $INSTALL_DIR/bin + + + $INSTALL_DIR/bin + $REPOSITORY_INSTALL_DIR + + + + +