htseq_count: htseq-count.xml annotate

annotate htseq-count.xml @ 5:0a835934d792

Version 0.3

author	lparsons
date	Tue, 05 Mar 2013 12:26:28 -0500
parents	359d40333595
children	08a11d1eaec6

rev	line source
5 0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	1 <tool id="htseq_count" name="htseq-count" version="0.3">
0 b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	2 <description> - Count aligned reads in a BAM file that overlap features in a GFF file</description>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	3 <version_command>htseq-count -h \| grep version \| sed 's/^$.$$version .*$\./\2/'</version_command>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	4 <requirements>
5 0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	5 <requirement type="package" version="1.6.2">numpy</requirement>
4 359d40333595 Added tool dependencies, updated tests to use small datasets Lance Parsons <lparsons@princeton.edu> parents: 3 diff changeset	6 <requirement type="package" version="0.5.3p9">htseq</requirement>
1 4de3f044aaeb Added version to requirements Lance Parsons <lparsons@princeton.edu> parents: 0 diff changeset	7 <requirement type="package" version="0.1.18">samtools</requirement>
5 0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	8 <requirement type="package" version="1.56.0">picard</requirement>
0 b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	9 </requirements>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	10 <command>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	11 ##set up input files
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	12 #set $reference_fasta_filename = "localref.fa"
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	13 #if $samout_conditional.samout:
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	14 #if str( $samout_conditional.reference_source.reference_source_selector ) == "history":
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	15 ln -s "${samout_conditional.reference_source.ref_file}" "${reference_fasta_filename}" &&
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	16 samtools faidx "${reference_fasta_filename}" 2>&1 \|\| echo "Error running samtools faidx for htseq-count" >&2 &&
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	17 #else:
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	18 #set $reference_fasta_filename = str( $samout_conditional.reference_source.ref_file.fields.path )
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	19 #end if
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	20 #end if
5 0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	21 #if str($singlepaired) == "paired":
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	22 ln -s $samfile local_input.sam &&
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	23 java -Xmx2G -jar "\$JAVA_JAR_PATH/SortSam.jar" VALIDATION_STRINGENCY=LENIENT SORT_ORDER=queryname O=prepared_input.sam I=local_input.sam TMP_DIR="${__new_file_path__}"
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	24 \|\| echo "Error running Picard MergeSamFiles" >&2 &&
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	25 #else:
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	26 #if $samfile.extension == "bam":
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	27 samtools view $samfile \|
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	28 #else
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	29 ln -s $samfile prepared_input.sam &&
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	30 #end if
0 b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	31 #end if
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	32 htseq-count
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	33 --mode=$mode
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	34 --stranded=$stranded
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	35 --minaqual=$minaqual
5 0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	36 --type=$featuretype
0 b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	37 --idattr=$idattr
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	38 #if $samout_conditional.samout:
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	39 --samout=$__new_file_path__/${samoutfile.id}_tmp
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	40 #end if
5 0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	41 #if str($singlepaired) == "paired":
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	42 prepared_input.sam
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	43 #else:
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	44 #if $samfile.extension == "bam":
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	45 -
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	46 #else:
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	47 prepared_input.sam
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	48 #end if
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	49 #end if
0 b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	50 $gfffile
5 0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	51 \| awk '{if ($1 ~ "no_feature\|ambiguous\|too_low_aQual\|not_aligned\|alignment_not_unique") print $0 \| "cat 1>&2"; else print $0}' > $counts 2>$othercounts
0 b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	52 #if $samout_conditional.samout:
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	53 && samtools view -Su -t ${reference_fasta_filename}.fai $__new_file_path__/${samoutfile.id}_tmp \| samtools sort -o - sorted > $samoutfile
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	54 #end if</command>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	55 <inputs>
5 0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	56 <param format="sam, bam" name="samfile" type="data" label="Aligned SAM/BAM File"/>
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	57 <param name="singlepaired" type="select" label="Is this library mate-paired?">
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	58 <help>Paired libraries will be sorted by read name prior to counting.</help>
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	59 <option value="single" selected="true">single-end</option>
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	60 <option value="paired">paired-end</option>
0 b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	61 </param>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	62 <param format="gff" name="gfffile" type="data" label="GFF File"/>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	63 <param name="mode" type="select" label="Mode">
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	64 <help>Mode to handle reads overlapping more than one feature.</help>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	65 <option value="union" selected="true">Union</option>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	66 <option value="intersection-strict">Intersection (strict)</option>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	67 <option value="intersection-nonempty">Intersection (nonempty)</option>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	68 </param>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	69 <param name="stranded" type="select" label="Stranded">
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	70 <help>Specify whether the data is from a strand-specific assay. 'Reverse' means yes with reversed strand interpretation.</help>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	71 <option value="yes" selected="true">Yes</option>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	72 <option value="no">No</option>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	73 <option value="reverse">Reverse</option>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	74 </param>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	75 <param name="minaqual" type="integer" value="0" label="Minimum alignment quality">
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	76 <help>Skip all reads with alignment quality lower than the given minimum value</help>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	77 </param>
5 0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	78 <param name="featuretype" type="text" value="exon" label="Feature type">
0 b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	79 <help>Feature type (3rd column in GFF file) to be used. All features of other types are ignored. The default, suitable for RNA-Seq and Ensembl GTF files, is exon.</help>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	80 </param>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	81 <param name="idattr" type="text" value="gene_id" label="ID Attribute">
5 0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	82 <help>GFF attribute to be used as feature ID. Several GFF lines with the same feature ID will be considered as parts of the same feature. The feature ID is used to identity the counts in the output table. All features of the specified type MUST have a value for this attribute. The default, suitable for RNA-SEq and Ensembl GTF files, is gene_id.</help>
0 b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	83 </param>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	84 <conditional name="samout_conditional">
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	85 <param name="samout" type="boolean" value="False" truevalue="True" falsevalue="False" label="Additional BAM Output">
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	86 <help>Write out all SAM alignment records into an output BAM file, annotating each line with its assignment to a feature or a special counter (as an optional field with tag ‘XF’).</help>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	87 </param>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	88 <when value="True">
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	89 <conditional name="reference_source">
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	90 <param name="reference_source_selector" type="select" label="Choose the source for the reference list">
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	91 <option value="cached">Locally cached</option>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	92 <option value="history">History</option>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	93 </param>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	94 <when value="cached">
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	95 <param name="ref_file" type="select" label="Using reference genome">
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	96 <options from_data_table="sam_fa_indexes">
5 0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	97 <filter type="data_meta" key="dbkey" ref="samfile" column="1"/>
0 b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	98 </options>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	99 <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	100 </param>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	101 </when>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	102 <when value="history"> <!-- FIX ME!!!! -->
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	103 <param name="ref_file" type="data" format="fasta" label="Using reference file" />
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	104 </when>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	105 </conditional>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	106 </when>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	107 </conditional>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	108 </inputs>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	109
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	110 <outputs>
5 0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	111 <data format="tabular" name="counts" metadata_source="samfile" label="${tool.name} on ${on_string}"/>
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	112 <data format="tabular" name="othercounts" metadata_source="samfile" label="${tool.name} on ${on_string} (no feature)"/>
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	113 <data format="bam" name="samoutfile" metadata_source="samfile" label="${tool.name} on ${on_string} (BAM)">
0 b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	114 <filter>samout_conditional['samout']</filter>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	115 </data>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	116 </outputs>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	117
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	118 <stdio>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	119 <exit_code range="1:" level="fatal" description="Unknown error occurred" />
5 0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	120 <regex match="htseq-count: command not found" source="stderr" level="fatal" description="The HTSeq python package is not properly installed, contact Galaxy administrators" />
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	121 <regex match="samtools: command not found" source="stderr" level="fatal" description="The samtools package is not properly installed, contact Galaxy administrators" />
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	122 <regex match="Error: Feature (.+) does not contain a '(.+)' attribute" source="both" level="fatal" description="Error parsing the GFF file, at least one feature of the specified 'Feature type' does not have a value for the specified 'ID Attribute'" />
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	123 <regex match="Error occured in line (\d+) of file" source="stderr" level="fatal" description="Unknown error parsing the GFF file" />
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	124 <regex match="Error" source="stderr" level="fatal" description="Unknown error occured" />
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	125 <regex match="Warning: Read (.+) claims to have an aligned mate which could not be found. $Is the SAM file properly sorted\?$" source="stderr" level="warning" description="PAIRED DATA MISSING OR NOT PROPERLY SORTED. Try reruning and selecting the paired-end option. See stderr output of this dataset for more information." />
0 b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	126 </stdio>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	127
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	128 <tests>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	129 <test>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	130 <param name="samfile" value="htseq-test.sam" />
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	131 <param name="gfffile" value="htseq-test.gff" />
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	132 <param name="samout" value="False" />
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	133 <output name="counts" file="htseq-test_counts.tsv" />
5 0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	134 <output name="othercounts" file="htseq-test_othercounts.tsv" />
0 b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	135 </test>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	136 <test>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	137 <param name="samfile" value="htseq-test.bam" />
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	138 <param name="gfffile" value="htseq-test.gff" />
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	139 <param name="samout" value="False" />
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	140 <output name="counts" file="htseq-test_counts.tsv" />
5 0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	141 <output name="othercounts" file="htseq-test_othercounts.tsv" />
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	142 </test>
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	143 <test>
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	144 <param name="samfile" value="htseq-test-paired.bam" />
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	145 <param name="singlepaired" value="paired" />
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	146 <param name="gfffile" value="htseq-test.gff" />
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	147 <param name="samout" value="False" />
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	148 <output name="counts" file="htseq-test-paired_counts.tsv" />
0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	149 <output name="othercounts" file="htseq-test-paired_othercounts.tsv" />
0 b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	150 </test>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	151 <!-- Seems to be an issue setting the $reference_fasta_filename variable during test
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	152 <test>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	153 <param name="samfile" value="htseq-test.sam" />
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	154 <param name="gfffile" value="htseq-test.gff" />
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	155 <param name="samout" value="True" />
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	156 <param name="reference_source_selector" value="history" />
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	157 <param name="ref_file" value="htseq-test_reference.fasta" />
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	158 <output name="counts" file="htseq-test_counts.tsv" />
5 0a835934d792 Version 0.3 lparsons parents: 4 diff changeset	159 <output name="othercounts" file="htseq-test_othercounts.tsv" />
0 b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	160 <output name="samoutfile" file="htseq-test_samout.bam" />
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	161 </test>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	162 -->
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	163 </tests>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	164
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	165 <help>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	166 Overview
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	167 --------
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	168
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	169 This tool takes an alignment file in SAM or BAM format and feature file in GFF format
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	170 and calculates the number of reads mapping to each feature. It uses the htseq-count
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	171 script that is part of the HTSeq python module. See
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	172 http://www-huber.embl.de/users/anders/HTSeq/doc/count.html for details.
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	173
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	174 A feature is an interval (i.e., a range of positions) on a chromosome or a union of
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	175 such intervals. In the case of RNA-Seq, the features are typically genes, where
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	176 each gene is considered here as the union of all its exons. One may also consider
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	177 each exon as a feature, e.g., in order to check for alternative splicing. For
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	178 comparative ChIP-Seq, the features might be binding regions from a pre-determined
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	179 list.
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	180
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	181 Paired-end Data MUST be sorted by QUERY NAME first
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	182
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	183 This tool requires that paired-end data be sorted by query name, which is NOT the default for Galaxy. Using the Picard Paired Read Mate Fixer with Query name sort FIRST is required for paired end data.
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	184
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	185
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	186 Overlap Modes
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	187 -------------
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	188
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	189 Special care must be taken to decide how to deal with reads that overlap more than one feature.
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	190
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	191 The htseq-count script allows to choose between three modes: union, intersection-strict, and intersection-nonempty.
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	192
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	193 The following figure illustrates the effect of these three modes:
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	194
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	195 .. image:: /static/images/count_modes.png
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	196 :width: 500
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	197
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	198 Strandedness
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	199 ------------
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	200
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	201 Important: The default for strandedness is yes. If your RNA-Seq data has not been made with a strand-specific protocol, this causes half of the reads to be lost. Hence, make sure to set the option Stranded to 'No' unless you have strand-specific data!
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	202
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	203 Output
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	204 ------
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	205
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	206 The script outputs a table with counts for each feature, followed by the special counters, which count reads that were not counted for any feature for various reasons, namely
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	207
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	208 - no_feature: reads which could not be assigned to any feature (set S as described above was empty).
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	209
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	210 - ambiguous: reads which could have been assigned to more than one feature and hence were not counted for any of these (set S had mroe than one element).
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	211
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	212 - too_low_aQual: reads which were not counted due to the -a option, see below
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	213
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	214 - not_aligned: reads in the SAM file without alignment
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	215
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	216 - alignment_not_unique: reads with more than one reported alignment. These reads are recognized from the NH optional SAM field tag. (If the aligner does not set this field, multiply aligned reads will be counted multiple times.)
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	217
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	218
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	219 Options Summary
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	220 ---------------
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	221
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	222 Usage: htseq-count [options] sam_file gff_file
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	223
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	224 This script takes an alignment file in SAM format and a feature file in GFF
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	225 format and calculates for each feature the number of reads mapping to it. See
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	226 http://www-huber.embl.de/users/anders/HTSeq/doc/count.html for details.
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	227
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	228 Options:
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	229 -h, --help show this help message and exit
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	230 -m MODE, --mode=MODE mode to handle reads overlapping more than one
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	231 feature(choices: union, intersection-strict,
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	232 intersection-nonempty; default: union)
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	233 -s STRANDED, --stranded=STRANDED
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	234 whether the data is from a strand-specific assay.
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	235 Specify 'yes', 'no', or 'reverse' (default: yes).
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	236 'reverse' means 'yes' with reversed strand
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	237 interpretation
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	238 -a MINAQUAL, --minaqual=MINAQUAL
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	239 skip all reads with alignment quality lower than the
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	240 given minimum value (default: 0)
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	241 -t FEATURETYPE, --type=FEATURETYPE
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	242 feature type (3rd column in GFF file) to be used, all
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	243 features of other type are ignored (default, suitable
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	244 for Ensembl GTF files: exon)
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	245 -i IDATTR, --idattr=IDATTR
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	246 GFF attribute to be used as feature ID (default,
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	247 suitable for Ensembl GTF files: gene_id)
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	248 -o SAMOUT, --samout=SAMOUT
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	249 write out all SAM alignment records into an output SAM
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	250 file called SAMOUT, annotating each line with its
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	251 feature assignment (as an optional field with tag
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	252 'XF')
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	253 -q, --quiet suppress progress report and warnings
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	254
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	255 Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	256 Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	257 Public License v3. Part of the 'HTSeq' framework.
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	258 </help>
b8349d8458fa Initial commit Lance Parsons <lparsons@princeton.edu> parents: diff changeset	259 </tool>

Mercurial > repos > lparsons > htseq_count

annotate htseq-count.xml @ 5:0a835934d792