genetrack: genetrack.xml annotate

annotate genetrack.xml @ 19:f45571c6e3dd draft

Uploaded

author	greg
date	Wed, 16 Dec 2015 19:59:14 -0500
parents	e1d437bd7d36
children	2f0dede41f69

rev	line source
0 0368815ae4d5 Uploaded greg parents: diff changeset	1 <?xml version="1.0"?>
11 497e3274f70b Uploaded greg parents: 6 diff changeset	2 <tool id="genetrack" name="GeneTrack" version="@WRAPPER_VERSION@.0">
0 0368815ae4d5 Uploaded greg parents: diff changeset	3 <description>peak predictor</description>
0368815ae4d5 Uploaded greg parents: diff changeset	4 <macros>
0368815ae4d5 Uploaded greg parents: diff changeset	5 <import>genetrack_macros.xml</import>
0368815ae4d5 Uploaded greg parents: diff changeset	6 </macros>
0368815ae4d5 Uploaded greg parents: diff changeset	7 <expand macro="requirements" />
0368815ae4d5 Uploaded greg parents: diff changeset	8 <command>
0368815ae4d5 Uploaded greg parents: diff changeset	9 python $__tool_directory__/genetrack.py
0368815ae4d5 Uploaded greg parents: diff changeset	10 --input_format $input_format_cond.input_format
6 fa85ca6c9cf8 Uploaded greg parents: 3 diff changeset	11 #if str($input_format_cond.input_format) == "scidx":
fa85ca6c9cf8 Uploaded greg parents: 3 diff changeset	12 #for $i in $input_format_cond.input_scidx:
0 0368815ae4d5 Uploaded greg parents: diff changeset	13 --input "${i}" "${i.hid}"
0368815ae4d5 Uploaded greg parents: diff changeset	14 #end for
0368815ae4d5 Uploaded greg parents: diff changeset	15 #elif str($input_format_cond.input_format) == "gff":
0368815ae4d5 Uploaded greg parents: diff changeset	16 #for $i in $input_format_cond.input_gff:
0368815ae4d5 Uploaded greg parents: diff changeset	17 --input "${i}" "${i.hid}"
0368815ae4d5 Uploaded greg parents: diff changeset	18 #end for
0368815ae4d5 Uploaded greg parents: diff changeset	19 #end if
0368815ae4d5 Uploaded greg parents: diff changeset	20 --sigma $sigma
0368815ae4d5 Uploaded greg parents: diff changeset	21 --exclusion $exclusion
0368815ae4d5 Uploaded greg parents: diff changeset	22 --up_width $up_width
0368815ae4d5 Uploaded greg parents: diff changeset	23 --down_width $down_width
0368815ae4d5 Uploaded greg parents: diff changeset	24 --filter $filter
0368815ae4d5 Uploaded greg parents: diff changeset	25 </command>
0368815ae4d5 Uploaded greg parents: diff changeset	26 <inputs>
0368815ae4d5 Uploaded greg parents: diff changeset	27 <conditional name="input_format_cond">
0368815ae4d5 Uploaded greg parents: diff changeset	28 <param name="input_format" type="select" label="Format of files for conversion">
6 fa85ca6c9cf8 Uploaded greg parents: 3 diff changeset	29 <option value="scidx" selected="True">ScIdx</option>
fa85ca6c9cf8 Uploaded greg parents: 3 diff changeset	30 <option value="gff">Gff</option>
0 0368815ae4d5 Uploaded greg parents: diff changeset	31 </param>
6 fa85ca6c9cf8 Uploaded greg parents: 3 diff changeset	32 <when value="scidx">
fa85ca6c9cf8 Uploaded greg parents: 3 diff changeset	33 <param name="input_scidx" type="data" format="scidx" multiple="True" label="Predict peaks on" />
0 0368815ae4d5 Uploaded greg parents: diff changeset	34 </when>
0368815ae4d5 Uploaded greg parents: diff changeset	35 <when value="gff">
0368815ae4d5 Uploaded greg parents: diff changeset	36 <param name="input_gff" type="data" format="gff" multiple="True" label="Predict peaks on" />
0368815ae4d5 Uploaded greg parents: diff changeset	37 </when>
0368815ae4d5 Uploaded greg parents: diff changeset	38 </conditional>
0368815ae4d5 Uploaded greg parents: diff changeset	39 <param name="sigma" type="integer" value="5" min="1" label="Sigma to use when smoothing reads" help="Higher values increase computation but produce more smoothing." />
0368815ae4d5 Uploaded greg parents: diff changeset	40 <param name="exclusion" type="integer" value="20" min="1" label="Peak exclusion zone" help="Exclusion zone around each peak that prevents others from being called." />
12 cd105fdfb0da Uploaded greg parents: 11 diff changeset	41 <param name="up_width" type="integer" value="10" min="0" label="Exclusion zone of upstream called peaks" />
cd105fdfb0da Uploaded greg parents: 11 diff changeset	42 <param name="down_width" type="integer" value="10" min="0" label="Exclusion zone of downstream called peaks" />
15 ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	43 <param name="filter" type="integer" value="1" min="0" label="Absolute read filter" help="Removes peaks with lower peak height." />
0 0368815ae4d5 Uploaded greg parents: diff changeset	44 </inputs>
0368815ae4d5 Uploaded greg parents: diff changeset	45 <outputs>
0368815ae4d5 Uploaded greg parents: diff changeset	46 <collection name="genetrack_output" type="list" label="Genetrack results on ${on_string}">
0368815ae4d5 Uploaded greg parents: diff changeset	47 <discover_datasets pattern="(?P<designation>.*)" directory="output" ext="gff" visible="false" />
0368815ae4d5 Uploaded greg parents: diff changeset	48 </collection>
0368815ae4d5 Uploaded greg parents: diff changeset	49 </outputs>
0368815ae4d5 Uploaded greg parents: diff changeset	50 <tests>
0368815ae4d5 Uploaded greg parents: diff changeset	51 <test>
0368815ae4d5 Uploaded greg parents: diff changeset	52 <param name="input_gff" value="genetrack_input2.gff" ftype="gff" />
0368815ae4d5 Uploaded greg parents: diff changeset	53 <param name="input_format" value="gff" />
0368815ae4d5 Uploaded greg parents: diff changeset	54 <param name="sigma" value="5" />
0368815ae4d5 Uploaded greg parents: diff changeset	55 <param name="exclusion" value="20" />
0368815ae4d5 Uploaded greg parents: diff changeset	56 <param name="up_width" value="10" />
0368815ae4d5 Uploaded greg parents: diff changeset	57 <param name="down_width" value="10" />
0368815ae4d5 Uploaded greg parents: diff changeset	58 <param name="filter" value="3" />
0368815ae4d5 Uploaded greg parents: diff changeset	59 <output_collection name="genetrack_output" type="list">
0368815ae4d5 Uploaded greg parents: diff changeset	60 <element name="s5e20u10d10F3_on_data_1" file="genetrack_output2.gff" ftype="gff" />
0368815ae4d5 Uploaded greg parents: diff changeset	61 </output_collection>
0368815ae4d5 Uploaded greg parents: diff changeset	62 </test>
0368815ae4d5 Uploaded greg parents: diff changeset	63 <test>
6 fa85ca6c9cf8 Uploaded greg parents: 3 diff changeset	64 <param name="input_scidx" value="genetrack_input3.scidx" ftype="scidx" />
fa85ca6c9cf8 Uploaded greg parents: 3 diff changeset	65 <param name="input_format" value="scidx" />
0 0368815ae4d5 Uploaded greg parents: diff changeset	66 <param name="sigma" value="5" />
0368815ae4d5 Uploaded greg parents: diff changeset	67 <param name="exclusion" value="20" />
0368815ae4d5 Uploaded greg parents: diff changeset	68 <param name="up_width" value="10" />
0368815ae4d5 Uploaded greg parents: diff changeset	69 <param name="down_width" value="10" />
0368815ae4d5 Uploaded greg parents: diff changeset	70 <param name="filter" value="3" />
0368815ae4d5 Uploaded greg parents: diff changeset	71 <output_collection name="genetrack_output" type="list">
0368815ae4d5 Uploaded greg parents: diff changeset	72 <element name="s5e20u10d10F3_on_data_1" file="genetrack_output3.gff" ftype="gff" />
0368815ae4d5 Uploaded greg parents: diff changeset	73 </output_collection>
0368815ae4d5 Uploaded greg parents: diff changeset	74 </test>
0368815ae4d5 Uploaded greg parents: diff changeset	75 <test>
0368815ae4d5 Uploaded greg parents: diff changeset	76 <param name="input_gff" value="genetrack_input_unsorted4.gff" ftype="gff" />
0368815ae4d5 Uploaded greg parents: diff changeset	77 <param name="input_format" value="gff" />
0368815ae4d5 Uploaded greg parents: diff changeset	78 <param name="sigma" value="5" />
0368815ae4d5 Uploaded greg parents: diff changeset	79 <param name="exclusion" value="20" />
0368815ae4d5 Uploaded greg parents: diff changeset	80 <param name="up_width" value="10" />
0368815ae4d5 Uploaded greg parents: diff changeset	81 <param name="down_width" value="10" />
0368815ae4d5 Uploaded greg parents: diff changeset	82 <param name="filter" value="3" />
0368815ae4d5 Uploaded greg parents: diff changeset	83 <output_collection name="genetrack_output" type="list">
0368815ae4d5 Uploaded greg parents: diff changeset	84 <element name="s5e20u10d10F3_on_data_1" file="genetrack_output4.gff" ftype="gff" />
0368815ae4d5 Uploaded greg parents: diff changeset	85 </output_collection>
0368815ae4d5 Uploaded greg parents: diff changeset	86 </test>
0368815ae4d5 Uploaded greg parents: diff changeset	87 </tests>
0368815ae4d5 Uploaded greg parents: diff changeset	88 <help>
0368815ae4d5 Uploaded greg parents: diff changeset	89 What it does
0368815ae4d5 Uploaded greg parents: diff changeset	90
15 ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	91 GeneTrack separately identifies peaks on the forward "+” (W) and reverse “-” (C) strand. The way that GeneTrack
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	92 works is to replace each tag with a probabilistic distribution of occurrences for that tag at and around its mapped
12 cd105fdfb0da Uploaded greg parents: 11 diff changeset	93 genomic coordinate. The distance decay of the probabilistic distribution is set by adjusting the value of the
cd105fdfb0da Uploaded greg parents: 11 diff changeset	94 tool's Sigma to use when smoothing reads parameter. GeneTrack then sums the distribution over all mapped
cd105fdfb0da Uploaded greg parents: 11 diff changeset	95 tags. This results in a smooth continuous trace that can be globally broadened or tightened by adjusting the
cd105fdfb0da Uploaded greg parents: 11 diff changeset	96 sigma value. GeneTrack starts with the highest smoothed peak first, treating each strand separately if indicated
cd105fdfb0da Uploaded greg parents: 11 diff changeset	97 by the data, then sets up an exclusion zone (centered over the peak) defined by the value of the **Peak exclusion
cd105fdfb0da Uploaded greg parents: 11 diff changeset	98 zone** parameter (see figure). The exclusion zone prevents any secondary peaks from being called on the same strand
cd105fdfb0da Uploaded greg parents: 11 diff changeset	99 within that exclusion zone. In rare cases, it may be desirable to set different exclusion zones upstream (more 5’)
cd105fdfb0da Uploaded greg parents: 11 diff changeset	100 versus downstream (more 3’) of the peak.
cd105fdfb0da Uploaded greg parents: 11 diff changeset	101
15 ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	102 .. image:: $PATH_TO_IMAGES/genetrack.png
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	103
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	104 GeneTrack continues through the data in order of peak height, until no other peaks are found, and in principle will
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	105 call a peak at a single isolated tag, if no filter is set using the tool's Absolute read filter parameter. A
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	106 filter value of 1 means that it will stop calling peaks when the tag count in the peak hits 1 (so single tag peaks
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	107 will be excluded in this case). GeneTrack outputs chrom (chromosome number), strand (+/W or -/C strand),
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	108 start (lower coordinate of exclusion zone), end (higher coordinate of exclusion zone), and value (peak
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	109 height). Genetrack's GFF output reports the start (lower coordinate) and end (higher coordinate) of the exclusion
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	110 zone.
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	111
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	112 In principle, the width of the exclusion zone may be as large as the DNA region occupied by the native protein plus
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	113 a steric exclusion zone between the protein and the exonuclease. On the other hand the site might be considerably
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	114 smaller if the protein is in a denatured state during exonuclease digestion (since it is pre-treated with SDS).
12 cd105fdfb0da Uploaded greg parents: 11 diff changeset	115
15 ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	116 In general, higher resolution data or smaller binding site size data should use smaller sigma values. Large binding
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	117 site size data such as 147 bp nucleosomal DNA use a larger sigma value like 20 (-s 20). For transcription factors
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	118 mapped by ChIP-exo, sigma may initially be set at 5, and the exclusion zone set at 20 (-s 5 –e 20). Sigma is typically
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	119 varied between ~3 and ~20. Too high of a sigma value may merge two independent nearby binding events. This may be
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	120 desirable if closely bound factors are not distinguishable. Too low of a sigma value will cause some tags that
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	121 contribute to a binding event to be excluded, because they may not be located sufficiently close to the main peak.
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	122 If alternative (mutually exclusive) binding is expected for two overlapping sites, and these sites are to be
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	123 independently recorded, then an empirically determined smaller exclusion zone width is set. Thus the value of sigma
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	124 is set empirically for each mapped factor, depending upon the resolution and binding site size of the binding event.
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	125
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	126 It might make sense to exclude peaks that have only a single tag, where -F 1 is used, or have their tags located on
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	127 only a single coordinate (called Singletons, where stddev=0 in the output file). However, low coverage datasets might
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	128 be improved by including them, if additional analysis (e.g., motif discovery) validates them. In addition, idealized
ebafcd6c3e0e Uploaded greg parents: 12 diff changeset	129 action of the exonuclease in ChIP-exo might place all tags for a peak on a single coordinate.
0 0368815ae4d5 Uploaded greg parents: diff changeset	130
6 fa85ca6c9cf8 Uploaded greg parents: 3 diff changeset	131 -----
fa85ca6c9cf8 Uploaded greg parents: 3 diff changeset	132
0 0368815ae4d5 Uploaded greg parents: diff changeset	133 Options
0368815ae4d5 Uploaded greg parents: diff changeset	134
16 b40ad4bee6cb Uploaded greg parents: 15 diff changeset	135 * Sigma to use when smoothing reads - Smooths clusters of tags via a Gaussian distribution.
b40ad4bee6cb Uploaded greg parents: 15 diff changeset	136 * Peak exclusion zone - Exclusion zone around each peak, eliminating all other peaks on the same strand that are within a ± bp distance of the peak.
b40ad4bee6cb Uploaded greg parents: 15 diff changeset	137 * Exclusion zone of upstream called peaks - Defines the exclusion zone centered over peaks upstream of a peak.
b40ad4bee6cb Uploaded greg parents: 15 diff changeset	138 * Exclusion zone of downstream called peaks - Defines the exclusion zone centered over peaks downstream of a peak.
b40ad4bee6cb Uploaded greg parents: 15 diff changeset	139 * Filter - Absolute read filter, restricts output to only peaks with larger peak height.
17 5a6ea187933b Uploaded greg parents: 16 diff changeset	140
5a6ea187933b Uploaded greg parents: 16 diff changeset	141 -----
5a6ea187933b Uploaded greg parents: 16 diff changeset	142
5a6ea187933b Uploaded greg parents: 16 diff changeset	143 Output gff Columns
5a6ea187933b Uploaded greg parents: 16 diff changeset	144
19 f45571c6e3dd Uploaded greg parents: 18 diff changeset	145 * Chromosome
f45571c6e3dd Uploaded greg parents: 18 diff changeset	146 * Script
f45571c6e3dd Uploaded greg parents: 18 diff changeset	147 * Placeholder (no meaning)
f45571c6e3dd Uploaded greg parents: 18 diff changeset	148 * Start of peak exclusion zone (-e 20)
f45571c6e3dd Uploaded greg parents: 18 diff changeset	149 * End of peak exclusion zone
f45571c6e3dd Uploaded greg parents: 18 diff changeset	150 * Tag sum (not peak height or area under curve, which LionDB provides)
f45571c6e3dd Uploaded greg parents: 18 diff changeset	151 * Strand
f45571c6e3dd Uploaded greg parents: 18 diff changeset	152 * Placeholder (no meaning)
f45571c6e3dd Uploaded greg parents: 18 diff changeset	153 * Attributes (standard deviation of reads located within exclusion zone) = fuzziness of peak
17 5a6ea187933b Uploaded greg parents: 16 diff changeset	154
5a6ea187933b Uploaded greg parents: 16 diff changeset	155 -----
5a6ea187933b Uploaded greg parents: 16 diff changeset	156
5a6ea187933b Uploaded greg parents: 16 diff changeset	157 Considerations
5a6ea187933b Uploaded greg parents: 16 diff changeset	158
5a6ea187933b Uploaded greg parents: 16 diff changeset	159 In principle, the width of the exclusion zone may be as large as the DNA region occupied by the native protein
5a6ea187933b Uploaded greg parents: 16 diff changeset	160 plus a steric exclusion zone between the protein and the exonuclease. On the other hand the site might be considerably
5a6ea187933b Uploaded greg parents: 16 diff changeset	161 smaller if the protein is in a denatured state during exonuclease digestion (since it is pre-treated with SDS).
5a6ea187933b Uploaded greg parents: 16 diff changeset	162
5a6ea187933b Uploaded greg parents: 16 diff changeset	163 In general, higher resolution data or smaller binding site size data should use smaller sigma values. Large binding site
5a6ea187933b Uploaded greg parents: 16 diff changeset	164 size data such as 147 bp nucleosomal DNA use a larger sigma value like 20 (-s 20). For transcription factors mapped by
5a6ea187933b Uploaded greg parents: 16 diff changeset	165 ChIP-exo, sigma may initially be set at 5, and the exclusion zone set at 20 (-s 5 –e 20). Sigma is typically varied
5a6ea187933b Uploaded greg parents: 16 diff changeset	166 between ~3 and ~20. Too high of a sigma value may merge two independent nearby binding events. This may be desirable if
5a6ea187933b Uploaded greg parents: 16 diff changeset	167 closely bound factors are not distinguishable. Too low of a sigma value will cause some tags that contribute to a binding
5a6ea187933b Uploaded greg parents: 16 diff changeset	168 event to be excluded, because they may not be located sufficiently close to the main peak. If alternative (mutually
5a6ea187933b Uploaded greg parents: 16 diff changeset	169 exclusive) binding is expected for two overlapping sites, and these sites are to be independently recorded, then an
5a6ea187933b Uploaded greg parents: 16 diff changeset	170 empirically determined smaller exclusion zone width is set. Thus, the value of sigma is set empirically for each mapped
5a6ea187933b Uploaded greg parents: 16 diff changeset	171 factor depending upon the resolution and binding site size of the binding event.
5a6ea187933b Uploaded greg parents: 16 diff changeset	172
5a6ea187933b Uploaded greg parents: 16 diff changeset	173 It might make sense to exclude peaks that have only a single tag, where -F 1 is used, or have their tags located on only
5a6ea187933b Uploaded greg parents: 16 diff changeset	174 a single coordinate (called Singletons, where stddev=0 in the output file). However, low coverage datasets might be
5a6ea187933b Uploaded greg parents: 16 diff changeset	175 improved by including them, if additional analysis (e.g., motif discovery) validates them. In addition, idealized action
5a6ea187933b Uploaded greg parents: 16 diff changeset	176 of the exonuclease in ChIP-exo might place all tags for a peak on a single coordinate.
5a6ea187933b Uploaded greg parents: 16 diff changeset	177
0 0368815ae4d5 Uploaded greg parents: diff changeset	178 </help>
0368815ae4d5 Uploaded greg parents: diff changeset	179 <expand macro="citations" />
0368815ae4d5 Uploaded greg parents: diff changeset	180 </tool>

Mercurial > repos > greg > genetrack

annotate genetrack.xml @ 19:f45571c6e3dd draft