| 
0
 | 
     1 <?xml version="1.0"?>
 | 
| 
11
 | 
     2 <tool id="genetrack" name="GeneTrack" version="@WRAPPER_VERSION@.0">
 | 
| 
0
 | 
     3     <description>peak predictor</description>
 | 
| 
 | 
     4     <macros>
 | 
| 
 | 
     5         <import>genetrack_macros.xml</import>
 | 
| 
 | 
     6     </macros>
 | 
| 
 | 
     7     <expand macro="requirements" />
 | 
| 
 | 
     8     <command>
 | 
| 
 | 
     9         python $__tool_directory__/genetrack.py
 | 
| 
 | 
    10         --input_format $input_format_cond.input_format
 | 
| 
6
 | 
    11         #if str($input_format_cond.input_format) == "scidx":
 | 
| 
 | 
    12             #for $i in $input_format_cond.input_scidx:
 | 
| 
0
 | 
    13                  --input "${i}" "${i.hid}"
 | 
| 
 | 
    14             #end for
 | 
| 
 | 
    15         #elif str($input_format_cond.input_format) == "gff":
 | 
| 
 | 
    16             #for $i in $input_format_cond.input_gff:
 | 
| 
 | 
    17                  --input "${i}" "${i.hid}"
 | 
| 
 | 
    18             #end for
 | 
| 
 | 
    19         #end if
 | 
| 
 | 
    20         --sigma $sigma
 | 
| 
 | 
    21         --exclusion $exclusion
 | 
| 
 | 
    22         --up_width $up_width
 | 
| 
 | 
    23         --down_width $down_width
 | 
| 
 | 
    24         --filter $filter
 | 
| 
 | 
    25     </command>
 | 
| 
 | 
    26     <inputs>
 | 
| 
 | 
    27         <conditional name="input_format_cond">
 | 
| 
 | 
    28             <param name="input_format" type="select" label="Format of files for conversion">
 | 
| 
6
 | 
    29                 <option value="scidx" selected="True">ScIdx</option>
 | 
| 
 | 
    30                 <option value="gff">Gff</option>
 | 
| 
0
 | 
    31             </param>
 | 
| 
6
 | 
    32             <when value="scidx">
 | 
| 
 | 
    33                 <param name="input_scidx" type="data" format="scidx" multiple="True" label="Predict peaks on" />
 | 
| 
0
 | 
    34             </when>
 | 
| 
 | 
    35             <when value="gff">
 | 
| 
 | 
    36                 <param  name="input_gff" type="data" format="gff" multiple="True" label="Predict peaks on" />
 | 
| 
 | 
    37             </when>
 | 
| 
 | 
    38         </conditional>
 | 
| 
 | 
    39         <param name="sigma" type="integer" value="5" min="1" label="Sigma to use when smoothing reads" help="Higher values increase computation but produce more smoothing." />
 | 
| 
 | 
    40         <param name="exclusion" type="integer" value="20" min="1" label="Peak exclusion zone" help="Exclusion zone around each peak that prevents others from being called." />
 | 
| 
12
 | 
    41         <param name="up_width" type="integer" value="10" min="0" label="Exclusion zone of upstream called peaks" />
 | 
| 
 | 
    42         <param name="down_width" type="integer" value="10" min="0" label="Exclusion zone of downstream called peaks" />
 | 
| 
15
 | 
    43         <param name="filter" type="integer" value="1" min="0" label="Absolute read filter" help="Removes peaks with lower peak height." />
 | 
| 
0
 | 
    44     </inputs>
 | 
| 
 | 
    45     <outputs>
 | 
| 
 | 
    46         <collection name="genetrack_output" type="list" label="Genetrack results on ${on_string}">
 | 
| 
 | 
    47             <discover_datasets pattern="(?P<designation>.*)" directory="output" ext="gff" visible="false" />
 | 
| 
 | 
    48         </collection>
 | 
| 
 | 
    49     </outputs>
 | 
| 
 | 
    50     <tests>
 | 
| 
 | 
    51         <test>
 | 
| 
 | 
    52             <param name="input_gff" value="genetrack_input2.gff" ftype="gff" />
 | 
| 
 | 
    53             <param name="input_format" value="gff" />
 | 
| 
 | 
    54             <param name="sigma" value="5" />
 | 
| 
 | 
    55             <param name="exclusion" value="20" />
 | 
| 
 | 
    56             <param name="up_width" value="10" />
 | 
| 
 | 
    57             <param name="down_width" value="10" />
 | 
| 
 | 
    58             <param name="filter" value="3" />
 | 
| 
 | 
    59             <output_collection name="genetrack_output" type="list">
 | 
| 
 | 
    60                 <element name="s5e20u10d10F3_on_data_1" file="genetrack_output2.gff" ftype="gff" />
 | 
| 
 | 
    61             </output_collection>
 | 
| 
 | 
    62         </test>
 | 
| 
 | 
    63         <test>
 | 
| 
6
 | 
    64             <param name="input_scidx" value="genetrack_input3.scidx" ftype="scidx" />
 | 
| 
 | 
    65             <param name="input_format" value="scidx" />
 | 
| 
0
 | 
    66             <param name="sigma" value="5" />
 | 
| 
 | 
    67             <param name="exclusion" value="20" />
 | 
| 
 | 
    68             <param name="up_width" value="10" />
 | 
| 
 | 
    69             <param name="down_width" value="10" />
 | 
| 
 | 
    70             <param name="filter" value="3" />
 | 
| 
 | 
    71             <output_collection name="genetrack_output" type="list">
 | 
| 
 | 
    72                 <element name="s5e20u10d10F3_on_data_1" file="genetrack_output3.gff" ftype="gff" />
 | 
| 
 | 
    73             </output_collection>
 | 
| 
 | 
    74         </test>
 | 
| 
 | 
    75         <test>
 | 
| 
 | 
    76             <param name="input_gff" value="genetrack_input_unsorted4.gff" ftype="gff" />
 | 
| 
 | 
    77             <param name="input_format" value="gff" />
 | 
| 
 | 
    78             <param name="sigma" value="5" />
 | 
| 
 | 
    79             <param name="exclusion" value="20" />
 | 
| 
 | 
    80             <param name="up_width" value="10" />
 | 
| 
 | 
    81             <param name="down_width" value="10" />
 | 
| 
 | 
    82             <param name="filter" value="3" />
 | 
| 
 | 
    83             <output_collection name="genetrack_output" type="list">
 | 
| 
 | 
    84                 <element name="s5e20u10d10F3_on_data_1" file="genetrack_output4.gff" ftype="gff" />
 | 
| 
 | 
    85             </output_collection>
 | 
| 
 | 
    86         </test>
 | 
| 
 | 
    87     </tests>
 | 
| 
 | 
    88     <help>
 | 
| 
 | 
    89 **What it does**
 | 
| 
 | 
    90 
 | 
| 
15
 | 
    91 GeneTrack separately identifies peaks on the forward "+” (W) and reverse “-” (C) strand.  The way that GeneTrack
 | 
| 
 | 
    92 works is to replace each tag with a probabilistic distribution of occurrences for that tag at and around its mapped
 | 
| 
12
 | 
    93 genomic coordinate.  The distance decay of the probabilistic distribution is set by adjusting the value of the
 | 
| 
 | 
    94 tool's **Sigma to use when smoothing reads** parameter.  GeneTrack then sums the distribution over all mapped
 | 
| 
 | 
    95 tags.  This results in a smooth continuous trace that can be globally broadened or tightened by adjusting the
 | 
| 
 | 
    96 sigma value.  GeneTrack starts with the highest smoothed peak first, treating each strand separately if indicated
 | 
| 
 | 
    97 by the data, then sets up an exclusion zone (centered over the peak) defined by the value of the **Peak exclusion
 | 
| 
 | 
    98 zone** parameter (see figure).  The exclusion zone prevents any secondary peaks from being called on the same strand
 | 
| 
 | 
    99 within that exclusion zone.  In rare cases, it may be desirable to set different exclusion zones upstream (more 5’)
 | 
| 
 | 
   100 versus downstream (more 3’) of the peak.
 | 
| 
 | 
   101 
 | 
| 
15
 | 
   102 .. image:: $PATH_TO_IMAGES/genetrack.png
 | 
| 
 | 
   103 
 | 
| 
 | 
   104 GeneTrack continues through the data in order of peak height, until no other peaks are found, and in principle will
 | 
| 
 | 
   105 call a peak at a single isolated tag, if no filter is set using the tool's **Absolute read filter** parameter.  A 
 | 
| 
 | 
   106 filter value of 1 means that it will stop calling peaks when the tag count in the peak hits 1 (so single tag peaks
 | 
| 
 | 
   107 will be excluded in this case).  GeneTrack outputs **chrom** (chromosome number), **strand** (+/W or -/C strand),
 | 
| 
 | 
   108 **start** (lower coordinate of exclusion zone), **end** (higher coordinate of exclusion zone), and **value** (peak
 | 
| 
 | 
   109 height).  Genetrack's GFF output reports the start (lower coordinate) and end (higher coordinate) of the exclusion
 | 
| 
 | 
   110 zone.
 | 
| 
 | 
   111 
 | 
| 
 | 
   112 In principle, the width of the exclusion zone may be as large as the DNA region occupied by the native protein plus
 | 
| 
 | 
   113 a steric exclusion zone between the protein and the exonuclease.  On the other hand the site might be considerably
 | 
| 
 | 
   114 smaller if the protein is in a denatured state during exonuclease digestion (since it is pre-treated with SDS).
 | 
| 
12
 | 
   115 
 | 
| 
15
 | 
   116 In general, higher resolution data or smaller binding site size data should use smaller sigma values. Large binding
 | 
| 
 | 
   117 site size data such as 147 bp nucleosomal DNA use a larger sigma value like 20 (-s 20).  For transcription factors
 | 
| 
 | 
   118 mapped by ChIP-exo, sigma may initially be set at 5, and the exclusion zone set at 20 (-s 5 –e 20).  Sigma is typically
 | 
| 
 | 
   119 varied between ~3 and ~20. Too high of a sigma value may merge two independent nearby binding events.  This may be
 | 
| 
 | 
   120 desirable if closely bound factors are not distinguishable.  Too low of a sigma value will cause some tags that
 | 
| 
 | 
   121 contribute to a binding event to be excluded, because they may not be located sufficiently close to the main peak.
 | 
| 
 | 
   122 If alternative (mutually exclusive) binding is expected for two overlapping sites, and these sites are to be
 | 
| 
 | 
   123 independently recorded, then an empirically determined smaller exclusion zone width is set.  Thus the value of sigma
 | 
| 
 | 
   124 is set empirically for each mapped factor, depending upon the resolution and binding site size of the binding event.
 | 
| 
 | 
   125 
 | 
| 
 | 
   126 It might make sense to exclude peaks that have only a single tag, where -F 1 is used, or have their tags located on
 | 
| 
 | 
   127 only a single coordinate (called Singletons, where stddev=0 in the output file).  However, low coverage datasets might
 | 
| 
 | 
   128 be improved by including them, if additional analysis (e.g., motif discovery) validates them. In addition, idealized
 | 
| 
 | 
   129 action of the exonuclease in ChIP-exo might place all tags for a peak on a single coordinate.
 | 
| 
0
 | 
   130 
 | 
| 
6
 | 
   131 -----
 | 
| 
 | 
   132 
 | 
| 
0
 | 
   133 **Options**
 | 
| 
 | 
   134 
 | 
| 
16
 | 
   135  * **Sigma to use when smoothing reads** - Smooths clusters of tags via a Gaussian distribution.
 | 
| 
 | 
   136  * **Peak exclusion zone** - Exclusion zone around each peak, eliminating all other peaks on the same strand that are within a ± bp distance of the peak.
 | 
| 
 | 
   137  * **Exclusion zone of upstream called peaks** - Defines the exclusion zone centered over peaks upstream of a peak.
 | 
| 
 | 
   138  * **Exclusion zone of downstream called peaks** - Defines the exclusion zone centered over peaks downstream of a peak.
 | 
| 
 | 
   139  * **Filter** - Absolute read filter, restricts output to only peaks with larger peak height.
 | 
| 
17
 | 
   140  
 | 
| 
 | 
   141  -----
 | 
| 
 | 
   142 
 | 
| 
 | 
   143 **Output gff Columns**
 | 
| 
 | 
   144 
 | 
| 
19
 | 
   145  * Chromosome
 | 
| 
 | 
   146  * Script
 | 
| 
 | 
   147  * Placeholder (no meaning)
 | 
| 
 | 
   148  * Start of peak exclusion zone (-e 20)
 | 
| 
 | 
   149  * End of peak exclusion zone
 | 
| 
 | 
   150  * Tag sum (not peak height or area under curve, which LionDB provides)
 | 
| 
 | 
   151  * Strand
 | 
| 
 | 
   152  * Placeholder (no meaning)
 | 
| 
 | 
   153  * Attributes (standard deviation of reads located within exclusion zone) = fuzziness of peak
 | 
| 
17
 | 
   154 
 | 
| 
 | 
   155  -----
 | 
| 
 | 
   156  
 | 
| 
 | 
   157  **Considerations**
 | 
| 
 | 
   158  
 | 
| 
 | 
   159  In principle, the width of the exclusion zone may be as large as the DNA region occupied by the native protein
 | 
| 
 | 
   160  plus a steric exclusion zone between the protein and the exonuclease.  On the other hand the site might be considerably
 | 
| 
 | 
   161  smaller if the protein is in a denatured state during exonuclease digestion (since it is pre-treated with SDS).
 | 
| 
 | 
   162  
 | 
| 
 | 
   163  In general, higher resolution data or smaller binding site size data should use smaller sigma values.  Large binding site
 | 
| 
 | 
   164  size data such as 147 bp nucleosomal DNA use a larger sigma value like 20 (-s 20).  For transcription factors mapped by
 | 
| 
 | 
   165  ChIP-exo, sigma may initially be set at 5, and the exclusion zone set at 20 (-s 5 –e 20).  Sigma is typically varied
 | 
| 
 | 
   166  between ~3 and ~20.  Too high of a sigma value may merge two independent nearby binding events.  This may be desirable if
 | 
| 
 | 
   167  closely bound factors are not distinguishable.  Too low of a sigma value will cause some tags that contribute to a binding
 | 
| 
 | 
   168  event to be excluded, because they may not be located sufficiently close to the main peak.  If alternative (mutually
 | 
| 
 | 
   169  exclusive) binding is expected for two overlapping sites, and these sites are to be independently recorded, then an
 | 
| 
 | 
   170  empirically determined smaller exclusion zone width is set.  Thus, the value of sigma is set empirically for each mapped
 | 
| 
 | 
   171  factor depending upon the resolution and binding site size of the binding event.
 | 
| 
 | 
   172 
 | 
| 
 | 
   173 It might make sense to exclude peaks that have only a single tag, where -F 1 is used, or have their tags located on only
 | 
| 
 | 
   174 a single coordinate (called Singletons, where stddev=0 in the output file).  However, low coverage datasets might be
 | 
| 
 | 
   175 improved by including them, if additional analysis (e.g., motif discovery) validates them.  In addition, idealized action
 | 
| 
 | 
   176 of the exonuclease in ChIP-exo might place all tags for a peak on a single coordinate.
 | 
| 
 | 
   177 
 | 
| 
0
 | 
   178     </help>
 | 
| 
 | 
   179     <expand macro="citations" />
 | 
| 
 | 
   180 </tool>
 |