Mercurial > repos > rlegendre > ribo_tools
changeset 13:7c944fd9907e draft
release 2
author | rlegendre |
---|---|
date | Thu, 09 Apr 2015 11:35:48 -0400 |
parents | ee3a3435ce43 |
children | 344bacf6acb8 |
files | 313b8f7d2a92 get_codon_frequency.py get_codon_frequency.xml kmer_analysis.py kmer_analysis.xml metagene_frameshift_analysis.py metagene_frameshift_analysis.xml metagene_readthrough.py metagene_readthrough.xml ribo_functions.py |
diffstat | 10 files changed, 112 insertions(+), 586 deletions(-) [+] |
line wrap: on
line diff
--- a/313b8f7d2a92 Fri Jan 23 03:31:37 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,395 +0,0 @@ -<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> -<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US"> -<head> -<link rel="icon" href="/repos/rlegendre/ribo_tools/static/hgicon.png" type="image/png" /> -<meta name="robots" content="index, nofollow" /> -<link rel="stylesheet" href="/repos/rlegendre/ribo_tools/static/style-paper.css" type="text/css" /> -<script type="text/javascript" src="/repos/rlegendre/ribo_tools/static/mercurial.js"></script> - -<title>ribo_tools: 313b8f7d2a92</title> -</head> -<body> -<div class="container"> -<div class="menu"> -<div class="logo"> -<a href="http://mercurial.selenic.com/"> -<img src="/repos/rlegendre/ribo_tools/static/hglogo.png" alt="mercurial" /></a> -</div> -<ul> - <li><a href="/repos/rlegendre/ribo_tools/shortlog/313b8f7d2a92">log</a></li> - <li><a href="/repos/rlegendre/ribo_tools/graph/313b8f7d2a92">graph</a></li> - <li><a href="/repos/rlegendre/ribo_tools/tags">tags</a></li> - <li><a href="/repos/rlegendre/ribo_tools/bookmarks">bookmarks</a></li> - <li><a href="/repos/rlegendre/ribo_tools/branches">branches</a></li> -</ul> -<ul> - <li class="active">changeset</li> - <li><a href="/repos/rlegendre/ribo_tools/raw-rev/313b8f7d2a92">raw</a></li> - <li><a href="/repos/rlegendre/ribo_tools/file/313b8f7d2a92">browse</a></li> -</ul> -<ul> - -<li> -<a href="/repos/rlegendre/ribo_tools/archive/313b8f7d2a92.tar.bz2">bz2</a> -</li> -<li> -<a href="/repos/rlegendre/ribo_tools/archive/313b8f7d2a92.zip">zip</a> -</li> -<li> -<a href="/repos/rlegendre/ribo_tools/archive/313b8f7d2a92.tar.gz">gz</a> -</li> -</ul> -<ul> - <li><a href="/repos/rlegendre/ribo_tools/help">help</a></li> -</ul> -</div> - -<div class="main"> - -<h2 class="breadcrumb"><a href="/">Mercurial</a> > <a href="/repos">repos</a> > <a href="/repos/rlegendre">rlegendre</a> > <a href="/repos/rlegendre/ribo_tools">ribo_tools</a> </h2> -<h3>changeset 11:313b8f7d2a92 <span class="tag">tip</span> </h3> - -<form class="search" action="/repos/rlegendre/ribo_tools/log"> - -<p><input name="rev" id="search1" type="text" size="30" /></p> -<div id="hint">Find changesets by keywords (author, files, the commit message), revision -number or hash, or <a href="/repos/rlegendre/ribo_tools/help/revsets">revset expression</a>.</div> -</form> - -<div class="description">(none)</div> - -<table id="changesetEntry"> -<tr> - <th class="author">author</th> - <td class="author">rlegendre</td> -</tr> -<tr> - <th class="date">date</th> - <td class="date age">Thu, 22 Jan 2015 14:34:53 +0100</td></tr> -<tr> - <th class="author">parents</th> - <td class="author"><a href="/repos/rlegendre/ribo_tools/rev/707807fee542">707807fee542</a> </td> -</tr> -<tr> - <th class="author">children</th> - <td class="author"></td> -</tr> -<tr> - <th class="files">files</th> - <td class="files"><a href="/repos/rlegendre/ribo_tools/file/313b8f7d2a92/get_codon_frequency.xml">get_codon_frequency.xml</a> <a href="/repos/rlegendre/ribo_tools/file/313b8f7d2a92/kmer_analysis.xml">kmer_analysis.xml</a> <a href="/repos/rlegendre/ribo_tools/file/313b8f7d2a92/metagene_frameshift_analysis.xml">metagene_frameshift_analysis.xml</a> <a href="/repos/rlegendre/ribo_tools/file/313b8f7d2a92/metagene_readthrough.xml">metagene_readthrough.xml</a> </td> -</tr> -<tr> - <th class="diffstat">diffstat</th> - <td class="diffstat"> - 4 files changed, 56 insertions(+), 55 deletions(-) - - <a id="diffstatexpand" href="javascript:toggleDiffstat()"/>[<tt>+</tt>]</a> - <div id="diffstatdetails" style="display:none;"> - <a href="javascript:toggleDiffstat()"/>[<tt>-</tt>]</a> - <p> - <table class="stripes2"> <tr> - <td class="diffstat-file"><a href="#l1.1">get_codon_frequency.xml</a></td> - <td class="diffstat-total" align="right">28</td> - <td class="diffstat-graph"> - <span class="diffstat-add" style="width:30.9523809524%;"> </span> - <span class="diffstat-remove" style="width:35.7142857143%;"> </span> - </td> - </tr> - <tr> - <td class="diffstat-file"><a href="#l2.1">kmer_analysis.xml</a></td> - <td class="diffstat-total" align="right">8</td> - <td class="diffstat-graph"> - <span class="diffstat-add" style="width:9.52380952381%;"> </span> - <span class="diffstat-remove" style="width:9.52380952381%;"> </span> - </td> - </tr> - <tr> - <td class="diffstat-file"><a href="#l3.1">metagene_frameshift_analysis.xml</a></td> - <td class="diffstat-total" align="right">42</td> - <td class="diffstat-graph"> - <span class="diffstat-add" style="width:52.380952381%;"> </span> - <span class="diffstat-remove" style="width:47.619047619%;"> </span> - </td> - </tr> - <tr> - <td class="diffstat-file"><a href="#l4.1">metagene_readthrough.xml</a></td> - <td class="diffstat-total" align="right">33</td> - <td class="diffstat-graph"> - <span class="diffstat-add" style="width:40.4761904762%;"> </span> - <span class="diffstat-remove" style="width:38.0952380952%;"> </span> - </td> - </tr> -</table> - </div> - </td> -</tr> -</table> - -<div class="overflow"> -<div class="sourcefirst linewraptoggle">line wrap: <a class="linewraplink" href="javascript:toggleLinewrap()">on</a></div> -<div class="sourcefirst"> line diff</div> -<div class="stripes2 diffblocks"> -<div class="bottomline inc-lineno"><pre class="sourcelines wrap"> -<span id="l1.1" class="minusline">--- a/get_codon_frequency.xml Thu Jan 22 14:34:38 2015 +0100</span><a href="#l1.1"></a> -<span id="l1.2" class="plusline">+++ b/get_codon_frequency.xml Thu Jan 22 14:34:53 2015 +0100</span><a href="#l1.2"></a> -<span id="l1.3" class="atline">@@ -1,5 +1,5 @@</span><a href="#l1.3"></a> -<span id="l1.4"> <tool id="Codon_analysis" name="Codon_density"></span><a href="#l1.4"></a> -<span id="l1.5" class="minusline">- <description> Analyse Ribo-seq alignments between two conditions to extract codon occupancy </description></span><a href="#l1.5"></a> -<span id="l1.6" class="plusline">+ <description>To compare Ribo-seq alignments between two sets of conditions, to determine codon occupancy </description></span><a href="#l1.6"></a> -<span id="l1.7"> <requirements></span><a href="#l1.7"></a> -<span id="l1.8"> <requirement type="package">samtools</requirement></span><a href="#l1.8"></a> -<span id="l1.9"> <requirement type="python-module">matplotlib</requirement></span><a href="#l1.9"></a> -<span id="l1.10" class="atline">@@ -32,7 +32,7 @@</span><a href="#l1.10"></a> -<span id="l1.11"> </command></span><a href="#l1.11"></a> -<span id="l1.12"> </span><a href="#l1.12"></a> -<span id="l1.13"> <inputs></span><a href="#l1.13"></a> -<span id="l1.14" class="minusline">- <param name="annot" type="data" label="References Input Annotation File (gff)" format="gff" /></span><a href="#l1.14"></a> -<span id="l1.15" class="plusline">+ <param name="annot" type="data" label="Reference annotation file (GFF)" format="gff" /></span><a href="#l1.15"></a> -<span id="l1.16"> <conditional name="replicat_opt"></span><a href="#l1.16"></a> -<span id="l1.17"> <param name="rep" type="select" label="Replicate option"></span><a href="#l1.17"></a> -<span id="l1.18"> <option value="yes">Yes (only two replicates by condition)</option></span><a href="#l1.18"></a> -<span id="l1.19" class="atline">@@ -40,23 +40,23 @@</span><a href="#l1.19"></a> -<span id="l1.20"> </param></span><a href="#l1.20"></a> -<span id="l1.21"> <when value="yes"></span><a href="#l1.21"></a> -<span id="l1.22"> ## Use conditional balise : if rep =yes : 4 files, else 2 files </span><a href="#l1.22"></a> -<span id="l1.23" class="minusline">- <param name="file1" type="data" label="First replicate of first condition (bam file)" format="bam" /></span><a href="#l1.23"></a> -<span id="l1.24" class="minusline">- <param name="file11" type="data" label="Second replicate of first condition (bam file)" format="bam" /></span><a href="#l1.24"></a> -<span id="l1.25" class="minusline">- <param name="file2" type="data" label="First replicate of second condition (bam file)" format="bam" /></span><a href="#l1.25"></a> -<span id="l1.26" class="minusline">- <param name="file22" type="data" label="First replicate of second condition (bam file)" format="bam" /></span><a href="#l1.26"></a> -<span id="l1.27" class="plusline">+ <param name="file1" type="data" label="First replicate of first condition (Bam file)" format="bam" /></span><a href="#l1.27"></a> -<span id="l1.28" class="plusline">+ <param name="file11" type="data" label="Second replicate of first condition (Bam file)" format="bam" /></span><a href="#l1.28"></a> -<span id="l1.29" class="plusline">+ <param name="file2" type="data" label="First replicate of second condition (Bam file)" format="bam" /></span><a href="#l1.29"></a> -<span id="l1.30" class="plusline">+ <param name="file22" type="data" label="First replicate of second condition (Bam file)" format="bam" /></span><a href="#l1.30"></a> -<span id="l1.31"> </when></span><a href="#l1.31"></a> -<span id="l1.32"> <when value="no"></span><a href="#l1.32"></a> -<span id="l1.33" class="minusline">- <param name="file1" type="data" label="First bam file" format="bam" /></span><a href="#l1.33"></a> -<span id="l1.34" class="minusline">- <param name="file2" type="data" label="Second bam File" format="bam" /></span><a href="#l1.34"></a> -<span id="l1.35" class="plusline">+ <param name="file1" type="data" label="First Bam file" format="bam" /></span><a href="#l1.35"></a> -<span id="l1.36" class="plusline">+ <param name="file2" type="data" label="Second Bam File" format="bam" /></span><a href="#l1.36"></a> -<span id="l1.37"> </when></span><a href="#l1.37"></a> -<span id="l1.38"> </conditional></span><a href="#l1.38"></a> -<span id="l1.39" class="minusline">- <param name="site" type="select" label="Please choose a ribosome site for codon analysis"></span><a href="#l1.39"></a> -<span id="l1.40" class="plusline">+ <param name="site" type="select" label="Please choose a ribosomal site for codon analysis"></span><a href="#l1.40"></a> -<span id="l1.41"> <option value="A">A</option></span><a href="#l1.41"></a> -<span id="l1.42"> <option value="P">P</option></span><a href="#l1.42"></a> -<span id="l1.43"> <option value="E">E</option></span><a href="#l1.43"></a> -<span id="l1.44"> </param></span><a href="#l1.44"></a> -<span id="l1.45"> <param name="asite" type="integer" label="Off-set from the 5'end of the footprint to the A-site" value ="15" /></span><a href="#l1.45"></a> -<span id="l1.46" class="minusline">- <param name="kmer" type="integer" label="Size of the best phasing reads" value ="28" /></span><a href="#l1.46"></a> -<span id="l1.47" class="plusline">+ <param name="kmer" type="integer" label="Lenght of the best phasing footprints" value ="28" /></span><a href="#l1.47"></a> -<span id="l1.48"> <param name="cond1" type="text" size="25" label="Condition one" help="Required even if no replicate" /></span><a href="#l1.48"></a> -<span id="l1.49"> <param name="cond2" type="text" size="25" label="Condition two" help="Required even if no replicate" /></span><a href="#l1.49"></a> -<span id="l1.50"> <param name="color1" type="text" size="50" label="Color for first condition" value ="SkyBlue" help="Enter standard name, hex color string, or rgb code. You cand find all colors here : http://pythonhosted.org/ete2/reference/reference_svgcolors.html" /></span><a href="#l1.50"></a> -<span id="l1.51" class="atline">@@ -74,14 +74,12 @@</span><a href="#l1.51"></a> -<span id="l1.52"> </span><a href="#l1.52"></a> -<span id="l1.53"> Summary</span><a href="#l1.53"></a> -<span id="l1.54"> ------- </span><a href="#l1.54"></a> -<span id="l1.55" class="minusline">-This tool uses Ribo-seq (bam file) to compare codon translation between two conditions.</span><a href="#l1.55"></a> -<span id="l1.56" class="minusline">-For each footprint, codons at choosen site are saved and an histogram with all normalized codon numbers is plotted in both conditions. </span><a href="#l1.56"></a> -<span id="l1.57" class="minusline">-A second histogram groups all codons corresponding to an amino acid.</span><a href="#l1.57"></a> -<span id="l1.58" class="minusline">-A chisquare test is used for testing if distribution of used codons is the same in both conditions.</span><a href="#l1.58"></a> -<span id="l1.59" class="plusline">+This tool uses Ribo-seq (BAM file) to determine whether codon occupancy differs between two sets of conditions. For each footprint, the codons at the chosen site are recorded and a histogram displaying all the normalised codon numbers is plotted for both sets of conditions. A second histogram groups together all the codons corresponding to a given amino acid. A chi-squared test is then carried out to determine whether the distribution of the codons used is the same in both sets of conditions.</span><a href="#l1.59"></a> -<span id="l1.60" class="plusline">+</span><a href="#l1.60"></a> -<span id="l1.61"> </span><a href="#l1.61"></a> -<span id="l1.62"> Output </span><a href="#l1.62"></a> -<span id="l1.63"> ------- </span><a href="#l1.63"></a> -<span id="l1.64" class="minusline">-This tool provides an html output with graphical outputs and a statistical test result. An additionnal csv file with codon numbers is provided. </span><a href="#l1.64"></a> -<span id="l1.65" class="plusline">+This tool provides an html file containing graphs and a statistical test result. An additional csv file with codon numbers is provided. </span><a href="#l1.65"></a> -<span id="l1.66"> </span><a href="#l1.66"></a> -<span id="l1.67"> </span><a href="#l1.67"></a> -<span id="l1.68"> Dependances</span><a href="#l1.68"></a></pre></div><div class="bottomline inc-lineno"><pre class="sourcelines wrap"> -<span id="l2.1" class="minusline">--- a/kmer_analysis.xml Thu Jan 22 14:34:38 2015 +0100</span><a href="#l2.1"></a> -<span id="l2.2" class="plusline">+++ b/kmer_analysis.xml Thu Jan 22 14:34:53 2015 +0100</span><a href="#l2.2"></a> -<span id="l2.3" class="atline">@@ -1,5 +1,5 @@</span><a href="#l2.3"></a> -<span id="l2.4"> <tool id="kmer_analysis" name="Kmer"></span><a href="#l2.4"></a> -<span id="l2.5" class="minusline">- <description>Compute proportion of each kmer and phasing</description></span><a href="#l2.5"></a> -<span id="l2.6" class="plusline">+ <description>To calculate the proportion and phasing of each kmer</description></span><a href="#l2.6"></a> -<span id="l2.7"> <requirements></span><a href="#l2.7"></a> -<span id="l2.8"> <requirement type="package">samtools</requirement></span><a href="#l2.8"></a> -<span id="l2.9"> <requirement type="package">numpy</requirement></span><a href="#l2.9"></a> -<span id="l2.10" class="atline">@@ -12,7 +12,7 @@</span><a href="#l2.10"></a> -<span id="l2.11"> </command></span><a href="#l2.11"></a> -<span id="l2.12"> </span><a href="#l2.12"></a> -<span id="l2.13"> <inputs></span><a href="#l2.13"></a> -<span id="l2.14" class="minusline">- <param name="gff" type="data" label="References Input Annotation File (gff)" format="gff" /></span><a href="#l2.14"></a> -<span id="l2.15" class="plusline">+ <param name="gff" type="data" label="Reference annotation file (GFF))" format="gff" /></span><a href="#l2.15"></a> -<span id="l2.16"> <param name="bamfile" type="data" label="Bam file" format="bam" /></span><a href="#l2.16"></a> -<span id="l2.17"> </inputs></span><a href="#l2.17"></a> -<span id="l2.18"> </span><a href="#l2.18"></a> -<span id="l2.19" class="atline">@@ -25,11 +25,11 @@</span><a href="#l2.19"></a> -<span id="l2.20"> </span><a href="#l2.20"></a> -<span id="l2.21"> Summary</span><a href="#l2.21"></a> -<span id="l2.22"> -------</span><a href="#l2.22"></a> -<span id="l2.23" class="minusline">-This tool uses Ribo-seq data (bam file) to compute proportion of each kmer (lenght of footprints) and phasing. </span><a href="#l2.23"></a> -<span id="l2.24" class="plusline">+The kmer tool computes the distribution of footprints length from Bam file and determines the proportion of footprints beginning in each frame, for all annotated genes in the GFF file.</span><a href="#l2.24"></a> -<span id="l2.25"> </span><a href="#l2.25"></a> -<span id="l2.26"> Output </span><a href="#l2.26"></a> -<span id="l2.27"> ------- </span><a href="#l2.27"></a> -<span id="l2.28" class="minusline">-This tool provides an html report with all kmer proportion and phasing. </span><a href="#l2.28"></a> -<span id="l2.29" class="plusline">+This tool provides an html report detailing the proportions and phasing of the kmers. </span><a href="#l2.29"></a> -<span id="l2.30"> </span><a href="#l2.30"></a> -<span id="l2.31"> </span><a href="#l2.31"></a> -<span id="l2.32"> Dependances</span><a href="#l2.32"></a></pre></div><div class="bottomline inc-lineno"><pre class="sourcelines wrap"> -<span id="l3.1" class="minusline">--- a/metagene_frameshift_analysis.xml Thu Jan 22 14:34:38 2015 +0100</span><a href="#l3.1"></a> -<span id="l3.2" class="plusline">+++ b/metagene_frameshift_analysis.xml Thu Jan 22 14:34:53 2015 +0100</span><a href="#l3.2"></a> -<span id="l3.3" class="atline">@@ -1,5 +1,5 @@</span><a href="#l3.3"></a> -<span id="l3.4"> <tool id="frameshift_analysis" name="Frame"></span><a href="#l3.4"></a> -<span id="l3.5" class="minusline">- <description> Analyse Ribo-seq alignment to extract translational ambiguities events</description></span><a href="#l3.5"></a> -<span id="l3.6" class="plusline">+ <description>To analyse Ribo-seq alignments for the extraction of translational ambiguities</description></span><a href="#l3.6"></a> -<span id="l3.7"> <requirements></span><a href="#l3.7"></a> -<span id="l3.8"> <requirement type="package">samtools</requirement></span><a href="#l3.8"></a> -<span id="l3.9"> <requirement type="python-module">matplotlib</requirement></span><a href="#l3.9"></a> -<span id="l3.10" class="atline">@@ -8,16 +8,18 @@</span><a href="#l3.10"></a> -<span id="l3.11"> <requirement type="python-module">Bio</requirement></span><a href="#l3.11"></a> -<span id="l3.12"> </requirements></span><a href="#l3.12"></a> -<span id="l3.13"> <command interpreter="python"> </span><a href="#l3.13"></a> -<span id="l3.14" class="minusline">- metagene_frameshift_analysis.py --input $reference --bam $mapping --cutoff $cutoff --kmer $kmer --fasta $fasta --dirout $output,$output.files_path --box $boxplot> $log</span><a href="#l3.14"></a> -<span id="l3.15" class="plusline">+ metagene_frameshift_analysis.py --gff $reference --bam $mapping --cutoff $cutoff --kmer $kmer --fasta $fasta --dirout $output,$output.files_path --box $boxplot --orf_length $orf --frame $frame > $log</span><a href="#l3.15"></a> -<span id="l3.16"> </span><a href="#l3.16"></a> -<span id="l3.17"> </command></span><a href="#l3.17"></a> -<span id="l3.18"> </span><a href="#l3.18"></a> -<span id="l3.19"> <inputs></span><a href="#l3.19"></a> -<span id="l3.20" class="minusline">- <param name="reference" type="data" label="References Input Annotation File (gff)" format="gff" /></span><a href="#l3.20"></a> -<span id="l3.21" class="minusline">- <param name="mapping" type="data" label="Bam Input File" format="bam" /></span><a href="#l3.21"></a> -<span id="l3.22" class="minusline">- <param name="fasta" type="data" label="Reference in fasta format" format="fasta" /></span><a href="#l3.22"></a> -<span id="l3.23" class="minusline">- <param name="kmer" type="integer" label="Longer of the best phasing reads" value ="28" /></span><a href="#l3.23"></a> -<span id="l3.24" class="plusline">+ <param name="reference" type="data" label="Reference annotation file (GFF)" format="gff" /></span><a href="#l3.24"></a> -<span id="l3.25" class="plusline">+ <param name="mapping" type="data" label="Bam file" format="bam" /></span><a href="#l3.25"></a> -<span id="l3.26" class="plusline">+ <param name="fasta" type="data" label="Reference genome in Fasta format" format="fasta" /></span><a href="#l3.26"></a> -<span id="l3.27" class="plusline">+ <param name="kmer" type="integer" label="Lenght of the best phasing footprints" value ="28" /></span><a href="#l3.27"></a> -<span id="l3.28" class="plusline">+ <param name="frame" type="integer" label="Frame where footprints show best phasing. Must be 1, 2 or 3" value ="1" /></span><a href="#l3.28"></a> -<span id="l3.29"> <param name="cutoff" type="integer" label="Cutoff for frame proportion in coding phase (default = 60 %)" value ="60" /></span><a href="#l3.29"></a> -<span id="l3.30" class="plusline">+ <param name="orf" type="integer" label="Approximate size of the segment (in bp)" value ="300" /> </span><a href="#l3.30"></a> -<span id="l3.31"> </inputs></span><a href="#l3.31"></a> -<span id="l3.32"> </span><a href="#l3.32"></a> -<span id="l3.33"> <outputs></span><a href="#l3.33"></a> -<span id="l3.34" class="atline">@@ -30,35 +32,35 @@</span><a href="#l3.34"></a> -<span id="l3.35"> <help></span><a href="#l3.35"></a> -<span id="l3.36"> Summary</span><a href="#l3.36"></a> -<span id="l3.37"> ------- </span><a href="#l3.37"></a> -<span id="l3.38" class="minusline">-This tool uses Ribo-seq data (bam file) to extract out-of-frame footprints in all genes from a reference annotation file (GFF3). Subprofile are plotted for each gene with dual coding events.</span><a href="#l3.38"></a> -<span id="l3.39" class="plusline">+This tool uses Ribo-seq data (BAM file) to extract out-of-frame footprints in all genes from a reference annotation file (GFF3). Subprofiles are plotted for each gene with dual coding events.</span><a href="#l3.39"></a> -<span id="l3.40"> </span><a href="#l3.40"></a> -<span id="l3.41"> </span><a href="#l3.41"></a> -<span id="l3.42" class="minusline">-*- GFF3 file* : It must contain 9 tabulate-delimited columns : Chromosome, source, feature, start, stop, score, strand, phasing, note. The gene ID was retrieved in note field by "ID=" tag.</span><a href="#l3.42"></a> -<span id="l3.43" class="plusline">+*- GFF3 file*: This file must have nine tabulated-delimited columns: Chromosome, source, feature, start, stop, score, strand, phasing, note. The gene ID is retrieved from the note field, using the "ID=" tag.</span><a href="#l3.43"></a> -<span id="l3.44"> </span><a href="#l3.44"></a> -<span id="l3.45" class="minusline">-*- Fasta file* : Reference fasta file. Be careful about the chromosome nomenclature used : it must be compatible with your GFF3 annotation file.</span><a href="#l3.45"></a> -<span id="l3.46" class="plusline">+*- Fasta file*: Reference fasta file. Care should be taken with the chromosome nomenclature used, which must be compatible with the GFF3 annotation file.</span><a href="#l3.46"></a> -<span id="l3.47"> </span><a href="#l3.47"></a> -<span id="l3.48" class="minusline">-*- BAM file* : It must be sorted. It can contain either multiples or unaligned footprints</span><a href="#l3.48"></a> -<span id="l3.49" class="plusline">+*- BAM file*: This file should be sorted and may contain either multiple or unaligned footprints</span><a href="#l3.49"></a> -<span id="l3.50"> </span><a href="#l3.50"></a> -<span id="l3.51" class="minusline">-*- Kmer* : Lenght of the best phasing footprint. You can compute it running kmer_analysis </span><a href="#l3.51"></a> -<span id="l3.52" class="plusline">+*- Kmer*: Length of the best-phased footprints. It can be calculated by running kmer_analysis </span><a href="#l3.52"></a> -<span id="l3.53" class="plusline">+</span><a href="#l3.53"></a> -<span id="l3.54" class="plusline">+*- Frame*: Frame for which the phasing of the footprints is best. It can be calculated by running kmer_analysis </span><a href="#l3.54"></a> -<span id="l3.55"> </span><a href="#l3.55"></a> -<span id="l3.56" class="minusline">-*- Cutoff* : Integer value for selecting all genes that have less than 60 % (default) of footprints in coding frame.</span><a href="#l3.56"></a> -<span id="l3.57" class="plusline">+*- Cutoff*: An integer value for selecting all genes for which fewer than 60 % (default) of the footprints are in the coding frame.</span><a href="#l3.57"></a> -<span id="l3.58"> </span><a href="#l3.58"></a> -<span id="l3.59" class="minusline">-</span><a href="#l3.59"></a> -<span id="l3.60" class="minusline">-</span><a href="#l3.60"></a> -<span id="l3.61" class="minusline">-.................................................................................................................................................................................................</span><a href="#l3.61"></a> -<span id="l3.62" class="plusline">+*- Orf*: Approximate size of the segment.</span><a href="#l3.62"></a> -<span id="l3.63"> </span><a href="#l3.63"></a> -<span id="l3.64"> </span><a href="#l3.64"></a> -<span id="l3.65"> </span><a href="#l3.65"></a> -<span id="l3.66"> Output </span><a href="#l3.66"></a> -<span id="l3.67"> ------- </span><a href="#l3.67"></a> -<span id="l3.68" class="minusline">-This tool generates 2 output files :</span><a href="#l3.68"></a> -<span id="l3.69" class="plusline">+This tool generates 3 output files:</span><a href="#l3.69"></a> -<span id="l3.70"> </span><a href="#l3.70"></a> -<span id="l3.71" class="minusline">-*- html file* : relative to translational ambiguities detection and visualization.</span><a href="#l3.71"></a> -<span id="l3.72" class="plusline">+*- html file*: for the detection and visualisation of translational ambiguities.</span><a href="#l3.72"></a> -<span id="l3.73"> </span><a href="#l3.73"></a> -<span id="l3.74" class="minusline">-*- Stat file* : statistiques about treated footprints and phasing.</span><a href="#l3.74"></a> -<span id="l3.75" class="plusline">+*- Stat file*: this file provides statistics for the treated footprints and phasing.</span><a href="#l3.75"></a> -<span id="l3.76"> </span><a href="#l3.76"></a> -<span id="l3.77" class="minusline">-*- Boxplot* : Proportion of footprints in the three frames for all genes.</span><a href="#l3.77"></a> -<span id="l3.78" class="minusline">-</span><a href="#l3.78"></a> -<span id="l3.79" class="plusline">+*- Boxplot*: Proportion of footprints in the three frames, for all genes.</span><a href="#l3.79"></a> -<span id="l3.80" class="plusline">+ </span><a href="#l3.80"></a> -<span id="l3.81"> </span><a href="#l3.81"></a> -<span id="l3.82"> Dependances</span><a href="#l3.82"></a> -<span id="l3.83"> ------------</span><a href="#l3.83"></a></pre></div><div class="bottomline inc-lineno"><pre class="sourcelines wrap"> -<span id="l4.1" class="minusline">--- a/metagene_readthrough.xml Thu Jan 22 14:34:38 2015 +0100</span><a href="#l4.1"></a> -<span id="l4.2" class="plusline">+++ b/metagene_readthrough.xml Thu Jan 22 14:34:53 2015 +0100</span><a href="#l4.2"></a> -<span id="l4.3" class="atline">@@ -1,5 +1,5 @@</span><a href="#l4.3"></a> -<span id="l4.4"> <tool id="readthrough_analysis" name="Stop_supp"></span><a href="#l4.4"></a> -<span id="l4.5" class="minusline">- <description> Analyse Ribo-seq alignment to detect readthrough events</description></span><a href="#l4.5"></a> -<span id="l4.6" class="plusline">+ <description>To analyse Ribo-seq alignments for the detection of stop codon readthrough events</description></span><a href="#l4.6"></a> -<span id="l4.7"> <requirements></span><a href="#l4.7"></a> -<span id="l4.8"> <requirement type="package">samtools</requirement></span><a href="#l4.8"></a> -<span id="l4.9"> <requirement type="python-module">HTseq</requirement></span><a href="#l4.9"></a> -<span id="l4.10" class="atline">@@ -8,14 +8,15 @@</span><a href="#l4.10"></a> -<span id="l4.11"> <requirement type="python-module">Bio</requirement></span><a href="#l4.11"></a> -<span id="l4.12"> </requirements></span><a href="#l4.12"></a> -<span id="l4.13"> <command interpreter="python"> </span><a href="#l4.13"></a> -<span id="l4.14" class="minusline">- metagene_readthrough.py --gff $gff --fasta $fasta --bam $mapping --dirout=$output,$output.files_path</span><a href="#l4.14"></a> -<span id="l4.15" class="plusline">+ metagene_readthrough.py --gff $gff --fasta $fasta --bam $mapping --dirout=$output,$output.files_path --extend $ext</span><a href="#l4.15"></a> -<span id="l4.16"> </span><a href="#l4.16"></a> -<span id="l4.17"> </command></span><a href="#l4.17"></a> -<span id="l4.18"> </span><a href="#l4.18"></a> -<span id="l4.19"> <inputs></span><a href="#l4.19"></a> -<span id="l4.20" class="minusline">- <param name="gff" type="data" label="References Input Annotation File (gff)" format="gff" /></span><a href="#l4.20"></a> -<span id="l4.21" class="minusline">- <param name="fasta" type="data" label="Reference in fasta format" format="fasta" /></span><a href="#l4.21"></a> -<span id="l4.22" class="minusline">- <param name="mapping" type="data" label="Bam Input File" format="bam" /></span><a href="#l4.22"></a> -<span id="l4.23" class="plusline">+ <param name="gff" type="data" label="Reference annotation file (GFF)" format="gff" /></span><a href="#l4.23"></a> -<span id="l4.24" class="plusline">+ <param name="fasta" type="data" label="Reference genome in Fasta format" format="fasta" /></span><a href="#l4.24"></a> -<span id="l4.25" class="plusline">+ <param name="mapping" type="data" label="Bam File" format="bam" /></span><a href="#l4.25"></a> -<span id="l4.26" class="plusline">+ <param name="ext" type="integer" label="Length of 3’ UTR extension downstream the annotated stop codon (in bp)" value="300" /></span><a href="#l4.26"></a> -<span id="l4.27"> </inputs></span><a href="#l4.27"></a> -<span id="l4.28"> </span><a href="#l4.28"></a> -<span id="l4.29"> <outputs></span><a href="#l4.29"></a> -<span id="l4.30" class="atline">@@ -25,28 +26,28 @@</span><a href="#l4.30"></a> -<span id="l4.31"> <help></span><a href="#l4.31"></a> -<span id="l4.32"> Summary</span><a href="#l4.32"></a> -<span id="l4.33"> ------- </span><a href="#l4.33"></a> -<span id="l4.34" class="minusline">-This tool uses Ribo-seq data (bam file) to extract potential genes with readthrough events from a reference annotation file (GFF3).</span><a href="#l4.34"></a> -<span id="l4.35" class="plusline">+This tool uses Ribo-seq data (BAM file) to extract genes displaying potential stop codon readthrough events from a reference annotation file (GFF3).</span><a href="#l4.35"></a> -<span id="l4.36"> </span><a href="#l4.36"></a> -<span id="l4.37" class="minusline">-C-terminal protein extensions were identified as previously described (Dunn J.G. and al, 2013). Only uniquely mapped footprints whose size is in the range 25 to 34 are considered.</span><a href="#l4.37"></a> -<span id="l4.38" class="minusline">-A gene is read-though if :</span><a href="#l4.38"></a> -<span id="l4.39" class="plusline">+C-terminal protein extensions were identified as previously described (Dunn J.G. et al, 2013). Only uniquely mapped footprints with a size in the 25 to 34 range are considered.</span><a href="#l4.39"></a> -<span id="l4.40" class="plusline">+A gene is considered to display readthrough if:</span><a href="#l4.40"></a> -<span id="l4.41"> </span><a href="#l4.41"></a> -<span id="l4.42"> i) It is covered by more than 128 footprints.</span><a href="#l4.42"></a> -<span id="l4.43"> </span><a href="#l4.43"></a> -<span id="l4.44" class="minusline">- ii) There are footprints after stop codon.</span><a href="#l4.44"></a> -<span id="l4.45" class="plusline">+ ii) There are footprints after the stop codon.</span><a href="#l4.45"></a> -<span id="l4.46"> </span><a href="#l4.46"></a> -<span id="l4.47" class="minusline">- iii) There are footprints overlapping the next in frame stop codon.</span><a href="#l4.47"></a> -<span id="l4.48" class="plusline">+ iii) There are footprints overlapping the next in-frame stop codon.</span><a href="#l4.48"></a> -<span id="l4.49"> </span><a href="#l4.49"></a> -<span id="l4.50" class="minusline">- iv) There is not Methionine in the next five codons downstream the official stop codon of CDS.</span><a href="#l4.50"></a> -<span id="l4.51" class="plusline">+ iv) There is no methionine codon in the next five codons downstream from the official stop codon of the CDS.</span><a href="#l4.51"></a> -<span id="l4.52"> </span><a href="#l4.52"></a> -<span id="l4.53" class="minusline">- v) The coverage is homogeneous within the extension.</span><a href="#l4.53"></a> -<span id="l4.54" class="plusline">+ v) Coverage is homogeneous within the extension.</span><a href="#l4.54"></a> -<span id="l4.55"> </span><a href="#l4.55"></a> -<span id="l4.56" class="minusline">-Stop codon readthrough was estimated by calculating a ratio between footprints in the C-terminal extension and in the CDS. Ribosome density footprints were estimated in RPKM (reads per kilobase per million). </span><a href="#l4.56"></a> -<span id="l4.57" class="minusline">-To control variability due to stop codon peaks, footprints mapping to stop codons are excluded to RPKM computing.</span><a href="#l4.57"></a> -<span id="l4.58" class="plusline">+Stop codon readthrough was estimated by calculating the ratio of the number of footprints in the C-terminal extension to that in the CDS. Ribosome density footprints were estimated in RPKM (reads per kilobase per million). We controlled for variability due to stop codon peaks, by excluding footprints mapping to stop codons from the calculation of RPKM.</span><a href="#l4.58"></a> -<span id="l4.59"> </span><a href="#l4.59"></a> -<span id="l4.60" class="plusline">+ </span><a href="#l4.60"></a> -<span id="l4.61"> Output </span><a href="#l4.61"></a> -<span id="l4.62"> ------- </span><a href="#l4.62"></a> -<span id="l4.63" class="minusline">-This tool produces html file with plots for each readthrough gene.</span><a href="#l4.63"></a> -<span id="l4.64" class="minusline">-</span><a href="#l4.64"></a> -<span id="l4.65" class="plusline">+This tool produces an html file with plots for each gene displaying readthrough.</span><a href="#l4.65"></a> -<span id="l4.66" class="plusline">+ </span><a href="#l4.66"></a> -<span id="l4.67"> </span><a href="#l4.67"></a> -<span id="l4.68"> Dependances</span><a href="#l4.68"></a> -<span id="l4.69"> ------------</span><a href="#l4.69"></a></pre></div> -</div> -</div> - -</div> -</div> -<script type="text/javascript">process_dates()</script> - - -</body> -</html> -
--- a/get_codon_frequency.py Fri Jan 23 03:31:37 2015 -0500 +++ b/get_codon_frequency.py Thu Apr 09 11:35:48 2015 -0400 @@ -28,7 +28,7 @@ import ribo_functions import HTSeq # #libraries for debugg -import pdb +#import pdb # import cPickle def stop_err(msg): @@ -36,70 +36,6 @@ sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) sys.exit() -def store_gff(gff): - ''' - parse and store gff file in a dictionnary - ''' - try: - GFF = OrderedDict({}) - with open(gff, 'r') as f_gff : - # GFF['order'] = [] - for line in f_gff: - # # switch commented lines - head = line.split("#")[0] - if head != "" : - feature = (line.split('\t')[8]).split(';') - chrom = line.split('\t')[0] - if chrom not in GFF : - GFF[chrom] = {} - # first line is already gene line : - if line.split('\t')[2] == 'gene' : - gene = feature[0].replace("ID=", "") - if re.search('gene', feature[2]) : - Name = feature[2].replace("gene=", "") - else : - Name = "Unknown" - # #get annotation - note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line) - note = urllib.unquote(str(note)).replace("\n", "") - # # store gene information - # GFF['order'].append(gene) - GFF[chrom][gene] = {} - GFF[chrom][gene]['chrom'] = line.split('\t')[0] - GFF[chrom][gene]['start'] = int(line.split('\t')[3]) - GFF[chrom][gene]['stop'] = int(line.split('\t')[4]) - GFF[chrom][gene]['strand'] = line.split('\t')[6] - GFF[chrom][gene]['name'] = Name - GFF[chrom][gene]['note'] = note - GFF[chrom][gene]['exon'] = {} - GFF[chrom][gene]['exon_number'] = 0 - # print Name - elif line.split('\t')[2] == 'CDS' : - gene = re.sub(r"Parent\=(.+)_mRNA", r"\1", feature[0]) - if GFF[chrom].has_key(gene) : - GFF[chrom][gene]['exon_number'] += 1 - exon_number = GFF[chrom][gene]['exon_number'] - GFF[chrom][gene]['exon'][exon_number] = {} - GFF[chrom][gene]['exon'][exon_number]['frame'] = line.split('\t')[7] - GFF[chrom][gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) - GFF[chrom][gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) - # # if there is a five prim UTR intron, we change start of gene - elif line.split('\t')[2] == 'five_prime_UTR_intron' : - if GFF[chrom][gene]['strand'] == "+" : - GFF[chrom][gene]['start'] = GFF[chrom][gene]['exon'][1]['start'] - else : - GFF[chrom][gene]['stop'] = GFF[chrom][gene]['exon'][exon_number]['stop'] - return GFF - except Exception, e: - stop_err('Error during gff storage : ' + str(e)) - -#chrI SGD gene 87286 87752 . + . ID=YAL030W;Name=YAL030W;gene=SNC1;Alias=SNC1;Ontology_term=GO:0005484,GO:0005768,GO:0005802,GO:0005886,GO:0005935,GO:0006887,GO:0006893,GO:000689 -#7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra -#ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr -#om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified -#chrI SGD CDS 87286 87387 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified -#chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified - def init_codon_dict(): @@ -115,13 +51,19 @@ ''' try: codon = init_codon_dict() + multi_tag = "XS:i:" ## bowtie Tag + tag = "IH:i:1" ## RUM tag + for feature in GFF : if feature.type == 'gene' : codon_dict = init_codon_dict() chrom = feature.iv.chrom start = feature.iv.start stop = feature.iv.end - region = chrom + ':' + str(start) + '-' + str(stop) + if start+50 < stop-50 : + region = chrom + ':' + str(start+50) + '-' + str(stop-50) + else : + break ## DEPRECATED #for chrom in GFF.iterkeys(): @@ -143,13 +85,16 @@ multi_tag = "XS:i:" elif 'bwa' in head: multi_tag = "XT:A:R" + elif 'TopHat' in head: + tag = "NH:i:1" else : - stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") + stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping") + if len(read) == 0: continue len_read = len(read.split('\t')[9]) # if it's read of good length - if len_read == kmer and multi_tag not in read: + if len_read == kmer and (tag in line or multi_tag not in line): feat = read.split('\t') seq = feat[9] # if it's a reverse read @@ -179,6 +124,7 @@ cod = seq[a_pos-6:a_pos-3] if codon_dict.has_key(cod) : codon_dict[cod] += 1 + del(read) # # add in global dict for cod, count in codon_dict.iteritems() : codon[cod] += count @@ -400,7 +346,7 @@ cond2_aa.append(z[1]) max_valaa.append(max(z)) # # plot amino acid profile : - fig = pl.figure(figsize=(24, 10), num=1) + fig = pl.figure(figsize=(30, 10), num=1) width = .50 ax = fig.add_subplot(111) ax.xaxis.set_ticks([]) @@ -440,8 +386,8 @@ # plot result - fig = pl.figure(figsize=(24, 10), num=1) - width = .50 + fig = pl.figure(figsize=(30, 10), num=1) + width = .40 ind = arange(len(codon_sorted)) ax = fig.add_subplot(111) pl.xlim(0, len(codon_sorted) + 1) @@ -546,7 +492,7 @@ max_val.append(max(cond1_norm[i], cond2_norm[i])) # plot result - fig = pl.figure(figsize=(30, 10), num=1) + fig = pl.figure(figsize=(40, 10), num=1) #fig = pl.figure(num=1) width = .40 ind = arange(len(codon_sorted)) @@ -560,7 +506,7 @@ ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, label=c1) ax.bar(ind + width, list(cond2_norm.itervalues()), width, facecolor=color2, label=c2) for x, y, z in zip(ind, max_val, codon_sorted): - ax.text(x + width, y + 0.02, '%s' % z, ha='center', va='bottom', fontsize=8) + ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=8) ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)') ax.set_xlabel('Codons') handles, labels = ax.get_legend_handles_labels() @@ -657,7 +603,7 @@ parser.add_option("-C", "--cond2", dest="c2", type="string", help="Name of second condition", metavar="STR") - parser.add_option("-k", "--kmer", dest="kmer", type="int", + parser.add_option("-k", "--kmer", dest="kmer", type="int", default = 28 , help="Length of your phasing reads", metavar="INT") # parser.add_option("-l", "--list", dest="list_cod", type= "string", @@ -669,19 +615,19 @@ parser.add_option("-d", "--dirout", dest="dirout", type="string", help="write report to PNG files", metavar="FILE") - parser.add_option("-a", "--asite", dest="asite", type="int", - help="Off-set from the 5'end of the footprint to the A-site", metavar="INT") + parser.add_option("-a", "--asite", dest="asite", type="int", default = 15 , + help="Off-set from the 5'end of the footprint to the A-site (default is 15)", metavar="INT") - parser.add_option("-s", "--site", dest="site", type="string", - help="Script can compute in site A, P or E", metavar="A|P|E") + parser.add_option("-s", "--site", dest="site", type="string", default = "A" , + help="Script can compute in site A, P or E (default is A-site)", metavar="A|P|E") - parser.add_option("-r", "--rep", dest="rep", type="string", + parser.add_option("-r", "--rep", dest="rep", type="string", default = "no" , help="if replicate or not", metavar="yes|no") - parser.add_option("-x", "--hex_col1", dest="color1", type= "string", + parser.add_option("-x", "--hex_col1", dest="color1", type= "string", default = "SkyBlue" , help="Color for first condition", metavar="STR") - parser.add_option("-X", "--hex_col2", dest="color2", type= "string", + parser.add_option("-X", "--hex_col2", dest="color2", type= "string", default = "Plum" , help="Color for second condition", metavar="STR") parser.add_option("-q", "--quiet", @@ -702,28 +648,6 @@ if not colors.is_color_like(options.color2) : stop_err( options.color2+' is not a proper color' ) - ## identify GFF or GTF format from 9th column - #with open (options.gff,"r") as gffile : - # for line in gffile : - # if '#' in line : - # ## skip header - # gffile.next() - # elif 'gene_id' in line : - # ## launch gtf reader : - # GFF = ribo_functions.store_gtf(options.gff) - # break - # elif 'ID=' in line : - # ## launch gff reader - # GFF = ribo_functions.store_gff(options.gff) - # break - # else : - # stop_err( 'Please check your annotation file is in correct format, GFF or GTF' ) - - #GFF = store_gff(options.gff) - #GFF = ribo_functions.store_gtf(options.gff) - ## check gff reading - #if not GFF['order'] : - # stop_err( 'Incorrect GFF file' + str( e ) ) #### NOT USE IN FINAL VERSION # # get codon list
--- a/get_codon_frequency.xml Fri Jan 23 03:31:37 2015 -0500 +++ b/get_codon_frequency.xml Thu Apr 09 11:35:48 2015 -0400 @@ -1,5 +1,5 @@ <tool id="Codon_analysis" name="Codon_density"> - <description>To compare Ribo-seq alignments between two sets of conditions, to determine codon occupancy </description> + <description>To compare Ribo-seq alignments between two sets of conditions, to determine codon occupancy</description> <requirements> <requirement type="package">samtools</requirement> <requirement type="python-module">matplotlib</requirement> @@ -57,8 +57,8 @@ </param> <param name="asite" type="integer" label="Off-set from the 5'end of the footprint to the A-site" value ="15" /> <param name="kmer" type="integer" label="Lenght of the best phasing footprints" value ="28" /> - <param name="cond1" type="text" size="25" label="Condition one" help="Required even if no replicate" /> - <param name="cond2" type="text" size="25" label="Condition two" help="Required even if no replicate" /> + <param name="cond1" type="text" size="25" label="Condition one" help="Required even if no replicate" value="cond1"/> + <param name="cond2" type="text" size="25" label="Condition two" help="Required even if no replicate" value="cond2"/> <param name="color1" type="text" size="50" label="Color for first condition" value ="SkyBlue" help="Enter standard name, hex color string, or rgb code. You cand find all colors here : http://pythonhosted.org/ete2/reference/reference_svgcolors.html" /> <param name="color2" type="text" size="50" label="Color for second condition" value ="Plum" help="Enter standard name, hex color string, or rgb code. You cand find all colors here : http://pythonhosted.org/ete2/reference/reference_svgcolors.html" />
--- a/kmer_analysis.py Fri Jan 23 03:31:37 2015 -0500 +++ b/kmer_analysis.py Thu Apr 09 11:35:48 2015 -0400 @@ -80,6 +80,10 @@ ''' global total_mapped_read + ## tags by default + multi_tag = "XS:i:" + tag = "IH:i:1" + ## init kmer dict KMER = OrderedDict({}) @@ -103,10 +107,10 @@ multi_tag = "XS:i:" elif 'bwa' in line: multi_tag = "XT:A:R" - #elif 'TopHat' in line: - # multi_tag = "NH:i:1" + elif 'TopHat' in line: + tag = "NH:i:1" else : - stop_err("No PG tag find in "+samfile+". Please use bowtie or bwa for mapping") + stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping") # get footprint elif re.search('^[^@].+', line) : @@ -121,7 +125,7 @@ read_pos = int(line.split('\t')[3]) read_sens = int(line.split('\t')[1]) #len_read = len(line.split('\t')[9]) - if len_read == kmer and multi_tag not in line: + if len_read == kmer and (tag in line or multi_tag not in line): ###if line.split('\t')[5] == '28M' : total_mapped_read +=1 #if it's a forward read @@ -399,7 +403,7 @@ #GFF = ribo_functions.store_gtf(options.gff) ## check gff reading if not GFF['order'] : - stop_err( 'Incorrect GFF file' + str( e ) ) + stop_err( 'Incorrect GFF file' ) ## split bam split_bam(options.bamfile,tmpdir) @@ -416,12 +420,12 @@ ## compute analysis with other kmer for keys in kmer.iterkeys() : if keys != 28 : - ## remove all txt files in tmp directory - if os.system("rm "+tmpdir+"/*.txt") != 0 : - stop_err( 'Error during tmp directory cleaning : ' + str( e ) ) - + ## If not enought reads in this kmer : if kmer[keys] > 100 : + ## remove all txt files in tmp directory + if os.system("rm "+tmpdir+"/*.txt") != 0 : + stop_err( 'Error during tmp directory cleaning') ## compute coverage and distribution kmer tmp = get_first_base(tmpdir, keys) ## compute phasing
--- a/kmer_analysis.xml Fri Jan 23 03:31:37 2015 -0500 +++ b/kmer_analysis.xml Thu Apr 09 11:35:48 2015 -0400 @@ -17,7 +17,7 @@ </inputs> <outputs> - <data format="html" name="output" label="Kmer report"/> + <data format="html" name="output" label="Kmer report on ${on_string}"/> </outputs> @@ -25,8 +25,11 @@ Summary ------- -The kmer tool computes the distribution of footprints length from Bam file and determines the proportion of footprints beginning in each frame, for all annotated genes in the GFF file. +The kmer tool computes the distribution of footprints length from Bam file and determines the proportion of footprints beginning in each frame, for all annotated genes in the `GFF3`_ file. + +.. _GFF3: http://gmod.org/wiki/GFF3 + Output ------- This tool provides an html report detailing the proportions and phasing of the kmers.
--- a/metagene_frameshift_analysis.py Fri Jan 23 03:31:37 2015 -0500 +++ b/metagene_frameshift_analysis.py Thu Apr 09 11:35:48 2015 -0400 @@ -11,10 +11,10 @@ import os, sys, optparse, tempfile, subprocess, re, shutil, commands, urllib, time, csv import math from numpy import arange -from matplotlib import pyplot as pl -#import matplotlib -#matplotlib.use('Agg') -#import matplotlib.pyplot as pl +#from matplotlib import pyplot as pl +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as pl import itertools from Bio import SeqIO from Bio.Seq import Seq @@ -87,6 +87,10 @@ write footprint coverage file for each sam file in tmpdir ''' global total_mapped_read + ## tags by default + multi_tag = "XS:i:" + tag = "IH:i:1" + try: ### compute position of P-site according to frame (P-site -> +15 starting from 5' end of footprint) p_site_pos = 16-frame @@ -105,13 +109,15 @@ genomeF = [0]*size genomeR = [0]*size # define multiple reads keys from mapper - elif '@PG' in line : + elif '@PG\tID' in line : if 'bowtie' in line: multi_tag = "XS:i:" elif 'bwa' in line: multi_tag = "XT:A:R" + elif 'TopHat' in line: + tag = "NH:i:1" else : - stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") + stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping") # get footprint elif re.search('^[^@].+', line) : @@ -120,7 +126,7 @@ read_sens = int(line.split('\t')[1]) len_read = len(line.split('\t')[9]) read_qual = int(line.split('\t')[4]) - if len_read == kmer and multi_tag not in line and read_qual > 20 : + if len_read == kmer and (tag in line or multi_tag not in line) and read_qual > 20 : ###if line.split('\t')[5] == '28M' : # print line.rstrip() total_mapped_read +=1 @@ -315,7 +321,7 @@ pl.ylabel('Total number of footprints (percent)') pl.draw() pl.savefig(subfolder+'/boxplot.png',format='png', dpi=640) - #pl.savefig(box_file, format='png', dpi=640) + pl.savefig(box_file, format='png', dpi=640) return GFF @@ -760,16 +766,16 @@ parser.add_option("-x", "--boxplot", dest="box", type= "string", help="report boxplot in this file", metavar="STR") - parser.add_option("-c", "--cutoff", dest="cutoff", type= "int", + parser.add_option("-c", "--cutoff", dest="cutoff", type= "int", default = 60 , help="Integer value for selecting all genes that have less than 60 % (default) of reads in coding frame ", metavar="INT") - parser.add_option("-k", "--kmer", dest="kmer", type= "int", - help="Longer of your phasing reads", metavar="INT") + parser.add_option("-k", "--kmer", dest="kmer", type= "int",default = 28 , + help="Longer of your phasing reads (Default is 28)", metavar="INT") - parser.add_option("-o", "--orf_length", dest="orf_length", type= "int", - help="Mean of ORF's length for segmentation", metavar="INT") + parser.add_option("-o", "--orf_length", dest="orf_length", type= "int", default = 300 , + help="Mean of ORF's length for segmentation (Default is 300pb, recommended for yeast)", metavar="INT") - parser.add_option("-p", "--frame", dest="frame", type="int", + parser.add_option("-p", "--frame", dest="frame", type="int", default = 1 , help="Script can compute in frame 1, 2 or 3", metavar="1|2|3") parser.add_option("-q", "--quiet", @@ -821,10 +827,10 @@ break else : stop_err( 'Please check your annotation file is in correct format, GFF or GTF' ) - #GFF = store_gff(options.gfffile) + ## check gff reading if not GFF['order'] : - stop_err( 'Incorrect GFF file' + str( e ) ) + stop_err( 'Incorrect GFF file') # #### cPickles for Test #### #if os.path.isfile("/home/rlegendre/Documents/FrameShift/OutOfFrameWithIntron/pyt_dict") :
--- a/metagene_frameshift_analysis.xml Fri Jan 23 03:31:37 2015 -0500 +++ b/metagene_frameshift_analysis.xml Thu Apr 09 11:35:48 2015 -0400 @@ -32,7 +32,7 @@ <help> Summary ------- -This tool uses Ribo-seq data (BAM file) to extract out-of-frame footprints in all genes from a reference annotation file (GFF3). Subprofiles are plotted for each gene with dual coding events. +This tool uses Ribo-seq data (BAM file) to extract out-of-frame footprints in all genes from a reference annotation file (`GFF3`_). Subprofiles are plotted for each gene with dual coding events. *- GFF3 file*: This file must have nine tabulated-delimited columns: Chromosome, source, feature, start, stop, score, strand, phasing, note. The gene ID is retrieved from the note field, using the "ID=" tag. @@ -49,8 +49,10 @@ *- Orf*: Approximate size of the segment. + +.. _GFF3: http://gmod.org/wiki/GFF3 - + Output ------- This tool generates 3 output files:
--- a/metagene_readthrough.py Fri Jan 23 03:31:37 2015 -0500 +++ b/metagene_readthrough.py Thu Apr 09 11:35:48 2015 -0400 @@ -665,44 +665,6 @@ stop_err( 'Error during computing analysis : ' + str( e ) ) -def cleaning_bam(bam): - ''' - Remove reads unmapped, non uniquely mapped and reads with length lower than 25 and upper than 32, and mapping quality upper than 12 - ''' - try : - header = subprocess.check_output(['samtools', 'view', '-H', bam], stderr= subprocess.PIPE) - #header = results.split('\n') - - # check mapper for define multiple tag - if re.search('bowtie', header): - multi_tag = "XS:i:" - elif re.search('bwa', header): - multi_tag = "XT:A:M" - else : - stop_err("No PG tag find in"+bam+". Please use bowtie or bwa for mapping") - - tmp_sam = tempfile.mktemp() - cmd = "samtools view %s > %s" % (bam, tmp_sam) - proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) - returncode = proc.wait() - - - with open(tempfile.mktemp(),'w') as out : - out.write(header) - with open(tmp_sam,'r') as sam : - for line in sam : - if multi_tag not in line and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 : - if len(line.split('\t')[9]) < 32 and len(line.split('\t')[9]) > 25 : - out.write(line) - bamfile = tempfile.mktemp()+'.bam' - cmd = "samtools view -hSb %s > %s" % (out.name,bamfile) - proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) - returncode = proc.wait() - - return bamfile - - except Exception,e: - stop_err( 'Error during cleaning bam : ' + str( e ) ) def write_html_page(html,subfolder) : @@ -1004,8 +966,8 @@ #GFF = ribo_functions.store_gtf(options.gff) ## check gff reading if not GFF['order'] : - stop_err( 'Incorrect GFF file' + str( e ) ) - clean_file = cleaning_bam(options.bamfile) + stop_err( 'Incorrect GFF file' ) + clean_file = ribo_functions.cleaning_bam(options.bamfile) compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder, options.extend) if os.path.exists( clean_file ): os.remove( clean_file )
--- a/metagene_readthrough.xml Fri Jan 23 03:31:37 2015 -0500 +++ b/metagene_readthrough.xml Thu Apr 09 11:35:48 2015 -0400 @@ -26,7 +26,7 @@ <help> Summary ------- -This tool uses Ribo-seq data (BAM file) to extract genes displaying potential stop codon readthrough events from a reference annotation file (GFF3). +This tool uses Ribo-seq data (BAM file) to extract genes displaying potential stop codon readthrough events from a reference annotation file (`GFF3`_). C-terminal protein extensions were identified as previously described (Dunn J.G. et al, 2013). Only uniquely mapped footprints with a size in the 25 to 34 range are considered. A gene is considered to display readthrough if: @@ -43,6 +43,8 @@ Stop codon readthrough was estimated by calculating the ratio of the number of footprints in the C-terminal extension to that in the CDS. Ribosome density footprints were estimated in RPKM (reads per kilobase per million). We controlled for variability due to stop codon peaks, by excluding footprints mapping to stop codons from the calculation of RPKM. +.. _GFF3: http://gmod.org/wiki/GFF3 + Output -------
--- a/ribo_functions.py Fri Jan 23 03:31:37 2015 -0500 +++ b/ribo_functions.py Thu Apr 09 11:35:48 2015 -0400 @@ -7,7 +7,7 @@ @license: GPL v3 ''' -import sys, subprocess, re, commands, time, urllib +import sys, subprocess, re, commands, time, urllib, tempfile from copy import copy @@ -127,7 +127,9 @@ try: GFF = {} with open(gff, 'r') as f_gff : + GFF['order'] = [] + #for line in itertools.islice( f_gff, 25 ): for line in f_gff: ## switch commented lines line = line.split("#")[0] @@ -136,13 +138,21 @@ # first line is already gene line : if line.split('\t')[2] == 'gene' : gene = feature[0].replace("ID=","") - if re.search('gene',feature[2]) : - Name = feature[2].replace("gene=","") + if 'Name' in line : + regex = re.compile('(Name=)([^;]*);') + res = regex.search(line.split('\t')[8]) + Name = res.group(2) + Name = Name.rstrip() else : Name = "Unknown" ##get annotation - note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line) - note = urllib.unquote(str(note)).replace("\n","") + if 'Note' in line : + regex = re.compile('(Note=)([^;]*);') + res = regex.search(line.split('\t')[8]) + note = res.group(2) + note = urllib.unquote(str(note)).replace("\n","") + else : + note = "" ## store gene information GFF['order'].append(gene) GFF[gene] = {} @@ -156,7 +166,11 @@ GFF[gene]['exon_number'] = 0 #print Name elif line.split('\t')[2] == 'CDS' : - gene = re.sub(r".?Parent\=(.+)\_mRNA?", r"\1", feature[0]) + regex = re.compile('(Parent=)([^;]*);') + res = regex.search(line.split('\t')[8]) + gene = res.group(2) + if 'mRNA' in gene: + gene = re.sub(r"(.*)(\_mRNA)", r"\1", gene) if GFF.has_key(gene) : GFF[gene]['exon_number'] += 1 exon_number = GFF[gene]['exon_number'] @@ -166,7 +180,7 @@ GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) ## if there is a five prim UTR intron, we change start of gene - elif line.split('\t')[2] == 'five_prime_UTR_intron' : + elif line.split('\t')[2] == 'five_prime_UTR_intron' : if GFF[gene]['strand'] == "+" : GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] else : @@ -199,7 +213,7 @@ # first line is already gene line : if 'protein_coding' in line : ##get name - gene = re.sub(r".+transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip() + gene = re.sub(r".+transcript_id \"([\w|-|\.]+)\";.*", r"\1", line).rstrip() Name = re.sub(r".+gene_name \"([\w|\-|\:|\.|\(|\)]+)\";.*", r"\1", line).rstrip() if line.split('\t')[2] == 'CDS' : ##if its first time we get this gene @@ -245,7 +259,7 @@ return __reverse_coordinates__(GFF) except Exception,e: - stop_err( 'Error during gff storage : ' + str( e ) ) + stop_err( 'Error during gtf storage : ' + str( e ) ) ##IV protein_coding exon 307766 307789 . - . gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; @@ -288,15 +302,19 @@ try : header = subprocess.check_output(['samtools', 'view', '-H', bam], stderr= subprocess.PIPE) #header = results.split('\n') - + ## tags by default + multi_tag = "XS:i:" + tag = "IH:i:1" # check mapper for define multiple tag - if re.search('bowtie', header): + if 'bowtie' in header: multi_tag = "XS:i:" - elif re.search('bwa', header): + elif 'bwa' in header: multi_tag = "XT:A:R" + elif 'TopHat' in header: + tag = "NH:i:1" else : - stop_err("No PG tag find in"+bam+". Please use bowtie or bwa for mapping") - + stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping") + tmp_sam = tempfile.mktemp() cmd = "samtools view %s > %s" % (bam, tmp_sam) proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) @@ -307,7 +325,7 @@ out.write(header) with open(tmp_sam,'r') as sam : for line in sam : - if multi_tag not in line and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 : + if (multi_tag not in line or tag in line) and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 : if len(line.split('\t')[9]) < 32 and len(line.split('\t')[9]) > 25 : out.write(line) bamfile = tempfile.mktemp()+'.bam'