Mercurial > repos > rlegendre > ribo_tools

--- a/get_codon_frequency.py	Fri Apr 10 03:18:56 2015 -0400
+++ b/get_codon_frequency.py	Mon May 11 09:53:08 2015 -0400
@@ -11,7 +11,7 @@
 from __future__ import division
 import os, sys, optparse, tempfile, subprocess, re, shutil, commands, urllib, time
 import itertools
-import math
+from math import log10
 from decimal import Decimal
 from Bio import SeqIO
 from Bio.Seq import Seq
@@ -29,7 +29,7 @@
 import HTSeq
 # #libraries for debugg
 #import pdb
-# import cPickle
+import cPickle

 def stop_err(msg):
     sys.stderr.write("%s\n" % msg)
@@ -302,39 +302,42 @@
         cond2_2 = result[3].copy()
         # get codon order in one of list
         codon_sorted = sorted(cond1_1.iterkeys(), reverse=False)
-        # get max of each list
-        sum11 = sum(list(cond1_1.itervalues()))
-        sum12 = sum(list(cond1_2.itervalues()))
-        sum21 = sum(list(cond2_1.itervalues()))
-        sum22 = sum(list(cond2_2.itervalues()))
-        # for each codon, get values and sd in each condition
-        cond1_val = {}
-        cond1 = {}
-        cond2_val = {}
-        cond2 = {}
-        std_cond1 = []
-        std_cond2 = []
-        max_val = []  # # max value for graph
-        for i in codon_sorted:
-            # # cond1 = moyenne of replicats cond1 divided by max
-            cond1_val[i] = ((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2)
-            cond1[i] = ((cond1_1[i] + cond1_2[i]) / 2)
-            # # standard deviation = absolute value of diffence between replicats of cond1
-            std_cond1.append(std(array([(cond1_1[i] * 100 / sum11), (cond1_2[i] * 100 / sum12)])))
-            # # cond2 = moyenne of replicats cond1divided by max
-            cond2_val[i] = ((cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2)
-            cond2[i] = ((cond2_1[i] + cond2_2[i]) / 2)
-            # # standard deviation = absolute value of diffence between replicats of cond2
-            std_cond2.append(std(array([((cond2_1[i]) * 100 / sum21), ((cond2_2[i]) * 100 / sum22)])))
-            # # max value for each codon
-            max_val.append(max((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2, (cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2))
-
-        # for graph design
-        cond1_norm = OrderedDict(sorted(cond1_val.items(), key=lambda t: t[0]))
-        cond1_norm.update ((x, y * 100) for x, y in cond1_norm.items())
-        cond2_norm = OrderedDict(sorted(cond2_val.items(), key=lambda t: t[0]))
-        cond2_norm.update ((x, y * 100) for x, y in cond2_norm.items())
-        max_val = [x * 100 for x in max_val]
+        try:
+            # get max of each list
+            sum11 = sum(list(cond1_1.itervalues()))
+            sum12 = sum(list(cond1_2.itervalues()))
+            sum21 = sum(list(cond2_1.itervalues()))
+            sum22 = sum(list(cond2_2.itervalues()))
+            # for each codon, get values and sd in each condition
+            cond1_val = {}
+            cond1 = {}
+            cond2_val = {}
+            cond2 = {}
+            std_cond1 = []
+            std_cond2 = []
+            max_val = []  # # max value for graph
+            for i in codon_sorted:
+                # # cond1 = moyenne of replicats cond1 divided by max
+                cond1_val[i] = ((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2)
+                cond1[i] = ((cond1_1[i] + cond1_2[i]) / 2)
+                # # standard deviation = absolute value of diffence between replicats of cond1
+                std_cond1.append(std(array([(cond1_1[i] * 100 / sum11), (cond1_2[i] * 100 / sum12)])))
+                # # cond2 = moyenne of replicats cond1divided by max
+                cond2_val[i] = ((cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2)
+                cond2[i] = ((cond2_1[i] + cond2_2[i]) / 2)
+                # # standard deviation = absolute value of diffence between replicats of cond2
+                std_cond2.append(std(array([((cond2_1[i]) * 100 / sum21), ((cond2_2[i]) * 100 / sum22)])))
+                # # max value for each codon
+                max_val.append(max((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2, (cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2))
+
+            # for graph design
+            cond1_norm = OrderedDict(sorted(cond1_val.items(), key=lambda t: t[0]))
+            cond1_norm.update ((x, y * 100) for x, y in cond1_norm.items())
+            cond2_norm = OrderedDict(sorted(cond2_val.items(), key=lambda t: t[0]))
+            cond2_norm.update ((x, y * 100) for x, y in cond2_norm.items())
+            max_val = [x * 100 for x in max_val]
+        except ZeroDivisionError:
+            stop_err("Not enough reads to compute the codon occupancy")

         AA = get_aa_dict(cond1_norm, cond2_norm)
         max_valaa = []
@@ -346,7 +349,7 @@
             cond2_aa.append(z[1])
             max_valaa.append(max(z))
         # # plot amino acid profile :
-        fig = pl.figure(figsize=(30, 10), num=1)
+        fig = pl.figure(figsize=(15,10), num=1)
         width = .50
         ax = fig.add_subplot(111)
         ax.xaxis.set_ticks([])
@@ -386,7 +389,7 @@


         # plot result
-        fig = pl.figure(figsize=(30, 10), num=1)
+        fig = pl.figure(figsize=(20,10), num=1)
         width = .40
         ind = arange(len(codon_sorted))
         ax = fig.add_subplot(111)
@@ -406,7 +409,6 @@
         ax.legend(handles, labels)
         pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340)
         pl.clf()
-


     elif len(result) == 2 :
@@ -419,14 +421,16 @@
         # pdb.set_trace()
         # get codon order in one of list
         codon_sorted = sorted(cond1.iterkeys(), reverse=False)
-
-        # get sum of each list
-        sum1 = sum(list(cond1.itervalues()))
-        sum2 = sum(list(cond2.itervalues()))
-        # #Normalize values by sum of each libraries
-        cond1_norm.update ((x, (y / sum1) * 100.0) for x, y in cond1_norm.items())
-        cond2_norm.update((x, (y / sum2) * 100.0) for x, y in cond2_norm.items())
-
+        try:
+            # get sum of each list
+            sum1 = sum(list(cond1.itervalues()))
+            sum2 = sum(list(cond2.itervalues()))
+            # #Normalize values by sum of each libraries
+            cond1_norm.update ((x, (y / sum1) * 100.0) for x, y in cond1_norm.items())
+            cond2_norm.update((x, (y / sum2) * 100.0) for x, y in cond2_norm.items())
+        except ZeroDivisionError:
+            stop_err("Not enough reads to compute the codon occupancy")
+
         # # compute theorical count in COND2
         cond2_count = []
         for z in cond1_norm.itervalues() :
@@ -448,7 +452,7 @@
             max_val.append(max(z))

         # # plot amino acid profile :
-        fig = pl.figure(num=1)
+        fig = pl.figure(figsize=(15,10), num=1)
         width = .45
         ax = fig.add_subplot(111)
         ind = arange(21)
@@ -492,7 +496,7 @@
             max_val.append(max(cond1_norm[i], cond2_norm[i]))

         # plot result
-        fig = pl.figure(figsize=(40, 10), num=1)
+        fig = pl.figure(figsize=(20,10), num=1)
         #fig = pl.figure(num=1)
         width = .40
         ind = arange(len(codon_sorted))
@@ -537,7 +541,7 @@
     <body>
         <h3>Global visualization</h3>
         <p>
-            <h5>Visualization of density footprint in each codon.</h5><br> If user has selected analyse with replicats, standart error deviation between each replicate as plotting as error bar in histogram.<br>
+            <h5>Visualization of density footprint in each codon.</h5><br> If user has selected "Yes" for the replicate option the standard deviation between each replicate is plotted as an error bar in histogram.<br>
             <img border="0" src="hist_codons.png"  width="1040"/>
         </p>
         <p>
@@ -582,6 +586,29 @@
         returncode = proc.wait()
         # if returncode != 0:
         #    raise Exception
+
+def plot_fc (cond1, cond2, site, dirout):
+
+    fc = cond1.copy()
+
+    for key, value in fc.iteritems():
+        fc[key] = cond2[key]/cond1[key]
+
+    index = arange(len(fc.keys()))
+    label = fc.keys()
+    label = [w.replace('T','U') for w in label]
+    pl.figure(figsize=(15,10), num=1)
+    ax = pl.subplot(1,1,1)
+    pl.xticks([])
+    pl.scatter(index, fc.values(), color='b')
+    pl.axhline(y=1,color='r')
+    pl.xticks(index, label, rotation=90)
+    pl.ylabel('Foldchange of codon occupancy')
+    ax.yaxis.set_ticks_position('left')
+    ax.xaxis.set_ticks_position('bottom')
+    pl.title(site+" site")
+    pl.savefig(dirout + '/fc_codons.png', format="png", dpi=340)
+

 def __main__():

@@ -675,7 +702,7 @@
                 result.append(get_codon_usage(fh, GFF, options.site, options.kmer, options.asite))
             (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2)
             # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir)
-
+            plot_fc (cond1, cond2, options.site, html_dir)

         # #If there are no replicat
         elif options.rep == "no" :
@@ -686,7 +713,7 @@
                 result.append(get_codon_usage(fh, GFF, options.site, options.kmer,options.asite))
             (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2)
             # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir)
-
+            plot_fc (cond1, cond2, options.site, html_dir)
         else :
             sys.stderr.write("Please enter yes or no for --rep option. Programme aborted at %s" % time.asctime(time.localtime(time.time())))
             sys.exit()
--- a/metagene_readthrough.py	Fri Apr 10 03:18:56 2015 -0400
+++ b/metagene_readthrough.py	Mon May 11 09:53:08 2015 -0400
@@ -38,7 +38,7 @@
     try :
         rpkm = "{0:.4f}".format(count_gene*1000000.0/(count_tot*length))
     except ArithmeticError :
-        stop_err( 'Illegal division by zero')
+        stop_err( 'Illegal division by zero during computing RPKM')
     return float(rpkm)


@@ -682,12 +682,15 @@
             spamreader = csv.reader(csvfile, delimiter='\t')
             ## skip the header
             next(spamreader, None)
-            for row in spamreader:
-                if row[2] == '-' :
-                    gene_table += '<tr><td>%s</td><td><a href="%s.png" data-lightbox="%s"><img src="%s_thumbnail.png" /></a></td><td><a title="%s">%s</a></td><td>%s</td><td>%s:%s-%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' %(row[0], row[0], row[0], row[0], row[11], row[1], row[3], row[4], row[5], row[6], row[8], row[9], row[10], row[12])
-
-        gene_table += '</tbody></table>'
-
+            ##test if file is empty or not
+            if next(spamreader, None):
+                for row in spamreader:
+                    if row[2] == '-' :
+                        gene_table += '<tr><td>%s</td><td><a href="%s.png" data-lightbox="%s"><img src="%s_thumbnail.png" /></a></td><td><a title="%s">%s</a></td><td>%s</td><td>%s:%s-%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' %(row[0], row[0], row[0], row[0], row[11], row[1], row[3], row[4], row[5], row[6], row[8], row[9], row[10], row[12])
+
+                gene_table += '</tbody></table>'
+            else :
+                gene_table = 'Sorry, there are no stop codon readthrough genes in your data\n'


         html_str = """
@@ -930,7 +933,7 @@
     parser.add_option("-d", "--dirout", dest="dirout", type= "string",
                   help="write report in this html file and in associated directory", metavar="STR,STR")

-    parser.add_option("-e", "--extend", dest="extend", type= "int",
+    parser.add_option("-e", "--extend", dest="extend", type= "int",default = 300 ,
                   help="Lenght of extension after stop in number of base pairs (depends on your organisme)", metavar="INT")

     parser.add_option("-q", "--quiet",