ribo_tools: get_codon_frequency.py comparison

comparison get_codon_frequency.py @ 15:702e60e819c2 draft

Uploaded

author	rlegendre
date	Mon, 11 May 2015 09:53:08 -0400
parents	344bacf6acb8
children	fcfdb2607cb8

comparison

equal deleted inserted replaced

-:344bacf6acb8
+:702e60e819c2
 '''
 from __future__ import division
 import os, sys, optparse, tempfile, subprocess, re, shutil, commands, urllib, time
 import itertools
-import math
+from math import log10
 from decimal import Decimal
 from Bio import SeqIO
 from Bio.Seq import Seq
 from numpy import arange, std, array, linspace, average
 #from matplotlib import pyplot as pl
 from collections import OrderedDict
 import ribo_functions
 import HTSeq
 # #libraries for debugg
 #import pdb
-# import cPickle
+import cPickle
 def stop_err(msg):
 sys.stderr.write("%s\n" % msg)
 sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
 sys.exit()
 cond1_2 = result[1].copy()
 cond2_1 = result[2].copy()
 cond2_2 = result[3].copy()
 # get codon order in one of list
 codon_sorted = sorted(cond1_1.iterkeys(), reverse=False)
-# get max of each list
+try:
-sum11 = sum(list(cond1_1.itervalues()))
+# get max of each list
-sum12 = sum(list(cond1_2.itervalues()))
+sum11 = sum(list(cond1_1.itervalues()))
-sum21 = sum(list(cond2_1.itervalues()))
+sum12 = sum(list(cond1_2.itervalues()))
-sum22 = sum(list(cond2_2.itervalues()))
+sum21 = sum(list(cond2_1.itervalues()))
-# for each codon, get values and sd in each condition
+sum22 = sum(list(cond2_2.itervalues()))
-cond1_val = {}
+# for each codon, get values and sd in each condition
-cond1 = {}
+cond1_val = {}
-cond2_val = {}
+cond1 = {}
-cond2 = {}
+cond2_val = {}
-std_cond1 = []
+cond2 = {}
-std_cond2 = []
+std_cond1 = []
-max_val = []  # # max value for graph
+std_cond2 = []
-for i in codon_sorted:
+max_val = []  # # max value for graph
-# # cond1 = moyenne of replicats cond1 divided by max
+for i in codon_sorted:
-cond1_val[i] = ((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2)
+# # cond1 = moyenne of replicats cond1 divided by max
-cond1[i] = ((cond1_1[i] + cond1_2[i]) / 2)
+cond1_val[i] = ((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2)
-# # standard deviation = absolute value of diffence between replicats of cond1
+cond1[i] = ((cond1_1[i] + cond1_2[i]) / 2)
-std_cond1.append(std(array([(cond1_1[i] * 100 / sum11), (cond1_2[i] * 100 / sum12)])))
+# # standard deviation = absolute value of diffence between replicats of cond1
-# # cond2 = moyenne of replicats cond1divided by max
+std_cond1.append(std(array([(cond1_1[i] * 100 / sum11), (cond1_2[i] * 100 / sum12)])))
-cond2_val[i] = ((cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2)
+# # cond2 = moyenne of replicats cond1divided by max
-cond2[i] = ((cond2_1[i] + cond2_2[i]) / 2)
+cond2_val[i] = ((cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2)
-# # standard deviation = absolute value of diffence between replicats of cond2
+cond2[i] = ((cond2_1[i] + cond2_2[i]) / 2)
-std_cond2.append(std(array([((cond2_1[i]) * 100 / sum21), ((cond2_2[i]) * 100 / sum22)])))
+# # standard deviation = absolute value of diffence between replicats of cond2
-# # max value for each codon
+std_cond2.append(std(array([((cond2_1[i]) * 100 / sum21), ((cond2_2[i]) * 100 / sum22)])))
-max_val.append(max((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2, (cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2))
+# # max value for each codon
+max_val.append(max((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2, (cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2))
-# for graph design
-cond1_norm = OrderedDict(sorted(cond1_val.items(), key=lambda t: t[0]))
+# for graph design
-cond1_norm.update ((x, y * 100) for x, y in cond1_norm.items())
+cond1_norm = OrderedDict(sorted(cond1_val.items(), key=lambda t: t[0]))
-cond2_norm = OrderedDict(sorted(cond2_val.items(), key=lambda t: t[0]))
+cond1_norm.update ((x, y * 100) for x, y in cond1_norm.items())
-cond2_norm.update ((x, y * 100) for x, y in cond2_norm.items())
+cond2_norm = OrderedDict(sorted(cond2_val.items(), key=lambda t: t[0]))
-max_val = [x * 100 for x in max_val]
+cond2_norm.update ((x, y * 100) for x, y in cond2_norm.items())
+max_val = [x * 100 for x in max_val]
+except ZeroDivisionError:
+stop_err("Not enough reads to compute the codon occupancy")
 AA = get_aa_dict(cond1_norm, cond2_norm)
 max_valaa = []
 cond1_aa = []
 cond2_aa = []
 for z in AA.itervalues():
 cond1_aa.append(z[0])
 cond2_aa.append(z[1])
 max_valaa.append(max(z))
 # # plot amino acid profile :
-fig = pl.figure(figsize=(30, 10), num=1)
+fig = pl.figure(figsize=(15,10), num=1)
 width = .50
 ax = fig.add_subplot(111)
 ax.xaxis.set_ticks([])
 ind = arange(21)
 pl.xlim(0, 21)
 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n')
 # plot result
-fig = pl.figure(figsize=(30, 10), num=1)
+fig = pl.figure(figsize=(20,10), num=1)
 width = .40
 ind = arange(len(codon_sorted))
 ax = fig.add_subplot(111)
 pl.xlim(0, len(codon_sorted) + 1)
 ax.spines['right'].set_color('none')
 ax.set_xlabel('Codons')
 handles, labels = ax.get_legend_handles_labels()
 ax.legend(handles, labels)
 pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340)
 pl.clf()
 elif len(result) == 2 :
 # store each dict in OrderedDict sorted by key to make code more readable
 cond1_norm = result[0].copy()
 cond2_norm = result[1].copy()
 # pdb.set_trace()
 # get codon order in one of list
 codon_sorted = sorted(cond1.iterkeys(), reverse=False)
+try:
 # get sum of each list
 sum1 = sum(list(cond1.itervalues()))
 sum2 = sum(list(cond2.itervalues()))
 # #Normalize values by sum of each libraries
 cond1_norm.update ((x, (y / sum1) * 100.0) for x, y in cond1_norm.items())
 cond2_norm.update((x, (y / sum2) * 100.0) for x, y in cond2_norm.items())
+except ZeroDivisionError:
+stop_err("Not enough reads to compute the codon occupancy")
 # # compute theorical count in COND2
 cond2_count = []
 for z in cond1_norm.itervalues() :
 count = int(z * sum2 / 100.0)
 cond2_count.append(count)
 cond1_aa.append(z[0])
 cond2_aa.append(z[1])
 max_val.append(max(z))
 # # plot amino acid profile :
-fig = pl.figure(num=1)
+fig = pl.figure(figsize=(15,10), num=1)
 width = .45
 ax = fig.add_subplot(111)
 ind = arange(21)
 pl.xlim(0, 21)
 #kwargs = {"hatch":'x'}
 for i in cond1:
 # # max value for each codon
 max_val.append(max(cond1_norm[i], cond2_norm[i]))
 # plot result
-fig = pl.figure(figsize=(40, 10), num=1)
+fig = pl.figure(figsize=(20,10), num=1)
 #fig = pl.figure(num=1)
 width = .40
 ind = arange(len(codon_sorted))
 ax = fig.add_subplot(111)
 pl.xlim(0, len(codon_sorted) + 1)
 <link href="/static/june_2007_style/blue/base.css" media="screen" rel="Stylesheet" type="text/css" />
 </head>
 <body>
 <h3>Global visualization</h3>
 <p>
-<h5>Visualization of density footprint in each codon.</h5><br> If user has selected analyse with replicats, standart error deviation between each replicate as plotting as error bar in histogram.<br>
+<h5>Visualization of density footprint in each codon.</h5><br> If user has selected "Yes" for the replicate option the standard deviation between each replicate is plotted as an error bar in histogram.<br>
 <img border="0" src="hist_codons.png"  width="1040"/>
 </p>
 <p>
 <h5>Test for homogeneity distribution between each condition</h5><br>
 H0 : %s and %s are same distribution <br>
 cmd = "samtools index %s " % (bamfile)
 proc = subprocess.Popen(args=cmd, shell=True, stderr=subprocess.PIPE)
 returncode = proc.wait()
 # if returncode != 0:
 #    raise Exception
+def plot_fc (cond1, cond2, site, dirout):
+fc = cond1.copy()
+for key, value in fc.iteritems():
+fc[key] = cond2[key]/cond1[key]
+index = arange(len(fc.keys()))
+label = fc.keys()
+label = [w.replace('T','U') for w in label]
+pl.figure(figsize=(15,10), num=1)
+ax = pl.subplot(1,1,1)
+pl.xticks([])
+pl.scatter(index, fc.values(), color='b')
+pl.axhline(y=1,color='r')
+pl.xticks(index, label, rotation=90)
+pl.ylabel('Foldchange of codon occupancy')
+ax.yaxis.set_ticks_position('left')
+ax.xaxis.set_ticks_position('bottom')
+pl.title(site+" site")
+pl.savefig(dirout + '/fc_codons.png', format="png", dpi=340)
 def __main__():
 # Parse command line options
 for fh in itertools.chain(cond1, cond2):
 check_index_bam (fh)
 result.append(get_codon_usage(fh, GFF, options.site, options.kmer, options.asite))
 (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2)
 # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir)
+plot_fc (cond1, cond2, options.site, html_dir)
 # #If there are no replicat
 elif options.rep == "no" :
 result = []
 # #calcul for each cond
 for fh in (options.file1, options.file2):
 check_index_bam (fh)
 result.append(get_codon_usage(fh, GFF, options.site, options.kmer,options.asite))
 (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2)
 # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir)
+plot_fc (cond1, cond2, options.site, html_dir)
 else :
 sys.stderr.write("Please enter yes or no for --rep option. Programme aborted at %s" % time.asctime(time.localtime(time.time())))
 sys.exit()
 # write_html_file(html,chi_pval,t_pval,codons,options.c1, options.c2)

Mercurial > repos > rlegendre > ribo_tools

comparison get_codon_frequency.py @ 15:702e60e819c2 draft