changeset 0:b8c070add3b7

Uploaded
author rlegendre
date Mon, 20 Oct 2014 11:06:17 -0400
parents
children 1fbddace2db6
files get_codon_frequency.py
diffstat 1 files changed, 748 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_codon_frequency.py	Mon Oct 20 11:06:17 2014 -0400
@@ -0,0 +1,748 @@
+#!/usr/bin/env python2.7
+# -*- coding: utf-8 -*-
+
+'''
+    Created on sep. 2013
+    @author: rachel legendre
+    @copyright: rachel.legendre@igmors.u-psud.fr
+    @license: GPL v3
+'''
+
+from __future__ import division
+import os, sys, optparse, tempfile, subprocess, re, shutil, commands, urllib, time
+import itertools
+import math
+from decimal import Decimal
+from Bio import SeqIO
+from Bio.Seq import Seq
+from numpy import arange, std, array, linspace, average
+#from matplotlib import pyplot as pl
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as pl
+from matplotlib import font_manager
+from matplotlib import colors
+import csv
+from scipy import stats
+from collections import OrderedDict
+# #libraries for debugg
+# import pdb
+# import cPickle
+
+def stop_err(msg):
+    sys.stderr.write("%s\n" % msg)
+    sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
+    sys.exit()
+
+    
+def store_gff(gff):
+    '''
+        parse and store gff file in a dictionnary
+    '''
+    try:
+        GFF = OrderedDict({})
+        with open(gff, 'r') as f_gff : 
+            # GFF['order'] = []
+            for line in f_gff:
+                # # switch commented lines
+                head = line.split("#")[0]
+                if head != "" :
+                    feature = (line.split('\t')[8]).split(';')
+                    chrom = line.split('\t')[0]
+                    if chrom not in GFF :
+                        GFF[chrom] = {}
+                # first line is already gene line :
+                    if line.split('\t')[2] == 'gene' :
+                        gene = feature[0].replace("ID=", "")
+                        if re.search('gene', feature[2]) :
+                            Name = feature[2].replace("gene=", "")
+                        else :
+                            Name = "Unknown"
+                        # #get annotation
+                        note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line)
+                        note = urllib.unquote(str(note)).replace("\n", "")
+                        # # store gene information
+                        # GFF['order'].append(gene)
+                        GFF[chrom][gene] = {}
+                        GFF[chrom][gene]['chrom'] = line.split('\t')[0]
+                        GFF[chrom][gene]['start'] = int(line.split('\t')[3])
+                        GFF[chrom][gene]['stop'] = int(line.split('\t')[4])
+                        GFF[chrom][gene]['strand'] = line.split('\t')[6]
+                        GFF[chrom][gene]['name'] = Name
+                        GFF[chrom][gene]['note'] = note
+                        GFF[chrom][gene]['exon'] = {}
+                        GFF[chrom][gene]['exon_number'] = 0
+                        # print Name
+                    elif line.split('\t')[2] == 'CDS' :
+                        gene = re.sub(r"Parent\=(.+)_mRNA", r"\1", feature[0])
+                        if GFF[chrom].has_key(gene) :
+                            GFF[chrom][gene]['exon_number'] += 1
+                            exon_number = GFF[chrom][gene]['exon_number'] 
+                            GFF[chrom][gene]['exon'][exon_number] = {}
+                            GFF[chrom][gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
+                            GFF[chrom][gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
+                            GFF[chrom][gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
+                    # # if there is a five prim UTR intron, we change start of gene
+                    elif line.split('\t')[2] == 'five_prime_UTR_intron' :
+                        if GFF[chrom][gene]['strand'] == "+" :
+                            GFF[chrom][gene]['start'] = GFF[chrom][gene]['exon'][1]['start']
+                        else :
+                            GFF[chrom][gene]['stop'] = GFF[chrom][gene]['exon'][exon_number]['stop']
+        return GFF
+    except Exception, e:
+        stop_err('Error during gff storage : ' + str(e))
+  
+#chrI    SGD     gene    87286   87752   .       +       .       ID=YAL030W;Name=YAL030W;gene=SNC1;Alias=SNC1;Ontology_term=GO:0005484,GO:0005768,GO:0005802,GO:0005886,GO:0005935,GO:0006887,GO:0006893,GO:000689
+#7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra
+#ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr
+#om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified
+#chrI    SGD     CDS     87286   87387   .       +       0       Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
+#chrI    SGD     CDS     87501   87752   .       +       0       Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified    
+
+
+
+def init_codon_dict():
+    
+    Codon_dict = OrderedDict([('AAA', 0), ('AAC', 0), ('AAG', 0), ('AAT', 0), ('ACA', 0), ('ACC', 0), ('ACG', 0), ('ACT', 0), ('AGA', 0), ('AGC', 0), ('AGG', 0), ('AGT', 0), ('ATA', 0), ('ATC', 0), ('ATG', 0), ('ATT', 0), ('CAA', 0), ('CAC', 0), ('CAG', 0), ('CAT', 0), ('CCA', 0), ('CCC', 0), ('CCG', 0), ('CCT', 0), ('CGA', 0), ('CGC', 0), ('CGG', 0), ('CGT', 0), ('CTA', 0), ('CTC', 0), ('CTG', 0), ('CTT', 0), ('GAA', 0), ('GAC', 0), ('GAG', 0), ('GAT', 0), ('GCA', 0), ('GCC', 0), ('GCG', 0), ('GCT', 0), ('GGA', 0), ('GGC', 0), ('GGG', 0), ('GGT', 0), ('GTA', 0), ('GTC', 0), ('GTG', 0), ('GTT', 0), ('TAA', 0), ('TAC', 0), ('TAG', 0), ('TAT', 0), ('TCA', 0), ('TCC', 0), ('TCG', 0), ('TCT', 0), ('TGA', 0), ('TGC', 0), ('TGG', 0), ('TGT', 0), ('TTA', 0), ('TTC', 0), ('TTG', 0), ('TTT', 0)])
+    return Codon_dict
+
+
+
+def get_codon_usage(bamfile, GFF, site, kmer, a_pos):
+    '''
+        Read GFF dict and get gene codon usage.
+        Return dict of codons usage
+    '''
+    try:
+        codon = init_codon_dict()
+                    
+        for chrom in GFF.iterkeys():
+            for gene in GFF[chrom] :
+                codon_dict = init_codon_dict()
+                start = GFF[chrom][gene]['start']
+                stop = GFF[chrom][gene]['stop']
+                region = chrom + ':' + str(start) + '-' + str(stop)
+                # #get all reads in this gene
+                reads = subprocess.check_output(["samtools", "view", bamfile, region])
+                head = subprocess.check_output(["samtools", "view", "-H", bamfile])
+                read_tab = reads.split('\n')
+                for read in read_tab:
+                    # # search mapper for eliminate multiple alignements
+                    if 'bowtie' in head:
+                        multi_tag = "XS:i:"
+                    elif 'bwa' in  head:
+                        multi_tag = "XT:A:R"
+                    else :
+                        stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping")  
+                    if len(read) == 0:
+                        continue
+                    
+                    len_read = len(read.split('\t')[9])        
+                    # if it's read of good length
+                    if len_read == kmer and multi_tag not in read:
+                        feat = read.split('\t')
+                        seq = feat[9]
+                        # if it's a reverse read
+                        if feat[1] == '16' :
+                            if site == "A" :
+                            # #get A-site
+                                cod = str(Seq(seq[a_pos-5:a_pos-2]).reverse_complement())
+                            elif site == "P" :
+                            # #get P-site
+                                cod = str(Seq(seq[a_pos-2:a_pos+1]).reverse_complement())
+                            else :
+                            # #get site-E
+                                cod = str(Seq(seq[a_pos+1:a_pos+4]).reverse_complement())
+                            # # test if it's a true codon not a CNG codon for example
+                            if codon_dict.has_key(cod) :
+                                codon_dict[cod] += 1
+                        # if it's a forward read
+                        elif feat[1] == '0' :
+                            if site == "A" :
+                            # #get A-site
+                                cod = seq[a_pos:a_pos+3]
+                            elif site == "P" :
+                            # #get P-site
+                                cod = seq[a_pos-3:a_pos]
+                            else :
+                            # #get site-E
+                                cod = seq[a_pos-6:a_pos-3]
+                            if codon_dict.has_key(cod) :
+                                codon_dict[cod] += 1
+                # # add in global dict   
+                for cod, count in codon_dict.iteritems() :
+                    codon[cod] += count
+                        
+        return codon
+        
+    except Exception, e:
+        stop_err('Error during codon usage calcul: ' + str(e))
+
+
+'''
+http://pyinsci.blogspot.fr/2009/09/violin-plot-with-matplotlib.html
+'''
+def violin_plot(ax, data, pos, bp=False):
+    '''
+    create violin plots on an axis
+    '''
+    dist = max(pos) - min(pos)
+    w = min(0.15 * max(dist, 1.0), 0.5)
+    for d, p in zip(data, pos):
+        k = stats.gaussian_kde(d)  # calculates the kernel density
+        m = k.dataset.min()  # lower bound of violin
+        M = k.dataset.max()  # upper bound of violin
+        x = arange(m, M, (M - m) / 100.)  # support for violin
+        v = k.evaluate(x)  # violin profile (density curve)
+        v = v / v.max() * w  # scaling the violin to the available space
+        ax.fill_betweenx(x, p, v + p, facecolor=color1, alpha=0.3)
+        ax.fill_betweenx(x, p, -v + p, facecolor=color2, alpha=0.3)
+    if bp:
+        ax.boxplot(data, notch=1, positions=pos, vert=1)
+
+
+
+'''
+http://log.ooz.ie/2013/02/matplotlib-comparative-histogram-recipe.html
+'''
+def comphist(x1, x2, orientation='vertical', **kwargs):
+    """Draw a comparative histogram."""
+    # Split keyword args:
+    kwargs1 = {}
+    kwargs2 = {}
+    kwcommon = {}
+    for arg in kwargs:
+        tgt_arg = arg[:-1]
+        if arg.endswith('1'):
+            arg_dict = kwargs1
+        elif arg.endswith('2'):
+            arg_dict = kwargs2
+        else:
+            arg_dict = kwcommon
+            tgt_arg = arg
+        arg_dict[tgt_arg] = kwargs[arg]
+    kwargs1.update(kwcommon)
+    kwargs2.update(kwcommon)
+
+    fig = pl.figure()
+
+    # Have both histograms share one axis.
+    if orientation == 'vertical':
+        ax1 = pl.subplot(211)
+        ax2 = pl.subplot(212, sharex=ax1)
+        # Flip the ax2 histogram horizontally.
+        ax2.set_ylim(ax1.get_ylim()[::-1])
+        pl.setp(ax1.get_xticklabels(), visible=False)
+        legend_loc = (1, 4)
+    else:
+        ax1 = pl.subplot(122)
+        ax2 = pl.subplot(121, sharey=ax1)
+        # Flip the ax2 histogram vertically.
+        ax2.set_xlim(ax2.get_xlim()[::-1])
+        pl.setp(ax1.get_yticklabels(), visible=False)
+        legend_loc = (1, 2)
+
+    ax1.hist(x1, orientation=orientation, **kwargs1)
+    ax2.hist(x2, orientation=orientation, **kwargs2)
+    ax2.set_ylim(ax1.get_ylim()[::-1])
+    ax1.legend(loc=legend_loc[0])
+    ax2.legend(loc=legend_loc[1])
+    # Tighten up the layout.    
+    pl.subplots_adjust(wspace=0.0, hspace=0.0)
+    return fig
+
+ 
+def compute_FC_plot(cond1_norm, cond2_norm, cod_name, codon_to_test, dirout):
+    
+    FC_tab = []
+    for z, y in zip(cond1_norm.itervalues(), cond2_norm.itervalues()):
+        fc = z - y
+        FC_tab.append(fc)
+    # #codon_to_test = ['TGA','TAG','TAA']
+
+    a = []
+    b = []
+    cod = []
+    for codon in cond1_norm.iterkeys():
+        if codon in codon_to_test :
+            fc = cond1_norm[codon] - cond2_norm[codon]
+            b.append(fc)
+            cod.append(codon)
+        else :
+            fc = cond1_norm[codon] - cond2_norm[codon]
+            a.append(fc)
+      
+   
+    fig = pl.figure(num=1)
+    comphist(array(a), array(b), label1='All codon', label2=cod_name, color2='green', bins=30, rwidth=1)
+    # pl.show()
+    pl.savefig(dirout + '/hist_codon_fc.png', format="png", dpi=340)
+    pl.clf()    
+    
+        
+    # #violin plot
+    pos = range(2)
+    dat = array([array(a), array(b)])
+    fig = pl.figure()
+    pl.title("Distribution of codons FoldChange between two conditions")
+    ax = fig.add_subplot(1, 1, 1)
+    lab = array(['All codons', cod_name])
+    violin_plot(ax, dat, pos, bp=1)
+    for x, z in zip(dat, pos):
+        ax.plot(z, average(x), color='r', marker='*', markeredgecolor='r')
+    xtickNames = pl.setp(ax, xticklabels=lab)
+    pl.savefig(dirout + '/violinplot_codon.png', format="png", dpi=340)
+    pl.clf()
+
+    # (Fval,pval) = stats.ttest_ind(a, b, axis=0, equal_var=True)
+    (Fval, pval) = stats.mannwhitneyu(a, b)
+    return pval
+    
+
+def get_aa_dict(cond1_norm, cond2_norm):
+    
+    # ## create amino acid dictionnary:
+    AA = OrderedDict({})
+    AA['Phe'] = [cond1_norm['TTT'] + cond1_norm['TTC'], cond2_norm['TTT'] + cond2_norm['TTC']]
+    AA['Leu'] = [cond1_norm['TTA'] + cond1_norm['TTG'] + cond1_norm['CTT'] + cond1_norm['CTC'] + cond1_norm['CTA'] + cond1_norm['CTG'], cond2_norm['TTA'] + cond2_norm['TTG'] + cond2_norm['CTT'] + cond2_norm['CTC'] + cond2_norm['CTA'] + cond2_norm['CTG']]
+    AA['Ile'] = [cond1_norm['ATT'] + cond1_norm['ATC'] + cond1_norm['ATA'], cond2_norm['ATT'] + cond2_norm['ATC'] + cond2_norm['ATA']]
+    AA['Met'] = [cond1_norm['ATG'], cond2_norm['ATG']]
+    AA['Val'] = [cond1_norm['GTT'] + cond1_norm['GTC'] + cond1_norm['GTA'] + cond1_norm['GTG'] + cond1_norm['AGT'] + cond1_norm['AGC'], cond2_norm['GTT'] + cond2_norm['GTC'] + cond2_norm['GTA'] + cond2_norm['GTG'] + cond2_norm['AGT'] + cond2_norm['AGC']]
+    AA['Ser'] = [cond1_norm['TCT'] + cond1_norm['TCC'] + cond1_norm['TCA'] + cond1_norm['TCG'], cond2_norm['TCT'] + cond2_norm['TCC'] + cond2_norm['TCA'] + cond2_norm['TCG']]
+    AA['Pro'] = [cond1_norm['CCT'] + cond1_norm['CCC'] + cond1_norm['CCA'] + cond1_norm['CCG'], cond2_norm['CCT'] + cond2_norm['CCC'] + cond2_norm['CCA'] + cond2_norm['CCG']]
+    AA['Thr'] = [cond1_norm['ACT'] + cond1_norm['ACC'] + cond1_norm['ACA'] + cond1_norm['ACG'], cond2_norm['ACT'] + cond2_norm['ACC'] + cond2_norm['ACA'] + cond2_norm['ACG']]
+    AA['Ala'] = [cond1_norm['GCT'] + cond1_norm['GCC'] + cond1_norm['GCA'] + cond1_norm['GCG'], cond2_norm['GCT'] + cond2_norm['GCC'] + cond2_norm['GCA'] + cond2_norm['GCG']]
+    AA['Tyr'] = [cond1_norm['TAT'] + cond1_norm['TAC'], cond2_norm['TAT'] + cond2_norm['TAC']]
+    AA['Stop'] = [cond1_norm['TAA'] + cond1_norm['TAG'] + cond1_norm['TGA'], cond2_norm['TAA'] + cond2_norm['TAG'] + cond2_norm['TGA']]
+    AA['His'] = [cond1_norm['CAT'] + cond1_norm['CAC'], cond2_norm['CAT'] + cond2_norm['CAC']]
+    AA['Gln'] = [cond1_norm['CAA'] + cond1_norm['CAG'], cond2_norm['CAA'] + cond2_norm['CAG']]
+    AA['Asn'] = [cond1_norm['AAT'] + cond1_norm['AAC'], cond2_norm['AAT'] + cond2_norm['AAC']]
+    AA['Lys'] = [cond1_norm['AAA'] + cond1_norm['AAG'], cond2_norm['AAA'] + cond2_norm['AAG']]
+    AA['Asp'] = [cond1_norm['GAT'] + cond1_norm['GAC'], cond2_norm['GAT'] + cond2_norm['GAC']]
+    AA['Glu'] = [cond1_norm['GAA'] + cond1_norm['GAG'], cond2_norm['GAA'] + cond2_norm['GAG']]
+    AA['Cys'] = [cond1_norm['TGT'] + cond1_norm['TGC'], cond2_norm['TGT'] + cond2_norm['TGC']]
+    AA['Trp'] = [cond1_norm['TGG'], cond2_norm['TGG']]
+    AA['Arg'] = [cond1_norm['CGT'] + cond1_norm['CGC'] + cond1_norm['CGA'] + cond1_norm['CGG'] + cond1_norm['AGA'] + cond1_norm['AGG'], cond2_norm['CGT'] + cond2_norm['CGC'] + cond2_norm['CGA'] + cond2_norm['CGG'] + cond2_norm['AGA'] + cond2_norm['AGG']]
+    AA['Gly'] = [cond1_norm['GGT'] + cond1_norm['GGC'] + cond1_norm['GGA'] + cond1_norm['GGG'], cond2_norm['GGT'] + cond2_norm['GGC'] + cond2_norm['GGA'] + cond2_norm['GGG']]
+    
+    
+    return AA
+
+
+    
+def plot_codon_usage(result, dirout, c1, c2, outfile, color1, color2):
+    '''
+        Take list of dict of codon usage and use matplotlib for do graph
+    '''
+
+    # #if there are replicat
+    if len(result) == 4 :
+        # store each dict in variables to make code more readable
+        cond1_1 = result[0].copy()
+        cond1_2 = result[1].copy()
+        cond2_1 = result[2].copy()
+        cond2_2 = result[3].copy()
+        # get codon order in one of list
+        codon_sorted = sorted(cond1_1.iterkeys(), reverse=False)
+        # get max of each list
+        sum11 = sum(list(cond1_1.itervalues()))
+        sum12 = sum(list(cond1_2.itervalues()))
+        sum21 = sum(list(cond2_1.itervalues()))
+        sum22 = sum(list(cond2_2.itervalues()))
+        # for each codon, get values and sd in each condition
+        cond1_val = {}
+        cond1 = {}
+        cond2_val = {}
+        cond2 = {}
+        std_cond1 = []
+        std_cond2 = []
+        max_val = []  # # max value for graph
+        for i in codon_sorted:
+            # # cond1 = moyenne of replicats cond1 divided by max 
+            cond1_val[i] = ((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2)
+            cond1[i] = ((cond1_1[i] + cond1_2[i]) / 2)
+            # # standard deviation = absolute value of diffence between replicats of cond1
+            std_cond1.append(std(array([(cond1_1[i] * 100 / sum11), (cond1_2[i] * 100 / sum12)])))
+            # # cond2 = moyenne of replicats cond1divided by max
+            cond2_val[i] = ((cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2)
+            cond2[i] = ((cond2_1[i] + cond2_2[i]) / 2)
+            # # standard deviation = absolute value of diffence between replicats of cond2
+            std_cond2.append(std(array([((cond2_1[i]) * 100 / sum21), ((cond2_2[i]) * 100 / sum22)])))
+            # # max value for each codon
+            max_val.append(max((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2, (cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2))
+            
+        # for graph design
+        cond1_norm = OrderedDict(sorted(cond1_val.items(), key=lambda t: t[0]))
+        cond1_norm.update ((x, y * 100) for x, y in cond1_norm.items())
+        cond2_norm = OrderedDict(sorted(cond2_val.items(), key=lambda t: t[0]))
+        cond2_norm.update ((x, y * 100) for x, y in cond2_norm.items())
+        max_val = [x * 100 for x in max_val]
+            
+        AA = get_aa_dict(cond1_norm, cond2_norm)
+        max_valaa = []
+        cond1_aa = []
+        cond2_aa = []
+        aa_name = list(AA.iterkeys())
+        for z in AA.itervalues():
+            cond1_aa.append(z[0])
+            cond2_aa.append(z[1])
+            max_valaa.append(max(z))
+        # # plot amino acid profile :
+        fig = pl.figure(figsize=(24, 10), num=1)
+        width = .50
+        ax = fig.add_subplot(111)
+        ax.xaxis.set_ticks([])
+        ind = arange(21)
+        pl.xlim(0, 21) 
+        ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1)
+        ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2)
+        for x, y, z in zip(ind, max_valaa, aa_name):
+            ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14)
+        ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)')
+        ax.set_xlabel('Amino Acid')
+        handles, labels = ax.get_legend_handles_labels()
+        ax.legend(handles, labels)
+        pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340)
+        pl.clf()
+        
+        
+        # # compute theorical count in COND2
+        sum2 = (sum21 + sum22) / 2
+        cond2_count = []
+        for z in cond1_norm.itervalues() :
+            count = int(z * sum2 / 100)
+            cond2_count.append(count)
+           
+        expected = array(cond2_count)
+        observed = array(list(cond2.itervalues()))
+        
+        # write result
+        with open(outfile, 'w') as out :
+            out.write('Codon\tRaw_' + c1 + '\tRaw_' + c2 + '\tNorm_' + c1 + '\tNorm_' + c2 + '\tFC\tFC_' + c1 + '\tFC_' + c2 + '\n')
+            for i in codon_sorted:
+                out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\t' + str((cond2_1[i] / sum21) / (cond1_1[i] / sum11)) + '\t' + str((cond2_2[i] / sum22) / (cond1_1[i] / sum11)) + '\n')
+            chi = stats.chisquare(observed, expected)
+            out.write('Khi2 test\n')
+            out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n')
+
+            
+            
+        # plot result
+        fig = pl.figure(figsize=(24, 10), num=1)
+        width = .50
+        ind = arange(len(codon_sorted))
+        ax = fig.add_subplot(111)
+        pl.xlim(0, len(codon_sorted) + 1) 
+        ax.spines['right'].set_color('none')
+        ax.spines['top'].set_color('none')
+        ax.xaxis.set_ticks([])
+        ax.spines['left'].set_smart_bounds(True)
+        ax.yaxis.set_ticks_position('left')
+        ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, yerr=std_cond1, error_kw={'elinewidth':1, 'ecolor':'black'}, label=c1)
+        ax.bar(ind + width, list(cond2_norm.itervalues()), width, yerr=std_cond2, facecolor=color2, error_kw={'elinewidth':1, 'ecolor':'black'}, label=c2)
+        for x, y, z in zip(ind, max_val, codon_sorted):
+            ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=8)   
+        ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)')
+        ax.set_xlabel('Codons')
+        handles, labels = ax.get_legend_handles_labels()
+        ax.legend(handles, labels)
+        pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340)
+        pl.clf()
+            
+
+
+    elif len(result) == 2 :
+        
+        # store each dict in OrderedDict sorted by key to make code more readable
+        cond1 = result[0]
+        cond2 = result[1]
+        cond1_norm = result[0].copy()
+        cond2_norm = result[1].copy()
+        # pdb.set_trace() 
+        # get codon order in one of list
+        codon_sorted = sorted(cond1.iterkeys(), reverse=False)
+
+        # get sum of each list
+        sum1 = sum(list(cond1.itervalues()))
+        sum2 = sum(list(cond2.itervalues()))
+        # #Normalize values by sum of each libraries
+        cond1_norm.update ((x, (y / sum1) * 100.0) for x, y in cond1_norm.items())
+        cond2_norm.update((x, (y / sum2) * 100.0) for x, y in cond2_norm.items())
+        
+        # # compute theorical count in COND2
+        cond2_count = []
+        for z in cond1_norm.itervalues() :
+            count = int(z * sum2 / 100.0)
+            cond2_count.append(count)
+           
+        expected = array(cond2_count)
+        observed = array(list(cond2.itervalues()))    
+            
+        AA = get_aa_dict(cond1_norm, cond2_norm)
+          
+        max_val = []
+        cond1_aa = []
+        cond2_aa = []
+        aa_name = list(AA.iterkeys())
+        for z in AA.itervalues():
+            cond1_aa.append(z[0])
+            cond2_aa.append(z[1])
+            max_val.append(max(z))
+        
+        # # plot amino acid profile :
+        fig = pl.figure(num=1)
+        width = .35
+        ax = fig.add_subplot(111)
+        ind = arange(21)
+        pl.xlim(0, 21) 
+        #kwargs = {"hatch":'x'}
+        #ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1, **kwargs)
+        #kwargs = {"hatch":'.'}
+        #ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2, **kwargs)
+        ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1)
+        ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2)        
+        #for x, y, z in zip(ind, max_val, aa_name):
+        #    ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14)
+        axis_font = {'size':'16'}
+        pl.xticks(ind + width, aa_name,**axis_font)
+        ax.spines['right'].set_visible(False)
+        ax.spines['top'].set_visible(False)
+        ax.yaxis.set_ticks_position('left')
+        ax.xaxis.set_ticks_position('bottom')
+        #ax.xaxis.set_ticks([])
+        ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)',**axis_font)
+        ax.set_xlabel('Amino Acids', **axis_font)
+        handles, labels = ax.get_legend_handles_labels()
+        font_prop = font_manager.FontProperties(size=12)
+        ax.legend(handles, labels, prop=font_prop)
+        pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340)
+        pl.clf()
+        
+        # write result
+        with open(outfile, 'w') as out :
+            out.write('Codon\tRaw_' + c1 + '\tRaw_' + c2 + '\tNorm_' + c1 + '\tNorm_' + c2 + '\tFC(Mut/WT)\n')
+            for i in codon_sorted:
+                out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n')
+            out.write('Khi2 test\n')
+            chi = stats.chisquare(observed, expected)
+            out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n')
+            
+        # # get max value for each codon for histogram   
+        max_val = []  # # max value for graph
+        for i in cond1:
+            # # max value for each codon
+            max_val.append(max(cond1_norm[i], cond2_norm[i]))
+            
+        # plot result
+        fig = pl.figure(figsize=(24, 10), num=1)
+        width = .50
+        ind = arange(len(codon_sorted))
+        ax = fig.add_subplot(111)
+        pl.xlim(0, len(codon_sorted) + 1) 
+        ax.spines['right'].set_color('none')
+        ax.spines['top'].set_color('none')
+        ax.xaxis.set_ticks([])
+        ax.spines['left'].set_smart_bounds(True)
+        ax.yaxis.set_ticks_position('left')
+        ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, label=c1)
+        ax.bar(ind + width, list(cond2_norm.itervalues()), width, facecolor=color2, label=c2)
+        for x, y, z in zip(ind, max_val, codon_sorted):
+            ax.text(x + width, y + 0.02, '%s' % z, ha='center', va='bottom', fontsize=8)   
+        ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)')
+        ax.set_xlabel('Codons')
+        handles, labels = ax.get_legend_handles_labels()
+        ax.legend(handles, labels)
+        pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340)
+        pl.clf()
+        
+     
+    else :
+         stop_err('Error running codon usage plotting : ' + str(e))
+        
+        
+    return (cond1_norm, cond2_norm, chi[1])
+
+def write_html_file(html, chi_pval, cond1, cond2):
+    try :
+        
+       
+        html_str = """
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+    
+<html xmlns="http://www.w3.org/1999/xhtml">
+    <head>
+        <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> 
+        <link href="/static/june_2007_style/blue/base.css" media="screen" rel="Stylesheet" type="text/css" />
+    </head>
+    <body>
+        <h3>Global visualization</h3>
+        <p>
+            <h5>Visualization of density footprint in each codon.</h5><br> If user has selected analyse with replicats, standart error deviation between each replicate as plotting as error bar in histogram.<br>
+            <img border="0" src="hist_codons.png"  width="1040"/>
+        </p>
+        <p>
+            <h5>Test for homogeneity distribution between each condition</h5><br>
+            H0 : %s and %s are same distribution <br>
+            Khi2 test p-value: %s<br><br>
+            If p-value less than 0.05, we can reject homogeneity distribution so we can hypothesize that distributions are not the same. Otherwise, we accept H0<br>
+            
+        </p>
+        <p>
+            <h5>Visualization of density footprint in each codon groupe by amino acid</h5><br>
+            <img border="0" src="hist_amino_acid.png"  width="1040"/>
+        </p>
+    </body>
+</html> """ % (cond1,cond2,chi_pval)
+         
+            
+        html_file = open(html, "w")
+        html_file.write(html_str)
+        html_file.close()
+        
+    except Exception, e :
+        stop_err('Error during html page creation : ' + str(e))
+    
+
+
+    
+def check_codons_list (codons) :
+    
+    for codon in codons :
+        if codon not in init_codon_dict().iterkeys() :
+            stop_err('Please to enter a valid codon : ' + codon + ' is not find\n')
+
+ 
+def check_index_bam (bamfile) :
+    # #testing indexed bam file
+    if os.path.isfile(bamfile + ".bai") :
+        pass
+    else :
+        cmd = "samtools index %s " % (bamfile)
+        proc = subprocess.Popen(args=cmd, shell=True, stderr=subprocess.PIPE)
+        returncode = proc.wait()    
+        # if returncode != 0:
+        #    raise Exception        
+        
+def __main__():
+    '''   
+        python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/get_codon_frequency.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -g Saccer3.gff -t tAI.csv -1 psiM1_sorted.bam,psiM2_sorted.bam -2 psiP1_sorted.bam,psiP2_sorted.bam -c psiM -C psiP -l TAG,TAA,TGA -r yes -o psi_count -d psi.html,html_dir > log2
+        python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/get_codon_frequency.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -g Saccer3.gff -t tAI.csv -c psiM -C psiP -1 RPF_psi-_28sorted.bam -2 RPF_psi+_28sorted.bam -l TAG,TAA,TGA -n Stop Codon -r no -o psi_count -d psi.html,html_dir > log2
+    '''     
+    
+    # Parse command line options
+    parser = optparse.OptionParser()
+    parser.add_option("-g", "--gff", dest="gff", type="string",
+                  help="gff file", metavar="FILE")
+    
+    parser.add_option("-1", "--file1", dest="file1", type="string",
+                  help="Bam Ribo-Seq alignments cond 1, if rep option, separate files by commas ", metavar="FILE")
+    
+    parser.add_option("-2", "--file2", dest="file2", type="string",
+                  help="Bam Ribo-Seq alignments cond 2, if rep option, separate files by commas", metavar="FILE")    
+    
+    parser.add_option("-c", "--cond1", dest="c1", type="string",
+                  help="Name for first condition", metavar="STR")
+    
+    parser.add_option("-C", "--cond2", dest="c2", type="string",
+                  help="Name of second condition", metavar="STR")
+   
+    parser.add_option("-k", "--kmer", dest="kmer", type="int",
+                  help="Longer of your phasing reads", metavar="INT") 
+        
+#     parser.add_option("-l", "--list", dest="list_cod", type= "string",
+#                   help="list of codons to compare to other", metavar="STR")
+     
+    parser.add_option("-o", "--out", dest="outfile", type="string",
+                  help="write report to FILE", metavar="FILE")
+    
+    parser.add_option("-d", "--dirout", dest="dirout", type="string",
+                  help="write report to PNG files", metavar="FILE")   
+
+    parser.add_option("-a", "--asite", dest="asite", type="int",
+                  help="Off-set from the 5'end of the footprint to the A-site", metavar="INT")  
+    
+    parser.add_option("-s", "--site", dest="site", type="string",
+                  help="Script can compute in site A, P or E", metavar="A|P|E")      
+
+    parser.add_option("-r", "--rep", dest="rep", type="string",
+                  help="if replicate or not", metavar="yes|no")   
+     
+    parser.add_option("-x", "--hex_col1", dest="color1", type= "string",
+                  help="Color for first condition", metavar="STR")
+    
+    parser.add_option("-X", "--hex_col2", dest="color2", type= "string",
+                  help="Color for second condition", metavar="STR")
+
+    parser.add_option("-q", "--quiet",
+                  action="store_false", dest="verbose", default=True,
+                  help="don't print status messages to stdout")
+
+    (options, args) = parser.parse_args()
+    print "Begin codon frequency analysis at", time.asctime(time.localtime(time.time()))
+
+    try:
+        authorized_site = ["A", "P", "E"]
+        if options.site not in authorized_site :
+            stop_err(options.site + ' is not a authorized ribosome site')
+        
+        ## Check if colors exist
+        if not colors.is_color_like(options.color1) :
+            stop_err( options.color1+' is not a proper color' )
+        if not colors.is_color_like(options.color2) :
+            stop_err( options.color2+' is not a proper color' )
+        
+        GFF = store_gff(options.gff)
+        
+        #### NOT USE IN FINAL VERSION
+        # # get codon list
+        # codons = options.list_cod.upper().split(',')
+        # check_codons_list(codons)
+        
+        # # get html file and directory :
+        (html, html_dir) = options.dirout.split(',')
+        if os.path.exists(html_dir):
+            raise
+        try:
+            os.mkdir(html_dir)
+        except:
+            raise Exception(html_dir + ' mkdir')
+        # #RUN analysis
+        # #If there are replicats
+        if options.rep == "yes" :
+            result = []
+            # split name of each file options by ","
+            cond1 = options.file1.split(',')
+            cond2 = options.file2.split(',')
+            # # calcul for each file
+            for fh in itertools.chain(cond1, cond2):
+                check_index_bam (fh)
+                result.append(get_codon_usage(fh, GFF, options.site, options.kmer, options.asite))
+            (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2)
+            # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir)
+                
+                
+        # #If there are no replicat
+        elif options.rep == "no" :
+            result = []
+            # #calcul for each cond
+            for fh in (options.file1, options.file2):
+                check_index_bam (fh)
+                result.append(get_codon_usage(fh, GFF, options.site, options.kmer,options.asite))
+            (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2)
+            # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir)
+                
+        else :
+            sys.stderr.write("Please enter yes or no for --rep option. Programme aborted at %s" % time.asctime(time.localtime(time.time())))
+            sys.exit()
+   
+        # write_html_file(html,chi_pval,t_pval,codons,options.c1, options.c2)
+        write_html_file(html, chi_pval, options.c1, options.c2)
+            
+        print "Finish codon frequency analysis at", time.asctime(time.localtime(time.time()))        
+    except Exception, e:
+        stop_err('Error running codon frequency analysis (main program) : ' + str(e))
+    
+
+if __name__=="__main__":
+    __main__()