Mercurial > repos > rlegendre > ribo_tools

#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-

'''
    Created on sep. 2013
    @author: rachel legendre
    @copyright: rachel.legendre@igmors.u-psud.fr
    @license: GPL v3
'''

from __future__ import division
import os, sys, optparse, tempfile, subprocess, re, shutil, commands, urllib, time
import itertools
from math import log10
from decimal import Decimal
from Bio import SeqIO
from Bio.Seq import Seq
from numpy import arange, std, array, linspace, average
#from matplotlib import pyplot as pl
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as pl
from matplotlib import font_manager
from matplotlib import colors
import csv
from scipy import stats, errstate
from collections import OrderedDict
import ribo_functions
import HTSeq
# #libraries for debugg
#import pdb
import cPickle

def stop_err(msg):
    sys.stderr.write("%s\n" % msg)
    sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
    sys.exit()


def init_codon_dict():

    Codon_dict = OrderedDict([('AAA', 0), ('AAC', 0), ('AAG', 0), ('AAT', 0), ('ACA', 0), ('ACC', 0), ('ACG', 0), ('ACT', 0), ('AGA', 0), ('AGC', 0), ('AGG', 0), ('AGT', 0), ('ATA', 0), ('ATC', 0), ('ATG', 0), ('ATT', 0), ('CAA', 0), ('CAC', 0), ('CAG', 0), ('CAT', 0), ('CCA', 0), ('CCC', 0), ('CCG', 0), ('CCT', 0), ('CGA', 0), ('CGC', 0), ('CGG', 0), ('CGT', 0), ('CTA', 0), ('CTC', 0), ('CTG', 0), ('CTT', 0), ('GAA', 0), ('GAC', 0), ('GAG', 0), ('GAT', 0), ('GCA', 0), ('GCC', 0), ('GCG', 0), ('GCT', 0), ('GGA', 0), ('GGC', 0), ('GGG', 0), ('GGT', 0), ('GTA', 0), ('GTC', 0), ('GTG', 0), ('GTT', 0), ('TAA', 0), ('TAC', 0), ('TAG', 0), ('TAT', 0), ('TCA', 0), ('TCC', 0), ('TCG', 0), ('TCT', 0), ('TGA', 0), ('TGC', 0), ('TGG', 0), ('TGT', 0), ('TTA', 0), ('TTC', 0), ('TTG', 0), ('TTT', 0)])
    return Codon_dict


def get_codon_usage(bamfile, GFF, site, kmer, a_pos):
    '''
        Read GFF dict and get gene codon usage.
        Return dict of codons usage
    '''
    try:
        codon = init_codon_dict()
        multi_tag = "XS:i:" ## bowtie Tag
        tag = "IH:i:1" ## RUM tag

        for feature in GFF :
            if feature.type == 'gene' :
                codon_dict = init_codon_dict()
                chrom = feature.iv.chrom
                start = feature.iv.start
                stop = feature.iv.end
                if start+50 < stop-50 :
                    region = chrom + ':' + str(start+50) + '-' + str(stop-50)
                    # #get all reads in this gene
                    reads = subprocess.check_output(["samtools", "view", bamfile, region])
                    head = subprocess.check_output(["samtools", "view", "-H", bamfile])
                    read_tab = reads.split('\n')
                    for read in read_tab:
                        # # search mapper for eliminate multiple alignements
                        if 'bowtie' in head:
                            multi_tag = "XS:i:"
                        elif 'bwa' in  head:
                            multi_tag = "XT:A:R"
                        elif 'TopHat' in  head:
                            tag = "NH:i:1"
                        else :
                            stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")

                        if len(read) == 0:
                            continue
                        len_read = len(read.split('\t')[9])
                        # if it's read of good length
                        if len_read == kmer and (tag in read or multi_tag not in read):
                            feat = read.split('\t')
                            seq = feat[9]
                            # if it's a reverse read
                            if feat[1] == '16' :
                                if site == "A" :
                                # #get A-site
                                    cod = str(Seq(seq[a_pos-5:a_pos-2]).reverse_complement())
                                elif site == "P" :
                                # #get P-site
                                    cod = str(Seq(seq[a_pos-2:a_pos+1]).reverse_complement())
                                else :
                                # #get site-E
                                    cod = str(Seq(seq[a_pos+1:a_pos+4]).reverse_complement())
                                # # test if it's a true codon not a CNG codon for example
                                if codon_dict.has_key(cod) :
                                    codon_dict[cod] += 1
                            # if it's a forward read
                            elif feat[1] == '0' :
                                if site == "A" :
                                # #get A-site
                                    cod = seq[a_pos:a_pos+3]
                                elif site == "P" :
                                # #get P-site
                                    cod = seq[a_pos-3:a_pos]
                                else :
                                # #get site-E
                                    cod = seq[a_pos-6:a_pos-3]
                                if codon_dict.has_key(cod) :
                                    codon_dict[cod] += 1
                    del(read)
                # # add in global dict
                for cod, count in codon_dict.iteritems() :
                    codon[cod] += count
        if sum(codon.values()) == 0 :
            stop_err('There are no reads aligning on annotated genes in your GFF file')
        else :
            return codon

    except Exception, e:
        stop_err('Error during codon usage calcul: ' + str(e))


'''
http://pyinsci.blogspot.fr/2009/09/violin-plot-with-matplotlib.html
'''
def violin_plot(ax, data, pos, bp=False):
    '''
    create violin plots on an axis
    '''
    dist = max(pos) - min(pos)
    w = min(0.15 * max(dist, 1.0), 0.5)
    for d, p in zip(data, pos):
        k = stats.gaussian_kde(d)  # calculates the kernel density
        m = k.dataset.min()  # lower bound of violin
        M = k.dataset.max()  # upper bound of violin
        x = arange(m, M, (M - m) / 100.)  # support for violin
        v = k.evaluate(x)  # violin profile (density curve)
        v = v / v.max() * w  # scaling the violin to the available space
        ax.fill_betweenx(x, p, v + p, facecolor=color1, alpha=0.3)
        ax.fill_betweenx(x, p, -v + p, facecolor=color2, alpha=0.3)
    if bp:
        ax.boxplot(data, notch=1, positions=pos, vert=1)


'''
http://log.ooz.ie/2013/02/matplotlib-comparative-histogram-recipe.html
'''
def comphist(x1, x2, orientation='vertical', **kwargs):
    """Draw a comparative histogram."""
    # Split keyword args:
    kwargs1 = {}
    kwargs2 = {}
    kwcommon = {}
    for arg in kwargs:
        tgt_arg = arg[:-1]
        if arg.endswith('1'):
            arg_dict = kwargs1
        elif arg.endswith('2'):
            arg_dict = kwargs2
        else:
            arg_dict = kwcommon
            tgt_arg = arg
        arg_dict[tgt_arg] = kwargs[arg]
    kwargs1.update(kwcommon)
    kwargs2.update(kwcommon)

    fig = pl.figure()

    # Have both histograms share one axis.
    if orientation == 'vertical':
        ax1 = pl.subplot(211)
        ax2 = pl.subplot(212, sharex=ax1)
        # Flip the ax2 histogram horizontally.
        ax2.set_ylim(ax1.get_ylim()[::-1])
        pl.setp(ax1.get_xticklabels(), visible=False)
        legend_loc = (1, 4)
    else:
        ax1 = pl.subplot(122)
        ax2 = pl.subplot(121, sharey=ax1)
        # Flip the ax2 histogram vertically.
        ax2.set_xlim(ax2.get_xlim()[::-1])
        pl.setp(ax1.get_yticklabels(), visible=False)
        legend_loc = (1, 2)

    ax1.hist(x1, orientation=orientation, **kwargs1)
    ax2.hist(x2, orientation=orientation, **kwargs2)
    ax2.set_ylim(ax1.get_ylim()[::-1])
    ax1.legend(loc=legend_loc[0])
    ax2.legend(loc=legend_loc[1])
    # Tighten up the layout.
    pl.subplots_adjust(wspace=0.0, hspace=0.0)
    return fig


def compute_FC_plot(cond1_norm, cond2_norm, cod_name, codon_to_test, dirout):

    FC_tab = []
    for z, y in zip(cond1_norm.itervalues(), cond2_norm.itervalues()):
        fc = z - y
        FC_tab.append(fc)
    # #codon_to_test = ['TGA','TAG','TAA']

    a = []
    b = []
    cod = []
    for codon in cond1_norm.iterkeys():
        if codon in codon_to_test :
            fc = cond1_norm[codon] - cond2_norm[codon]
            b.append(fc)
            cod.append(codon)
        else :
            fc = cond1_norm[codon] - cond2_norm[codon]
            a.append(fc)


    fig = pl.figure(num=1)
    comphist(array(a), array(b), label1='All codon', label2=cod_name, color2='green', bins=30, rwidth=1)
    # pl.show()
    pl.savefig(dirout + '/hist_codon_fc.png', format="png", dpi=340)
    pl.clf()


    # #violin plot
    pos = range(2)
    dat = array([array(a), array(b)])
    fig = pl.figure()
    pl.title("Distribution of codons FoldChange between two conditions")
    ax = fig.add_subplot(1, 1, 1)
    lab = array(['All codons', cod_name])
    violin_plot(ax, dat, pos, bp=1)
    for x, z in zip(dat, pos):
        ax.plot(z, average(x), color='r', marker='*', markeredgecolor='r')
    xtickNames = pl.setp(ax, xticklabels=lab)
    pl.savefig(dirout + '/violinplot_codon.png', format="png", dpi=340)
    pl.clf()

    # (Fval,pval) = stats.ttest_ind(a, b, axis=0, equal_var=True)
    (Fval, pval) = stats.mannwhitneyu(a, b)
    return pval


def get_aa_dict(cond1_norm, cond2_norm):

    # ## create amino acid dictionnary:
    AA = OrderedDict({})
    AA['Phe'] = [cond1_norm['TTT'] + cond1_norm['TTC'], cond2_norm['TTT'] + cond2_norm['TTC']]
    AA['Leu'] = [cond1_norm['TTA'] + cond1_norm['TTG'] + cond1_norm['CTT'] + cond1_norm['CTC'] + cond1_norm['CTA'] + cond1_norm['CTG'], cond2_norm['TTA'] + cond2_norm['TTG'] + cond2_norm['CTT'] + cond2_norm['CTC'] + cond2_norm['CTA'] + cond2_norm['CTG']]
    AA['Ile'] = [cond1_norm['ATT'] + cond1_norm['ATC'] + cond1_norm['ATA'], cond2_norm['ATT'] + cond2_norm['ATC'] + cond2_norm['ATA']]
    AA['Met'] = [cond1_norm['ATG'], cond2_norm['ATG']]
    AA['Val'] = [cond1_norm['GTT'] + cond1_norm['GTC'] + cond1_norm['GTA'] + cond1_norm['GTG'] + cond1_norm['AGT'] + cond1_norm['AGC'], cond2_norm['GTT'] + cond2_norm['GTC'] + cond2_norm['GTA'] + cond2_norm['GTG'] + cond2_norm['AGT'] + cond2_norm['AGC']]
    AA['Ser'] = [cond1_norm['TCT'] + cond1_norm['TCC'] + cond1_norm['TCA'] + cond1_norm['TCG'], cond2_norm['TCT'] + cond2_norm['TCC'] + cond2_norm['TCA'] + cond2_norm['TCG']]
    AA['Pro'] = [cond1_norm['CCT'] + cond1_norm['CCC'] + cond1_norm['CCA'] + cond1_norm['CCG'], cond2_norm['CCT'] + cond2_norm['CCC'] + cond2_norm['CCA'] + cond2_norm['CCG']]
    AA['Thr'] = [cond1_norm['ACT'] + cond1_norm['ACC'] + cond1_norm['ACA'] + cond1_norm['ACG'], cond2_norm['ACT'] + cond2_norm['ACC'] + cond2_norm['ACA'] + cond2_norm['ACG']]
    AA['Ala'] = [cond1_norm['GCT'] + cond1_norm['GCC'] + cond1_norm['GCA'] + cond1_norm['GCG'], cond2_norm['GCT'] + cond2_norm['GCC'] + cond2_norm['GCA'] + cond2_norm['GCG']]
    AA['Tyr'] = [cond1_norm['TAT'] + cond1_norm['TAC'], cond2_norm['TAT'] + cond2_norm['TAC']]
    AA['Stop'] = [cond1_norm['TAA'] + cond1_norm['TAG'] + cond1_norm['TGA'], cond2_norm['TAA'] + cond2_norm['TAG'] + cond2_norm['TGA']]
    AA['His'] = [cond1_norm['CAT'] + cond1_norm['CAC'], cond2_norm['CAT'] + cond2_norm['CAC']]
    AA['Gln'] = [cond1_norm['CAA'] + cond1_norm['CAG'], cond2_norm['CAA'] + cond2_norm['CAG']]
    AA['Asn'] = [cond1_norm['AAT'] + cond1_norm['AAC'], cond2_norm['AAT'] + cond2_norm['AAC']]
    AA['Lys'] = [cond1_norm['AAA'] + cond1_norm['AAG'], cond2_norm['AAA'] + cond2_norm['AAG']]
    AA['Asp'] = [cond1_norm['GAT'] + cond1_norm['GAC'], cond2_norm['GAT'] + cond2_norm['GAC']]
    AA['Glu'] = [cond1_norm['GAA'] + cond1_norm['GAG'], cond2_norm['GAA'] + cond2_norm['GAG']]
    AA['Cys'] = [cond1_norm['TGT'] + cond1_norm['TGC'], cond2_norm['TGT'] + cond2_norm['TGC']]
    AA['Trp'] = [cond1_norm['TGG'], cond2_norm['TGG']]
    AA['Arg'] = [cond1_norm['CGT'] + cond1_norm['CGC'] + cond1_norm['CGA'] + cond1_norm['CGG'] + cond1_norm['AGA'] + cond1_norm['AGG'], cond2_norm['CGT'] + cond2_norm['CGC'] + cond2_norm['CGA'] + cond2_norm['CGG'] + cond2_norm['AGA'] + cond2_norm['AGG']]
    AA['Gly'] = [cond1_norm['GGT'] + cond1_norm['GGC'] + cond1_norm['GGA'] + cond1_norm['GGG'], cond2_norm['GGT'] + cond2_norm['GGC'] + cond2_norm['GGA'] + cond2_norm['GGG']]


    return AA


def plot_codon_usage(result, dirout, c1, c2, outfile, color1, color2):
    '''
        Take list of dict of codon usage and use matplotlib for do graph
    '''

    # #if there are replicat
    if len(result) == 4 :
        # store each dict in variables to make code more readable
        cond1_1 = result[0].copy()
        cond1_2 = result[1].copy()
        cond2_1 = result[2].copy()
        cond2_2 = result[3].copy()
        # get codon order in one of list
        codon_sorted = sorted(cond1_1.iterkeys(), reverse=False)
        try:
            # get max of each list
            sum11 = sum(list(cond1_1.itervalues()))
            sum12 = sum(list(cond1_2.itervalues()))
            sum21 = sum(list(cond2_1.itervalues()))
            sum22 = sum(list(cond2_2.itervalues()))
            # for each codon, get values and sd in each condition
            cond1_val = {}
            cond1 = {}
            cond2_val = {}
            cond2 = {}
            std_cond1 = []
            std_cond2 = []
            max_val = []  # # max value for graph
            for i in codon_sorted:
                # # cond1 = mean of replicats cond1 divided by max
                cond1_val[i] = ((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2)
                cond1[i] = ((cond1_1[i] + cond1_2[i]) / 2)
                # # standard deviation = absolute value of difference between replicats of cond1
                std_cond1.append(std(array([(cond1_1[i] * 100 / sum11), (cond1_2[i] * 100 / sum12)])))
                # # cond2 = mean of replicats cond1divided by max
                cond2_val[i] = ((cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2)
                cond2[i] = ((cond2_1[i] + cond2_2[i]) / 2)
                # # standard deviation = absolute value of difference between replicats of cond2
                std_cond2.append(std(array([((cond2_1[i]) * 100 / sum21), ((cond2_2[i]) * 100 / sum22)])))
                # # max value for each codon
                max_val.append(max((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2, (cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2))

            # for graph design
            cond1_norm = OrderedDict(sorted(cond1_val.items(), key=lambda t: t[0]))
            cond1_norm.update ((x, y * 100) for x, y in cond1_norm.items())
            cond2_norm = OrderedDict(sorted(cond2_val.items(), key=lambda t: t[0]))
            cond2_norm.update ((x, y * 100) for x, y in cond2_norm.items())
            max_val = [x * 100 for x in max_val]
        except ZeroDivisionError:
            stop_err("Not enough reads to compute the codon occupancy")

        AA = get_aa_dict(cond1_norm, cond2_norm)
        max_valaa = []
        cond1_aa = []
        cond2_aa = []
        aa_name = list(AA.iterkeys())
        for z in AA.itervalues():
            cond1_aa.append(z[0])
            cond2_aa.append(z[1])
            max_valaa.append(max(z))
        # # plot amino acid profile :
        fig = pl.figure(figsize=(15,10), num=1)
        width = .50
        ax = fig.add_subplot(111)
        ax.xaxis.set_ticks([])
        ind = arange(21)
        pl.xlim(0, 21)
        ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1)
        ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2)
        for x, y, z in zip(ind, max_valaa, aa_name):
            ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14)
        ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)')
        ax.set_xlabel('Amino Acid')
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles, labels)
        pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340)
        pl.clf()


        # # compute theorical count in COND2
        sum2 = (sum21 + sum22) / 2
        cond2_count = []
        for z in cond1_norm.itervalues() :
            count = int(z * sum2 / 100)
            cond2_count.append(count)

        expected = array(cond2_count)
        observed = array(list(cond2.itervalues()))

        # write result
        with open(outfile, 'w') as out :
            out.write('Codon,Raw_' + c1 + ',Raw_' + c2 + ',Norm_' + c1 + ',Norm_' + c2 + ',FC(Mut/WT)\n')
            for i in codon_sorted:
                ## if global foldchange is equal to zero
                if cond1_norm[i] == 0 and cond2_norm[i] == 0:
                    out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',1.0\n')
                elif cond1_norm[i] == 0 :
                    out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',0.0\n')
                else:
                    out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',' + str(cond2_norm[i] / cond1_norm[i]) + '\n')
            with errstate(all='ignore'):
                chi = stats.chisquare(observed, expected)
            out.write('Khi2 test\n')
            out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n')


        # plot result
        fig = pl.figure(figsize=(20,10), num=1)
        width = .40
        ind = arange(len(codon_sorted))
        ax = fig.add_subplot(111)
        pl.xlim(0, len(codon_sorted) + 1)
        ax.spines['right'].set_color('none')
        ax.spines['top'].set_color('none')
        ax.xaxis.set_ticks([])
        ax.spines['left'].set_smart_bounds(True)
        ax.yaxis.set_ticks_position('left')
        ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, yerr=std_cond1, error_kw={'elinewidth':1, 'ecolor':'black'}, label=c1)
        ax.bar(ind + width, list(cond2_norm.itervalues()), width, yerr=std_cond2, facecolor=color2, error_kw={'elinewidth':1, 'ecolor':'black'}, label=c2)
        for x, y, z in zip(ind, max_val, codon_sorted):
            ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=8)
        ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)')
        ax.set_xlabel('Codons')
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles, labels)
        pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340)
        pl.clf()


    elif len(result) == 2 :

        # store each dict in OrderedDict sorted by key to make code more readable
        cond1 = result[0]
        cond2 = result[1]
        cond1_norm = result[0].copy()
        cond2_norm = result[1].copy()
        # pdb.set_trace()
        # get codon order in one of list
        codon_sorted = sorted(cond1.iterkeys(), reverse=False)
        try:
            # get sum of each list
            sum1 = sum(list(cond1.itervalues()))
            sum2 = sum(list(cond2.itervalues()))
            # #Normalize values by sum of each libraries
            cond1_norm.update ((x, (y / sum1) * 100.0) for x, y in cond1_norm.items())
            cond2_norm.update((x, (y / sum2) * 100.0) for x, y in cond2_norm.items())
        except ZeroDivisionError:
            stop_err("Not enough reads to compute the codon occupancy. "+str(sum1)+" and "+str(sum2)+" reads are used for each condition, respectively.\n")

        # # compute theorical count in COND2
        cond2_count = []
        for z in cond1_norm.itervalues() :
            count = int(z * sum2 / 100.0)
            cond2_count.append(count)

        expected = array(cond2_count)
        observed = array(list(cond2.itervalues()))

        AA = get_aa_dict(cond1_norm, cond2_norm)

        max_val = []
        cond1_aa = []
        cond2_aa = []
        aa_name = list(AA.iterkeys())
        for z in AA.itervalues():
            cond1_aa.append(z[0])
            cond2_aa.append(z[1])
            max_val.append(max(z))

        # # plot amino acid profile :
        fig = pl.figure(figsize=(15,10), num=1)
        width = .45
        ax = fig.add_subplot(111)
        ind = arange(21)
        pl.xlim(0, 21)
        #kwargs = {"hatch":'x'}
        #ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1, **kwargs)
        #kwargs = {"hatch":'.'}
        #ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2, **kwargs)
        ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1)
        ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2)
        #for x, y, z in zip(ind, max_val, aa_name):
        #    ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14)
        axis_font = {'size':'10'}
        pl.xticks(ind + width, aa_name,**axis_font)
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)
        ax.yaxis.set_ticks_position('left')
        ax.xaxis.set_ticks_position('bottom')
        #ax.xaxis.set_ticks([])
        ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)',**axis_font)
        ax.set_xlabel('Amino Acids', **axis_font)
        handles, labels = ax.get_legend_handles_labels()
        font_prop = font_manager.FontProperties(size=8)
        ax.legend(handles, labels, prop=font_prop)
        pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340)
        pl.clf()

        # write result
        with open(outfile, 'w') as out :
            out.write('Codon,Raw_' + c1 + ',Raw_' + c2 + ',Norm_' + c1 + ',Norm_' + c2 + ',FC(Mut/WT)\n')
            for i in codon_sorted:
                if cond1_norm[i] == 0 and cond2_norm[i] == 0:
                    out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',1.0\n')
                elif cond1_norm[i] == 0 :
                    out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',0.0\n')
                else:
                    out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',' + str(cond2_norm[i] / cond1_norm[i]) + '\n')
            out.write('Khi2 test\n')
            with errstate(all='ignore'):
                chi = stats.chisquare(observed, expected)
            out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n')

        # # get max value for each codon for histogram
        max_val = []  # # max value for graph
        for i in cond1:
            # # max value for each codon
            max_val.append(max(cond1_norm[i], cond2_norm[i]))

        # plot result
        fig = pl.figure(figsize=(20,10), num=1)
        #fig = pl.figure(num=1)
        width = .40
        ind = arange(len(codon_sorted))
        ax = fig.add_subplot(111)
        pl.xlim(0, len(codon_sorted) + 1)
        ax.spines['right'].set_color('none')
        ax.spines['top'].set_color('none')
        ax.xaxis.set_ticks([])
        ax.spines['left'].set_smart_bounds(True)
        ax.yaxis.set_ticks_position('left')
        ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, label=c1)
        ax.bar(ind + width, list(cond2_norm.itervalues()), width, facecolor=color2, label=c2)
        for x, y, z in zip(ind, max_val, codon_sorted):
            ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=8)
        ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)')
        ax.set_xlabel('Codons')
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles, labels)
        pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340)
        pl.clf()


    else :
         stop_err('Error running codon usage plotting : ' + str(e))


    return (cond1_norm, cond2_norm, chi[1])

def write_html_file(html, chi_pval, cond1, cond2):
    try :


        html_str = """
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
    <head>
        <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
        <link href="/static/june_2007_style/blue/base.css" media="screen" rel="Stylesheet" type="text/css" />
    </head>
    <body>
        <h3>Global visualization</h3>
        <p>
            <h5>Visualization of density footprint in each codon.</h5><br> If user has selected "Yes" for the replicate option the standard deviation between each replicate is plotted as an error bar in histogram.<br>
            <img border="0" src="hist_codons.png"  width="1040"/>
        </p>
        <p>
            <h5>Test for homogeneity distribution between each condition</h5><br>
            H0 : %s and %s are same distribution <br>
            Khi2 test p-value: %s<br><br>
            If p-value less than 0.05, we can reject homogeneity distribution so we can hypothesize that distributions are not the same. Otherwise, we accept H0<br>

        </p>
        <p>
            <h5>Visualization of density footprint in each codon groupe by amino acid</h5><br>
            <img border="0" src="hist_amino_acid.png"  width="1040"/>
        </p>
    </body>
</html> """ % (cond1,cond2,chi_pval)


        html_file = open(html, "w")
        html_file.write(html_str)
        html_file.close()

    except Exception, e :
        stop_err('Error during html page creation : ' + str(e))


def check_codons_list (codons) :

    for codon in codons :
        if codon not in init_codon_dict().iterkeys() :
            stop_err('Please to enter a valid codon : ' + codon + ' is not find\n')


def check_index_bam (bamfile) :
    # #testing indexed bam file
    if os.path.isfile(bamfile + ".bai") :
        pass
    else :
        cmd = "samtools index %s " % (bamfile)
        proc = subprocess.Popen(args=cmd, shell=True, stderr=subprocess.PIPE)
        returncode = proc.wait()
        # if returncode != 0:
        #    raise Exception

def plot_fc (cond1, cond2, site, dirout):

    fc = cond1.copy()

    for key, value in fc.iteritems():
        if cond1[key] == 0:
            fc[key] = 1
        else:
            fc[key] = cond2[key]/cond1[key]

    index = arange(len(fc.keys()))
    label = fc.keys()
    label = [w.replace('T','U') for w in label]
    pl.figure(figsize=(15,10), num=1)
    ax = pl.subplot(1,1,1)
    pl.xticks([])
    pl.scatter(index, fc.values(), color='b')
    pl.axhline(y=1,color='r')
    pl.xticks(index, label, rotation=90)
    pl.ylabel('Foldchange of codon occupancy')
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    pl.ylim(-1,3)
    pl.title(site+" site")
    pl.savefig(dirout + '/fc_codons.png', format="png", dpi=340)


def __main__():


    # Parse command line options
    parser = optparse.OptionParser()
    parser.add_option("-g", "--gff", dest="gff", type="string",
                  help="gff file", metavar="FILE")

    parser.add_option("-1", "--file1", dest="file1", type="string",
                  help="Bam Ribo-Seq alignments cond 1, if rep option, separate files by commas ", metavar="FILE")

    parser.add_option("-2", "--file2", dest="file2", type="string",
                  help="Bam Ribo-Seq alignments cond 2, if rep option, separate files by commas", metavar="FILE")

    parser.add_option("-c", "--cond1", dest="c1", type="string",
                  help="Name for first condition", metavar="STR")

    parser.add_option("-C", "--cond2", dest="c2", type="string",
                  help="Name of second condition", metavar="STR")

    parser.add_option("-k", "--kmer", dest="kmer", type="int",  default = 28 ,
                  help="Length of your phasing reads", metavar="INT")

#     parser.add_option("-l", "--list", dest="list_cod", type= "string",
#                   help="list of codons to compare to other", metavar="STR")

    parser.add_option("-o", "--out", dest="outfile", type="string",
                  help="write report to FILE", metavar="FILE")

    parser.add_option("-d", "--dirout", dest="dirout", type="string",
                  help="write report to PNG files", metavar="FILE")

    parser.add_option("-a", "--asite", dest="asite", type="int",  default = 15 ,
                  help="Off-set from the 5'end of the footprint to the A-site (default is 15)", metavar="INT")

    parser.add_option("-s", "--site", dest="site", type="string",  default = "A" ,
                  help="Script can compute in site A, P or E (default is A-site)", metavar="A|P|E")

    parser.add_option("-r", "--rep", dest="rep", type="string", default = "no" ,
                  help="if replicate or not", metavar="yes|no")

    parser.add_option("-x", "--hex_col1", dest="color1", type= "string",  default = "SkyBlue" ,
                  help="Color for first condition", metavar="STR")

    parser.add_option("-X", "--hex_col2", dest="color2", type= "string",  default = "Plum" ,
                  help="Color for second condition", metavar="STR")

    parser.add_option("-q", "--quiet",
                  action="store_false", dest="verbose", default=True,
                  help="don't print status messages to stdout")

    (options, args) = parser.parse_args()
    print "Begin codon frequency analysis at", time.asctime(time.localtime(time.time()))

    try:
        authorized_site = ["A", "P", "E"]
        if options.site not in authorized_site :
            stop_err(options.site + ' is not a authorized ribosome site')

        ## Check if colors exist
        if not colors.is_color_like(options.color1) :
            stop_err( options.color1+' is not a proper color' )
        if not colors.is_color_like(options.color2) :
            stop_err( options.color2+' is not a proper color' )


        #### NOT USE IN FINAL VERSION
        # # get codon list
        # codons = options.list_cod.upper().split(',')
        # check_codons_list(codons)
        GFF = HTSeq.GFF_Reader(options.gff)
        # # get html file and directory :
        (html, html_dir) = options.dirout.split(',')
        if not os.path.exists(html_dir):
            try:
                os.mkdir(html_dir)
            except Exception, e :
                stop_err('Error running make directory : ' + str(e))
        # #RUN analysis
        # #If there are replicats
        if options.rep == "yes" :
            result = []
            # split name of each file options by ","
            cond1 = options.file1.split(',')
            cond2 = options.file2.split(',')
            # # calcul for each file
            for fh in itertools.chain(cond1, cond2):
                check_index_bam (fh)
                result.append(get_codon_usage(fh, GFF, options.site, options.kmer, options.asite))
            (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2)
            # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir)
            plot_fc (cond1, cond2, options.site, html_dir)

        # #If there are no replicat
        elif options.rep == "no" :
            result = []
            # #calcul for each cond
            for fh in (options.file1, options.file2):
                check_index_bam (fh)
                result.append(get_codon_usage(fh, GFF, options.site, options.kmer,options.asite))
            (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2)

            # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir)
            plot_fc (cond1, cond2, options.site, html_dir)
        else :
            sys.stderr.write("Please enter yes or no for --rep option. Programme aborted at %s" % time.asctime(time.localtime(time.time())))
            sys.exit()

        # write_html_file(html,chi_pval,t_pval,codons,options.c1, options.c2)
        write_html_file(html, chi_pval, options.c1, options.c2)

        print "Finish codon frequency analysis at", time.asctime(time.localtime(time.time()))
    except Exception, e:
        stop_err('Error running codon frequency analysis (main program) : ' + str(e))


if __name__=="__main__":
    __main__()
author	rlegendre
date	Fri, 12 Jun 2015 11:32:59 -0400
parents	c87c40e642af
children