Mercurial > repos > rlegendre > ribo_tools

#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-

'''
    Created on sep. 2013
    @author: rachel legendre
    @copyright: rachel.legendre@igmors.u-psud.fr
    @license: GPL v3
'''

from __future__ import division
import os, sys, optparse, tempfile, subprocess, re, shutil, commands, urllib, time
import itertools
import math
from decimal import Decimal
from Bio import SeqIO
from Bio.Seq import Seq
from numpy import arange, std, array, linspace, average
#from matplotlib import pyplot as pl
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as pl
from matplotlib import font_manager
from matplotlib import colors
import csv
from scipy import stats
from collections import OrderedDict
import ribo_functions
import HTSeq
# #libraries for debugg
import pdb
# import cPickle

def stop_err(msg):
    sys.stderr.write("%s\n" % msg)
    sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
    sys.exit()

def store_gff(gff):
    '''
        parse and store gff file in a dictionnary
    '''
    try:
        GFF = OrderedDict({})
        with open(gff, 'r') as f_gff :
            # GFF['order'] = []
            for line in f_gff:
                # # switch commented lines
                head = line.split("#")[0]
                if head != "" :
                    feature = (line.split('\t')[8]).split(';')
                    chrom = line.split('\t')[0]
                    if chrom not in GFF :
                        GFF[chrom] = {}
                # first line is already gene line :
                    if line.split('\t')[2] == 'gene' :
                        gene = feature[0].replace("ID=", "")
                        if re.search('gene', feature[2]) :
                            Name = feature[2].replace("gene=", "")
                        else :
                            Name = "Unknown"
                        # #get annotation
                        note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line)
                        note = urllib.unquote(str(note)).replace("\n", "")
                        # # store gene information
                        # GFF['order'].append(gene)
                        GFF[chrom][gene] = {}
                        GFF[chrom][gene]['chrom'] = line.split('\t')[0]
                        GFF[chrom][gene]['start'] = int(line.split('\t')[3])
                        GFF[chrom][gene]['stop'] = int(line.split('\t')[4])
                        GFF[chrom][gene]['strand'] = line.split('\t')[6]
                        GFF[chrom][gene]['name'] = Name
                        GFF[chrom][gene]['note'] = note
                        GFF[chrom][gene]['exon'] = {}
                        GFF[chrom][gene]['exon_number'] = 0
                        # print Name
                    elif line.split('\t')[2] == 'CDS' :
                        gene = re.sub(r"Parent\=(.+)_mRNA", r"\1", feature[0])
                        if GFF[chrom].has_key(gene) :
                            GFF[chrom][gene]['exon_number'] += 1
                            exon_number = GFF[chrom][gene]['exon_number']
                            GFF[chrom][gene]['exon'][exon_number] = {}
                            GFF[chrom][gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
                            GFF[chrom][gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
                            GFF[chrom][gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
                    # # if there is a five prim UTR intron, we change start of gene
                    elif line.split('\t')[2] == 'five_prime_UTR_intron' :
                        if GFF[chrom][gene]['strand'] == "+" :
                            GFF[chrom][gene]['start'] = GFF[chrom][gene]['exon'][1]['start']
                        else :
                            GFF[chrom][gene]['stop'] = GFF[chrom][gene]['exon'][exon_number]['stop']
        return GFF
    except Exception, e:
        stop_err('Error during gff storage : ' + str(e))

#chrI    SGD     gene    87286   87752   .       +       .       ID=YAL030W;Name=YAL030W;gene=SNC1;Alias=SNC1;Ontology_term=GO:0005484,GO:0005768,GO:0005802,GO:0005886,GO:0005935,GO:0006887,GO:0006893,GO:000689
#7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra
#ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr
#om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified
#chrI    SGD     CDS     87286   87387   .       +       0       Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
#chrI    SGD     CDS     87501   87752   .       +       0       Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified


def init_codon_dict():

    Codon_dict = OrderedDict([('AAA', 0), ('AAC', 0), ('AAG', 0), ('AAT', 0), ('ACA', 0), ('ACC', 0), ('ACG', 0), ('ACT', 0), ('AGA', 0), ('AGC', 0), ('AGG', 0), ('AGT', 0), ('ATA', 0), ('ATC', 0), ('ATG', 0), ('ATT', 0), ('CAA', 0), ('CAC', 0), ('CAG', 0), ('CAT', 0), ('CCA', 0), ('CCC', 0), ('CCG', 0), ('CCT', 0), ('CGA', 0), ('CGC', 0), ('CGG', 0), ('CGT', 0), ('CTA', 0), ('CTC', 0), ('CTG', 0), ('CTT', 0), ('GAA', 0), ('GAC', 0), ('GAG', 0), ('GAT', 0), ('GCA', 0), ('GCC', 0), ('GCG', 0), ('GCT', 0), ('GGA', 0), ('GGC', 0), ('GGG', 0), ('GGT', 0), ('GTA', 0), ('GTC', 0), ('GTG', 0), ('GTT', 0), ('TAA', 0), ('TAC', 0), ('TAG', 0), ('TAT', 0), ('TCA', 0), ('TCC', 0), ('TCG', 0), ('TCT', 0), ('TGA', 0), ('TGC', 0), ('TGG', 0), ('TGT', 0), ('TTA', 0), ('TTC', 0), ('TTG', 0), ('TTT', 0)])
    return Codon_dict


def get_codon_usage(bamfile, GFF, site, kmer, a_pos):
    '''
        Read GFF dict and get gene codon usage.
        Return dict of codons usage
    '''
    try:
        codon = init_codon_dict()
        for feature in GFF :
            if feature.type == 'gene' :
                codon_dict = init_codon_dict()
                chrom = feature.iv.chrom
                start = feature.iv.start
                stop = feature.iv.end
                region = chrom + ':' + str(start) + '-' + str(stop)

        ## DEPRECATED
        #for chrom in GFF.iterkeys():
            #for gene in GFF[chrom] :
               # codon_dict = init_codon_dict()
                #start = GFF[chrom][gene]['start']
                #print start
                #stop = GFF[chrom][gene]['stop']
                #print stop
                #region = chrom + ':' + str(start) + '-' + str(stop)
        #######
                # #get all reads in this gene
                reads = subprocess.check_output(["samtools", "view", bamfile, region])
                head = subprocess.check_output(["samtools", "view", "-H", bamfile])
                read_tab = reads.split('\n')
                for read in read_tab:
                    # # search mapper for eliminate multiple alignements
                    if 'bowtie' in head:
                        multi_tag = "XS:i:"
                    elif 'bwa' in  head:
                        multi_tag = "XT:A:R"
                    else :
                        stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping")
                    if len(read) == 0:
                        continue
                    len_read = len(read.split('\t')[9])
                    # if it's read of good length
                    if len_read == kmer and multi_tag not in read:
                        feat = read.split('\t')
                        seq = feat[9]
                        # if it's a reverse read
                        if feat[1] == '16' :
                            if site == "A" :
                            # #get A-site
                                cod = str(Seq(seq[a_pos-5:a_pos-2]).reverse_complement())
                            elif site == "P" :
                            # #get P-site
                                cod = str(Seq(seq[a_pos-2:a_pos+1]).reverse_complement())
                            else :
                            # #get site-E
                                cod = str(Seq(seq[a_pos+1:a_pos+4]).reverse_complement())
                            # # test if it's a true codon not a CNG codon for example
                            if codon_dict.has_key(cod) :
                                codon_dict[cod] += 1
                        # if it's a forward read
                        elif feat[1] == '0' :
                            if site == "A" :
                            # #get A-site
                                cod = seq[a_pos:a_pos+3]
                            elif site == "P" :
                            # #get P-site
                                cod = seq[a_pos-3:a_pos]
                            else :
                            # #get site-E
                                cod = seq[a_pos-6:a_pos-3]
                            if codon_dict.has_key(cod) :
                                codon_dict[cod] += 1
                # # add in global dict
                for cod, count in codon_dict.iteritems() :
                    codon[cod] += count

        return codon

    except Exception, e:
        stop_err('Error during codon usage calcul: ' + str(e))


'''
http://pyinsci.blogspot.fr/2009/09/violin-plot-with-matplotlib.html
'''
def violin_plot(ax, data, pos, bp=False):
    '''
    create violin plots on an axis
    '''
    dist = max(pos) - min(pos)
    w = min(0.15 * max(dist, 1.0), 0.5)
    for d, p in zip(data, pos):
        k = stats.gaussian_kde(d)  # calculates the kernel density
        m = k.dataset.min()  # lower bound of violin
        M = k.dataset.max()  # upper bound of violin
        x = arange(m, M, (M - m) / 100.)  # support for violin
        v = k.evaluate(x)  # violin profile (density curve)
        v = v / v.max() * w  # scaling the violin to the available space
        ax.fill_betweenx(x, p, v + p, facecolor=color1, alpha=0.3)
        ax.fill_betweenx(x, p, -v + p, facecolor=color2, alpha=0.3)
    if bp:
        ax.boxplot(data, notch=1, positions=pos, vert=1)


'''
http://log.ooz.ie/2013/02/matplotlib-comparative-histogram-recipe.html
'''
def comphist(x1, x2, orientation='vertical', **kwargs):
    """Draw a comparative histogram."""
    # Split keyword args:
    kwargs1 = {}
    kwargs2 = {}
    kwcommon = {}
    for arg in kwargs:
        tgt_arg = arg[:-1]
        if arg.endswith('1'):
            arg_dict = kwargs1
        elif arg.endswith('2'):
            arg_dict = kwargs2
        else:
            arg_dict = kwcommon
            tgt_arg = arg
        arg_dict[tgt_arg] = kwargs[arg]
    kwargs1.update(kwcommon)
    kwargs2.update(kwcommon)

    fig = pl.figure()

    # Have both histograms share one axis.
    if orientation == 'vertical':
        ax1 = pl.subplot(211)
        ax2 = pl.subplot(212, sharex=ax1)
        # Flip the ax2 histogram horizontally.
        ax2.set_ylim(ax1.get_ylim()[::-1])
        pl.setp(ax1.get_xticklabels(), visible=False)
        legend_loc = (1, 4)
    else:
        ax1 = pl.subplot(122)
        ax2 = pl.subplot(121, sharey=ax1)
        # Flip the ax2 histogram vertically.
        ax2.set_xlim(ax2.get_xlim()[::-1])
        pl.setp(ax1.get_yticklabels(), visible=False)
        legend_loc = (1, 2)

    ax1.hist(x1, orientation=orientation, **kwargs1)
    ax2.hist(x2, orientation=orientation, **kwargs2)
    ax2.set_ylim(ax1.get_ylim()[::-1])
    ax1.legend(loc=legend_loc[0])
    ax2.legend(loc=legend_loc[1])
    # Tighten up the layout.
    pl.subplots_adjust(wspace=0.0, hspace=0.0)
    return fig


def compute_FC_plot(cond1_norm, cond2_norm, cod_name, codon_to_test, dirout):

    FC_tab = []
    for z, y in zip(cond1_norm.itervalues(), cond2_norm.itervalues()):
        fc = z - y
        FC_tab.append(fc)
    # #codon_to_test = ['TGA','TAG','TAA']

    a = []
    b = []
    cod = []
    for codon in cond1_norm.iterkeys():
        if codon in codon_to_test :
            fc = cond1_norm[codon] - cond2_norm[codon]
            b.append(fc)
            cod.append(codon)
        else :
            fc = cond1_norm[codon] - cond2_norm[codon]
            a.append(fc)


    fig = pl.figure(num=1)
    comphist(array(a), array(b), label1='All codon', label2=cod_name, color2='green', bins=30, rwidth=1)
    # pl.show()
    pl.savefig(dirout + '/hist_codon_fc.png', format="png", dpi=340)
    pl.clf()


    # #violin plot
    pos = range(2)
    dat = array([array(a), array(b)])
    fig = pl.figure()
    pl.title("Distribution of codons FoldChange between two conditions")
    ax = fig.add_subplot(1, 1, 1)
    lab = array(['All codons', cod_name])
    violin_plot(ax, dat, pos, bp=1)
    for x, z in zip(dat, pos):
        ax.plot(z, average(x), color='r', marker='*', markeredgecolor='r')
    xtickNames = pl.setp(ax, xticklabels=lab)
    pl.savefig(dirout + '/violinplot_codon.png', format="png", dpi=340)
    pl.clf()

    # (Fval,pval) = stats.ttest_ind(a, b, axis=0, equal_var=True)
    (Fval, pval) = stats.mannwhitneyu(a, b)
    return pval


def get_aa_dict(cond1_norm, cond2_norm):

    # ## create amino acid dictionnary:
    AA = OrderedDict({})
    AA['Phe'] = [cond1_norm['TTT'] + cond1_norm['TTC'], cond2_norm['TTT'] + cond2_norm['TTC']]
    AA['Leu'] = [cond1_norm['TTA'] + cond1_norm['TTG'] + cond1_norm['CTT'] + cond1_norm['CTC'] + cond1_norm['CTA'] + cond1_norm['CTG'], cond2_norm['TTA'] + cond2_norm['TTG'] + cond2_norm['CTT'] + cond2_norm['CTC'] + cond2_norm['CTA'] + cond2_norm['CTG']]
    AA['Ile'] = [cond1_norm['ATT'] + cond1_norm['ATC'] + cond1_norm['ATA'], cond2_norm['ATT'] + cond2_norm['ATC'] + cond2_norm['ATA']]
    AA['Met'] = [cond1_norm['ATG'], cond2_norm['ATG']]
    AA['Val'] = [cond1_norm['GTT'] + cond1_norm['GTC'] + cond1_norm['GTA'] + cond1_norm['GTG'] + cond1_norm['AGT'] + cond1_norm['AGC'], cond2_norm['GTT'] + cond2_norm['GTC'] + cond2_norm['GTA'] + cond2_norm['GTG'] + cond2_norm['AGT'] + cond2_norm['AGC']]
    AA['Ser'] = [cond1_norm['TCT'] + cond1_norm['TCC'] + cond1_norm['TCA'] + cond1_norm['TCG'], cond2_norm['TCT'] + cond2_norm['TCC'] + cond2_norm['TCA'] + cond2_norm['TCG']]
    AA['Pro'] = [cond1_norm['CCT'] + cond1_norm['CCC'] + cond1_norm['CCA'] + cond1_norm['CCG'], cond2_norm['CCT'] + cond2_norm['CCC'] + cond2_norm['CCA'] + cond2_norm['CCG']]
    AA['Thr'] = [cond1_norm['ACT'] + cond1_norm['ACC'] + cond1_norm['ACA'] + cond1_norm['ACG'], cond2_norm['ACT'] + cond2_norm['ACC'] + cond2_norm['ACA'] + cond2_norm['ACG']]
    AA['Ala'] = [cond1_norm['GCT'] + cond1_norm['GCC'] + cond1_norm['GCA'] + cond1_norm['GCG'], cond2_norm['GCT'] + cond2_norm['GCC'] + cond2_norm['GCA'] + cond2_norm['GCG']]
    AA['Tyr'] = [cond1_norm['TAT'] + cond1_norm['TAC'], cond2_norm['TAT'] + cond2_norm['TAC']]
    AA['Stop'] = [cond1_norm['TAA'] + cond1_norm['TAG'] + cond1_norm['TGA'], cond2_norm['TAA'] + cond2_norm['TAG'] + cond2_norm['TGA']]
    AA['His'] = [cond1_norm['CAT'] + cond1_norm['CAC'], cond2_norm['CAT'] + cond2_norm['CAC']]
    AA['Gln'] = [cond1_norm['CAA'] + cond1_norm['CAG'], cond2_norm['CAA'] + cond2_norm['CAG']]
    AA['Asn'] = [cond1_norm['AAT'] + cond1_norm['AAC'], cond2_norm['AAT'] + cond2_norm['AAC']]
    AA['Lys'] = [cond1_norm['AAA'] + cond1_norm['AAG'], cond2_norm['AAA'] + cond2_norm['AAG']]
    AA['Asp'] = [cond1_norm['GAT'] + cond1_norm['GAC'], cond2_norm['GAT'] + cond2_norm['GAC']]
    AA['Glu'] = [cond1_norm['GAA'] + cond1_norm['GAG'], cond2_norm['GAA'] + cond2_norm['GAG']]
    AA['Cys'] = [cond1_norm['TGT'] + cond1_norm['TGC'], cond2_norm['TGT'] + cond2_norm['TGC']]
    AA['Trp'] = [cond1_norm['TGG'], cond2_norm['TGG']]
    AA['Arg'] = [cond1_norm['CGT'] + cond1_norm['CGC'] + cond1_norm['CGA'] + cond1_norm['CGG'] + cond1_norm['AGA'] + cond1_norm['AGG'], cond2_norm['CGT'] + cond2_norm['CGC'] + cond2_norm['CGA'] + cond2_norm['CGG'] + cond2_norm['AGA'] + cond2_norm['AGG']]
    AA['Gly'] = [cond1_norm['GGT'] + cond1_norm['GGC'] + cond1_norm['GGA'] + cond1_norm['GGG'], cond2_norm['GGT'] + cond2_norm['GGC'] + cond2_norm['GGA'] + cond2_norm['GGG']]


    return AA


def plot_codon_usage(result, dirout, c1, c2, outfile, color1, color2):
    '''
        Take list of dict of codon usage and use matplotlib for do graph
    '''

    # #if there are replicat
    if len(result) == 4 :
        # store each dict in variables to make code more readable
        cond1_1 = result[0].copy()
        cond1_2 = result[1].copy()
        cond2_1 = result[2].copy()
        cond2_2 = result[3].copy()
        # get codon order in one of list
        codon_sorted = sorted(cond1_1.iterkeys(), reverse=False)
        # get max of each list
        sum11 = sum(list(cond1_1.itervalues()))
        sum12 = sum(list(cond1_2.itervalues()))
        sum21 = sum(list(cond2_1.itervalues()))
        sum22 = sum(list(cond2_2.itervalues()))
        # for each codon, get values and sd in each condition
        cond1_val = {}
        cond1 = {}
        cond2_val = {}
        cond2 = {}
        std_cond1 = []
        std_cond2 = []
        max_val = []  # # max value for graph
        for i in codon_sorted:
            # # cond1 = moyenne of replicats cond1 divided by max
            cond1_val[i] = ((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2)
            cond1[i] = ((cond1_1[i] + cond1_2[i]) / 2)
            # # standard deviation = absolute value of diffence between replicats of cond1
            std_cond1.append(std(array([(cond1_1[i] * 100 / sum11), (cond1_2[i] * 100 / sum12)])))
            # # cond2 = moyenne of replicats cond1divided by max
            cond2_val[i] = ((cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2)
            cond2[i] = ((cond2_1[i] + cond2_2[i]) / 2)
            # # standard deviation = absolute value of diffence between replicats of cond2
            std_cond2.append(std(array([((cond2_1[i]) * 100 / sum21), ((cond2_2[i]) * 100 / sum22)])))
            # # max value for each codon
            max_val.append(max((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2, (cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2))

        # for graph design
        cond1_norm = OrderedDict(sorted(cond1_val.items(), key=lambda t: t[0]))
        cond1_norm.update ((x, y * 100) for x, y in cond1_norm.items())
        cond2_norm = OrderedDict(sorted(cond2_val.items(), key=lambda t: t[0]))
        cond2_norm.update ((x, y * 100) for x, y in cond2_norm.items())
        max_val = [x * 100 for x in max_val]

        AA = get_aa_dict(cond1_norm, cond2_norm)
        max_valaa = []
        cond1_aa = []
        cond2_aa = []
        aa_name = list(AA.iterkeys())
        for z in AA.itervalues():
            cond1_aa.append(z[0])
            cond2_aa.append(z[1])
            max_valaa.append(max(z))
        # # plot amino acid profile :
        fig = pl.figure(figsize=(24, 10), num=1)
        width = .50
        ax = fig.add_subplot(111)
        ax.xaxis.set_ticks([])
        ind = arange(21)
        pl.xlim(0, 21)
        ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1)
        ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2)
        for x, y, z in zip(ind, max_valaa, aa_name):
            ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14)
        ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)')
        ax.set_xlabel('Amino Acid')
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles, labels)
        pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340)
        pl.clf()


        # # compute theorical count in COND2
        sum2 = (sum21 + sum22) / 2
        cond2_count = []
        for z in cond1_norm.itervalues() :
            count = int(z * sum2 / 100)
            cond2_count.append(count)

        expected = array(cond2_count)
        observed = array(list(cond2.itervalues()))

        # write result
        with open(outfile, 'w') as out :
            out.write('Codon\tRaw_' + c1 + '\tRaw_' + c2 + '\tNorm_' + c1 + '\tNorm_' + c2 + '\tFC\tFC_' + c1 + '\tFC_' + c2 + '\n')
            for i in codon_sorted:
                out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\t' + str((cond2_1[i] / sum21) / (cond1_1[i] / sum11)) + '\t' + str((cond2_2[i] / sum22) / (cond1_1[i] / sum11)) + '\n')
            chi = stats.chisquare(observed, expected)
            out.write('Khi2 test\n')
            out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n')


        # plot result
        fig = pl.figure(figsize=(24, 10), num=1)
        width = .50
        ind = arange(len(codon_sorted))
        ax = fig.add_subplot(111)
        pl.xlim(0, len(codon_sorted) + 1)
        ax.spines['right'].set_color('none')
        ax.spines['top'].set_color('none')
        ax.xaxis.set_ticks([])
        ax.spines['left'].set_smart_bounds(True)
        ax.yaxis.set_ticks_position('left')
        ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, yerr=std_cond1, error_kw={'elinewidth':1, 'ecolor':'black'}, label=c1)
        ax.bar(ind + width, list(cond2_norm.itervalues()), width, yerr=std_cond2, facecolor=color2, error_kw={'elinewidth':1, 'ecolor':'black'}, label=c2)
        for x, y, z in zip(ind, max_val, codon_sorted):
            ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=8)
        ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)')
        ax.set_xlabel('Codons')
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles, labels)
        pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340)
        pl.clf()


    elif len(result) == 2 :

        # store each dict in OrderedDict sorted by key to make code more readable
        cond1 = result[0]
        cond2 = result[1]
        cond1_norm = result[0].copy()
        cond2_norm = result[1].copy()
        # pdb.set_trace()
        # get codon order in one of list
        codon_sorted = sorted(cond1.iterkeys(), reverse=False)

        # get sum of each list
        sum1 = sum(list(cond1.itervalues()))
        sum2 = sum(list(cond2.itervalues()))
        # #Normalize values by sum of each libraries
        cond1_norm.update ((x, (y / sum1) * 100.0) for x, y in cond1_norm.items())
        cond2_norm.update((x, (y / sum2) * 100.0) for x, y in cond2_norm.items())

        # # compute theorical count in COND2
        cond2_count = []
        for z in cond1_norm.itervalues() :
            count = int(z * sum2 / 100.0)
            cond2_count.append(count)

        expected = array(cond2_count)
        observed = array(list(cond2.itervalues()))

        AA = get_aa_dict(cond1_norm, cond2_norm)

        max_val = []
        cond1_aa = []
        cond2_aa = []
        aa_name = list(AA.iterkeys())
        for z in AA.itervalues():
            cond1_aa.append(z[0])
            cond2_aa.append(z[1])
            max_val.append(max(z))

        # # plot amino acid profile :
        fig = pl.figure(num=1)
        width = .45
        ax = fig.add_subplot(111)
        ind = arange(21)
        pl.xlim(0, 21)
        #kwargs = {"hatch":'x'}
        #ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1, **kwargs)
        #kwargs = {"hatch":'.'}
        #ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2, **kwargs)
        ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1)
        ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2)
        #for x, y, z in zip(ind, max_val, aa_name):
        #    ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14)
        axis_font = {'size':'10'}
        pl.xticks(ind + width, aa_name,**axis_font)
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)
        ax.yaxis.set_ticks_position('left')
        ax.xaxis.set_ticks_position('bottom')
        #ax.xaxis.set_ticks([])
        ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)',**axis_font)
        ax.set_xlabel('Amino Acids', **axis_font)
        handles, labels = ax.get_legend_handles_labels()
        font_prop = font_manager.FontProperties(size=8)
        ax.legend(handles, labels, prop=font_prop)
        pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340)
        pl.clf()

        # write result
        with open(outfile, 'w') as out :
            out.write('Codon\tRaw_' + c1 + '\tRaw_' + c2 + '\tNorm_' + c1 + '\tNorm_' + c2 + '\tFC(Mut/WT)\n')
            for i in codon_sorted:
                out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n')
            out.write('Khi2 test\n')
            chi = stats.chisquare(observed, expected)
            out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n')

        # # get max value for each codon for histogram
        max_val = []  # # max value for graph
        for i in cond1:
            # # max value for each codon
            max_val.append(max(cond1_norm[i], cond2_norm[i]))

        # plot result
        fig = pl.figure(figsize=(30, 10), num=1)
        #fig = pl.figure(num=1)
        width = .40
        ind = arange(len(codon_sorted))
        ax = fig.add_subplot(111)
        pl.xlim(0, len(codon_sorted) + 1)
        ax.spines['right'].set_color('none')
        ax.spines['top'].set_color('none')
        ax.xaxis.set_ticks([])
        ax.spines['left'].set_smart_bounds(True)
        ax.yaxis.set_ticks_position('left')
        ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, label=c1)
        ax.bar(ind + width, list(cond2_norm.itervalues()), width, facecolor=color2, label=c2)
        for x, y, z in zip(ind, max_val, codon_sorted):
            ax.text(x + width, y + 0.02, '%s' % z, ha='center', va='bottom', fontsize=8)
        ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)')
        ax.set_xlabel('Codons')
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles, labels)
        pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340)
        pl.clf()


    else :
         stop_err('Error running codon usage plotting : ' + str(e))


    return (cond1_norm, cond2_norm, chi[1])

def write_html_file(html, chi_pval, cond1, cond2):
    try :


        html_str = """
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
    <head>
        <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
        <link href="/static/june_2007_style/blue/base.css" media="screen" rel="Stylesheet" type="text/css" />
    </head>
    <body>
        <h3>Global visualization</h3>
        <p>
            <h5>Visualization of density footprint in each codon.</h5><br> If user has selected analyse with replicats, standart error deviation between each replicate as plotting as error bar in histogram.<br>
            <img border="0" src="hist_codons.png"  width="1040"/>
        </p>
        <p>
            <h5>Test for homogeneity distribution between each condition</h5><br>
            H0 : %s and %s are same distribution <br>
            Khi2 test p-value: %s<br><br>
            If p-value less than 0.05, we can reject homogeneity distribution so we can hypothesize that distributions are not the same. Otherwise, we accept H0<br>

        </p>
        <p>
            <h5>Visualization of density footprint in each codon groupe by amino acid</h5><br>
            <img border="0" src="hist_amino_acid.png"  width="1040"/>
        </p>
    </body>
</html> """ % (cond1,cond2,chi_pval)


        html_file = open(html, "w")
        html_file.write(html_str)
        html_file.close()

    except Exception, e :
        stop_err('Error during html page creation : ' + str(e))


def check_codons_list (codons) :

    for codon in codons :
        if codon not in init_codon_dict().iterkeys() :
            stop_err('Please to enter a valid codon : ' + codon + ' is not find\n')


def check_index_bam (bamfile) :
    # #testing indexed bam file
    if os.path.isfile(bamfile + ".bai") :
        pass
    else :
        cmd = "samtools index %s " % (bamfile)
        proc = subprocess.Popen(args=cmd, shell=True, stderr=subprocess.PIPE)
        returncode = proc.wait()
        # if returncode != 0:
        #    raise Exception

def __main__():


    # Parse command line options
    parser = optparse.OptionParser()
    parser.add_option("-g", "--gff", dest="gff", type="string",
                  help="gff file", metavar="FILE")

    parser.add_option("-1", "--file1", dest="file1", type="string",
                  help="Bam Ribo-Seq alignments cond 1, if rep option, separate files by commas ", metavar="FILE")

    parser.add_option("-2", "--file2", dest="file2", type="string",
                  help="Bam Ribo-Seq alignments cond 2, if rep option, separate files by commas", metavar="FILE")

    parser.add_option("-c", "--cond1", dest="c1", type="string",
                  help="Name for first condition", metavar="STR")

    parser.add_option("-C", "--cond2", dest="c2", type="string",
                  help="Name of second condition", metavar="STR")

    parser.add_option("-k", "--kmer", dest="kmer", type="int",
                  help="Length of your phasing reads", metavar="INT")

#     parser.add_option("-l", "--list", dest="list_cod", type= "string",
#                   help="list of codons to compare to other", metavar="STR")

    parser.add_option("-o", "--out", dest="outfile", type="string",
                  help="write report to FILE", metavar="FILE")

    parser.add_option("-d", "--dirout", dest="dirout", type="string",
                  help="write report to PNG files", metavar="FILE")

    parser.add_option("-a", "--asite", dest="asite", type="int",
                  help="Off-set from the 5'end of the footprint to the A-site", metavar="INT")

    parser.add_option("-s", "--site", dest="site", type="string",
                  help="Script can compute in site A, P or E", metavar="A|P|E")

    parser.add_option("-r", "--rep", dest="rep", type="string",
                  help="if replicate or not", metavar="yes|no")

    parser.add_option("-x", "--hex_col1", dest="color1", type= "string",
                  help="Color for first condition", metavar="STR")

    parser.add_option("-X", "--hex_col2", dest="color2", type= "string",
                  help="Color for second condition", metavar="STR")

    parser.add_option("-q", "--quiet",
                  action="store_false", dest="verbose", default=True,
                  help="don't print status messages to stdout")

    (options, args) = parser.parse_args()
    print "Begin codon frequency analysis at", time.asctime(time.localtime(time.time()))

    try:
        authorized_site = ["A", "P", "E"]
        if options.site not in authorized_site :
            stop_err(options.site + ' is not a authorized ribosome site')

        ## Check if colors exist
        if not colors.is_color_like(options.color1) :
            stop_err( options.color1+' is not a proper color' )
        if not colors.is_color_like(options.color2) :
            stop_err( options.color2+' is not a proper color' )

        ## identify GFF or GTF format from 9th column
        #with open (options.gff,"r") as gffile :
        #    for line in gffile :
        #        if '#' in line :
        #            ## skip header
        #            gffile.next()
         #       elif 'gene_id' in line :
        #            ## launch gtf reader :
        #            GFF = ribo_functions.store_gtf(options.gff)
        #            break
        #        elif 'ID=' in line :
        #            ## launch gff reader
        #            GFF = ribo_functions.store_gff(options.gff)
        #            break
        #        else :
        #            stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )

        #GFF = store_gff(options.gff)
        #GFF = ribo_functions.store_gtf(options.gff)
        ## check gff reading
        #if not GFF['order'] :
         #   stop_err( 'Incorrect GFF file' + str( e ) )

        #### NOT USE IN FINAL VERSION
        # # get codon list
        # codons = options.list_cod.upper().split(',')
        # check_codons_list(codons)
        GFF = HTSeq.GFF_Reader(options.gff)
        # # get html file and directory :
        (html, html_dir) = options.dirout.split(',')
        if os.path.exists(html_dir):
            raise
        try:
            os.mkdir(html_dir)
        except:
            raise Exception(html_dir + ' mkdir')
        # #RUN analysis
        # #If there are replicats
        if options.rep == "yes" :
            result = []
            # split name of each file options by ","
            cond1 = options.file1.split(',')
            cond2 = options.file2.split(',')
            # # calcul for each file
            for fh in itertools.chain(cond1, cond2):
                check_index_bam (fh)
                result.append(get_codon_usage(fh, GFF, options.site, options.kmer, options.asite))
            (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2)
            # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir)


        # #If there are no replicat
        elif options.rep == "no" :
            result = []
            # #calcul for each cond
            for fh in (options.file1, options.file2):
                check_index_bam (fh)
                result.append(get_codon_usage(fh, GFF, options.site, options.kmer,options.asite))
            (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2)
            # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir)

        else :
            sys.stderr.write("Please enter yes or no for --rep option. Programme aborted at %s" % time.asctime(time.localtime(time.time())))
            sys.exit()

        # write_html_file(html,chi_pval,t_pval,codons,options.c1, options.c2)
        write_html_file(html, chi_pval, options.c1, options.c2)

        print "Finish codon frequency analysis at", time.asctime(time.localtime(time.time()))
    except Exception, e:
        stop_err('Error running codon frequency analysis (main program) : ' + str(e))


if __name__=="__main__":
    __main__()
author	rlegendre
date	Thu, 22 Jan 2015 14:34:38 +0100
parents	b8c070add3b7
children	7c944fd9907e