Mercurial > repos > rlegendre > ribo_tools
view get_codon_frequency.py @ 10:707807fee542
(none)
author | rlegendre |
---|---|
date | Thu, 22 Jan 2015 14:34:38 +0100 |
parents | b8c070add3b7 |
children | 7c944fd9907e |
line wrap: on
line source
#!/usr/bin/env python2.7 # -*- coding: utf-8 -*- ''' Created on sep. 2013 @author: rachel legendre @copyright: rachel.legendre@igmors.u-psud.fr @license: GPL v3 ''' from __future__ import division import os, sys, optparse, tempfile, subprocess, re, shutil, commands, urllib, time import itertools import math from decimal import Decimal from Bio import SeqIO from Bio.Seq import Seq from numpy import arange, std, array, linspace, average #from matplotlib import pyplot as pl import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as pl from matplotlib import font_manager from matplotlib import colors import csv from scipy import stats from collections import OrderedDict import ribo_functions import HTSeq # #libraries for debugg import pdb # import cPickle def stop_err(msg): sys.stderr.write("%s\n" % msg) sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) sys.exit() def store_gff(gff): ''' parse and store gff file in a dictionnary ''' try: GFF = OrderedDict({}) with open(gff, 'r') as f_gff : # GFF['order'] = [] for line in f_gff: # # switch commented lines head = line.split("#")[0] if head != "" : feature = (line.split('\t')[8]).split(';') chrom = line.split('\t')[0] if chrom not in GFF : GFF[chrom] = {} # first line is already gene line : if line.split('\t')[2] == 'gene' : gene = feature[0].replace("ID=", "") if re.search('gene', feature[2]) : Name = feature[2].replace("gene=", "") else : Name = "Unknown" # #get annotation note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line) note = urllib.unquote(str(note)).replace("\n", "") # # store gene information # GFF['order'].append(gene) GFF[chrom][gene] = {} GFF[chrom][gene]['chrom'] = line.split('\t')[0] GFF[chrom][gene]['start'] = int(line.split('\t')[3]) GFF[chrom][gene]['stop'] = int(line.split('\t')[4]) GFF[chrom][gene]['strand'] = line.split('\t')[6] GFF[chrom][gene]['name'] = Name GFF[chrom][gene]['note'] = note GFF[chrom][gene]['exon'] = {} GFF[chrom][gene]['exon_number'] = 0 # print Name elif line.split('\t')[2] == 'CDS' : gene = re.sub(r"Parent\=(.+)_mRNA", r"\1", feature[0]) if GFF[chrom].has_key(gene) : GFF[chrom][gene]['exon_number'] += 1 exon_number = GFF[chrom][gene]['exon_number'] GFF[chrom][gene]['exon'][exon_number] = {} GFF[chrom][gene]['exon'][exon_number]['frame'] = line.split('\t')[7] GFF[chrom][gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) GFF[chrom][gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) # # if there is a five prim UTR intron, we change start of gene elif line.split('\t')[2] == 'five_prime_UTR_intron' : if GFF[chrom][gene]['strand'] == "+" : GFF[chrom][gene]['start'] = GFF[chrom][gene]['exon'][1]['start'] else : GFF[chrom][gene]['stop'] = GFF[chrom][gene]['exon'][exon_number]['stop'] return GFF except Exception, e: stop_err('Error during gff storage : ' + str(e)) #chrI SGD gene 87286 87752 . + . ID=YAL030W;Name=YAL030W;gene=SNC1;Alias=SNC1;Ontology_term=GO:0005484,GO:0005768,GO:0005802,GO:0005886,GO:0005935,GO:0006887,GO:0006893,GO:000689 #7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra #ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr #om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified #chrI SGD CDS 87286 87387 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified #chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified def init_codon_dict(): Codon_dict = OrderedDict([('AAA', 0), ('AAC', 0), ('AAG', 0), ('AAT', 0), ('ACA', 0), ('ACC', 0), ('ACG', 0), ('ACT', 0), ('AGA', 0), ('AGC', 0), ('AGG', 0), ('AGT', 0), ('ATA', 0), ('ATC', 0), ('ATG', 0), ('ATT', 0), ('CAA', 0), ('CAC', 0), ('CAG', 0), ('CAT', 0), ('CCA', 0), ('CCC', 0), ('CCG', 0), ('CCT', 0), ('CGA', 0), ('CGC', 0), ('CGG', 0), ('CGT', 0), ('CTA', 0), ('CTC', 0), ('CTG', 0), ('CTT', 0), ('GAA', 0), ('GAC', 0), ('GAG', 0), ('GAT', 0), ('GCA', 0), ('GCC', 0), ('GCG', 0), ('GCT', 0), ('GGA', 0), ('GGC', 0), ('GGG', 0), ('GGT', 0), ('GTA', 0), ('GTC', 0), ('GTG', 0), ('GTT', 0), ('TAA', 0), ('TAC', 0), ('TAG', 0), ('TAT', 0), ('TCA', 0), ('TCC', 0), ('TCG', 0), ('TCT', 0), ('TGA', 0), ('TGC', 0), ('TGG', 0), ('TGT', 0), ('TTA', 0), ('TTC', 0), ('TTG', 0), ('TTT', 0)]) return Codon_dict def get_codon_usage(bamfile, GFF, site, kmer, a_pos): ''' Read GFF dict and get gene codon usage. Return dict of codons usage ''' try: codon = init_codon_dict() for feature in GFF : if feature.type == 'gene' : codon_dict = init_codon_dict() chrom = feature.iv.chrom start = feature.iv.start stop = feature.iv.end region = chrom + ':' + str(start) + '-' + str(stop) ## DEPRECATED #for chrom in GFF.iterkeys(): #for gene in GFF[chrom] : # codon_dict = init_codon_dict() #start = GFF[chrom][gene]['start'] #print start #stop = GFF[chrom][gene]['stop'] #print stop #region = chrom + ':' + str(start) + '-' + str(stop) ####### # #get all reads in this gene reads = subprocess.check_output(["samtools", "view", bamfile, region]) head = subprocess.check_output(["samtools", "view", "-H", bamfile]) read_tab = reads.split('\n') for read in read_tab: # # search mapper for eliminate multiple alignements if 'bowtie' in head: multi_tag = "XS:i:" elif 'bwa' in head: multi_tag = "XT:A:R" else : stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") if len(read) == 0: continue len_read = len(read.split('\t')[9]) # if it's read of good length if len_read == kmer and multi_tag not in read: feat = read.split('\t') seq = feat[9] # if it's a reverse read if feat[1] == '16' : if site == "A" : # #get A-site cod = str(Seq(seq[a_pos-5:a_pos-2]).reverse_complement()) elif site == "P" : # #get P-site cod = str(Seq(seq[a_pos-2:a_pos+1]).reverse_complement()) else : # #get site-E cod = str(Seq(seq[a_pos+1:a_pos+4]).reverse_complement()) # # test if it's a true codon not a CNG codon for example if codon_dict.has_key(cod) : codon_dict[cod] += 1 # if it's a forward read elif feat[1] == '0' : if site == "A" : # #get A-site cod = seq[a_pos:a_pos+3] elif site == "P" : # #get P-site cod = seq[a_pos-3:a_pos] else : # #get site-E cod = seq[a_pos-6:a_pos-3] if codon_dict.has_key(cod) : codon_dict[cod] += 1 # # add in global dict for cod, count in codon_dict.iteritems() : codon[cod] += count return codon except Exception, e: stop_err('Error during codon usage calcul: ' + str(e)) ''' http://pyinsci.blogspot.fr/2009/09/violin-plot-with-matplotlib.html ''' def violin_plot(ax, data, pos, bp=False): ''' create violin plots on an axis ''' dist = max(pos) - min(pos) w = min(0.15 * max(dist, 1.0), 0.5) for d, p in zip(data, pos): k = stats.gaussian_kde(d) # calculates the kernel density m = k.dataset.min() # lower bound of violin M = k.dataset.max() # upper bound of violin x = arange(m, M, (M - m) / 100.) # support for violin v = k.evaluate(x) # violin profile (density curve) v = v / v.max() * w # scaling the violin to the available space ax.fill_betweenx(x, p, v + p, facecolor=color1, alpha=0.3) ax.fill_betweenx(x, p, -v + p, facecolor=color2, alpha=0.3) if bp: ax.boxplot(data, notch=1, positions=pos, vert=1) ''' http://log.ooz.ie/2013/02/matplotlib-comparative-histogram-recipe.html ''' def comphist(x1, x2, orientation='vertical', **kwargs): """Draw a comparative histogram.""" # Split keyword args: kwargs1 = {} kwargs2 = {} kwcommon = {} for arg in kwargs: tgt_arg = arg[:-1] if arg.endswith('1'): arg_dict = kwargs1 elif arg.endswith('2'): arg_dict = kwargs2 else: arg_dict = kwcommon tgt_arg = arg arg_dict[tgt_arg] = kwargs[arg] kwargs1.update(kwcommon) kwargs2.update(kwcommon) fig = pl.figure() # Have both histograms share one axis. if orientation == 'vertical': ax1 = pl.subplot(211) ax2 = pl.subplot(212, sharex=ax1) # Flip the ax2 histogram horizontally. ax2.set_ylim(ax1.get_ylim()[::-1]) pl.setp(ax1.get_xticklabels(), visible=False) legend_loc = (1, 4) else: ax1 = pl.subplot(122) ax2 = pl.subplot(121, sharey=ax1) # Flip the ax2 histogram vertically. ax2.set_xlim(ax2.get_xlim()[::-1]) pl.setp(ax1.get_yticklabels(), visible=False) legend_loc = (1, 2) ax1.hist(x1, orientation=orientation, **kwargs1) ax2.hist(x2, orientation=orientation, **kwargs2) ax2.set_ylim(ax1.get_ylim()[::-1]) ax1.legend(loc=legend_loc[0]) ax2.legend(loc=legend_loc[1]) # Tighten up the layout. pl.subplots_adjust(wspace=0.0, hspace=0.0) return fig def compute_FC_plot(cond1_norm, cond2_norm, cod_name, codon_to_test, dirout): FC_tab = [] for z, y in zip(cond1_norm.itervalues(), cond2_norm.itervalues()): fc = z - y FC_tab.append(fc) # #codon_to_test = ['TGA','TAG','TAA'] a = [] b = [] cod = [] for codon in cond1_norm.iterkeys(): if codon in codon_to_test : fc = cond1_norm[codon] - cond2_norm[codon] b.append(fc) cod.append(codon) else : fc = cond1_norm[codon] - cond2_norm[codon] a.append(fc) fig = pl.figure(num=1) comphist(array(a), array(b), label1='All codon', label2=cod_name, color2='green', bins=30, rwidth=1) # pl.show() pl.savefig(dirout + '/hist_codon_fc.png', format="png", dpi=340) pl.clf() # #violin plot pos = range(2) dat = array([array(a), array(b)]) fig = pl.figure() pl.title("Distribution of codons FoldChange between two conditions") ax = fig.add_subplot(1, 1, 1) lab = array(['All codons', cod_name]) violin_plot(ax, dat, pos, bp=1) for x, z in zip(dat, pos): ax.plot(z, average(x), color='r', marker='*', markeredgecolor='r') xtickNames = pl.setp(ax, xticklabels=lab) pl.savefig(dirout + '/violinplot_codon.png', format="png", dpi=340) pl.clf() # (Fval,pval) = stats.ttest_ind(a, b, axis=0, equal_var=True) (Fval, pval) = stats.mannwhitneyu(a, b) return pval def get_aa_dict(cond1_norm, cond2_norm): # ## create amino acid dictionnary: AA = OrderedDict({}) AA['Phe'] = [cond1_norm['TTT'] + cond1_norm['TTC'], cond2_norm['TTT'] + cond2_norm['TTC']] AA['Leu'] = [cond1_norm['TTA'] + cond1_norm['TTG'] + cond1_norm['CTT'] + cond1_norm['CTC'] + cond1_norm['CTA'] + cond1_norm['CTG'], cond2_norm['TTA'] + cond2_norm['TTG'] + cond2_norm['CTT'] + cond2_norm['CTC'] + cond2_norm['CTA'] + cond2_norm['CTG']] AA['Ile'] = [cond1_norm['ATT'] + cond1_norm['ATC'] + cond1_norm['ATA'], cond2_norm['ATT'] + cond2_norm['ATC'] + cond2_norm['ATA']] AA['Met'] = [cond1_norm['ATG'], cond2_norm['ATG']] AA['Val'] = [cond1_norm['GTT'] + cond1_norm['GTC'] + cond1_norm['GTA'] + cond1_norm['GTG'] + cond1_norm['AGT'] + cond1_norm['AGC'], cond2_norm['GTT'] + cond2_norm['GTC'] + cond2_norm['GTA'] + cond2_norm['GTG'] + cond2_norm['AGT'] + cond2_norm['AGC']] AA['Ser'] = [cond1_norm['TCT'] + cond1_norm['TCC'] + cond1_norm['TCA'] + cond1_norm['TCG'], cond2_norm['TCT'] + cond2_norm['TCC'] + cond2_norm['TCA'] + cond2_norm['TCG']] AA['Pro'] = [cond1_norm['CCT'] + cond1_norm['CCC'] + cond1_norm['CCA'] + cond1_norm['CCG'], cond2_norm['CCT'] + cond2_norm['CCC'] + cond2_norm['CCA'] + cond2_norm['CCG']] AA['Thr'] = [cond1_norm['ACT'] + cond1_norm['ACC'] + cond1_norm['ACA'] + cond1_norm['ACG'], cond2_norm['ACT'] + cond2_norm['ACC'] + cond2_norm['ACA'] + cond2_norm['ACG']] AA['Ala'] = [cond1_norm['GCT'] + cond1_norm['GCC'] + cond1_norm['GCA'] + cond1_norm['GCG'], cond2_norm['GCT'] + cond2_norm['GCC'] + cond2_norm['GCA'] + cond2_norm['GCG']] AA['Tyr'] = [cond1_norm['TAT'] + cond1_norm['TAC'], cond2_norm['TAT'] + cond2_norm['TAC']] AA['Stop'] = [cond1_norm['TAA'] + cond1_norm['TAG'] + cond1_norm['TGA'], cond2_norm['TAA'] + cond2_norm['TAG'] + cond2_norm['TGA']] AA['His'] = [cond1_norm['CAT'] + cond1_norm['CAC'], cond2_norm['CAT'] + cond2_norm['CAC']] AA['Gln'] = [cond1_norm['CAA'] + cond1_norm['CAG'], cond2_norm['CAA'] + cond2_norm['CAG']] AA['Asn'] = [cond1_norm['AAT'] + cond1_norm['AAC'], cond2_norm['AAT'] + cond2_norm['AAC']] AA['Lys'] = [cond1_norm['AAA'] + cond1_norm['AAG'], cond2_norm['AAA'] + cond2_norm['AAG']] AA['Asp'] = [cond1_norm['GAT'] + cond1_norm['GAC'], cond2_norm['GAT'] + cond2_norm['GAC']] AA['Glu'] = [cond1_norm['GAA'] + cond1_norm['GAG'], cond2_norm['GAA'] + cond2_norm['GAG']] AA['Cys'] = [cond1_norm['TGT'] + cond1_norm['TGC'], cond2_norm['TGT'] + cond2_norm['TGC']] AA['Trp'] = [cond1_norm['TGG'], cond2_norm['TGG']] AA['Arg'] = [cond1_norm['CGT'] + cond1_norm['CGC'] + cond1_norm['CGA'] + cond1_norm['CGG'] + cond1_norm['AGA'] + cond1_norm['AGG'], cond2_norm['CGT'] + cond2_norm['CGC'] + cond2_norm['CGA'] + cond2_norm['CGG'] + cond2_norm['AGA'] + cond2_norm['AGG']] AA['Gly'] = [cond1_norm['GGT'] + cond1_norm['GGC'] + cond1_norm['GGA'] + cond1_norm['GGG'], cond2_norm['GGT'] + cond2_norm['GGC'] + cond2_norm['GGA'] + cond2_norm['GGG']] return AA def plot_codon_usage(result, dirout, c1, c2, outfile, color1, color2): ''' Take list of dict of codon usage and use matplotlib for do graph ''' # #if there are replicat if len(result) == 4 : # store each dict in variables to make code more readable cond1_1 = result[0].copy() cond1_2 = result[1].copy() cond2_1 = result[2].copy() cond2_2 = result[3].copy() # get codon order in one of list codon_sorted = sorted(cond1_1.iterkeys(), reverse=False) # get max of each list sum11 = sum(list(cond1_1.itervalues())) sum12 = sum(list(cond1_2.itervalues())) sum21 = sum(list(cond2_1.itervalues())) sum22 = sum(list(cond2_2.itervalues())) # for each codon, get values and sd in each condition cond1_val = {} cond1 = {} cond2_val = {} cond2 = {} std_cond1 = [] std_cond2 = [] max_val = [] # # max value for graph for i in codon_sorted: # # cond1 = moyenne of replicats cond1 divided by max cond1_val[i] = ((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2) cond1[i] = ((cond1_1[i] + cond1_2[i]) / 2) # # standard deviation = absolute value of diffence between replicats of cond1 std_cond1.append(std(array([(cond1_1[i] * 100 / sum11), (cond1_2[i] * 100 / sum12)]))) # # cond2 = moyenne of replicats cond1divided by max cond2_val[i] = ((cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2) cond2[i] = ((cond2_1[i] + cond2_2[i]) / 2) # # standard deviation = absolute value of diffence between replicats of cond2 std_cond2.append(std(array([((cond2_1[i]) * 100 / sum21), ((cond2_2[i]) * 100 / sum22)]))) # # max value for each codon max_val.append(max((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2, (cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2)) # for graph design cond1_norm = OrderedDict(sorted(cond1_val.items(), key=lambda t: t[0])) cond1_norm.update ((x, y * 100) for x, y in cond1_norm.items()) cond2_norm = OrderedDict(sorted(cond2_val.items(), key=lambda t: t[0])) cond2_norm.update ((x, y * 100) for x, y in cond2_norm.items()) max_val = [x * 100 for x in max_val] AA = get_aa_dict(cond1_norm, cond2_norm) max_valaa = [] cond1_aa = [] cond2_aa = [] aa_name = list(AA.iterkeys()) for z in AA.itervalues(): cond1_aa.append(z[0]) cond2_aa.append(z[1]) max_valaa.append(max(z)) # # plot amino acid profile : fig = pl.figure(figsize=(24, 10), num=1) width = .50 ax = fig.add_subplot(111) ax.xaxis.set_ticks([]) ind = arange(21) pl.xlim(0, 21) ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1) ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2) for x, y, z in zip(ind, max_valaa, aa_name): ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14) ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)') ax.set_xlabel('Amino Acid') handles, labels = ax.get_legend_handles_labels() ax.legend(handles, labels) pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340) pl.clf() # # compute theorical count in COND2 sum2 = (sum21 + sum22) / 2 cond2_count = [] for z in cond1_norm.itervalues() : count = int(z * sum2 / 100) cond2_count.append(count) expected = array(cond2_count) observed = array(list(cond2.itervalues())) # write result with open(outfile, 'w') as out : out.write('Codon\tRaw_' + c1 + '\tRaw_' + c2 + '\tNorm_' + c1 + '\tNorm_' + c2 + '\tFC\tFC_' + c1 + '\tFC_' + c2 + '\n') for i in codon_sorted: out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\t' + str((cond2_1[i] / sum21) / (cond1_1[i] / sum11)) + '\t' + str((cond2_2[i] / sum22) / (cond1_1[i] / sum11)) + '\n') chi = stats.chisquare(observed, expected) out.write('Khi2 test\n') out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n') # plot result fig = pl.figure(figsize=(24, 10), num=1) width = .50 ind = arange(len(codon_sorted)) ax = fig.add_subplot(111) pl.xlim(0, len(codon_sorted) + 1) ax.spines['right'].set_color('none') ax.spines['top'].set_color('none') ax.xaxis.set_ticks([]) ax.spines['left'].set_smart_bounds(True) ax.yaxis.set_ticks_position('left') ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, yerr=std_cond1, error_kw={'elinewidth':1, 'ecolor':'black'}, label=c1) ax.bar(ind + width, list(cond2_norm.itervalues()), width, yerr=std_cond2, facecolor=color2, error_kw={'elinewidth':1, 'ecolor':'black'}, label=c2) for x, y, z in zip(ind, max_val, codon_sorted): ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=8) ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)') ax.set_xlabel('Codons') handles, labels = ax.get_legend_handles_labels() ax.legend(handles, labels) pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340) pl.clf() elif len(result) == 2 : # store each dict in OrderedDict sorted by key to make code more readable cond1 = result[0] cond2 = result[1] cond1_norm = result[0].copy() cond2_norm = result[1].copy() # pdb.set_trace() # get codon order in one of list codon_sorted = sorted(cond1.iterkeys(), reverse=False) # get sum of each list sum1 = sum(list(cond1.itervalues())) sum2 = sum(list(cond2.itervalues())) # #Normalize values by sum of each libraries cond1_norm.update ((x, (y / sum1) * 100.0) for x, y in cond1_norm.items()) cond2_norm.update((x, (y / sum2) * 100.0) for x, y in cond2_norm.items()) # # compute theorical count in COND2 cond2_count = [] for z in cond1_norm.itervalues() : count = int(z * sum2 / 100.0) cond2_count.append(count) expected = array(cond2_count) observed = array(list(cond2.itervalues())) AA = get_aa_dict(cond1_norm, cond2_norm) max_val = [] cond1_aa = [] cond2_aa = [] aa_name = list(AA.iterkeys()) for z in AA.itervalues(): cond1_aa.append(z[0]) cond2_aa.append(z[1]) max_val.append(max(z)) # # plot amino acid profile : fig = pl.figure(num=1) width = .45 ax = fig.add_subplot(111) ind = arange(21) pl.xlim(0, 21) #kwargs = {"hatch":'x'} #ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1, **kwargs) #kwargs = {"hatch":'.'} #ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2, **kwargs) ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1) ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2) #for x, y, z in zip(ind, max_val, aa_name): # ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14) axis_font = {'size':'10'} pl.xticks(ind + width, aa_name,**axis_font) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.yaxis.set_ticks_position('left') ax.xaxis.set_ticks_position('bottom') #ax.xaxis.set_ticks([]) ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)',**axis_font) ax.set_xlabel('Amino Acids', **axis_font) handles, labels = ax.get_legend_handles_labels() font_prop = font_manager.FontProperties(size=8) ax.legend(handles, labels, prop=font_prop) pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340) pl.clf() # write result with open(outfile, 'w') as out : out.write('Codon\tRaw_' + c1 + '\tRaw_' + c2 + '\tNorm_' + c1 + '\tNorm_' + c2 + '\tFC(Mut/WT)\n') for i in codon_sorted: out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n') out.write('Khi2 test\n') chi = stats.chisquare(observed, expected) out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n') # # get max value for each codon for histogram max_val = [] # # max value for graph for i in cond1: # # max value for each codon max_val.append(max(cond1_norm[i], cond2_norm[i])) # plot result fig = pl.figure(figsize=(30, 10), num=1) #fig = pl.figure(num=1) width = .40 ind = arange(len(codon_sorted)) ax = fig.add_subplot(111) pl.xlim(0, len(codon_sorted) + 1) ax.spines['right'].set_color('none') ax.spines['top'].set_color('none') ax.xaxis.set_ticks([]) ax.spines['left'].set_smart_bounds(True) ax.yaxis.set_ticks_position('left') ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, label=c1) ax.bar(ind + width, list(cond2_norm.itervalues()), width, facecolor=color2, label=c2) for x, y, z in zip(ind, max_val, codon_sorted): ax.text(x + width, y + 0.02, '%s' % z, ha='center', va='bottom', fontsize=8) ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)') ax.set_xlabel('Codons') handles, labels = ax.get_legend_handles_labels() ax.legend(handles, labels) pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340) pl.clf() else : stop_err('Error running codon usage plotting : ' + str(e)) return (cond1_norm, cond2_norm, chi[1]) def write_html_file(html, chi_pval, cond1, cond2): try : html_str = """ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <link href="/static/june_2007_style/blue/base.css" media="screen" rel="Stylesheet" type="text/css" /> </head> <body> <h3>Global visualization</h3> <p> <h5>Visualization of density footprint in each codon.</h5><br> If user has selected analyse with replicats, standart error deviation between each replicate as plotting as error bar in histogram.<br> <img border="0" src="hist_codons.png" width="1040"/> </p> <p> <h5>Test for homogeneity distribution between each condition</h5><br> H0 : %s and %s are same distribution <br> Khi2 test p-value: %s<br><br> If p-value less than 0.05, we can reject homogeneity distribution so we can hypothesize that distributions are not the same. Otherwise, we accept H0<br> </p> <p> <h5>Visualization of density footprint in each codon groupe by amino acid</h5><br> <img border="0" src="hist_amino_acid.png" width="1040"/> </p> </body> </html> """ % (cond1,cond2,chi_pval) html_file = open(html, "w") html_file.write(html_str) html_file.close() except Exception, e : stop_err('Error during html page creation : ' + str(e)) def check_codons_list (codons) : for codon in codons : if codon not in init_codon_dict().iterkeys() : stop_err('Please to enter a valid codon : ' + codon + ' is not find\n') def check_index_bam (bamfile) : # #testing indexed bam file if os.path.isfile(bamfile + ".bai") : pass else : cmd = "samtools index %s " % (bamfile) proc = subprocess.Popen(args=cmd, shell=True, stderr=subprocess.PIPE) returncode = proc.wait() # if returncode != 0: # raise Exception def __main__(): # Parse command line options parser = optparse.OptionParser() parser.add_option("-g", "--gff", dest="gff", type="string", help="gff file", metavar="FILE") parser.add_option("-1", "--file1", dest="file1", type="string", help="Bam Ribo-Seq alignments cond 1, if rep option, separate files by commas ", metavar="FILE") parser.add_option("-2", "--file2", dest="file2", type="string", help="Bam Ribo-Seq alignments cond 2, if rep option, separate files by commas", metavar="FILE") parser.add_option("-c", "--cond1", dest="c1", type="string", help="Name for first condition", metavar="STR") parser.add_option("-C", "--cond2", dest="c2", type="string", help="Name of second condition", metavar="STR") parser.add_option("-k", "--kmer", dest="kmer", type="int", help="Length of your phasing reads", metavar="INT") # parser.add_option("-l", "--list", dest="list_cod", type= "string", # help="list of codons to compare to other", metavar="STR") parser.add_option("-o", "--out", dest="outfile", type="string", help="write report to FILE", metavar="FILE") parser.add_option("-d", "--dirout", dest="dirout", type="string", help="write report to PNG files", metavar="FILE") parser.add_option("-a", "--asite", dest="asite", type="int", help="Off-set from the 5'end of the footprint to the A-site", metavar="INT") parser.add_option("-s", "--site", dest="site", type="string", help="Script can compute in site A, P or E", metavar="A|P|E") parser.add_option("-r", "--rep", dest="rep", type="string", help="if replicate or not", metavar="yes|no") parser.add_option("-x", "--hex_col1", dest="color1", type= "string", help="Color for first condition", metavar="STR") parser.add_option("-X", "--hex_col2", dest="color2", type= "string", help="Color for second condition", metavar="STR") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") (options, args) = parser.parse_args() print "Begin codon frequency analysis at", time.asctime(time.localtime(time.time())) try: authorized_site = ["A", "P", "E"] if options.site not in authorized_site : stop_err(options.site + ' is not a authorized ribosome site') ## Check if colors exist if not colors.is_color_like(options.color1) : stop_err( options.color1+' is not a proper color' ) if not colors.is_color_like(options.color2) : stop_err( options.color2+' is not a proper color' ) ## identify GFF or GTF format from 9th column #with open (options.gff,"r") as gffile : # for line in gffile : # if '#' in line : # ## skip header # gffile.next() # elif 'gene_id' in line : # ## launch gtf reader : # GFF = ribo_functions.store_gtf(options.gff) # break # elif 'ID=' in line : # ## launch gff reader # GFF = ribo_functions.store_gff(options.gff) # break # else : # stop_err( 'Please check your annotation file is in correct format, GFF or GTF' ) #GFF = store_gff(options.gff) #GFF = ribo_functions.store_gtf(options.gff) ## check gff reading #if not GFF['order'] : # stop_err( 'Incorrect GFF file' + str( e ) ) #### NOT USE IN FINAL VERSION # # get codon list # codons = options.list_cod.upper().split(',') # check_codons_list(codons) GFF = HTSeq.GFF_Reader(options.gff) # # get html file and directory : (html, html_dir) = options.dirout.split(',') if os.path.exists(html_dir): raise try: os.mkdir(html_dir) except: raise Exception(html_dir + ' mkdir') # #RUN analysis # #If there are replicats if options.rep == "yes" : result = [] # split name of each file options by "," cond1 = options.file1.split(',') cond2 = options.file2.split(',') # # calcul for each file for fh in itertools.chain(cond1, cond2): check_index_bam (fh) result.append(get_codon_usage(fh, GFF, options.site, options.kmer, options.asite)) (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2) # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir) # #If there are no replicat elif options.rep == "no" : result = [] # #calcul for each cond for fh in (options.file1, options.file2): check_index_bam (fh) result.append(get_codon_usage(fh, GFF, options.site, options.kmer,options.asite)) (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2) # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir) else : sys.stderr.write("Please enter yes or no for --rep option. Programme aborted at %s" % time.asctime(time.localtime(time.time()))) sys.exit() # write_html_file(html,chi_pval,t_pval,codons,options.c1, options.c2) write_html_file(html, chi_pval, options.c1, options.c2) print "Finish codon frequency analysis at", time.asctime(time.localtime(time.time())) except Exception, e: stop_err('Error running codon frequency analysis (main program) : ' + str(e)) if __name__=="__main__": __main__()