Mercurial > repos > rlegendre > ribo_tools
comparison get_codon_frequency.py @ 0:b8c070add3b7
Uploaded
| author | rlegendre |
|---|---|
| date | Mon, 20 Oct 2014 11:06:17 -0400 |
| parents | |
| children | 707807fee542 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:b8c070add3b7 |
|---|---|
| 1 #!/usr/bin/env python2.7 | |
| 2 # -*- coding: utf-8 -*- | |
| 3 | |
| 4 ''' | |
| 5 Created on sep. 2013 | |
| 6 @author: rachel legendre | |
| 7 @copyright: rachel.legendre@igmors.u-psud.fr | |
| 8 @license: GPL v3 | |
| 9 ''' | |
| 10 | |
| 11 from __future__ import division | |
| 12 import os, sys, optparse, tempfile, subprocess, re, shutil, commands, urllib, time | |
| 13 import itertools | |
| 14 import math | |
| 15 from decimal import Decimal | |
| 16 from Bio import SeqIO | |
| 17 from Bio.Seq import Seq | |
| 18 from numpy import arange, std, array, linspace, average | |
| 19 #from matplotlib import pyplot as pl | |
| 20 import matplotlib | |
| 21 matplotlib.use('Agg') | |
| 22 import matplotlib.pyplot as pl | |
| 23 from matplotlib import font_manager | |
| 24 from matplotlib import colors | |
| 25 import csv | |
| 26 from scipy import stats | |
| 27 from collections import OrderedDict | |
| 28 # #libraries for debugg | |
| 29 # import pdb | |
| 30 # import cPickle | |
| 31 | |
| 32 def stop_err(msg): | |
| 33 sys.stderr.write("%s\n" % msg) | |
| 34 sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) | |
| 35 sys.exit() | |
| 36 | |
| 37 | |
| 38 def store_gff(gff): | |
| 39 ''' | |
| 40 parse and store gff file in a dictionnary | |
| 41 ''' | |
| 42 try: | |
| 43 GFF = OrderedDict({}) | |
| 44 with open(gff, 'r') as f_gff : | |
| 45 # GFF['order'] = [] | |
| 46 for line in f_gff: | |
| 47 # # switch commented lines | |
| 48 head = line.split("#")[0] | |
| 49 if head != "" : | |
| 50 feature = (line.split('\t')[8]).split(';') | |
| 51 chrom = line.split('\t')[0] | |
| 52 if chrom not in GFF : | |
| 53 GFF[chrom] = {} | |
| 54 # first line is already gene line : | |
| 55 if line.split('\t')[2] == 'gene' : | |
| 56 gene = feature[0].replace("ID=", "") | |
| 57 if re.search('gene', feature[2]) : | |
| 58 Name = feature[2].replace("gene=", "") | |
| 59 else : | |
| 60 Name = "Unknown" | |
| 61 # #get annotation | |
| 62 note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line) | |
| 63 note = urllib.unquote(str(note)).replace("\n", "") | |
| 64 # # store gene information | |
| 65 # GFF['order'].append(gene) | |
| 66 GFF[chrom][gene] = {} | |
| 67 GFF[chrom][gene]['chrom'] = line.split('\t')[0] | |
| 68 GFF[chrom][gene]['start'] = int(line.split('\t')[3]) | |
| 69 GFF[chrom][gene]['stop'] = int(line.split('\t')[4]) | |
| 70 GFF[chrom][gene]['strand'] = line.split('\t')[6] | |
| 71 GFF[chrom][gene]['name'] = Name | |
| 72 GFF[chrom][gene]['note'] = note | |
| 73 GFF[chrom][gene]['exon'] = {} | |
| 74 GFF[chrom][gene]['exon_number'] = 0 | |
| 75 # print Name | |
| 76 elif line.split('\t')[2] == 'CDS' : | |
| 77 gene = re.sub(r"Parent\=(.+)_mRNA", r"\1", feature[0]) | |
| 78 if GFF[chrom].has_key(gene) : | |
| 79 GFF[chrom][gene]['exon_number'] += 1 | |
| 80 exon_number = GFF[chrom][gene]['exon_number'] | |
| 81 GFF[chrom][gene]['exon'][exon_number] = {} | |
| 82 GFF[chrom][gene]['exon'][exon_number]['frame'] = line.split('\t')[7] | |
| 83 GFF[chrom][gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) | |
| 84 GFF[chrom][gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) | |
| 85 # # if there is a five prim UTR intron, we change start of gene | |
| 86 elif line.split('\t')[2] == 'five_prime_UTR_intron' : | |
| 87 if GFF[chrom][gene]['strand'] == "+" : | |
| 88 GFF[chrom][gene]['start'] = GFF[chrom][gene]['exon'][1]['start'] | |
| 89 else : | |
| 90 GFF[chrom][gene]['stop'] = GFF[chrom][gene]['exon'][exon_number]['stop'] | |
| 91 return GFF | |
| 92 except Exception, e: | |
| 93 stop_err('Error during gff storage : ' + str(e)) | |
| 94 | |
| 95 #chrI SGD gene 87286 87752 . + . ID=YAL030W;Name=YAL030W;gene=SNC1;Alias=SNC1;Ontology_term=GO:0005484,GO:0005768,GO:0005802,GO:0005886,GO:0005935,GO:0006887,GO:0006893,GO:000689 | |
| 96 #7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra | |
| 97 #ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr | |
| 98 #om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified | |
| 99 #chrI SGD CDS 87286 87387 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified | |
| 100 #chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified | |
| 101 | |
| 102 | |
| 103 | |
| 104 def init_codon_dict(): | |
| 105 | |
| 106 Codon_dict = OrderedDict([('AAA', 0), ('AAC', 0), ('AAG', 0), ('AAT', 0), ('ACA', 0), ('ACC', 0), ('ACG', 0), ('ACT', 0), ('AGA', 0), ('AGC', 0), ('AGG', 0), ('AGT', 0), ('ATA', 0), ('ATC', 0), ('ATG', 0), ('ATT', 0), ('CAA', 0), ('CAC', 0), ('CAG', 0), ('CAT', 0), ('CCA', 0), ('CCC', 0), ('CCG', 0), ('CCT', 0), ('CGA', 0), ('CGC', 0), ('CGG', 0), ('CGT', 0), ('CTA', 0), ('CTC', 0), ('CTG', 0), ('CTT', 0), ('GAA', 0), ('GAC', 0), ('GAG', 0), ('GAT', 0), ('GCA', 0), ('GCC', 0), ('GCG', 0), ('GCT', 0), ('GGA', 0), ('GGC', 0), ('GGG', 0), ('GGT', 0), ('GTA', 0), ('GTC', 0), ('GTG', 0), ('GTT', 0), ('TAA', 0), ('TAC', 0), ('TAG', 0), ('TAT', 0), ('TCA', 0), ('TCC', 0), ('TCG', 0), ('TCT', 0), ('TGA', 0), ('TGC', 0), ('TGG', 0), ('TGT', 0), ('TTA', 0), ('TTC', 0), ('TTG', 0), ('TTT', 0)]) | |
| 107 return Codon_dict | |
| 108 | |
| 109 | |
| 110 | |
| 111 def get_codon_usage(bamfile, GFF, site, kmer, a_pos): | |
| 112 ''' | |
| 113 Read GFF dict and get gene codon usage. | |
| 114 Return dict of codons usage | |
| 115 ''' | |
| 116 try: | |
| 117 codon = init_codon_dict() | |
| 118 | |
| 119 for chrom in GFF.iterkeys(): | |
| 120 for gene in GFF[chrom] : | |
| 121 codon_dict = init_codon_dict() | |
| 122 start = GFF[chrom][gene]['start'] | |
| 123 stop = GFF[chrom][gene]['stop'] | |
| 124 region = chrom + ':' + str(start) + '-' + str(stop) | |
| 125 # #get all reads in this gene | |
| 126 reads = subprocess.check_output(["samtools", "view", bamfile, region]) | |
| 127 head = subprocess.check_output(["samtools", "view", "-H", bamfile]) | |
| 128 read_tab = reads.split('\n') | |
| 129 for read in read_tab: | |
| 130 # # search mapper for eliminate multiple alignements | |
| 131 if 'bowtie' in head: | |
| 132 multi_tag = "XS:i:" | |
| 133 elif 'bwa' in head: | |
| 134 multi_tag = "XT:A:R" | |
| 135 else : | |
| 136 stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") | |
| 137 if len(read) == 0: | |
| 138 continue | |
| 139 | |
| 140 len_read = len(read.split('\t')[9]) | |
| 141 # if it's read of good length | |
| 142 if len_read == kmer and multi_tag not in read: | |
| 143 feat = read.split('\t') | |
| 144 seq = feat[9] | |
| 145 # if it's a reverse read | |
| 146 if feat[1] == '16' : | |
| 147 if site == "A" : | |
| 148 # #get A-site | |
| 149 cod = str(Seq(seq[a_pos-5:a_pos-2]).reverse_complement()) | |
| 150 elif site == "P" : | |
| 151 # #get P-site | |
| 152 cod = str(Seq(seq[a_pos-2:a_pos+1]).reverse_complement()) | |
| 153 else : | |
| 154 # #get site-E | |
| 155 cod = str(Seq(seq[a_pos+1:a_pos+4]).reverse_complement()) | |
| 156 # # test if it's a true codon not a CNG codon for example | |
| 157 if codon_dict.has_key(cod) : | |
| 158 codon_dict[cod] += 1 | |
| 159 # if it's a forward read | |
| 160 elif feat[1] == '0' : | |
| 161 if site == "A" : | |
| 162 # #get A-site | |
| 163 cod = seq[a_pos:a_pos+3] | |
| 164 elif site == "P" : | |
| 165 # #get P-site | |
| 166 cod = seq[a_pos-3:a_pos] | |
| 167 else : | |
| 168 # #get site-E | |
| 169 cod = seq[a_pos-6:a_pos-3] | |
| 170 if codon_dict.has_key(cod) : | |
| 171 codon_dict[cod] += 1 | |
| 172 # # add in global dict | |
| 173 for cod, count in codon_dict.iteritems() : | |
| 174 codon[cod] += count | |
| 175 | |
| 176 return codon | |
| 177 | |
| 178 except Exception, e: | |
| 179 stop_err('Error during codon usage calcul: ' + str(e)) | |
| 180 | |
| 181 | |
| 182 ''' | |
| 183 http://pyinsci.blogspot.fr/2009/09/violin-plot-with-matplotlib.html | |
| 184 ''' | |
| 185 def violin_plot(ax, data, pos, bp=False): | |
| 186 ''' | |
| 187 create violin plots on an axis | |
| 188 ''' | |
| 189 dist = max(pos) - min(pos) | |
| 190 w = min(0.15 * max(dist, 1.0), 0.5) | |
| 191 for d, p in zip(data, pos): | |
| 192 k = stats.gaussian_kde(d) # calculates the kernel density | |
| 193 m = k.dataset.min() # lower bound of violin | |
| 194 M = k.dataset.max() # upper bound of violin | |
| 195 x = arange(m, M, (M - m) / 100.) # support for violin | |
| 196 v = k.evaluate(x) # violin profile (density curve) | |
| 197 v = v / v.max() * w # scaling the violin to the available space | |
| 198 ax.fill_betweenx(x, p, v + p, facecolor=color1, alpha=0.3) | |
| 199 ax.fill_betweenx(x, p, -v + p, facecolor=color2, alpha=0.3) | |
| 200 if bp: | |
| 201 ax.boxplot(data, notch=1, positions=pos, vert=1) | |
| 202 | |
| 203 | |
| 204 | |
| 205 ''' | |
| 206 http://log.ooz.ie/2013/02/matplotlib-comparative-histogram-recipe.html | |
| 207 ''' | |
| 208 def comphist(x1, x2, orientation='vertical', **kwargs): | |
| 209 """Draw a comparative histogram.""" | |
| 210 # Split keyword args: | |
| 211 kwargs1 = {} | |
| 212 kwargs2 = {} | |
| 213 kwcommon = {} | |
| 214 for arg in kwargs: | |
| 215 tgt_arg = arg[:-1] | |
| 216 if arg.endswith('1'): | |
| 217 arg_dict = kwargs1 | |
| 218 elif arg.endswith('2'): | |
| 219 arg_dict = kwargs2 | |
| 220 else: | |
| 221 arg_dict = kwcommon | |
| 222 tgt_arg = arg | |
| 223 arg_dict[tgt_arg] = kwargs[arg] | |
| 224 kwargs1.update(kwcommon) | |
| 225 kwargs2.update(kwcommon) | |
| 226 | |
| 227 fig = pl.figure() | |
| 228 | |
| 229 # Have both histograms share one axis. | |
| 230 if orientation == 'vertical': | |
| 231 ax1 = pl.subplot(211) | |
| 232 ax2 = pl.subplot(212, sharex=ax1) | |
| 233 # Flip the ax2 histogram horizontally. | |
| 234 ax2.set_ylim(ax1.get_ylim()[::-1]) | |
| 235 pl.setp(ax1.get_xticklabels(), visible=False) | |
| 236 legend_loc = (1, 4) | |
| 237 else: | |
| 238 ax1 = pl.subplot(122) | |
| 239 ax2 = pl.subplot(121, sharey=ax1) | |
| 240 # Flip the ax2 histogram vertically. | |
| 241 ax2.set_xlim(ax2.get_xlim()[::-1]) | |
| 242 pl.setp(ax1.get_yticklabels(), visible=False) | |
| 243 legend_loc = (1, 2) | |
| 244 | |
| 245 ax1.hist(x1, orientation=orientation, **kwargs1) | |
| 246 ax2.hist(x2, orientation=orientation, **kwargs2) | |
| 247 ax2.set_ylim(ax1.get_ylim()[::-1]) | |
| 248 ax1.legend(loc=legend_loc[0]) | |
| 249 ax2.legend(loc=legend_loc[1]) | |
| 250 # Tighten up the layout. | |
| 251 pl.subplots_adjust(wspace=0.0, hspace=0.0) | |
| 252 return fig | |
| 253 | |
| 254 | |
| 255 def compute_FC_plot(cond1_norm, cond2_norm, cod_name, codon_to_test, dirout): | |
| 256 | |
| 257 FC_tab = [] | |
| 258 for z, y in zip(cond1_norm.itervalues(), cond2_norm.itervalues()): | |
| 259 fc = z - y | |
| 260 FC_tab.append(fc) | |
| 261 # #codon_to_test = ['TGA','TAG','TAA'] | |
| 262 | |
| 263 a = [] | |
| 264 b = [] | |
| 265 cod = [] | |
| 266 for codon in cond1_norm.iterkeys(): | |
| 267 if codon in codon_to_test : | |
| 268 fc = cond1_norm[codon] - cond2_norm[codon] | |
| 269 b.append(fc) | |
| 270 cod.append(codon) | |
| 271 else : | |
| 272 fc = cond1_norm[codon] - cond2_norm[codon] | |
| 273 a.append(fc) | |
| 274 | |
| 275 | |
| 276 fig = pl.figure(num=1) | |
| 277 comphist(array(a), array(b), label1='All codon', label2=cod_name, color2='green', bins=30, rwidth=1) | |
| 278 # pl.show() | |
| 279 pl.savefig(dirout + '/hist_codon_fc.png', format="png", dpi=340) | |
| 280 pl.clf() | |
| 281 | |
| 282 | |
| 283 # #violin plot | |
| 284 pos = range(2) | |
| 285 dat = array([array(a), array(b)]) | |
| 286 fig = pl.figure() | |
| 287 pl.title("Distribution of codons FoldChange between two conditions") | |
| 288 ax = fig.add_subplot(1, 1, 1) | |
| 289 lab = array(['All codons', cod_name]) | |
| 290 violin_plot(ax, dat, pos, bp=1) | |
| 291 for x, z in zip(dat, pos): | |
| 292 ax.plot(z, average(x), color='r', marker='*', markeredgecolor='r') | |
| 293 xtickNames = pl.setp(ax, xticklabels=lab) | |
| 294 pl.savefig(dirout + '/violinplot_codon.png', format="png", dpi=340) | |
| 295 pl.clf() | |
| 296 | |
| 297 # (Fval,pval) = stats.ttest_ind(a, b, axis=0, equal_var=True) | |
| 298 (Fval, pval) = stats.mannwhitneyu(a, b) | |
| 299 return pval | |
| 300 | |
| 301 | |
| 302 def get_aa_dict(cond1_norm, cond2_norm): | |
| 303 | |
| 304 # ## create amino acid dictionnary: | |
| 305 AA = OrderedDict({}) | |
| 306 AA['Phe'] = [cond1_norm['TTT'] + cond1_norm['TTC'], cond2_norm['TTT'] + cond2_norm['TTC']] | |
| 307 AA['Leu'] = [cond1_norm['TTA'] + cond1_norm['TTG'] + cond1_norm['CTT'] + cond1_norm['CTC'] + cond1_norm['CTA'] + cond1_norm['CTG'], cond2_norm['TTA'] + cond2_norm['TTG'] + cond2_norm['CTT'] + cond2_norm['CTC'] + cond2_norm['CTA'] + cond2_norm['CTG']] | |
| 308 AA['Ile'] = [cond1_norm['ATT'] + cond1_norm['ATC'] + cond1_norm['ATA'], cond2_norm['ATT'] + cond2_norm['ATC'] + cond2_norm['ATA']] | |
| 309 AA['Met'] = [cond1_norm['ATG'], cond2_norm['ATG']] | |
| 310 AA['Val'] = [cond1_norm['GTT'] + cond1_norm['GTC'] + cond1_norm['GTA'] + cond1_norm['GTG'] + cond1_norm['AGT'] + cond1_norm['AGC'], cond2_norm['GTT'] + cond2_norm['GTC'] + cond2_norm['GTA'] + cond2_norm['GTG'] + cond2_norm['AGT'] + cond2_norm['AGC']] | |
| 311 AA['Ser'] = [cond1_norm['TCT'] + cond1_norm['TCC'] + cond1_norm['TCA'] + cond1_norm['TCG'], cond2_norm['TCT'] + cond2_norm['TCC'] + cond2_norm['TCA'] + cond2_norm['TCG']] | |
| 312 AA['Pro'] = [cond1_norm['CCT'] + cond1_norm['CCC'] + cond1_norm['CCA'] + cond1_norm['CCG'], cond2_norm['CCT'] + cond2_norm['CCC'] + cond2_norm['CCA'] + cond2_norm['CCG']] | |
| 313 AA['Thr'] = [cond1_norm['ACT'] + cond1_norm['ACC'] + cond1_norm['ACA'] + cond1_norm['ACG'], cond2_norm['ACT'] + cond2_norm['ACC'] + cond2_norm['ACA'] + cond2_norm['ACG']] | |
| 314 AA['Ala'] = [cond1_norm['GCT'] + cond1_norm['GCC'] + cond1_norm['GCA'] + cond1_norm['GCG'], cond2_norm['GCT'] + cond2_norm['GCC'] + cond2_norm['GCA'] + cond2_norm['GCG']] | |
| 315 AA['Tyr'] = [cond1_norm['TAT'] + cond1_norm['TAC'], cond2_norm['TAT'] + cond2_norm['TAC']] | |
| 316 AA['Stop'] = [cond1_norm['TAA'] + cond1_norm['TAG'] + cond1_norm['TGA'], cond2_norm['TAA'] + cond2_norm['TAG'] + cond2_norm['TGA']] | |
| 317 AA['His'] = [cond1_norm['CAT'] + cond1_norm['CAC'], cond2_norm['CAT'] + cond2_norm['CAC']] | |
| 318 AA['Gln'] = [cond1_norm['CAA'] + cond1_norm['CAG'], cond2_norm['CAA'] + cond2_norm['CAG']] | |
| 319 AA['Asn'] = [cond1_norm['AAT'] + cond1_norm['AAC'], cond2_norm['AAT'] + cond2_norm['AAC']] | |
| 320 AA['Lys'] = [cond1_norm['AAA'] + cond1_norm['AAG'], cond2_norm['AAA'] + cond2_norm['AAG']] | |
| 321 AA['Asp'] = [cond1_norm['GAT'] + cond1_norm['GAC'], cond2_norm['GAT'] + cond2_norm['GAC']] | |
| 322 AA['Glu'] = [cond1_norm['GAA'] + cond1_norm['GAG'], cond2_norm['GAA'] + cond2_norm['GAG']] | |
| 323 AA['Cys'] = [cond1_norm['TGT'] + cond1_norm['TGC'], cond2_norm['TGT'] + cond2_norm['TGC']] | |
| 324 AA['Trp'] = [cond1_norm['TGG'], cond2_norm['TGG']] | |
| 325 AA['Arg'] = [cond1_norm['CGT'] + cond1_norm['CGC'] + cond1_norm['CGA'] + cond1_norm['CGG'] + cond1_norm['AGA'] + cond1_norm['AGG'], cond2_norm['CGT'] + cond2_norm['CGC'] + cond2_norm['CGA'] + cond2_norm['CGG'] + cond2_norm['AGA'] + cond2_norm['AGG']] | |
| 326 AA['Gly'] = [cond1_norm['GGT'] + cond1_norm['GGC'] + cond1_norm['GGA'] + cond1_norm['GGG'], cond2_norm['GGT'] + cond2_norm['GGC'] + cond2_norm['GGA'] + cond2_norm['GGG']] | |
| 327 | |
| 328 | |
| 329 return AA | |
| 330 | |
| 331 | |
| 332 | |
| 333 def plot_codon_usage(result, dirout, c1, c2, outfile, color1, color2): | |
| 334 ''' | |
| 335 Take list of dict of codon usage and use matplotlib for do graph | |
| 336 ''' | |
| 337 | |
| 338 # #if there are replicat | |
| 339 if len(result) == 4 : | |
| 340 # store each dict in variables to make code more readable | |
| 341 cond1_1 = result[0].copy() | |
| 342 cond1_2 = result[1].copy() | |
| 343 cond2_1 = result[2].copy() | |
| 344 cond2_2 = result[3].copy() | |
| 345 # get codon order in one of list | |
| 346 codon_sorted = sorted(cond1_1.iterkeys(), reverse=False) | |
| 347 # get max of each list | |
| 348 sum11 = sum(list(cond1_1.itervalues())) | |
| 349 sum12 = sum(list(cond1_2.itervalues())) | |
| 350 sum21 = sum(list(cond2_1.itervalues())) | |
| 351 sum22 = sum(list(cond2_2.itervalues())) | |
| 352 # for each codon, get values and sd in each condition | |
| 353 cond1_val = {} | |
| 354 cond1 = {} | |
| 355 cond2_val = {} | |
| 356 cond2 = {} | |
| 357 std_cond1 = [] | |
| 358 std_cond2 = [] | |
| 359 max_val = [] # # max value for graph | |
| 360 for i in codon_sorted: | |
| 361 # # cond1 = moyenne of replicats cond1 divided by max | |
| 362 cond1_val[i] = ((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2) | |
| 363 cond1[i] = ((cond1_1[i] + cond1_2[i]) / 2) | |
| 364 # # standard deviation = absolute value of diffence between replicats of cond1 | |
| 365 std_cond1.append(std(array([(cond1_1[i] * 100 / sum11), (cond1_2[i] * 100 / sum12)]))) | |
| 366 # # cond2 = moyenne of replicats cond1divided by max | |
| 367 cond2_val[i] = ((cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2) | |
| 368 cond2[i] = ((cond2_1[i] + cond2_2[i]) / 2) | |
| 369 # # standard deviation = absolute value of diffence between replicats of cond2 | |
| 370 std_cond2.append(std(array([((cond2_1[i]) * 100 / sum21), ((cond2_2[i]) * 100 / sum22)]))) | |
| 371 # # max value for each codon | |
| 372 max_val.append(max((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2, (cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2)) | |
| 373 | |
| 374 # for graph design | |
| 375 cond1_norm = OrderedDict(sorted(cond1_val.items(), key=lambda t: t[0])) | |
| 376 cond1_norm.update ((x, y * 100) for x, y in cond1_norm.items()) | |
| 377 cond2_norm = OrderedDict(sorted(cond2_val.items(), key=lambda t: t[0])) | |
| 378 cond2_norm.update ((x, y * 100) for x, y in cond2_norm.items()) | |
| 379 max_val = [x * 100 for x in max_val] | |
| 380 | |
| 381 AA = get_aa_dict(cond1_norm, cond2_norm) | |
| 382 max_valaa = [] | |
| 383 cond1_aa = [] | |
| 384 cond2_aa = [] | |
| 385 aa_name = list(AA.iterkeys()) | |
| 386 for z in AA.itervalues(): | |
| 387 cond1_aa.append(z[0]) | |
| 388 cond2_aa.append(z[1]) | |
| 389 max_valaa.append(max(z)) | |
| 390 # # plot amino acid profile : | |
| 391 fig = pl.figure(figsize=(24, 10), num=1) | |
| 392 width = .50 | |
| 393 ax = fig.add_subplot(111) | |
| 394 ax.xaxis.set_ticks([]) | |
| 395 ind = arange(21) | |
| 396 pl.xlim(0, 21) | |
| 397 ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1) | |
| 398 ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2) | |
| 399 for x, y, z in zip(ind, max_valaa, aa_name): | |
| 400 ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14) | |
| 401 ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)') | |
| 402 ax.set_xlabel('Amino Acid') | |
| 403 handles, labels = ax.get_legend_handles_labels() | |
| 404 ax.legend(handles, labels) | |
| 405 pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340) | |
| 406 pl.clf() | |
| 407 | |
| 408 | |
| 409 # # compute theorical count in COND2 | |
| 410 sum2 = (sum21 + sum22) / 2 | |
| 411 cond2_count = [] | |
| 412 for z in cond1_norm.itervalues() : | |
| 413 count = int(z * sum2 / 100) | |
| 414 cond2_count.append(count) | |
| 415 | |
| 416 expected = array(cond2_count) | |
| 417 observed = array(list(cond2.itervalues())) | |
| 418 | |
| 419 # write result | |
| 420 with open(outfile, 'w') as out : | |
| 421 out.write('Codon\tRaw_' + c1 + '\tRaw_' + c2 + '\tNorm_' + c1 + '\tNorm_' + c2 + '\tFC\tFC_' + c1 + '\tFC_' + c2 + '\n') | |
| 422 for i in codon_sorted: | |
| 423 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\t' + str((cond2_1[i] / sum21) / (cond1_1[i] / sum11)) + '\t' + str((cond2_2[i] / sum22) / (cond1_1[i] / sum11)) + '\n') | |
| 424 chi = stats.chisquare(observed, expected) | |
| 425 out.write('Khi2 test\n') | |
| 426 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n') | |
| 427 | |
| 428 | |
| 429 | |
| 430 # plot result | |
| 431 fig = pl.figure(figsize=(24, 10), num=1) | |
| 432 width = .50 | |
| 433 ind = arange(len(codon_sorted)) | |
| 434 ax = fig.add_subplot(111) | |
| 435 pl.xlim(0, len(codon_sorted) + 1) | |
| 436 ax.spines['right'].set_color('none') | |
| 437 ax.spines['top'].set_color('none') | |
| 438 ax.xaxis.set_ticks([]) | |
| 439 ax.spines['left'].set_smart_bounds(True) | |
| 440 ax.yaxis.set_ticks_position('left') | |
| 441 ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, yerr=std_cond1, error_kw={'elinewidth':1, 'ecolor':'black'}, label=c1) | |
| 442 ax.bar(ind + width, list(cond2_norm.itervalues()), width, yerr=std_cond2, facecolor=color2, error_kw={'elinewidth':1, 'ecolor':'black'}, label=c2) | |
| 443 for x, y, z in zip(ind, max_val, codon_sorted): | |
| 444 ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=8) | |
| 445 ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)') | |
| 446 ax.set_xlabel('Codons') | |
| 447 handles, labels = ax.get_legend_handles_labels() | |
| 448 ax.legend(handles, labels) | |
| 449 pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340) | |
| 450 pl.clf() | |
| 451 | |
| 452 | |
| 453 | |
| 454 elif len(result) == 2 : | |
| 455 | |
| 456 # store each dict in OrderedDict sorted by key to make code more readable | |
| 457 cond1 = result[0] | |
| 458 cond2 = result[1] | |
| 459 cond1_norm = result[0].copy() | |
| 460 cond2_norm = result[1].copy() | |
| 461 # pdb.set_trace() | |
| 462 # get codon order in one of list | |
| 463 codon_sorted = sorted(cond1.iterkeys(), reverse=False) | |
| 464 | |
| 465 # get sum of each list | |
| 466 sum1 = sum(list(cond1.itervalues())) | |
| 467 sum2 = sum(list(cond2.itervalues())) | |
| 468 # #Normalize values by sum of each libraries | |
| 469 cond1_norm.update ((x, (y / sum1) * 100.0) for x, y in cond1_norm.items()) | |
| 470 cond2_norm.update((x, (y / sum2) * 100.0) for x, y in cond2_norm.items()) | |
| 471 | |
| 472 # # compute theorical count in COND2 | |
| 473 cond2_count = [] | |
| 474 for z in cond1_norm.itervalues() : | |
| 475 count = int(z * sum2 / 100.0) | |
| 476 cond2_count.append(count) | |
| 477 | |
| 478 expected = array(cond2_count) | |
| 479 observed = array(list(cond2.itervalues())) | |
| 480 | |
| 481 AA = get_aa_dict(cond1_norm, cond2_norm) | |
| 482 | |
| 483 max_val = [] | |
| 484 cond1_aa = [] | |
| 485 cond2_aa = [] | |
| 486 aa_name = list(AA.iterkeys()) | |
| 487 for z in AA.itervalues(): | |
| 488 cond1_aa.append(z[0]) | |
| 489 cond2_aa.append(z[1]) | |
| 490 max_val.append(max(z)) | |
| 491 | |
| 492 # # plot amino acid profile : | |
| 493 fig = pl.figure(num=1) | |
| 494 width = .35 | |
| 495 ax = fig.add_subplot(111) | |
| 496 ind = arange(21) | |
| 497 pl.xlim(0, 21) | |
| 498 #kwargs = {"hatch":'x'} | |
| 499 #ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1, **kwargs) | |
| 500 #kwargs = {"hatch":'.'} | |
| 501 #ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2, **kwargs) | |
| 502 ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1) | |
| 503 ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2) | |
| 504 #for x, y, z in zip(ind, max_val, aa_name): | |
| 505 # ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14) | |
| 506 axis_font = {'size':'16'} | |
| 507 pl.xticks(ind + width, aa_name,**axis_font) | |
| 508 ax.spines['right'].set_visible(False) | |
| 509 ax.spines['top'].set_visible(False) | |
| 510 ax.yaxis.set_ticks_position('left') | |
| 511 ax.xaxis.set_ticks_position('bottom') | |
| 512 #ax.xaxis.set_ticks([]) | |
| 513 ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)',**axis_font) | |
| 514 ax.set_xlabel('Amino Acids', **axis_font) | |
| 515 handles, labels = ax.get_legend_handles_labels() | |
| 516 font_prop = font_manager.FontProperties(size=12) | |
| 517 ax.legend(handles, labels, prop=font_prop) | |
| 518 pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340) | |
| 519 pl.clf() | |
| 520 | |
| 521 # write result | |
| 522 with open(outfile, 'w') as out : | |
| 523 out.write('Codon\tRaw_' + c1 + '\tRaw_' + c2 + '\tNorm_' + c1 + '\tNorm_' + c2 + '\tFC(Mut/WT)\n') | |
| 524 for i in codon_sorted: | |
| 525 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n') | |
| 526 out.write('Khi2 test\n') | |
| 527 chi = stats.chisquare(observed, expected) | |
| 528 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n') | |
| 529 | |
| 530 # # get max value for each codon for histogram | |
| 531 max_val = [] # # max value for graph | |
| 532 for i in cond1: | |
| 533 # # max value for each codon | |
| 534 max_val.append(max(cond1_norm[i], cond2_norm[i])) | |
| 535 | |
| 536 # plot result | |
| 537 fig = pl.figure(figsize=(24, 10), num=1) | |
| 538 width = .50 | |
| 539 ind = arange(len(codon_sorted)) | |
| 540 ax = fig.add_subplot(111) | |
| 541 pl.xlim(0, len(codon_sorted) + 1) | |
| 542 ax.spines['right'].set_color('none') | |
| 543 ax.spines['top'].set_color('none') | |
| 544 ax.xaxis.set_ticks([]) | |
| 545 ax.spines['left'].set_smart_bounds(True) | |
| 546 ax.yaxis.set_ticks_position('left') | |
| 547 ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, label=c1) | |
| 548 ax.bar(ind + width, list(cond2_norm.itervalues()), width, facecolor=color2, label=c2) | |
| 549 for x, y, z in zip(ind, max_val, codon_sorted): | |
| 550 ax.text(x + width, y + 0.02, '%s' % z, ha='center', va='bottom', fontsize=8) | |
| 551 ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)') | |
| 552 ax.set_xlabel('Codons') | |
| 553 handles, labels = ax.get_legend_handles_labels() | |
| 554 ax.legend(handles, labels) | |
| 555 pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340) | |
| 556 pl.clf() | |
| 557 | |
| 558 | |
| 559 else : | |
| 560 stop_err('Error running codon usage plotting : ' + str(e)) | |
| 561 | |
| 562 | |
| 563 return (cond1_norm, cond2_norm, chi[1]) | |
| 564 | |
| 565 def write_html_file(html, chi_pval, cond1, cond2): | |
| 566 try : | |
| 567 | |
| 568 | |
| 569 html_str = """ | |
| 570 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" | |
| 571 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | |
| 572 | |
| 573 <html xmlns="http://www.w3.org/1999/xhtml"> | |
| 574 <head> | |
| 575 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 576 <link href="/static/june_2007_style/blue/base.css" media="screen" rel="Stylesheet" type="text/css" /> | |
| 577 </head> | |
| 578 <body> | |
| 579 <h3>Global visualization</h3> | |
| 580 <p> | |
| 581 <h5>Visualization of density footprint in each codon.</h5><br> If user has selected analyse with replicats, standart error deviation between each replicate as plotting as error bar in histogram.<br> | |
| 582 <img border="0" src="hist_codons.png" width="1040"/> | |
| 583 </p> | |
| 584 <p> | |
| 585 <h5>Test for homogeneity distribution between each condition</h5><br> | |
| 586 H0 : %s and %s are same distribution <br> | |
| 587 Khi2 test p-value: %s<br><br> | |
| 588 If p-value less than 0.05, we can reject homogeneity distribution so we can hypothesize that distributions are not the same. Otherwise, we accept H0<br> | |
| 589 | |
| 590 </p> | |
| 591 <p> | |
| 592 <h5>Visualization of density footprint in each codon groupe by amino acid</h5><br> | |
| 593 <img border="0" src="hist_amino_acid.png" width="1040"/> | |
| 594 </p> | |
| 595 </body> | |
| 596 </html> """ % (cond1,cond2,chi_pval) | |
| 597 | |
| 598 | |
| 599 html_file = open(html, "w") | |
| 600 html_file.write(html_str) | |
| 601 html_file.close() | |
| 602 | |
| 603 except Exception, e : | |
| 604 stop_err('Error during html page creation : ' + str(e)) | |
| 605 | |
| 606 | |
| 607 | |
| 608 | |
| 609 def check_codons_list (codons) : | |
| 610 | |
| 611 for codon in codons : | |
| 612 if codon not in init_codon_dict().iterkeys() : | |
| 613 stop_err('Please to enter a valid codon : ' + codon + ' is not find\n') | |
| 614 | |
| 615 | |
| 616 def check_index_bam (bamfile) : | |
| 617 # #testing indexed bam file | |
| 618 if os.path.isfile(bamfile + ".bai") : | |
| 619 pass | |
| 620 else : | |
| 621 cmd = "samtools index %s " % (bamfile) | |
| 622 proc = subprocess.Popen(args=cmd, shell=True, stderr=subprocess.PIPE) | |
| 623 returncode = proc.wait() | |
| 624 # if returncode != 0: | |
| 625 # raise Exception | |
| 626 | |
| 627 def __main__(): | |
| 628 ''' | |
| 629 python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/get_codon_frequency.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -g Saccer3.gff -t tAI.csv -1 psiM1_sorted.bam,psiM2_sorted.bam -2 psiP1_sorted.bam,psiP2_sorted.bam -c psiM -C psiP -l TAG,TAA,TGA -r yes -o psi_count -d psi.html,html_dir > log2 | |
| 630 python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/get_codon_frequency.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -g Saccer3.gff -t tAI.csv -c psiM -C psiP -1 RPF_psi-_28sorted.bam -2 RPF_psi+_28sorted.bam -l TAG,TAA,TGA -n Stop Codon -r no -o psi_count -d psi.html,html_dir > log2 | |
| 631 ''' | |
| 632 | |
| 633 # Parse command line options | |
| 634 parser = optparse.OptionParser() | |
| 635 parser.add_option("-g", "--gff", dest="gff", type="string", | |
| 636 help="gff file", metavar="FILE") | |
| 637 | |
| 638 parser.add_option("-1", "--file1", dest="file1", type="string", | |
| 639 help="Bam Ribo-Seq alignments cond 1, if rep option, separate files by commas ", metavar="FILE") | |
| 640 | |
| 641 parser.add_option("-2", "--file2", dest="file2", type="string", | |
| 642 help="Bam Ribo-Seq alignments cond 2, if rep option, separate files by commas", metavar="FILE") | |
| 643 | |
| 644 parser.add_option("-c", "--cond1", dest="c1", type="string", | |
| 645 help="Name for first condition", metavar="STR") | |
| 646 | |
| 647 parser.add_option("-C", "--cond2", dest="c2", type="string", | |
| 648 help="Name of second condition", metavar="STR") | |
| 649 | |
| 650 parser.add_option("-k", "--kmer", dest="kmer", type="int", | |
| 651 help="Longer of your phasing reads", metavar="INT") | |
| 652 | |
| 653 # parser.add_option("-l", "--list", dest="list_cod", type= "string", | |
| 654 # help="list of codons to compare to other", metavar="STR") | |
| 655 | |
| 656 parser.add_option("-o", "--out", dest="outfile", type="string", | |
| 657 help="write report to FILE", metavar="FILE") | |
| 658 | |
| 659 parser.add_option("-d", "--dirout", dest="dirout", type="string", | |
| 660 help="write report to PNG files", metavar="FILE") | |
| 661 | |
| 662 parser.add_option("-a", "--asite", dest="asite", type="int", | |
| 663 help="Off-set from the 5'end of the footprint to the A-site", metavar="INT") | |
| 664 | |
| 665 parser.add_option("-s", "--site", dest="site", type="string", | |
| 666 help="Script can compute in site A, P or E", metavar="A|P|E") | |
| 667 | |
| 668 parser.add_option("-r", "--rep", dest="rep", type="string", | |
| 669 help="if replicate or not", metavar="yes|no") | |
| 670 | |
| 671 parser.add_option("-x", "--hex_col1", dest="color1", type= "string", | |
| 672 help="Color for first condition", metavar="STR") | |
| 673 | |
| 674 parser.add_option("-X", "--hex_col2", dest="color2", type= "string", | |
| 675 help="Color for second condition", metavar="STR") | |
| 676 | |
| 677 parser.add_option("-q", "--quiet", | |
| 678 action="store_false", dest="verbose", default=True, | |
| 679 help="don't print status messages to stdout") | |
| 680 | |
| 681 (options, args) = parser.parse_args() | |
| 682 print "Begin codon frequency analysis at", time.asctime(time.localtime(time.time())) | |
| 683 | |
| 684 try: | |
| 685 authorized_site = ["A", "P", "E"] | |
| 686 if options.site not in authorized_site : | |
| 687 stop_err(options.site + ' is not a authorized ribosome site') | |
| 688 | |
| 689 ## Check if colors exist | |
| 690 if not colors.is_color_like(options.color1) : | |
| 691 stop_err( options.color1+' is not a proper color' ) | |
| 692 if not colors.is_color_like(options.color2) : | |
| 693 stop_err( options.color2+' is not a proper color' ) | |
| 694 | |
| 695 GFF = store_gff(options.gff) | |
| 696 | |
| 697 #### NOT USE IN FINAL VERSION | |
| 698 # # get codon list | |
| 699 # codons = options.list_cod.upper().split(',') | |
| 700 # check_codons_list(codons) | |
| 701 | |
| 702 # # get html file and directory : | |
| 703 (html, html_dir) = options.dirout.split(',') | |
| 704 if os.path.exists(html_dir): | |
| 705 raise | |
| 706 try: | |
| 707 os.mkdir(html_dir) | |
| 708 except: | |
| 709 raise Exception(html_dir + ' mkdir') | |
| 710 # #RUN analysis | |
| 711 # #If there are replicats | |
| 712 if options.rep == "yes" : | |
| 713 result = [] | |
| 714 # split name of each file options by "," | |
| 715 cond1 = options.file1.split(',') | |
| 716 cond2 = options.file2.split(',') | |
| 717 # # calcul for each file | |
| 718 for fh in itertools.chain(cond1, cond2): | |
| 719 check_index_bam (fh) | |
| 720 result.append(get_codon_usage(fh, GFF, options.site, options.kmer, options.asite)) | |
| 721 (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2) | |
| 722 # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir) | |
| 723 | |
| 724 | |
| 725 # #If there are no replicat | |
| 726 elif options.rep == "no" : | |
| 727 result = [] | |
| 728 # #calcul for each cond | |
| 729 for fh in (options.file1, options.file2): | |
| 730 check_index_bam (fh) | |
| 731 result.append(get_codon_usage(fh, GFF, options.site, options.kmer,options.asite)) | |
| 732 (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2) | |
| 733 # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir) | |
| 734 | |
| 735 else : | |
| 736 sys.stderr.write("Please enter yes or no for --rep option. Programme aborted at %s" % time.asctime(time.localtime(time.time()))) | |
| 737 sys.exit() | |
| 738 | |
| 739 # write_html_file(html,chi_pval,t_pval,codons,options.c1, options.c2) | |
| 740 write_html_file(html, chi_pval, options.c1, options.c2) | |
| 741 | |
| 742 print "Finish codon frequency analysis at", time.asctime(time.localtime(time.time())) | |
| 743 except Exception, e: | |
| 744 stop_err('Error running codon frequency analysis (main program) : ' + str(e)) | |
| 745 | |
| 746 | |
| 747 if __name__=="__main__": | |
| 748 __main__() |
