comparison get_codon_frequency.py @ 17:c87c40e642af draft

Uploaded
author rlegendre
date Fri, 29 May 2015 09:17:29 -0400
parents fcfdb2607cb8
children 385fc64fa988
comparison
equal deleted inserted replaced
16:fcfdb2607cb8 17:c87c40e642af
21 matplotlib.use('Agg') 21 matplotlib.use('Agg')
22 import matplotlib.pyplot as pl 22 import matplotlib.pyplot as pl
23 from matplotlib import font_manager 23 from matplotlib import font_manager
24 from matplotlib import colors 24 from matplotlib import colors
25 import csv 25 import csv
26 from scipy import stats 26 from scipy import stats, errstate
27 from collections import OrderedDict 27 from collections import OrderedDict
28 import ribo_functions 28 import ribo_functions
29 import HTSeq 29 import HTSeq
30 # #libraries for debugg 30 # #libraries for debugg
31 #import pdb 31 #import pdb
58 if feature.type == 'gene' : 58 if feature.type == 'gene' :
59 codon_dict = init_codon_dict() 59 codon_dict = init_codon_dict()
60 chrom = feature.iv.chrom 60 chrom = feature.iv.chrom
61 start = feature.iv.start 61 start = feature.iv.start
62 stop = feature.iv.end 62 stop = feature.iv.end
63 if start+50 < stop-50 : 63 if start+50 < stop-50 :
64 region = chrom + ':' + str(start+50) + '-' + str(stop-50) 64 region = chrom + ':' + str(start+50) + '-' + str(stop-50)
65 else : 65 # #get all reads in this gene
66 break 66 reads = subprocess.check_output(["samtools", "view", bamfile, region])
67 67 head = subprocess.check_output(["samtools", "view", "-H", bamfile])
68 ## DEPRECATED 68 read_tab = reads.split('\n')
69 #for chrom in GFF.iterkeys(): 69 for read in read_tab:
70 #for gene in GFF[chrom] : 70 # # search mapper for eliminate multiple alignements
71 # codon_dict = init_codon_dict() 71 if 'bowtie' in head:
72 #start = GFF[chrom][gene]['start'] 72 multi_tag = "XS:i:"
73 #print start 73 elif 'bwa' in head:
74 #stop = GFF[chrom][gene]['stop'] 74 multi_tag = "XT:A:R"
75 #print stop 75 elif 'TopHat' in head:
76 #region = chrom + ':' + str(start) + '-' + str(stop) 76 tag = "NH:i:1"
77 ####### 77 else :
78 # #get all reads in this gene 78 stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")
79 reads = subprocess.check_output(["samtools", "view", bamfile, region]) 79
80 head = subprocess.check_output(["samtools", "view", "-H", bamfile]) 80 if len(read) == 0:
81 read_tab = reads.split('\n') 81 continue
82 for read in read_tab: 82 len_read = len(read.split('\t')[9])
83 # # search mapper for eliminate multiple alignements 83 # if it's read of good length
84 if 'bowtie' in head: 84 if len_read == kmer and (tag in read or multi_tag not in read):
85 multi_tag = "XS:i:" 85 feat = read.split('\t')
86 elif 'bwa' in head: 86 seq = feat[9]
87 multi_tag = "XT:A:R" 87 # if it's a reverse read
88 elif 'TopHat' in head: 88 if feat[1] == '16' :
89 tag = "NH:i:1" 89 if site == "A" :
90 else : 90 # #get A-site
91 stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping") 91 cod = str(Seq(seq[a_pos-5:a_pos-2]).reverse_complement())
92 92 elif site == "P" :
93 if len(read) == 0: 93 # #get P-site
94 continue 94 cod = str(Seq(seq[a_pos-2:a_pos+1]).reverse_complement())
95 len_read = len(read.split('\t')[9]) 95 else :
96 # if it's read of good length 96 # #get site-E
97 if len_read == kmer and (tag in read or multi_tag not in read): 97 cod = str(Seq(seq[a_pos+1:a_pos+4]).reverse_complement())
98 feat = read.split('\t') 98 # # test if it's a true codon not a CNG codon for example
99 seq = feat[9] 99 if codon_dict.has_key(cod) :
100 # if it's a reverse read 100 codon_dict[cod] += 1
101 if feat[1] == '16' : 101 # if it's a forward read
102 if site == "A" : 102 elif feat[1] == '0' :
103 # #get A-site 103 if site == "A" :
104 cod = str(Seq(seq[a_pos-5:a_pos-2]).reverse_complement()) 104 # #get A-site
105 elif site == "P" : 105 cod = seq[a_pos:a_pos+3]
106 # #get P-site 106 elif site == "P" :
107 cod = str(Seq(seq[a_pos-2:a_pos+1]).reverse_complement()) 107 # #get P-site
108 else : 108 cod = seq[a_pos-3:a_pos]
109 # #get site-E 109 else :
110 cod = str(Seq(seq[a_pos+1:a_pos+4]).reverse_complement()) 110 # #get site-E
111 # # test if it's a true codon not a CNG codon for example 111 cod = seq[a_pos-6:a_pos-3]
112 if codon_dict.has_key(cod) : 112 if codon_dict.has_key(cod) :
113 codon_dict[cod] += 1 113 codon_dict[cod] += 1
114 # if it's a forward read 114 del(read)
115 elif feat[1] == '0' :
116 if site == "A" :
117 # #get A-site
118 cod = seq[a_pos:a_pos+3]
119 elif site == "P" :
120 # #get P-site
121 cod = seq[a_pos-3:a_pos]
122 else :
123 # #get site-E
124 cod = seq[a_pos-6:a_pos-3]
125 if codon_dict.has_key(cod) :
126 codon_dict[cod] += 1
127 del(read)
128 # # add in global dict 115 # # add in global dict
129 for cod, count in codon_dict.iteritems() : 116 for cod, count in codon_dict.iteritems() :
130 codon[cod] += count 117 codon[cod] += count
131 118 if sum(codon.values()) == 0 :
132 return codon 119 stop_err('There are no reads aligning on annotated genes in your GFF file')
120 else :
121 return codon
133 122
134 except Exception, e: 123 except Exception, e:
135 stop_err('Error during codon usage calcul: ' + str(e)) 124 stop_err('Error during codon usage calcul: ' + str(e))
136 125
137 126
315 cond2 = {} 304 cond2 = {}
316 std_cond1 = [] 305 std_cond1 = []
317 std_cond2 = [] 306 std_cond2 = []
318 max_val = [] # # max value for graph 307 max_val = [] # # max value for graph
319 for i in codon_sorted: 308 for i in codon_sorted:
320 # # cond1 = moyenne of replicats cond1 divided by max 309 # # cond1 = mean of replicats cond1 divided by max
321 cond1_val[i] = ((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2) 310 cond1_val[i] = ((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2)
322 cond1[i] = ((cond1_1[i] + cond1_2[i]) / 2) 311 cond1[i] = ((cond1_1[i] + cond1_2[i]) / 2)
323 # # standard deviation = absolute value of diffence between replicats of cond1 312 # # standard deviation = absolute value of difference between replicats of cond1
324 std_cond1.append(std(array([(cond1_1[i] * 100 / sum11), (cond1_2[i] * 100 / sum12)]))) 313 std_cond1.append(std(array([(cond1_1[i] * 100 / sum11), (cond1_2[i] * 100 / sum12)])))
325 # # cond2 = moyenne of replicats cond1divided by max 314 # # cond2 = mean of replicats cond1divided by max
326 cond2_val[i] = ((cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2) 315 cond2_val[i] = ((cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2)
327 cond2[i] = ((cond2_1[i] + cond2_2[i]) / 2) 316 cond2[i] = ((cond2_1[i] + cond2_2[i]) / 2)
328 # # standard deviation = absolute value of difference between replicats of cond2 317 # # standard deviation = absolute value of difference between replicats of cond2
329 std_cond2.append(std(array([((cond2_1[i]) * 100 / sum21), ((cond2_2[i]) * 100 / sum22)]))) 318 std_cond2.append(std(array([((cond2_1[i]) * 100 / sum21), ((cond2_2[i]) * 100 / sum22)])))
330 # # max value for each codon 319 # # max value for each codon
386 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t1.0\n') 375 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t1.0\n')
387 elif cond1_norm[i] == 0 : 376 elif cond1_norm[i] == 0 :
388 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t0.0\n') 377 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t0.0\n')
389 else: 378 else:
390 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n') 379 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n')
391 chi = stats.chisquare(observed, expected) 380 with errstate(all='ignore'):
381 chi = stats.chisquare(observed, expected)
392 out.write('Khi2 test\n') 382 out.write('Khi2 test\n')
393 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n') 383 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n')
394 384
395 385
396 386
433 sum2 = sum(list(cond2.itervalues())) 423 sum2 = sum(list(cond2.itervalues()))
434 # #Normalize values by sum of each libraries 424 # #Normalize values by sum of each libraries
435 cond1_norm.update ((x, (y / sum1) * 100.0) for x, y in cond1_norm.items()) 425 cond1_norm.update ((x, (y / sum1) * 100.0) for x, y in cond1_norm.items())
436 cond2_norm.update((x, (y / sum2) * 100.0) for x, y in cond2_norm.items()) 426 cond2_norm.update((x, (y / sum2) * 100.0) for x, y in cond2_norm.items())
437 except ZeroDivisionError: 427 except ZeroDivisionError:
438 stop_err("Not enough reads to compute the codon occupancy") 428 stop_err("Not enough reads to compute the codon occupancy. "+str(sum1)+" and "+str(sum2)+" reads are used for each condition, respectively.\n")
439 429
440 # # compute theorical count in COND2 430 # # compute theorical count in COND2
441 cond2_count = [] 431 cond2_count = []
442 for z in cond1_norm.itervalues() : 432 for z in cond1_norm.itervalues() :
443 count = int(z * sum2 / 100.0) 433 count = int(z * sum2 / 100.0)
495 elif cond1_norm[i] == 0 : 485 elif cond1_norm[i] == 0 :
496 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t0.0\n') 486 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t0.0\n')
497 else: 487 else:
498 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n') 488 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n')
499 out.write('Khi2 test\n') 489 out.write('Khi2 test\n')
500 chi = stats.chisquare(observed, expected) 490 with errstate(all='ignore'):
491 chi = stats.chisquare(observed, expected)
501 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n') 492 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n')
502 493
503 # # get max value for each codon for histogram 494 # # get max value for each codon for histogram
504 max_val = [] # # max value for graph 495 max_val = [] # # max value for graph
505 for i in cond1: 496 for i in cond1:
601 def plot_fc (cond1, cond2, site, dirout): 592 def plot_fc (cond1, cond2, site, dirout):
602 593
603 fc = cond1.copy() 594 fc = cond1.copy()
604 595
605 for key, value in fc.iteritems(): 596 for key, value in fc.iteritems():
606 fc[key] = cond2[key]/cond1[key] 597 if cond1[key] == 0:
598 fc[key] = 1
599 else:
600 fc[key] = cond2[key]/cond1[key]
607 601
608 index = arange(len(fc.keys())) 602 index = arange(len(fc.keys()))
609 label = fc.keys() 603 label = fc.keys()
610 label = [w.replace('T','U') for w in label] 604 label = [w.replace('T','U') for w in label]
611 pl.figure(figsize=(15,10), num=1) 605 pl.figure(figsize=(15,10), num=1)
693 # codons = options.list_cod.upper().split(',') 687 # codons = options.list_cod.upper().split(',')
694 # check_codons_list(codons) 688 # check_codons_list(codons)
695 GFF = HTSeq.GFF_Reader(options.gff) 689 GFF = HTSeq.GFF_Reader(options.gff)
696 # # get html file and directory : 690 # # get html file and directory :
697 (html, html_dir) = options.dirout.split(',') 691 (html, html_dir) = options.dirout.split(',')
698 if os.path.exists(html_dir): 692 if not os.path.exists(html_dir):
699 raise 693 try:
700 try: 694 os.mkdir(html_dir)
701 os.mkdir(html_dir) 695 except Exception, e :
702 except: 696 stop_err('Error running make directory : ' + str(e))
703 raise Exception(html_dir + ' mkdir')
704 # #RUN analysis 697 # #RUN analysis
705 # #If there are replicats 698 # #If there are replicats
706 if options.rep == "yes" : 699 if options.rep == "yes" :
707 result = [] 700 result = []
708 # split name of each file options by "," 701 # split name of each file options by ","
722 # #calcul for each cond 715 # #calcul for each cond
723 for fh in (options.file1, options.file2): 716 for fh in (options.file1, options.file2):
724 check_index_bam (fh) 717 check_index_bam (fh)
725 result.append(get_codon_usage(fh, GFF, options.site, options.kmer,options.asite)) 718 result.append(get_codon_usage(fh, GFF, options.site, options.kmer,options.asite))
726 (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2) 719 (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2)
720
727 # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir) 721 # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir)
728 plot_fc (cond1, cond2, options.site, html_dir) 722 plot_fc (cond1, cond2, options.site, html_dir)
729 else : 723 else :
730 sys.stderr.write("Please enter yes or no for --rep option. Programme aborted at %s" % time.asctime(time.localtime(time.time()))) 724 sys.stderr.write("Please enter yes or no for --rep option. Programme aborted at %s" % time.asctime(time.localtime(time.time())))
731 sys.exit() 725 sys.exit()