Mercurial > repos > rlegendre > ribo_tools
comparison get_codon_frequency.py @ 17:c87c40e642af draft
Uploaded
| author | rlegendre |
|---|---|
| date | Fri, 29 May 2015 09:17:29 -0400 |
| parents | fcfdb2607cb8 |
| children | 385fc64fa988 |
comparison
equal
deleted
inserted
replaced
| 16:fcfdb2607cb8 | 17:c87c40e642af |
|---|---|
| 21 matplotlib.use('Agg') | 21 matplotlib.use('Agg') |
| 22 import matplotlib.pyplot as pl | 22 import matplotlib.pyplot as pl |
| 23 from matplotlib import font_manager | 23 from matplotlib import font_manager |
| 24 from matplotlib import colors | 24 from matplotlib import colors |
| 25 import csv | 25 import csv |
| 26 from scipy import stats | 26 from scipy import stats, errstate |
| 27 from collections import OrderedDict | 27 from collections import OrderedDict |
| 28 import ribo_functions | 28 import ribo_functions |
| 29 import HTSeq | 29 import HTSeq |
| 30 # #libraries for debugg | 30 # #libraries for debugg |
| 31 #import pdb | 31 #import pdb |
| 58 if feature.type == 'gene' : | 58 if feature.type == 'gene' : |
| 59 codon_dict = init_codon_dict() | 59 codon_dict = init_codon_dict() |
| 60 chrom = feature.iv.chrom | 60 chrom = feature.iv.chrom |
| 61 start = feature.iv.start | 61 start = feature.iv.start |
| 62 stop = feature.iv.end | 62 stop = feature.iv.end |
| 63 if start+50 < stop-50 : | 63 if start+50 < stop-50 : |
| 64 region = chrom + ':' + str(start+50) + '-' + str(stop-50) | 64 region = chrom + ':' + str(start+50) + '-' + str(stop-50) |
| 65 else : | 65 # #get all reads in this gene |
| 66 break | 66 reads = subprocess.check_output(["samtools", "view", bamfile, region]) |
| 67 | 67 head = subprocess.check_output(["samtools", "view", "-H", bamfile]) |
| 68 ## DEPRECATED | 68 read_tab = reads.split('\n') |
| 69 #for chrom in GFF.iterkeys(): | 69 for read in read_tab: |
| 70 #for gene in GFF[chrom] : | 70 # # search mapper for eliminate multiple alignements |
| 71 # codon_dict = init_codon_dict() | 71 if 'bowtie' in head: |
| 72 #start = GFF[chrom][gene]['start'] | 72 multi_tag = "XS:i:" |
| 73 #print start | 73 elif 'bwa' in head: |
| 74 #stop = GFF[chrom][gene]['stop'] | 74 multi_tag = "XT:A:R" |
| 75 #print stop | 75 elif 'TopHat' in head: |
| 76 #region = chrom + ':' + str(start) + '-' + str(stop) | 76 tag = "NH:i:1" |
| 77 ####### | 77 else : |
| 78 # #get all reads in this gene | 78 stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping") |
| 79 reads = subprocess.check_output(["samtools", "view", bamfile, region]) | 79 |
| 80 head = subprocess.check_output(["samtools", "view", "-H", bamfile]) | 80 if len(read) == 0: |
| 81 read_tab = reads.split('\n') | 81 continue |
| 82 for read in read_tab: | 82 len_read = len(read.split('\t')[9]) |
| 83 # # search mapper for eliminate multiple alignements | 83 # if it's read of good length |
| 84 if 'bowtie' in head: | 84 if len_read == kmer and (tag in read or multi_tag not in read): |
| 85 multi_tag = "XS:i:" | 85 feat = read.split('\t') |
| 86 elif 'bwa' in head: | 86 seq = feat[9] |
| 87 multi_tag = "XT:A:R" | 87 # if it's a reverse read |
| 88 elif 'TopHat' in head: | 88 if feat[1] == '16' : |
| 89 tag = "NH:i:1" | 89 if site == "A" : |
| 90 else : | 90 # #get A-site |
| 91 stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping") | 91 cod = str(Seq(seq[a_pos-5:a_pos-2]).reverse_complement()) |
| 92 | 92 elif site == "P" : |
| 93 if len(read) == 0: | 93 # #get P-site |
| 94 continue | 94 cod = str(Seq(seq[a_pos-2:a_pos+1]).reverse_complement()) |
| 95 len_read = len(read.split('\t')[9]) | 95 else : |
| 96 # if it's read of good length | 96 # #get site-E |
| 97 if len_read == kmer and (tag in read or multi_tag not in read): | 97 cod = str(Seq(seq[a_pos+1:a_pos+4]).reverse_complement()) |
| 98 feat = read.split('\t') | 98 # # test if it's a true codon not a CNG codon for example |
| 99 seq = feat[9] | 99 if codon_dict.has_key(cod) : |
| 100 # if it's a reverse read | 100 codon_dict[cod] += 1 |
| 101 if feat[1] == '16' : | 101 # if it's a forward read |
| 102 if site == "A" : | 102 elif feat[1] == '0' : |
| 103 # #get A-site | 103 if site == "A" : |
| 104 cod = str(Seq(seq[a_pos-5:a_pos-2]).reverse_complement()) | 104 # #get A-site |
| 105 elif site == "P" : | 105 cod = seq[a_pos:a_pos+3] |
| 106 # #get P-site | 106 elif site == "P" : |
| 107 cod = str(Seq(seq[a_pos-2:a_pos+1]).reverse_complement()) | 107 # #get P-site |
| 108 else : | 108 cod = seq[a_pos-3:a_pos] |
| 109 # #get site-E | 109 else : |
| 110 cod = str(Seq(seq[a_pos+1:a_pos+4]).reverse_complement()) | 110 # #get site-E |
| 111 # # test if it's a true codon not a CNG codon for example | 111 cod = seq[a_pos-6:a_pos-3] |
| 112 if codon_dict.has_key(cod) : | 112 if codon_dict.has_key(cod) : |
| 113 codon_dict[cod] += 1 | 113 codon_dict[cod] += 1 |
| 114 # if it's a forward read | 114 del(read) |
| 115 elif feat[1] == '0' : | |
| 116 if site == "A" : | |
| 117 # #get A-site | |
| 118 cod = seq[a_pos:a_pos+3] | |
| 119 elif site == "P" : | |
| 120 # #get P-site | |
| 121 cod = seq[a_pos-3:a_pos] | |
| 122 else : | |
| 123 # #get site-E | |
| 124 cod = seq[a_pos-6:a_pos-3] | |
| 125 if codon_dict.has_key(cod) : | |
| 126 codon_dict[cod] += 1 | |
| 127 del(read) | |
| 128 # # add in global dict | 115 # # add in global dict |
| 129 for cod, count in codon_dict.iteritems() : | 116 for cod, count in codon_dict.iteritems() : |
| 130 codon[cod] += count | 117 codon[cod] += count |
| 131 | 118 if sum(codon.values()) == 0 : |
| 132 return codon | 119 stop_err('There are no reads aligning on annotated genes in your GFF file') |
| 120 else : | |
| 121 return codon | |
| 133 | 122 |
| 134 except Exception, e: | 123 except Exception, e: |
| 135 stop_err('Error during codon usage calcul: ' + str(e)) | 124 stop_err('Error during codon usage calcul: ' + str(e)) |
| 136 | 125 |
| 137 | 126 |
| 315 cond2 = {} | 304 cond2 = {} |
| 316 std_cond1 = [] | 305 std_cond1 = [] |
| 317 std_cond2 = [] | 306 std_cond2 = [] |
| 318 max_val = [] # # max value for graph | 307 max_val = [] # # max value for graph |
| 319 for i in codon_sorted: | 308 for i in codon_sorted: |
| 320 # # cond1 = moyenne of replicats cond1 divided by max | 309 # # cond1 = mean of replicats cond1 divided by max |
| 321 cond1_val[i] = ((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2) | 310 cond1_val[i] = ((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2) |
| 322 cond1[i] = ((cond1_1[i] + cond1_2[i]) / 2) | 311 cond1[i] = ((cond1_1[i] + cond1_2[i]) / 2) |
| 323 # # standard deviation = absolute value of diffence between replicats of cond1 | 312 # # standard deviation = absolute value of difference between replicats of cond1 |
| 324 std_cond1.append(std(array([(cond1_1[i] * 100 / sum11), (cond1_2[i] * 100 / sum12)]))) | 313 std_cond1.append(std(array([(cond1_1[i] * 100 / sum11), (cond1_2[i] * 100 / sum12)]))) |
| 325 # # cond2 = moyenne of replicats cond1divided by max | 314 # # cond2 = mean of replicats cond1divided by max |
| 326 cond2_val[i] = ((cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2) | 315 cond2_val[i] = ((cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2) |
| 327 cond2[i] = ((cond2_1[i] + cond2_2[i]) / 2) | 316 cond2[i] = ((cond2_1[i] + cond2_2[i]) / 2) |
| 328 # # standard deviation = absolute value of difference between replicats of cond2 | 317 # # standard deviation = absolute value of difference between replicats of cond2 |
| 329 std_cond2.append(std(array([((cond2_1[i]) * 100 / sum21), ((cond2_2[i]) * 100 / sum22)]))) | 318 std_cond2.append(std(array([((cond2_1[i]) * 100 / sum21), ((cond2_2[i]) * 100 / sum22)]))) |
| 330 # # max value for each codon | 319 # # max value for each codon |
| 386 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t1.0\n') | 375 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t1.0\n') |
| 387 elif cond1_norm[i] == 0 : | 376 elif cond1_norm[i] == 0 : |
| 388 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t0.0\n') | 377 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t0.0\n') |
| 389 else: | 378 else: |
| 390 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n') | 379 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n') |
| 391 chi = stats.chisquare(observed, expected) | 380 with errstate(all='ignore'): |
| 381 chi = stats.chisquare(observed, expected) | |
| 392 out.write('Khi2 test\n') | 382 out.write('Khi2 test\n') |
| 393 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n') | 383 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n') |
| 394 | 384 |
| 395 | 385 |
| 396 | 386 |
| 433 sum2 = sum(list(cond2.itervalues())) | 423 sum2 = sum(list(cond2.itervalues())) |
| 434 # #Normalize values by sum of each libraries | 424 # #Normalize values by sum of each libraries |
| 435 cond1_norm.update ((x, (y / sum1) * 100.0) for x, y in cond1_norm.items()) | 425 cond1_norm.update ((x, (y / sum1) * 100.0) for x, y in cond1_norm.items()) |
| 436 cond2_norm.update((x, (y / sum2) * 100.0) for x, y in cond2_norm.items()) | 426 cond2_norm.update((x, (y / sum2) * 100.0) for x, y in cond2_norm.items()) |
| 437 except ZeroDivisionError: | 427 except ZeroDivisionError: |
| 438 stop_err("Not enough reads to compute the codon occupancy") | 428 stop_err("Not enough reads to compute the codon occupancy. "+str(sum1)+" and "+str(sum2)+" reads are used for each condition, respectively.\n") |
| 439 | 429 |
| 440 # # compute theorical count in COND2 | 430 # # compute theorical count in COND2 |
| 441 cond2_count = [] | 431 cond2_count = [] |
| 442 for z in cond1_norm.itervalues() : | 432 for z in cond1_norm.itervalues() : |
| 443 count = int(z * sum2 / 100.0) | 433 count = int(z * sum2 / 100.0) |
| 495 elif cond1_norm[i] == 0 : | 485 elif cond1_norm[i] == 0 : |
| 496 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t0.0\n') | 486 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t0.0\n') |
| 497 else: | 487 else: |
| 498 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n') | 488 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n') |
| 499 out.write('Khi2 test\n') | 489 out.write('Khi2 test\n') |
| 500 chi = stats.chisquare(observed, expected) | 490 with errstate(all='ignore'): |
| 491 chi = stats.chisquare(observed, expected) | |
| 501 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n') | 492 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n') |
| 502 | 493 |
| 503 # # get max value for each codon for histogram | 494 # # get max value for each codon for histogram |
| 504 max_val = [] # # max value for graph | 495 max_val = [] # # max value for graph |
| 505 for i in cond1: | 496 for i in cond1: |
| 601 def plot_fc (cond1, cond2, site, dirout): | 592 def plot_fc (cond1, cond2, site, dirout): |
| 602 | 593 |
| 603 fc = cond1.copy() | 594 fc = cond1.copy() |
| 604 | 595 |
| 605 for key, value in fc.iteritems(): | 596 for key, value in fc.iteritems(): |
| 606 fc[key] = cond2[key]/cond1[key] | 597 if cond1[key] == 0: |
| 598 fc[key] = 1 | |
| 599 else: | |
| 600 fc[key] = cond2[key]/cond1[key] | |
| 607 | 601 |
| 608 index = arange(len(fc.keys())) | 602 index = arange(len(fc.keys())) |
| 609 label = fc.keys() | 603 label = fc.keys() |
| 610 label = [w.replace('T','U') for w in label] | 604 label = [w.replace('T','U') for w in label] |
| 611 pl.figure(figsize=(15,10), num=1) | 605 pl.figure(figsize=(15,10), num=1) |
| 693 # codons = options.list_cod.upper().split(',') | 687 # codons = options.list_cod.upper().split(',') |
| 694 # check_codons_list(codons) | 688 # check_codons_list(codons) |
| 695 GFF = HTSeq.GFF_Reader(options.gff) | 689 GFF = HTSeq.GFF_Reader(options.gff) |
| 696 # # get html file and directory : | 690 # # get html file and directory : |
| 697 (html, html_dir) = options.dirout.split(',') | 691 (html, html_dir) = options.dirout.split(',') |
| 698 if os.path.exists(html_dir): | 692 if not os.path.exists(html_dir): |
| 699 raise | 693 try: |
| 700 try: | 694 os.mkdir(html_dir) |
| 701 os.mkdir(html_dir) | 695 except Exception, e : |
| 702 except: | 696 stop_err('Error running make directory : ' + str(e)) |
| 703 raise Exception(html_dir + ' mkdir') | |
| 704 # #RUN analysis | 697 # #RUN analysis |
| 705 # #If there are replicats | 698 # #If there are replicats |
| 706 if options.rep == "yes" : | 699 if options.rep == "yes" : |
| 707 result = [] | 700 result = [] |
| 708 # split name of each file options by "," | 701 # split name of each file options by "," |
| 722 # #calcul for each cond | 715 # #calcul for each cond |
| 723 for fh in (options.file1, options.file2): | 716 for fh in (options.file1, options.file2): |
| 724 check_index_bam (fh) | 717 check_index_bam (fh) |
| 725 result.append(get_codon_usage(fh, GFF, options.site, options.kmer,options.asite)) | 718 result.append(get_codon_usage(fh, GFF, options.site, options.kmer,options.asite)) |
| 726 (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2) | 719 (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2) |
| 720 | |
| 727 # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir) | 721 # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir) |
| 728 plot_fc (cond1, cond2, options.site, html_dir) | 722 plot_fc (cond1, cond2, options.site, html_dir) |
| 729 else : | 723 else : |
| 730 sys.stderr.write("Please enter yes or no for --rep option. Programme aborted at %s" % time.asctime(time.localtime(time.time()))) | 724 sys.stderr.write("Please enter yes or no for --rep option. Programme aborted at %s" % time.asctime(time.localtime(time.time()))) |
| 731 sys.exit() | 725 sys.exit() |
