ribo_tools: get_codon_frequency.py comparison

comparison get_codon_frequency.py @ 17:c87c40e642af draft

Uploaded

author	rlegendre
date	Fri, 29 May 2015 09:17:29 -0400
parents	fcfdb2607cb8
children	385fc64fa988

comparison

equal deleted inserted replaced

-:fcfdb2607cb8
+:c87c40e642af
 matplotlib.use('Agg')
 import matplotlib.pyplot as pl
 from matplotlib import font_manager
 from matplotlib import colors
 import csv
-from scipy import stats
+from scipy import stats, errstate
 from collections import OrderedDict
 import ribo_functions
 import HTSeq
 # #libraries for debugg
 #import pdb
 if feature.type == 'gene' :
 codon_dict = init_codon_dict()
 chrom = feature.iv.chrom
 start = feature.iv.start
 stop = feature.iv.end
 if start+50 < stop-50 :
 region = chrom + ':' + str(start+50) + '-' + str(stop-50)
-else :
+# #get all reads in this gene
-break
+reads = subprocess.check_output(["samtools", "view", bamfile, region])
+head = subprocess.check_output(["samtools", "view", "-H", bamfile])
-## DEPRECATED
+read_tab = reads.split('\n')
-#for chrom in GFF.iterkeys():
+for read in read_tab:
-#for gene in GFF[chrom] :
+# # search mapper for eliminate multiple alignements
-# codon_dict = init_codon_dict()
+if 'bowtie' in head:
-#start = GFF[chrom][gene]['start']
+multi_tag = "XS:i:"
-#print start
+elif 'bwa' in  head:
-#stop = GFF[chrom][gene]['stop']
+multi_tag = "XT:A:R"
-#print stop
+elif 'TopHat' in  head:
-#region = chrom + ':' + str(start) + '-' + str(stop)
+tag = "NH:i:1"
-#######
+else :
-# #get all reads in this gene
+stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")
-reads = subprocess.check_output(["samtools", "view", bamfile, region])
-head = subprocess.check_output(["samtools", "view", "-H", bamfile])
+if len(read) == 0:
-read_tab = reads.split('\n')
+continue
-for read in read_tab:
+len_read = len(read.split('\t')[9])
-# # search mapper for eliminate multiple alignements
+# if it's read of good length
-if 'bowtie' in head:
+if len_read == kmer and (tag in read or multi_tag not in read):
-multi_tag = "XS:i:"
+feat = read.split('\t')
-elif 'bwa' in  head:
+seq = feat[9]
-multi_tag = "XT:A:R"
+# if it's a reverse read
-elif 'TopHat' in  head:
+if feat[1] == '16' :
-tag = "NH:i:1"
+if site == "A" :
-else :
+# #get A-site
-stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")
+cod = str(Seq(seq[a_pos-5:a_pos-2]).reverse_complement())
+elif site == "P" :
-if len(read) == 0:
+# #get P-site
-continue
+cod = str(Seq(seq[a_pos-2:a_pos+1]).reverse_complement())
-len_read = len(read.split('\t')[9])
+else :
-# if it's read of good length
+# #get site-E
-if len_read == kmer and (tag in read or multi_tag not in read):
+cod = str(Seq(seq[a_pos+1:a_pos+4]).reverse_complement())
-feat = read.split('\t')
+# # test if it's a true codon not a CNG codon for example
-seq = feat[9]
+if codon_dict.has_key(cod) :
-# if it's a reverse read
+codon_dict[cod] += 1
-if feat[1] == '16' :
+# if it's a forward read
-if site == "A" :
+elif feat[1] == '0' :
-# #get A-site
+if site == "A" :
-cod = str(Seq(seq[a_pos-5:a_pos-2]).reverse_complement())
+# #get A-site
-elif site == "P" :
+cod = seq[a_pos:a_pos+3]
-# #get P-site
+elif site == "P" :
-cod = str(Seq(seq[a_pos-2:a_pos+1]).reverse_complement())
+# #get P-site
-else :
+cod = seq[a_pos-3:a_pos]
-# #get site-E
+else :
-cod = str(Seq(seq[a_pos+1:a_pos+4]).reverse_complement())
+# #get site-E
-# # test if it's a true codon not a CNG codon for example
+cod = seq[a_pos-6:a_pos-3]
 if codon_dict.has_key(cod) :
 codon_dict[cod] += 1
-# if it's a forward read
+del(read)
-elif feat[1] == '0' :
-if site == "A" :
-# #get A-site
-cod = seq[a_pos:a_pos+3]
-elif site == "P" :
-# #get P-site
-cod = seq[a_pos-3:a_pos]
-else :
-# #get site-E
-cod = seq[a_pos-6:a_pos-3]
-if codon_dict.has_key(cod) :
-codon_dict[cod] += 1
-del(read)
 # # add in global dict
 for cod, count in codon_dict.iteritems() :
 codon[cod] += count
+if sum(codon.values()) == 0 :
-return codon
+stop_err('There are no reads aligning on annotated genes in your GFF file')
+else :
+return codon
 except Exception, e:
 stop_err('Error during codon usage calcul: ' + str(e))
 cond2 = {}
 std_cond1 = []
 std_cond2 = []
 max_val = []  # # max value for graph
 for i in codon_sorted:
-# # cond1 = moyenne of replicats cond1 divided by max
+# # cond1 = mean of replicats cond1 divided by max
 cond1_val[i] = ((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2)
 cond1[i] = ((cond1_1[i] + cond1_2[i]) / 2)
-# # standard deviation = absolute value of diffence between replicats of cond1
+# # standard deviation = absolute value of difference between replicats of cond1
 std_cond1.append(std(array([(cond1_1[i] * 100 / sum11), (cond1_2[i] * 100 / sum12)])))
-# # cond2 = moyenne of replicats cond1divided by max
+# # cond2 = mean of replicats cond1divided by max
 cond2_val[i] = ((cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2)
 cond2[i] = ((cond2_1[i] + cond2_2[i]) / 2)
 # # standard deviation = absolute value of difference between replicats of cond2
 std_cond2.append(std(array([((cond2_1[i]) * 100 / sum21), ((cond2_2[i]) * 100 / sum22)])))
 # # max value for each codon
 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t1.0\n')
 elif cond1_norm[i] == 0 :
 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t0.0\n')
 else:
 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n')
-chi = stats.chisquare(observed, expected)
+with errstate(all='ignore'):
+chi = stats.chisquare(observed, expected)
 out.write('Khi2 test\n')
 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n')
 sum2 = sum(list(cond2.itervalues()))
 # #Normalize values by sum of each libraries
 cond1_norm.update ((x, (y / sum1) * 100.0) for x, y in cond1_norm.items())
 cond2_norm.update((x, (y / sum2) * 100.0) for x, y in cond2_norm.items())
 except ZeroDivisionError:
-stop_err("Not enough reads to compute the codon occupancy")
+stop_err("Not enough reads to compute the codon occupancy. "+str(sum1)+" and "+str(sum2)+" reads are used for each condition, respectively.\n")
 # # compute theorical count in COND2
 cond2_count = []
 for z in cond1_norm.itervalues() :
 count = int(z * sum2 / 100.0)
 elif cond1_norm[i] == 0 :
 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t0.0\n')
 else:
 out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n')
 out.write('Khi2 test\n')
-chi = stats.chisquare(observed, expected)
+with errstate(all='ignore'):
+chi = stats.chisquare(observed, expected)
 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n')
 # # get max value for each codon for histogram
 max_val = []  # # max value for graph
 for i in cond1:
 def plot_fc (cond1, cond2, site, dirout):
 fc = cond1.copy()
 for key, value in fc.iteritems():
-fc[key] = cond2[key]/cond1[key]
+if cond1[key] == 0:
+fc[key] = 1
+else:
+fc[key] = cond2[key]/cond1[key]
 index = arange(len(fc.keys()))
 label = fc.keys()
 label = [w.replace('T','U') for w in label]
 pl.figure(figsize=(15,10), num=1)
 # codons = options.list_cod.upper().split(',')
 # check_codons_list(codons)
 GFF = HTSeq.GFF_Reader(options.gff)
 # # get html file and directory :
 (html, html_dir) = options.dirout.split(',')
-if os.path.exists(html_dir):
+if not os.path.exists(html_dir):
-raise
+try:
-try:
+os.mkdir(html_dir)
-os.mkdir(html_dir)
+except Exception, e :
-except:
+stop_err('Error running make directory : ' + str(e))
-raise Exception(html_dir + ' mkdir')
 # #RUN analysis
 # #If there are replicats
 if options.rep == "yes" :
 result = []
 # split name of each file options by ","
 # #calcul for each cond
 for fh in (options.file1, options.file2):
 check_index_bam (fh)
 result.append(get_codon_usage(fh, GFF, options.site, options.kmer,options.asite))
 (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2)
 # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir)
 plot_fc (cond1, cond2, options.site, html_dir)
 else :
 sys.stderr.write("Please enter yes or no for --rep option. Programme aborted at %s" % time.asctime(time.localtime(time.time())))
 sys.exit()

Mercurial > repos > rlegendre > ribo_tools

comparison get_codon_frequency.py @ 17:c87c40e642af draft