ribo_tools: get_codon_frequency.py comparison

comparison get_codon_frequency.py @ 10:707807fee542

(none)

author	rlegendre
date	Thu, 22 Jan 2015 14:34:38 +0100
parents	b8c070add3b7
children	7c944fd9907e

comparison

equal deleted inserted replaced

-:d7739f797a26
+:707807fee542
 from matplotlib import font_manager
 from matplotlib import colors
 import csv
 from scipy import stats
 from collections import OrderedDict
+import ribo_functions
+import HTSeq
 # #libraries for debugg
-# import pdb
+import pdb
 # import cPickle
 def stop_err(msg):
 sys.stderr.write("%s\n" % msg)
 sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
 sys.exit()
 def store_gff(gff):
 '''
 parse and store gff file in a dictionnary
 '''
 try:
 #7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra
 #ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr
 #om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified
 #chrI    SGD     CDS     87286   87387   .       +       0       Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
 #chrI    SGD     CDS     87501   87752   .       +       0       Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
 def init_codon_dict():
 Codon_dict = OrderedDict([('AAA', 0), ('AAC', 0), ('AAG', 0), ('AAT', 0), ('ACA', 0), ('ACC', 0), ('ACG', 0), ('ACT', 0), ('AGA', 0), ('AGC', 0), ('AGG', 0), ('AGT', 0), ('ATA', 0), ('ATC', 0), ('ATG', 0), ('ATT', 0), ('CAA', 0), ('CAC', 0), ('CAG', 0), ('CAT', 0), ('CCA', 0), ('CCC', 0), ('CCG', 0), ('CCT', 0), ('CGA', 0), ('CGC', 0), ('CGG', 0), ('CGT', 0), ('CTA', 0), ('CTC', 0), ('CTG', 0), ('CTT', 0), ('GAA', 0), ('GAC', 0), ('GAG', 0), ('GAT', 0), ('GCA', 0), ('GCC', 0), ('GCG', 0), ('GCT', 0), ('GGA', 0), ('GGC', 0), ('GGG', 0), ('GGT', 0), ('GTA', 0), ('GTC', 0), ('GTG', 0), ('GTT', 0), ('TAA', 0), ('TAC', 0), ('TAG', 0), ('TAT', 0), ('TCA', 0), ('TCC', 0), ('TCG', 0), ('TCT', 0), ('TGA', 0), ('TGC', 0), ('TGG', 0), ('TGT', 0), ('TTA', 0), ('TTC', 0), ('TTG', 0), ('TTT', 0)])
 return Codon_dict
 Read GFF dict and get gene codon usage.
 Return dict of codons usage
 '''
 try:
 codon = init_codon_dict()
+for feature in GFF :
-for chrom in GFF.iterkeys():
+if feature.type == 'gene' :
-for gene in GFF[chrom] :
 codon_dict = init_codon_dict()
-start = GFF[chrom][gene]['start']
+chrom = feature.iv.chrom
-stop = GFF[chrom][gene]['stop']
+start = feature.iv.start
+stop = feature.iv.end
 region = chrom + ':' + str(start) + '-' + str(stop)
+## DEPRECATED
+#for chrom in GFF.iterkeys():
+#for gene in GFF[chrom] :
+# codon_dict = init_codon_dict()
+#start = GFF[chrom][gene]['start']
+#print start
+#stop = GFF[chrom][gene]['stop']
+#print stop
+#region = chrom + ':' + str(start) + '-' + str(stop)
+#######
 # #get all reads in this gene
 reads = subprocess.check_output(["samtools", "view", bamfile, region])
 head = subprocess.check_output(["samtools", "view", "-H", bamfile])
 read_tab = reads.split('\n')
 for read in read_tab:
 # # search mapper for eliminate multiple alignements
 if 'bowtie' in head:
 multi_tag = "XS:i:"
 elif 'bwa' in  head:
 multi_tag = "XT:A:R"
 else :
 stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping")
 if len(read) == 0:
 continue
 len_read = len(read.split('\t')[9])
 # if it's read of good length
 if len_read == kmer and multi_tag not in read:
 feat = read.split('\t')
 seq = feat[9]
 except Exception, e:
 stop_err('Error during codon usage calcul: ' + str(e))
 '''
 http://pyinsci.blogspot.fr/2009/09/violin-plot-with-matplotlib.html
 '''
 def violin_plot(ax, data, pos, bp=False):
 '''
 cond2_aa.append(z[1])
 max_val.append(max(z))
 # # plot amino acid profile :
 fig = pl.figure(num=1)
-width = .35
+width = .45
 ax = fig.add_subplot(111)
 ind = arange(21)
 pl.xlim(0, 21)
 #kwargs = {"hatch":'x'}
 #ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1, **kwargs)
 #ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2, **kwargs)
 ax.bar(ind, cond1_aa, width, facecolor=color1, label=c1)
 ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2)
 #for x, y, z in zip(ind, max_val, aa_name):
 #    ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14)
-axis_font = {'size':'16'}
+axis_font = {'size':'10'}
 pl.xticks(ind + width, aa_name,**axis_font)
 ax.spines['right'].set_visible(False)
 ax.spines['top'].set_visible(False)
 ax.yaxis.set_ticks_position('left')
 ax.xaxis.set_ticks_position('bottom')
 #ax.xaxis.set_ticks([])
 ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)',**axis_font)
 ax.set_xlabel('Amino Acids', **axis_font)
 handles, labels = ax.get_legend_handles_labels()
-font_prop = font_manager.FontProperties(size=12)
+font_prop = font_manager.FontProperties(size=8)
 ax.legend(handles, labels, prop=font_prop)
 pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340)
 pl.clf()
 # write result
 for i in cond1:
 # # max value for each codon
 max_val.append(max(cond1_norm[i], cond2_norm[i]))
 # plot result
-fig = pl.figure(figsize=(24, 10), num=1)
+fig = pl.figure(figsize=(30, 10), num=1)
-width = .50
+#fig = pl.figure(num=1)
+width = .40
 ind = arange(len(codon_sorted))
 ax = fig.add_subplot(111)
 pl.xlim(0, len(codon_sorted) + 1)
 ax.spines['right'].set_color('none')
 ax.spines['top'].set_color('none')
 returncode = proc.wait()
 # if returncode != 0:
 #    raise Exception
 def __main__():
-'''
-python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/get_codon_frequency.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -g Saccer3.gff -t tAI.csv -1 psiM1_sorted.bam,psiM2_sorted.bam -2 psiP1_sorted.bam,psiP2_sorted.bam -c psiM -C psiP -l TAG,TAA,TGA -r yes -o psi_count -d psi.html,html_dir > log2
-python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/get_codon_frequency.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -g Saccer3.gff -t tAI.csv -c psiM -C psiP -1 RPF_psi-_28sorted.bam -2 RPF_psi+_28sorted.bam -l TAG,TAA,TGA -n Stop Codon -r no -o psi_count -d psi.html,html_dir > log2
-'''
 # Parse command line options
 parser = optparse.OptionParser()
 parser.add_option("-g", "--gff", dest="gff", type="string",
 help="gff file", metavar="FILE")
 parser.add_option("-C", "--cond2", dest="c2", type="string",
 help="Name of second condition", metavar="STR")
 parser.add_option("-k", "--kmer", dest="kmer", type="int",
-help="Longer of your phasing reads", metavar="INT")
+help="Length of your phasing reads", metavar="INT")
 #     parser.add_option("-l", "--list", dest="list_cod", type= "string",
 #                   help="list of codons to compare to other", metavar="STR")
 parser.add_option("-o", "--out", dest="outfile", type="string",
 if not colors.is_color_like(options.color1) :
 stop_err( options.color1+' is not a proper color' )
 if not colors.is_color_like(options.color2) :
 stop_err( options.color2+' is not a proper color' )
-GFF = store_gff(options.gff)
+## identify GFF or GTF format from 9th column
+#with open (options.gff,"r") as gffile :
+#    for line in gffile :
+#        if '#' in line :
+#            ## skip header
+#            gffile.next()
+#       elif 'gene_id' in line :
+#            ## launch gtf reader :
+#            GFF = ribo_functions.store_gtf(options.gff)
+#            break
+#        elif 'ID=' in line :
+#            ## launch gff reader
+#            GFF = ribo_functions.store_gff(options.gff)
+#            break
+#        else :
+#            stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )
+#GFF = store_gff(options.gff)
+#GFF = ribo_functions.store_gtf(options.gff)
+## check gff reading
+#if not GFF['order'] :
+#   stop_err( 'Incorrect GFF file' + str( e ) )
 #### NOT USE IN FINAL VERSION
 # # get codon list
 # codons = options.list_cod.upper().split(',')
 # check_codons_list(codons)
+GFF = HTSeq.GFF_Reader(options.gff)
 # # get html file and directory :
 (html, html_dir) = options.dirout.split(',')
 if os.path.exists(html_dir):
 raise
 try:

Mercurial > repos > rlegendre > ribo_tools

comparison get_codon_frequency.py @ 10:707807fee542