diff get_codon_frequency.py @ 10:707807fee542

(none)
author rlegendre
date Thu, 22 Jan 2015 14:34:38 +0100
parents b8c070add3b7
children 7c944fd9907e
line wrap: on
line diff
--- a/get_codon_frequency.py	Tue Dec 09 09:43:07 2014 -0500
+++ b/get_codon_frequency.py	Thu Jan 22 14:34:38 2015 +0100
@@ -25,8 +25,10 @@
 import csv
 from scipy import stats
 from collections import OrderedDict
+import ribo_functions
+import HTSeq
 # #libraries for debugg
-# import pdb
+import pdb
 # import cPickle
 
 def stop_err(msg):
@@ -34,7 +36,6 @@
     sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
     sys.exit()
 
-    
 def store_gff(gff):
     '''
         parse and store gff file in a dictionnary
@@ -98,8 +99,7 @@
 #om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified
 #chrI    SGD     CDS     87286   87387   .       +       0       Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
 #chrI    SGD     CDS     87501   87752   .       +       0       Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified    
-
-
+    
 
 def init_codon_dict():
     
@@ -115,17 +115,28 @@
     '''
     try:
         codon = init_codon_dict()
-                    
-        for chrom in GFF.iterkeys():
-            for gene in GFF[chrom] :
+        for feature in GFF :
+            if feature.type == 'gene' :
                 codon_dict = init_codon_dict()
-                start = GFF[chrom][gene]['start']
-                stop = GFF[chrom][gene]['stop']
+                chrom = feature.iv.chrom
+                start = feature.iv.start
+                stop = feature.iv.end 
                 region = chrom + ':' + str(start) + '-' + str(stop)
+        
+        ## DEPRECATED
+        #for chrom in GFF.iterkeys():
+            #for gene in GFF[chrom] :
+               # codon_dict = init_codon_dict()
+                #start = GFF[chrom][gene]['start']
+                #print start
+                #stop = GFF[chrom][gene]['stop']
+                #print stop
+                #region = chrom + ':' + str(start) + '-' + str(stop)
+        #######
                 # #get all reads in this gene
                 reads = subprocess.check_output(["samtools", "view", bamfile, region])
                 head = subprocess.check_output(["samtools", "view", "-H", bamfile])
-                read_tab = reads.split('\n')
+                read_tab = reads.split('\n')        
                 for read in read_tab:
                     # # search mapper for eliminate multiple alignements
                     if 'bowtie' in head:
@@ -136,7 +147,6 @@
                         stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping")  
                     if len(read) == 0:
                         continue
-                    
                     len_read = len(read.split('\t')[9])        
                     # if it's read of good length
                     if len_read == kmer and multi_tag not in read:
@@ -179,6 +189,8 @@
         stop_err('Error during codon usage calcul: ' + str(e))
 
 
+
+
 '''
 http://pyinsci.blogspot.fr/2009/09/violin-plot-with-matplotlib.html
 '''
@@ -491,7 +503,7 @@
         
         # # plot amino acid profile :
         fig = pl.figure(num=1)
-        width = .35
+        width = .45
         ax = fig.add_subplot(111)
         ind = arange(21)
         pl.xlim(0, 21) 
@@ -503,7 +515,7 @@
         ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2)        
         #for x, y, z in zip(ind, max_val, aa_name):
         #    ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14)
-        axis_font = {'size':'16'}
+        axis_font = {'size':'10'}
         pl.xticks(ind + width, aa_name,**axis_font)
         ax.spines['right'].set_visible(False)
         ax.spines['top'].set_visible(False)
@@ -513,7 +525,7 @@
         ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)',**axis_font)
         ax.set_xlabel('Amino Acids', **axis_font)
         handles, labels = ax.get_legend_handles_labels()
-        font_prop = font_manager.FontProperties(size=12)
+        font_prop = font_manager.FontProperties(size=8)
         ax.legend(handles, labels, prop=font_prop)
         pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340)
         pl.clf()
@@ -534,8 +546,9 @@
             max_val.append(max(cond1_norm[i], cond2_norm[i]))
             
         # plot result
-        fig = pl.figure(figsize=(24, 10), num=1)
-        width = .50
+        fig = pl.figure(figsize=(30, 10), num=1)
+        #fig = pl.figure(num=1)
+        width = .40
         ind = arange(len(codon_sorted))
         ax = fig.add_subplot(111)
         pl.xlim(0, len(codon_sorted) + 1) 
@@ -625,10 +638,7 @@
         #    raise Exception        
         
 def __main__():
-    '''   
-        python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/get_codon_frequency.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -g Saccer3.gff -t tAI.csv -1 psiM1_sorted.bam,psiM2_sorted.bam -2 psiP1_sorted.bam,psiP2_sorted.bam -c psiM -C psiP -l TAG,TAA,TGA -r yes -o psi_count -d psi.html,html_dir > log2
-        python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/get_codon_frequency.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -g Saccer3.gff -t tAI.csv -c psiM -C psiP -1 RPF_psi-_28sorted.bam -2 RPF_psi+_28sorted.bam -l TAG,TAA,TGA -n Stop Codon -r no -o psi_count -d psi.html,html_dir > log2
-    '''     
+    
     
     # Parse command line options
     parser = optparse.OptionParser()
@@ -648,7 +658,7 @@
                   help="Name of second condition", metavar="STR")
    
     parser.add_option("-k", "--kmer", dest="kmer", type="int",
-                  help="Longer of your phasing reads", metavar="INT") 
+                  help="Length of your phasing reads", metavar="INT") 
         
 #     parser.add_option("-l", "--list", dest="list_cod", type= "string",
 #                   help="list of codons to compare to other", metavar="STR")
@@ -692,13 +702,34 @@
         if not colors.is_color_like(options.color2) :
             stop_err( options.color2+' is not a proper color' )
         
-        GFF = store_gff(options.gff)
+        ## identify GFF or GTF format from 9th column
+        #with open (options.gff,"r") as gffile :
+        #    for line in gffile :
+        #        if '#' in line :
+        #            ## skip header
+        #            gffile.next()
+         #       elif 'gene_id' in line :
+        #            ## launch gtf reader :
+        #            GFF = ribo_functions.store_gtf(options.gff)
+        #            break
+        #        elif 'ID=' in line :
+        #            ## launch gff reader 
+        #            GFF = ribo_functions.store_gff(options.gff)
+        #            break
+        #        else :
+        #            stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )
+            
+        #GFF = store_gff(options.gff)
+        #GFF = ribo_functions.store_gtf(options.gff)
+        ## check gff reading
+        #if not GFF['order'] :
+         #   stop_err( 'Incorrect GFF file' + str( e ) )
         
         #### NOT USE IN FINAL VERSION
         # # get codon list
         # codons = options.list_cod.upper().split(',')
         # check_codons_list(codons)
-        
+        GFF = HTSeq.GFF_Reader(options.gff)
         # # get html file and directory :
         (html, html_dir) = options.dirout.split(',')
         if os.path.exists(html_dir):