diff get_codon_frequency.py @ 13:7c944fd9907e draft

release 2
author rlegendre
date Thu, 09 Apr 2015 11:35:48 -0400
parents 707807fee542
children 344bacf6acb8
line wrap: on
line diff
--- a/get_codon_frequency.py	Fri Jan 23 03:31:37 2015 -0500
+++ b/get_codon_frequency.py	Thu Apr 09 11:35:48 2015 -0400
@@ -28,7 +28,7 @@
 import ribo_functions
 import HTSeq
 # #libraries for debugg
-import pdb
+#import pdb
 # import cPickle
 
 def stop_err(msg):
@@ -36,70 +36,6 @@
     sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
     sys.exit()
 
-def store_gff(gff):
-    '''
-        parse and store gff file in a dictionnary
-    '''
-    try:
-        GFF = OrderedDict({})
-        with open(gff, 'r') as f_gff : 
-            # GFF['order'] = []
-            for line in f_gff:
-                # # switch commented lines
-                head = line.split("#")[0]
-                if head != "" :
-                    feature = (line.split('\t')[8]).split(';')
-                    chrom = line.split('\t')[0]
-                    if chrom not in GFF :
-                        GFF[chrom] = {}
-                # first line is already gene line :
-                    if line.split('\t')[2] == 'gene' :
-                        gene = feature[0].replace("ID=", "")
-                        if re.search('gene', feature[2]) :
-                            Name = feature[2].replace("gene=", "")
-                        else :
-                            Name = "Unknown"
-                        # #get annotation
-                        note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line)
-                        note = urllib.unquote(str(note)).replace("\n", "")
-                        # # store gene information
-                        # GFF['order'].append(gene)
-                        GFF[chrom][gene] = {}
-                        GFF[chrom][gene]['chrom'] = line.split('\t')[0]
-                        GFF[chrom][gene]['start'] = int(line.split('\t')[3])
-                        GFF[chrom][gene]['stop'] = int(line.split('\t')[4])
-                        GFF[chrom][gene]['strand'] = line.split('\t')[6]
-                        GFF[chrom][gene]['name'] = Name
-                        GFF[chrom][gene]['note'] = note
-                        GFF[chrom][gene]['exon'] = {}
-                        GFF[chrom][gene]['exon_number'] = 0
-                        # print Name
-                    elif line.split('\t')[2] == 'CDS' :
-                        gene = re.sub(r"Parent\=(.+)_mRNA", r"\1", feature[0])
-                        if GFF[chrom].has_key(gene) :
-                            GFF[chrom][gene]['exon_number'] += 1
-                            exon_number = GFF[chrom][gene]['exon_number'] 
-                            GFF[chrom][gene]['exon'][exon_number] = {}
-                            GFF[chrom][gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
-                            GFF[chrom][gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
-                            GFF[chrom][gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
-                    # # if there is a five prim UTR intron, we change start of gene
-                    elif line.split('\t')[2] == 'five_prime_UTR_intron' :
-                        if GFF[chrom][gene]['strand'] == "+" :
-                            GFF[chrom][gene]['start'] = GFF[chrom][gene]['exon'][1]['start']
-                        else :
-                            GFF[chrom][gene]['stop'] = GFF[chrom][gene]['exon'][exon_number]['stop']
-        return GFF
-    except Exception, e:
-        stop_err('Error during gff storage : ' + str(e))
-  
-#chrI    SGD     gene    87286   87752   .       +       .       ID=YAL030W;Name=YAL030W;gene=SNC1;Alias=SNC1;Ontology_term=GO:0005484,GO:0005768,GO:0005802,GO:0005886,GO:0005935,GO:0006887,GO:0006893,GO:000689
-#7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra
-#ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr
-#om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified
-#chrI    SGD     CDS     87286   87387   .       +       0       Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
-#chrI    SGD     CDS     87501   87752   .       +       0       Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified    
-    
 
 def init_codon_dict():
     
@@ -115,13 +51,19 @@
     '''
     try:
         codon = init_codon_dict()
+        multi_tag = "XS:i:" ## bowtie Tag
+        tag = "IH:i:1" ## RUM tag
+        
         for feature in GFF :
             if feature.type == 'gene' :
                 codon_dict = init_codon_dict()
                 chrom = feature.iv.chrom
                 start = feature.iv.start
                 stop = feature.iv.end 
-                region = chrom + ':' + str(start) + '-' + str(stop)
+                if start+50 < stop-50 :
+                    region = chrom + ':' + str(start+50) + '-' + str(stop-50)
+                else :
+                    break
         
         ## DEPRECATED
         #for chrom in GFF.iterkeys():
@@ -143,13 +85,16 @@
                         multi_tag = "XS:i:"
                     elif 'bwa' in  head:
                         multi_tag = "XT:A:R"
+                    elif 'TopHat' in  head:
+                        tag = "NH:i:1"
                     else :
-                        stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping")  
+                        stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")  
+
                     if len(read) == 0:
                         continue
                     len_read = len(read.split('\t')[9])        
                     # if it's read of good length
-                    if len_read == kmer and multi_tag not in read:
+                    if len_read == kmer and (tag in line or multi_tag not in line):
                         feat = read.split('\t')
                         seq = feat[9]
                         # if it's a reverse read
@@ -179,6 +124,7 @@
                                 cod = seq[a_pos-6:a_pos-3]
                             if codon_dict.has_key(cod) :
                                 codon_dict[cod] += 1
+                del(read)
                 # # add in global dict   
                 for cod, count in codon_dict.iteritems() :
                     codon[cod] += count
@@ -400,7 +346,7 @@
             cond2_aa.append(z[1])
             max_valaa.append(max(z))
         # # plot amino acid profile :
-        fig = pl.figure(figsize=(24, 10), num=1)
+        fig = pl.figure(figsize=(30, 10), num=1)
         width = .50
         ax = fig.add_subplot(111)
         ax.xaxis.set_ticks([])
@@ -440,8 +386,8 @@
             
             
         # plot result
-        fig = pl.figure(figsize=(24, 10), num=1)
-        width = .50
+        fig = pl.figure(figsize=(30, 10), num=1)
+        width = .40
         ind = arange(len(codon_sorted))
         ax = fig.add_subplot(111)
         pl.xlim(0, len(codon_sorted) + 1) 
@@ -546,7 +492,7 @@
             max_val.append(max(cond1_norm[i], cond2_norm[i]))
             
         # plot result
-        fig = pl.figure(figsize=(30, 10), num=1)
+        fig = pl.figure(figsize=(40, 10), num=1)
         #fig = pl.figure(num=1)
         width = .40
         ind = arange(len(codon_sorted))
@@ -560,7 +506,7 @@
         ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, label=c1)
         ax.bar(ind + width, list(cond2_norm.itervalues()), width, facecolor=color2, label=c2)
         for x, y, z in zip(ind, max_val, codon_sorted):
-            ax.text(x + width, y + 0.02, '%s' % z, ha='center', va='bottom', fontsize=8)   
+            ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=8)   
         ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)')
         ax.set_xlabel('Codons')
         handles, labels = ax.get_legend_handles_labels()
@@ -657,7 +603,7 @@
     parser.add_option("-C", "--cond2", dest="c2", type="string",
                   help="Name of second condition", metavar="STR")
    
-    parser.add_option("-k", "--kmer", dest="kmer", type="int",
+    parser.add_option("-k", "--kmer", dest="kmer", type="int",  default = 28 ,
                   help="Length of your phasing reads", metavar="INT") 
         
 #     parser.add_option("-l", "--list", dest="list_cod", type= "string",
@@ -669,19 +615,19 @@
     parser.add_option("-d", "--dirout", dest="dirout", type="string",
                   help="write report to PNG files", metavar="FILE")   
 
-    parser.add_option("-a", "--asite", dest="asite", type="int",
-                  help="Off-set from the 5'end of the footprint to the A-site", metavar="INT")  
+    parser.add_option("-a", "--asite", dest="asite", type="int",  default = 15 ,
+                  help="Off-set from the 5'end of the footprint to the A-site (default is 15)", metavar="INT")  
     
-    parser.add_option("-s", "--site", dest="site", type="string",
-                  help="Script can compute in site A, P or E", metavar="A|P|E")      
+    parser.add_option("-s", "--site", dest="site", type="string",  default = "A" ,
+                  help="Script can compute in site A, P or E (default is A-site)", metavar="A|P|E")      
 
-    parser.add_option("-r", "--rep", dest="rep", type="string",
+    parser.add_option("-r", "--rep", dest="rep", type="string", default = "no" ,
                   help="if replicate or not", metavar="yes|no")   
      
-    parser.add_option("-x", "--hex_col1", dest="color1", type= "string",
+    parser.add_option("-x", "--hex_col1", dest="color1", type= "string",  default = "SkyBlue" ,
                   help="Color for first condition", metavar="STR")
     
-    parser.add_option("-X", "--hex_col2", dest="color2", type= "string",
+    parser.add_option("-X", "--hex_col2", dest="color2", type= "string",  default = "Plum" ,
                   help="Color for second condition", metavar="STR")
 
     parser.add_option("-q", "--quiet",
@@ -702,28 +648,6 @@
         if not colors.is_color_like(options.color2) :
             stop_err( options.color2+' is not a proper color' )
         
-        ## identify GFF or GTF format from 9th column
-        #with open (options.gff,"r") as gffile :
-        #    for line in gffile :
-        #        if '#' in line :
-        #            ## skip header
-        #            gffile.next()
-         #       elif 'gene_id' in line :
-        #            ## launch gtf reader :
-        #            GFF = ribo_functions.store_gtf(options.gff)
-        #            break
-        #        elif 'ID=' in line :
-        #            ## launch gff reader 
-        #            GFF = ribo_functions.store_gff(options.gff)
-        #            break
-        #        else :
-        #            stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )
-            
-        #GFF = store_gff(options.gff)
-        #GFF = ribo_functions.store_gtf(options.gff)
-        ## check gff reading
-        #if not GFF['order'] :
-         #   stop_err( 'Incorrect GFF file' + str( e ) )
         
         #### NOT USE IN FINAL VERSION
         # # get codon list