diff metagene_readthrough.py @ 19:385fc64fa988 draft default tip

Uploaded
author rlegendre
date Fri, 12 Jun 2015 11:32:59 -0400
parents c87c40e642af
children
line wrap: on
line diff
--- a/metagene_readthrough.py	Tue Jun 09 09:06:17 2015 -0400
+++ b/metagene_readthrough.py	Fri Jun 12 11:32:59 2015 -0400
@@ -16,6 +16,7 @@
 import HTSeq
 #from matplotlib import pyplot as pl
 import matplotlib
+from jinja2.nodes import Pos
 matplotlib.use('Agg')
 import matplotlib.pyplot as pl
 from numpy import arange
@@ -404,14 +405,49 @@
     except Exception, e:
         stop_err( 'Error during gene plotting : ' + str( e ) )
 
+
+def __percent__(prop):
+        
+    if sum(prop) != 0 :
+        perc = [0,0,0]
+        if prop[0] != 0 :
+            perc[0] = int("{0:.0f}".format((prop[0]*100.0)/sum(prop)))
+        if prop[1] != 0 :
+            perc[1] = int("{0:.0f}".format((prop[1]*100.0)/sum(prop)))
+        if prop[2] != 0 :
+            perc[2] = int("{0:.0f}".format((prop[2]*100.0)/sum(prop)))
+        return perc
+    else :
+        return prop
+
+def compute_phasing(chrom, start, stop, aln, kmer, strand):
     
-def compute_analysis(bam, GFF, fasta, gff, dirout, extend) :
+    phasing = [0,0,0]
+    for track in aln.fetch(chrom, start, stop):
+        if strand == '-':
+            if track.is_reverse :
+                if len(track.seq) == kmer :
+                    pos = stop - (track.pos + track.rlen - 12) 
+                    if pos > 0:  
+                        phasing[pos%3] += 1
+        else :
+            if not track.is_reverse :
+                if len(track.seq) == kmer :
+                    pos = (track.pos + 12) - start
+                    if pos > 0:  
+                        phasing[pos%3] += 1
+
+    #return __percent__(phasing)
+    return phasing
+
+    
+def compute_analysis(bam, GFF, fasta, gff, dirout, extend, kmer) :
     
     try:
-        background_file = dirout+"/background_sequence.fasta"
-        file_back = open(background_file, 'w')
-        file_context = open(dirout+"/stop_context.fasta", 'w')
-        file_extension = open(dirout+"/extensions.fasta", 'w')
+        #background_file = dirout+"/background_sequence.fasta"
+        #file_back = open(background_file, 'w')
+        #file_context = open(dirout+"/stop_context.fasta", 'w')
+        #file_extension = open(dirout+"/extensions.fasta", 'w')
         ## Opening Bam file with pysam librarie
         pysam.index(bam)
         aln = pysam.Samfile(bam, "rb",header=True, check_header = True)
@@ -429,7 +465,7 @@
 
         with open(dirout+"/readthrough_result.csv","w") as out :
             myheader = ['Gene','Name', 'FAIL', 'Stop context','chrom','start extension','stop extension','length extension','RPKM CDS', 'RPKM extension','ratio','Annotation','sequence']
-            writer = csv.writer(out,delimiter='\t')
+            writer = csv.writer(out,delimiter=',')
             writer.writerow(myheader)    
             for gene in GFF['order'] :
                 #print GFF[gene]
@@ -522,7 +558,7 @@
                                     if (seq):
                                         res = find_stop(seq)
                                         if res == -1 :
-                                            mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq]
+                                            mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']]
                                             writer.writerow(mylist)
                                         else :
                                             indic = 'ok'
@@ -541,7 +577,7 @@
                                     if (seq):
                                         res = find_stop(seq)
                                         if res == -1 :
-                                            mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq]
+                                            mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']]
                                             writer.writerow(mylist)
                                         else :
                                             indic = 'ok'
@@ -584,6 +620,26 @@
                                     ## if we have footprint in stop codon extension and stop extension is further than cds_stop+9
                                     if count_stop > 2 and stop_ok == 1 :
                                         homo_cov = check_homo_coverage(gene,GFF,start_extension,stop_extension,aln)
+                                       # phasing = compute_phasing(GFF[gene]['chrom'],start_extension,stop_extension, aln, kmer,GFF[gene]['strand'])
+                                       # print gene, phasing
+#                                         count_after_stop = 0
+#                                         try :
+#                                             ### count method of pysam doesn't strand information
+#                                             if GFF[gene]['strand'] == '+' :
+#                                                 for track in aln.fetch(GFF[gene]['chrom'],stop_extension+15,stop_extension+40) :
+#                                                     if not track.is_reverse :
+#                                                         count_after_stop += 1
+#                                             else :
+#                                                 for track in aln.fetch(GFF[gene]['chrom'],start_extension-40,start_extension-15) :
+#                                                     if track.is_reverse :
+#                                                         count_after_stop += 1
+#                                         except ValueError:
+#                                             ## genere warning about gtf coordinates
+#                                             warnings.warn("Please check coordinates in GFF : maybe stop or start codon are missing" )
+#                                             pass
+#                                         print gene+"\t"+str(count_stop)+"\t"+str(count_after_stop)
+#                                         if (count_stop*5)/100 > count_after_stop :
+#                                             print "moins de 5%...\n"
                                         if (homo_cov) :
                                             '''
                                                 write result witch corresponding to readthrough
@@ -600,10 +656,10 @@
                                                 ratio = rpkm_ext/rpkm_cds
                                                 #print gene,ratio
                                                 #print start_extension,stop_extension
-                                            mylist = [gene,GFF[gene]['name'],'-',contexte_stop,GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,GFF[gene]['note'],seq]
+                                            mylist = [gene,GFF[gene]['name'],'-',contexte_stop,GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,seq,GFF[gene]['note']]
                                             writer.writerow(mylist)
-                                            file_context.write('>'+gene+'\n'+contexte_stop+'\n')
-                                            file_extension.write('>'+gene+'\n'+seq+'\n')
+                                            #file_context.write('>'+gene+'\n'+contexte_stop+'\n')
+                                            #file_extension.write('>'+gene+'\n'+seq+'\n')
                                         else :
                                             '''
                                                 write result witch corresponding to readthrough but with no homogeneous coverage
@@ -617,10 +673,10 @@
                                                 rpkm_ext = compute_rpkm(len_ext,count_ext,count_tot)
                                                 ## compute ratio between extension coverage and cds coverage (rpkm)
                                                 ratio = rpkm_ext/rpkm_cds
-                                            mylist = [gene,GFF[gene]['name'],'hetero cov',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,GFF[gene]['note'],seq]
+                                            mylist = [gene,GFF[gene]['name'],'hetero cov',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,seq,GFF[gene]['note']]
                                             writer.writerow(mylist)
-                                            file_context.write('>'+gene+'\n'+contexte_stop+'\n')
-                                            file_extension.write('>'+gene+'\n'+seq+'\n')
+                                            #file_context.write('>'+gene+'\n'+contexte_stop+'\n')
+                                            #file_extension.write('>'+gene+'\n'+seq+'\n')
                                             #print ">"+gene+"\n"+contexte_stop
                                         
                                         ## plot gene :
@@ -632,38 +688,38 @@
                                         '''
                                             write result with no footprint in stop codon of extension
                                         '''
-                                        mylist = [gene,GFF[gene]['name'],'no RPF in stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq]
+                                        mylist = [gene,GFF[gene]['name'],'no RPF in stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']]
                                         writer.writerow(mylist)
-                                        file_context.write('>'+gene+'\n'+contexte_stop+'\n')
-                                        file_extension.write('>'+gene+'\n'+seq+'\n')
+                                        #file_context.write('>'+gene+'\n'+contexte_stop+'\n')
+                                        #file_extension.write('>'+gene+'\n'+seq+'\n')
                                         #print ">"+gene+"\n"+contexte_stop
                                 else :
                                     '''
                                         write result with RPF maybe result of reinitiation on a start codon
                                     '''
                                     if pass_length(start_extension,stop_extension) :
-                                        mylist = [gene,GFF[gene]['name'],'Met after stop', contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq]
+                                        mylist = [gene,GFF[gene]['name'],'Met after stop', contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']]
                                         writer.writerow(mylist)
-                                        file_context.write('>'+gene+'\n'+contexte_stop+'\n')
-                                        file_extension.write('>'+gene+'\n'+seq+'\n')
+                                        #file_context.write('>'+gene+'\n'+contexte_stop+'\n')
+                                        #file_extension.write('>'+gene+'\n'+seq+'\n')
                                         #print ">"+gene+"\n"+contexte_stop
-                        else :
-                            ## if its not a interesting case, we get stop context of genes without readthrough
-                            if GFF[gene]['strand'] == '+' :
-                                contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['stop']-6:GFF[gene]['stop']+6])
-                                file_back.write(contexte_stop+'\n')
-                            else :
-                                contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['start']-7:GFF[gene]['start']+5].reverse_complement())
-                                file_back.write(contexte_stop+'\n')
-                            
+#                         else :
+#                             ## if its not a interesting case, we get stop context of genes without readthrough
+#                             if GFF[gene]['strand'] == '+' :
+#                                 contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['stop']-6:GFF[gene]['stop']+6])
+#                                 file_back.write(contexte_stop+'\n')
+#                             else :
+#                                 contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['start']-7:GFF[gene]['start']+5].reverse_complement())
+#                                 file_back.write(contexte_stop+'\n')
+#                             
                     ## excluded UT with incorrect positions        
                     except ValueError:
                         pass
                             
               
-        file_context.close()
-        file_back.close()
-        file_extension.close()
+        #file_context.close()
+        #file_back.close()
+        #file_extension.close()
     except Exception,e:
         stop_err( 'Error during computing analysis : ' + str( e ) ) 
 
@@ -679,14 +735,14 @@
         gene_table += '<thead><tr><th data-sort="string">Gene</th><th>Plot</th><th data-sort="string">Name</th><th>Stop context</th><th>Coordinates</th><th>RPKM CDS</th><th>RPKM extension</th><th data-sort="float">ratio</th><th>Extension</th></tr></thead><tbody>'
 
         with open(os.path.join(subfolder,'readthrough_result.csv'), 'rb') as csvfile:
-            spamreader = csv.reader(csvfile, delimiter='\t')
+            spamreader = csv.reader(csvfile, delimiter=',')
             ## skip the header
             next(spamreader, None)
             ##test if file is empty or not
             if next(spamreader, None):
                 for row in spamreader:
                     if row[2] == '-' :
-                        gene_table += '<tr><td>%s</td><td><a href="%s.png" data-lightbox="%s"><img src="%s_thumbnail.png" /></a></td><td><a title="%s">%s</a></td><td>%s</td><td>%s:%s-%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' %(row[0], row[0], row[0], row[0], row[11], row[1], row[3], row[4], row[5], row[6], row[8], row[9], row[10], row[12])
+                        gene_table += '<tr><td>%s</td><td><a href="%s.png" data-lightbox="%s"><img src="%s_thumbnail.png" /></a></td><td><a title="%s">%s</a></td><td>%s</td><td>%s:%s-%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' %(row[0], row[0], row[0], row[0], row[11], row[1], row[3], row[4], row[5], row[6], row[8], row[9], row[10], row[11])
                     
                 gene_table += '</tbody></table>' 
             else :
@@ -931,7 +987,10 @@
                   help="Bam Ribo-Seq alignments ", metavar="FILE")
                     
     parser.add_option("-d", "--dirout", dest="dirout", type= "string",
-                  help="write report in this html file and in associated directory", metavar="STR,STR")   
+                  help="write report in this html file and in associated directory", metavar="STR,STR")
+      
+    parser.add_option("-k", "--kmer", dest="kmer", type= "int",default = 28 ,
+                  help="Longer of your phasing reads (Default is 28)", metavar="INT") 
     
     parser.add_option("-e", "--extend", dest="extend", type= "int",default = 300 ,
                   help="Lenght of extension after stop in number of base pairs (depends on your organisme)", metavar="INT")     
@@ -972,8 +1031,13 @@
         ## check gff reading
         if not GFF['order'] :
             stop_err( 'Incorrect GFF file' )
+        for gene in GFF['order']:
+            if not GFF[gene]['exon'] :
+                del GFF[gene]
+                GFF['order'].remove(gene)              
+
         clean_file = ribo_functions.cleaning_bam(options.bamfile)
-        compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder, options.extend)
+        compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder, options.extend, options.kmer)
         if os.path.exists( clean_file ):
             os.remove( clean_file )