diff metagene_readthrough.py @ 10:707807fee542

(none)
author rlegendre
date Thu, 22 Jan 2015 14:34:38 +0100
parents 29c9c86e17e1
children 7c944fd9907e
line wrap: on
line diff
--- a/metagene_readthrough.py	Tue Dec 09 09:43:07 2014 -0500
+++ b/metagene_readthrough.py	Thu Jan 22 14:34:38 2015 +0100
@@ -21,70 +21,17 @@
 from numpy import arange
 from matplotlib import ticker as t
 from PIL import Image
+import ribo_functions
+## suppress matplotlib warnings
+import warnings
+
+
 
 def stop_err( msg ):
     sys.stderr.write( "%s\n" % msg )
     sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
     sys.exit()
     
-    
-def store_gff(gff):
-    '''
-        parse and store gff file in a dictionnary
-    '''
-    try:
-        GFF = {}
-        with open(gff, 'r') as f_gff : 
-            GFF['order'] = []
-            for line in f_gff:
-                ## switch commented lines
-                line = line.split("#")[0]
-                if line != "" :
-                    feature = (line.split('\t')[8]).split(';')
-                # first line is already gene line :
-                    if line.split('\t')[2] == 'gene' :
-                        gene = feature[0].replace("ID=","")
-                        if re.search('gene',feature[2]) :
-                            Name = feature[2].replace("gene=","")
-                        else :
-                            Name = "Unknown"
-                        ##get annotation
-                        note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line)
-                        note = unquote(str(note)).replace("\n","")
-                        ## store gene information
-                        GFF['order'].append(gene)
-                        GFF[gene] = {}
-                        GFF[gene]['chrom'] = line.split('\t')[0]
-                        GFF[gene]['start'] = int(line.split('\t')[3])
-                        GFF[gene]['stop'] = int(line.split('\t')[4])
-                        GFF[gene]['strand'] = line.split('\t')[6]
-                        GFF[gene]['name'] = Name
-                        GFF[gene]['note'] = note
-                        GFF[gene]['exon'] = {}
-                        GFF[gene]['exon_number'] = 0
-                        #print Name
-                    elif line.split('\t')[2] == 'CDS' :
-                        gene = re.sub(r".?Parent\=(.+)(_mRNA)+", r"\1", feature[0])
-                        if GFF.has_key(gene) :
-                            GFF[gene]['exon_number'] += 1
-                            exon_number = GFF[gene]['exon_number'] 
-                            GFF[gene]['exon'][exon_number] = {}
-                            GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
-                            GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
-                            GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
-                        
-                    ## if there is a five prim UTR intron, we change start of gene
-                    elif   line.split('\t')[2] == 'five_prime_UTR_intron' :
-                        if GFF[gene]['strand'] == "+" :
-                            GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
-                        else :
-                            GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
-        return GFF
-    except Exception,e:
-        stop_err( 'Error during gff storage : ' + str( e ) ) 
-
-
-       
         
 def compute_rpkm(length,count_gene,count_tot) :
     
@@ -135,7 +82,7 @@
       ranges, this in one more than the coordinate of the last base
       that is considered part of the sequence.
     strand: The strand, as a single character, '+' or '-'. '.' indicates
-      that the strand is irrelavant. (Alternatively, pass a Strand object.)
+      that the strand is irrelevant. (Alternatively, pass a Strand object.)
     length: The length of the interval, i.e., end - start
     start_d: The "directional start" position. This is the position of the
      first base of the interval, taking the strand into account. Hence, 
@@ -145,14 +92,14 @@
      strand=='-1', it is start-1.
   
 '''
-def check_overlapping(gff_reader,chrm,start,stop,strand):
+def check_overlapping(gff_reader,chrm,start,stop,strand, name):
     
     #### probleme avec les genes completement inclu...
     
     iv2 = HTSeq.GenomicInterval(chrm,start,stop,strand)
     for feature in gff_reader:
         if feature.type == "gene" :
-            if feature.iv.overlaps(iv2) :
+            if feature.iv.overlaps(iv2) and feature.attr.get('gene_name') != name :
                 ## if its a reverse gene, we replace start of extension by start of previous gene
                 if strand == '-' :
                     return (feature.iv.end+3,stop)
@@ -200,6 +147,9 @@
 
 def plot_gene ( aln, gene, start_extension, stop_extension, dirout ) :
     
+    
+    ### ignore matplotlib warnings for galaxy
+    warnings.filterwarnings('ignore')
     try:
         strand = gene['strand']
         len_gene = gene['stop']-gene['start']
@@ -446,13 +396,13 @@
         im = Image.open(infile)
         im.thumbnail(size, Image.ANTIALIAS)
         im.save(dirout+"_thumbnail.png", "PNG")
-    
+        warnings.resetwarnings()
                 
     except Exception, e:
         stop_err( 'Error during gene plotting : ' + str( e ) )
 
     
-def compute_analysis(bam, GFF, fasta, gff, dirout) :
+def compute_analysis(bam, GFF, fasta, gff, dirout, extend) :
     
     try:
         background_file = dirout+"/background_sequence.fasta"
@@ -479,20 +429,46 @@
             writer = csv.writer(out,delimiter='\t')
             writer.writerow(myheader)    
             for gene in GFF['order'] :
+                #print GFF[gene]
+                ## maybe no start position in GTF file so we must to check and replace
+                exon_number = GFF[gene]['exon_number']
+                try : GFF[gene]['start'] 
+                except :
+                    if GFF[gene]['strand'] == '+' :
+                        GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
+                    else :
+                        GFF[gene]['start'] = GFF[gene]['exon'][exon_number]['stop']
+                ## also for stop coordinates
+                try : GFF[gene]['stop'] 
+                except :
+                    if GFF[gene]['strand'] == '+' :
+                        GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
+    
+                    else :
+                        GFF[gene]['stop'] = GFF[gene]['exon'][1]['start']
+                
+                               
                 indic = ""
                 # compute rpkm of CDS :
                 len_cds = GFF[gene]['stop']-GFF[gene]['start']
                 count_cds = 0
-                ### count method of pysam doesn't strand information
-                if GFF[gene]['strand'] == '+' :
-                    for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+12,GFF[gene]['stop']-15) :
-                        if not track.is_reverse :
-                            count_cds += 1
-                else :
-                    for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+15,GFF[gene]['stop']-12) :
-                        if track.is_reverse :
-                            count_cds += 1
-                rpkm_cds = compute_rpkm(len_cds,count_cds,count_tot)
+                rpkm_cds = 0
+                count_cds = 0
+                try :
+                    ### count method of pysam doesn't strand information
+                    if GFF[gene]['strand'] == '+' :
+                        for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+12,GFF[gene]['stop']-15) :
+                            if not track.is_reverse :
+                                count_cds += 1
+                    else :
+                        for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+15,GFF[gene]['stop']-12) :
+                            if track.is_reverse :
+                                count_cds += 1
+                    rpkm_cds = compute_rpkm(len_cds,count_cds,count_tot)
+                except ValueError:
+                    ## genere warning about gtf coordinates
+                    #warnings.warn("Please check coordinates in GFF : maybe stop or start codon are missing" )
+                    pass
                 ## Only if gene is translate :
                 if rpkm_cds > 0 and count_cds > 128:  
                     ## search footprint in UTR3
@@ -520,7 +496,8 @@
                                     count += 1
                             ## get sequence of extension
                             seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement())
-                                
+                             
+                       
                         ## if we have coverage after cds stop codon    
                         if count > 10 :    
                             res = find_stop(seq)
@@ -530,9 +507,11 @@
                                 '''
                                 ## check if next in-frame codon is far than 90 nt extension :
                                 if GFF[gene]['strand'] == '+' :
-                                    pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['stop']+1,GFF[gene]['stop']+300,'+')
+                                    pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['stop']+1,GFF[gene]['stop']+extend,'+',GFF[gene]['name'])
                                     start_extension = pos[0]-1
                                     stop_extension = pos[1]
+                                    #start_extension = GFF[gene]['stop']
+                                    #stop_extension = GFF[gene]['stop']+extend
                                     seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension])
                                     
                                     #print gene,count,pos,'\n',seq
@@ -547,9 +526,11 @@
                                             #print res
                                             #stop_extension = start_extension + res +3
                                 else :
-                                    pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['start']-300,GFF[gene]['start']-1,'-')
+                                    pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['start']-extend,GFF[gene]['start']-1,'-',GFF[gene]['name'])
                                     start_extension = pos[0]
                                     stop_extension = pos[1]+1
+                                    #start_extension = GFF[gene]['start']-extend
+                                    #stop_extension = GFF[gene]['start']
                                     seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement())
                                     
                                     #print gene,count,pos,'\n',seq
@@ -970,11 +951,6 @@
         
 def __main__():
 
-    ## python metagene_readthrough.py -g Saccer3.gff -f Saccer3.fa -b B1_sorted.bam -o Bstrain_readthrough.csv
-    #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_readthrough.py -g /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /data/Mapping/read_mapped_unique_psiP_sorted.bam -o /home/rlegendre/Documents/Translecture/plop_py.csv -d /home/rlegendre/Documents/Translecture/out_trans/
-    #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_readthrough.py -g /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /home/rlegendre/Documents/PDE2S/PDE2S_trim_no_rRNA_sorted.bam -o /home/rlegendre/Documents/PDE2S/Readthrough_pde2s.csv -d /home/rlegendre/Documents/PDE2S/out_trans
-
-
     #Parse command line options
     parser = optparse.OptionParser()
     parser.add_option("-g", "--gff", dest="gff", type= "string",
@@ -987,7 +963,10 @@
                   help="Bam Ribo-Seq alignments ", metavar="FILE")
                     
     parser.add_option("-d", "--dirout", dest="dirout", type= "string",
-                  help="write report in this html file and in associated directory", metavar="STR,STR")    
+                  help="write report in this html file and in associated directory", metavar="STR,STR")   
+    
+    parser.add_option("-e", "--extend", dest="extend", type= "int",
+                  help="Lenght of extension after stop in number of base pairs (depends on your organisme)", metavar="INT")     
                     
     parser.add_option("-q", "--quiet",
                   action="store_false", dest="verbose", default=True,
@@ -1004,10 +983,30 @@
             os.mkdir(subfolder)
         except:
             raise
-        
-        GFF = store_gff(options.gff)
+        ## identify GFF or GTF format from 9th column
+        with open (options.gff,"r") as gffile :
+            for line in gffile :
+                if '#' in line :
+                    ## skip header
+                    gffile.next()
+                elif 'gene_id' in line :
+                    ## launch gtf reader :
+                    GFF = ribo_functions.store_gtf(options.gff)
+                    break
+                elif 'ID=' in line :
+                    ## launch gff reader 
+                    GFF = ribo_functions.store_gff(options.gff)
+                    break
+                else :
+                    stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )
+            
+        #GFF = store_gff(options.gff)
+        #GFF = ribo_functions.store_gtf(options.gff)
+        ## check gff reading
+        if not GFF['order'] :
+            stop_err( 'Incorrect GFF file' + str( e ) )
         clean_file = cleaning_bam(options.bamfile)
-        compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder)
+        compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder, options.extend)
         if os.path.exists( clean_file ):
             os.remove( clean_file )