diff ribo_functions.py @ 10:707807fee542

(none)
author rlegendre
date Thu, 22 Jan 2015 14:34:38 +0100
parents d7739f797a26
children 7c944fd9907e
line wrap: on
line diff
--- a/ribo_functions.py	Tue Dec 09 09:43:07 2014 -0500
+++ b/ribo_functions.py	Thu Jan 22 14:34:38 2015 +0100
@@ -8,6 +8,8 @@
 '''
 
 import sys, subprocess, re, commands, time, urllib
+from copy import copy
+
 
 def stop_err( msg ):
     sys.stderr.write( "%s\n" % msg )
@@ -154,7 +156,7 @@
                         GFF[gene]['exon_number'] = 0
                         #print Name
                     elif line.split('\t')[2] == 'CDS' :
-                        gene = re.sub(r".?Parent\=(.+)(_mRNA)+", r"\1", feature[0])
+                        gene = re.sub(r".?Parent\=(.+)\_mRNA?", r"\1", feature[0])
                         if GFF.has_key(gene) :
                             GFF[gene]['exon_number'] += 1
                             exon_number = GFF[gene]['exon_number'] 
@@ -184,7 +186,7 @@
 
 def store_gtf(gff):
     '''
-        parse and store gtf file in a dictionnary (DEPRECATED)
+        parse and store gtf file in a dictionnary 
     '''
     try:
         GFF = {}
@@ -195,11 +197,11 @@
                 line = line.split("#")[0]
                 if line != "" :
                 # first line is already gene line :
-                    if line.split('\t')[1] == 'protein_coding' :
+                    if 'protein_coding' in line :
                         ##get name
-                        gene = re.sub(r".+ transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip()
-                        Name = re.sub(r".+ transcript_name \"([\w|-]+)\";.*", r"\1", line).rstrip()
-                        if line.split('\t')[2] == 'exon' :
+                        gene = re.sub(r".+transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip()
+                        Name = re.sub(r".+gene_name \"([\w|\-|\:|\.|\(|\)]+)\";.*", r"\1", line).rstrip()
+                        if line.split('\t')[2] == 'CDS' :
                             ##if its first time we get this gene
                             if gene not in GFF.keys() :
                             ## store gene information
@@ -207,35 +209,41 @@
                                 GFF[gene] = {}
                                 GFF[gene]['chrom'] = line.split('\t')[0]
                                 GFF[gene]['strand'] = line.split('\t')[6]
+                                GFF[gene]['start'] = int(line.split('\t')[3])
+                                GFF[gene]['stop'] = int(line.split('\t')[4])
                                 GFF[gene]['name'] = Name
+                                GFF[gene]['note'] = ""
                                 GFF[gene]['exon_number'] = 1
                                 GFF[gene]['exon'] = {}
-                                exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
+                                #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
+                                ## some exons are non codant
+                                exon_number = 1
                                 GFF[gene]['exon'][exon_number] = {}
                                 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
                                 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
                             else :
                                 ## we add exon
-                                exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
+                                #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
+                                exon_number += 1
                                 GFF[gene]['exon_number'] = exon_number
                                 GFF[gene]['exon'][exon_number] = {}
                                 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
                                 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
-                        elif line.split('\t')[2] == 'CDS' :
-                            exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
+                        #elif line.split('\t')[2] == 'CDS' :
+                            #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
                             GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
                         elif line.split('\t')[2] == 'start_codon' :
                             if GFF[gene]['strand'] == '-' :
-                                GFF[gene]['start'] = int(line.split('\t')[4])
+                                GFF[gene]['stop'] = int(line.split('\t')[4])
                             else :
                                 GFF[gene]['start'] = int(line.split('\t')[3])
                         elif line.split('\t')[2] == 'stop_codon' :
                             if GFF[gene]['strand'] == '-' :
-                                GFF[gene]['stop'] = int(line.split('\t')[3])
+                                GFF[gene]['start'] = int(line.split('\t')[3])
                             else :
                                 GFF[gene]['stop'] = int(line.split('\t')[4])
 
-        return GFF
+        return __reverse_coordinates__(GFF)
     except Exception,e:
         stop_err( 'Error during gff storage : ' + str( e ) )
 
@@ -252,8 +260,26 @@
 ## protein_id "YDL083C";
 ##IV      protein_coding  stop_codon      306926  306928  .       -       0       gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "
 ##RPS16B";
+def __reverse_coordinates__(GFF):
+    
+    for gene in GFF['order']:
+        ## for reverse gene
+        if GFF[gene]['strand'] == "-":
+            ## if this gene have many exon and the stop of gene is the stop of first (and not last) exon, we reverse exon coordinates
+            if GFF[gene]['stop'] == GFF[gene]['exon'][1]['stop'] and GFF[gene]['exon_number'] > 1 :
+                tmp = copy(GFF[gene]['exon'])
+                exon_number = GFF[gene]['exon_number']
+                rev_index = exon_number+1
+                for z in range(1,exon_number+1):
+                    rev_index -= 1
+                    GFF[gene]['exon'][z] = tmp[rev_index]
 
+                ## check start
+                if GFF[gene]['start'] != GFF[gene]['exon'][1]['start'] and GFF[gene]['start']:
+                    GFF[gene]['exon'][1]['start'] = GFF[gene]['start']
 
+    return GFF
+    
 
 def cleaning_bam(bam):
     '''