diff ribo_functions.py @ 18:a121cce43f90 draft

Uploaded
author rlegendre
date Tue, 09 Jun 2015 09:06:17 -0400
parents c87c40e642af
children
line wrap: on
line diff
--- a/ribo_functions.py	Fri May 29 09:17:29 2015 -0400
+++ b/ribo_functions.py	Tue Jun 09 09:06:17 2015 -0400
@@ -141,6 +141,7 @@
     '''
     try:
         GFF = {}
+        mRNA = {}
         with open(gff, 'r') as f_gff : 
 
             GFF['order'] = []
@@ -153,8 +154,9 @@
                 # first line is already gene line :
                     if line.split('\t')[2] == 'gene' :
                         gene = feature[0].replace("ID=","")
+                        curent_gene = gene
                         if 'Name' in line :
-                            regex = re.compile('(Name=)([^;]*);')
+                            regex = re.compile('(Name=)([^;]*);?')
                             res = regex.search(line.split('\t')[8])
                             Name = res.group(2)
                             Name = Name.rstrip()
@@ -162,7 +164,7 @@
                             Name = "Unknown"
                         ##get annotation
                         if 'Note' in line :
-                            regex = re.compile('(Note=)([^;]*);')
+                            regex = re.compile('(Note=)([^;]*);?')
                             res = regex.search(line.split('\t')[8])
                             note = res.group(2)                     
                             note = urllib.unquote(str(note)).replace("\n","")
@@ -180,12 +182,27 @@
                         GFF[gene]['exon'] = {}
                         GFF[gene]['exon_number'] = 0
                         #print Name
+                    elif line.split('\t')[2] == 'mRNA' :
+                        regex = re.compile('(Parent=)([^;]*);?')
+                        res = regex.search(line.split('\t')[8])
+                        gene_name = res.group(2)
+                        regex = re.compile('(ID=)([^;]*);?')
+                        res = regex.search(line.split('\t')[8])
+                        mRNA_name = res.group(2)
+                        if gene not in mRNA.viewvalues() and gene_name == curent_gene :
+                            mRNA[mRNA_name] = gene_name
+
                     elif line.split('\t')[2] == 'CDS' :
-                        regex = re.compile('(Parent=)([^;]*);')
+                        regex = re.compile('(Parent=)([^;]*);?')
                         res = regex.search(line.split('\t')[8])
                         gene = res.group(2) 
+
                         if 'mRNA' in gene:
                             gene = re.sub(r"(.*)(\_mRNA)", r"\1", gene)
+                        if mRNA.has_key(gene) and GFF.has_key(mRNA[gene]):
+
+                            gene = gene_name
+
                         if GFF.has_key(gene) :
                             GFF[gene]['exon_number'] += 1
                             exon_number = GFF[gene]['exon_number'] 
@@ -193,7 +210,7 @@
                             GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
                             GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
                             GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
-                        
+
                     ## if there is a five prim UTR intron, we change start of gene
                     elif line.split('\t')[2] == 'five_prime_UTR_intron' :
                         if GFF[gene]['strand'] == "+" :