changeset 18:a121cce43f90 draft

Uploaded
author rlegendre
date Tue, 09 Jun 2015 09:06:17 -0400
parents c87c40e642af
children 385fc64fa988
files kmer_analysis.py metagene_frameshift_analysis.py ribo_functions.py
diffstat 3 files changed, 209 insertions(+), 179 deletions(-) [+]
line wrap: on
line diff
--- a/kmer_analysis.py	Fri May 29 09:17:29 2015 -0400
+++ b/kmer_analysis.py	Tue Jun 09 09:06:17 2015 -0400
@@ -367,12 +367,11 @@
 
     try:
         
-        if os.path.exists(options.dirout):
-            raise
-        try:
-            os.mkdir(options.dirout)
-        except:
-            raise 
+        if not os.path.exists(options.dirout):
+            try:
+                os.mkdir(options.dirout)
+            except Exception, e :
+                stop_err('Error running make directory : ' + str(e)) 
         
         ##testing indexed bam file
         if os.path.isfile(options.bamfile+".bai") :
@@ -404,7 +403,10 @@
         ## check gff reading
         if not GFF['order'] :
             stop_err( 'Incorrect GFF file' )
-        
+        for gene in GFF['order']:
+            if not GFF[gene]['exon'] :
+                del GFF[gene]
+                GFF['order'].remove(gene)  
         ## split bam
         split_bam(options.bamfile,tmpdir)
         ###################################
--- a/metagene_frameshift_analysis.py	Fri May 29 09:17:29 2015 -0400
+++ b/metagene_frameshift_analysis.py	Tue Jun 09 09:06:17 2015 -0400
@@ -204,32 +204,37 @@
                             data = f.readlines()
                 except IOError :
                     print tmpdir+"/assoCov_"+chrom+".txt doesn't exist" 
-                     
-                ## if a gene without intron :
-                if GFF[gene]['exon_number'] == 1:
-                        
-                    ## get coverage for each gene   
-                    if GFF[gene]['strand'] == "+":
-                        for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
-                            cov.append(int((data[i].rstrip()).split("\t")[0]))
+                
+                try :     
+                    ## if a gene without intron :
+                    if GFF[gene]['exon_number'] == 1:
+                            
+                        ## get coverage for each gene   
+                        if GFF[gene]['strand'] == "+":
+                            for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
+                                cov.append(int((data[i].rstrip()).split("\t")[0]))
+                        else :
+                            for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
+                                cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
+                            cov.reverse()
                     else :
-                        for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
-                            cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
-                        cov.reverse()
-                else :
-                    ## For each gene, get coverage and sum of exon size
-                    if GFF[gene]['strand'] == "+":
-                            
-                        for exon in range(1,GFF[gene]['exon_number']+1) :
-                            for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
-                                cov.append(int((data[i].rstrip()).split("\t")[0]))  
-                    else :
-                              
-                        for exon in range(1,GFF[gene]['exon_number']+1) :
-                            for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
-                                cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
-                        cov.reverse()        
+                        ## For each gene, get coverage and sum of exon size
+                        if GFF[gene]['strand'] == "+":
                                 
+                            for exon in range(1,GFF[gene]['exon_number']+1) :
+                                for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
+                                    cov.append(int((data[i].rstrip()).split("\t")[0]))  
+                        else :
+                                  
+                            for exon in range(1,GFF[gene]['exon_number']+1) :
+                                for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
+                                    cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
+                            cov.reverse()  
+                except :
+                    print gene+" could not be analysed.\n"
+                    del GFF[gene]
+                    continue      
+              
                 len_cov = len(cov)
                 ## segmentation by orf_length
                 nb_segment = int(len_cov/orf_length)
@@ -237,14 +242,6 @@
                 if len_cov < orf_length :
                     nb_segment = 1
                 
-#                 #segmentation by gene size
-#                 if len_cov < 1000 :
-#                     nb_segment = 3
-#                 elif len_cov > 1000 and len_cov < 2000:
-#                     nb_segment = 6
-#                 else :
-#                     nb_segment = 9
-
                 ## number of aa in gene :
                 nb_aa = len_cov/3
                 len_frag = len_cov/nb_segment
@@ -255,7 +252,7 @@
                 seg_rpkm = []
                 window = 0
                 GFF[gene]['dual_coding'] = '-'
-                ''' segmentation et total proportions '''
+                ''' segmentation and total proportions '''
                 if sum(cov) > MIN_COV_REQUIREMENT:
                     ## compute mean of ORF length
                     mean_orf += len_cov
@@ -290,16 +287,16 @@
                         idx += 1
                     for z in range(0,3) :
                         whole_phasing_table[z].append(__percent__(tot_prop)[z])         
-                else :
-                    for nuc in range(0,len_cov-2,3) :
-                        ## Calculate proportion
-                        prop[0] += cov[nuc]
-                        prop[1] += cov[nuc+1]
-                        prop[2] += cov[nuc+2]
-                    whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop)))
-                    #for z in range(0,3) :
-                    #    whole_phasing_table[z].append(__percent__(tot_prop)[z])
-                            
+#                 else :
+#                     for nuc in range(0,len_cov-2,3) :
+#                         ## Calculate proportion
+#                         prop[0] += cov[nuc]
+#                         prop[1] += cov[nuc+1]
+#                         prop[2] += cov[nuc+2]
+#                     whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop)))
+#                     for z in range(0,3) :
+#                         whole_phasing_table[z].append(__percent__(tot_prop)[z])
+         
                 if GFF[gene]['dual_coding'] == 'yes' :
                     mylist = [gene,GFF[gene]['name'],GFF[gene]['total_prop'],GFF[gene]['seg_prop'],GFF[gene]['total_rpkm'],GFF[gene]['seg_rpkm'],GFF[gene]['note']]
                     #out.write(gene+'\t'+GFF[gene]['name']+'\t'+str(GFF[gene]['total_prop'])+'\t'+str(GFF[gene]['seg_prop'])+'\t'+str(GFF[gene]['total_rpkm'])+'\t'+GFF[gene]['note']+'\n')
@@ -309,20 +306,23 @@
                     del GFF[gene]
                         
         del GFF['order']
-            
-           
+   
         whole_phasing = __percent__(whole_phasing)
         sys.stdout.write("Proportion of reads in each frame :\n%s\n" % whole_phasing)
-        sys.stdout.write("Mean length of ORF in your organism : %s\n" % int(mean_orf/nb_gene))
-        sys.stdout.write("%d genes are used for frame analysis\n" % nb_gene)
+        if nb_gene == 0:
+            sys.stdout.write("%d gene are used for frame analysis: each gene require %d reads at least to be analysed.\n" % (nb_gene, MIN_COV_REQUIREMENT))
+        else :
+            sys.stdout.write("Mean length of ORF in your organism : %s\n" % int(mean_orf/nb_gene))
+            sys.stdout.write("%d genes are used for frame analysis\n" % nb_gene)
             
         ##produce boxplot
         pl.boxplot(whole_phasing_table,notch=True,sym='ro')
         pl.ylabel('Total number of footprints (percent)')
+        pl.ylim(0,100)
         pl.draw()
         pl.savefig(subfolder+'/boxplot.png',format='png', dpi=640)
         pl.savefig(box_file, format='png', dpi=640)
-        
+        pl.clf()
         
         return GFF
 
@@ -381,6 +381,8 @@
     except Exception, e:
         stop_err( 'Error during ORF computing : ' + str( e ) )
     
+
+
 def plot_subprofile(GFF, directory,fasta) :
     '''
     This function plot subprofiles of footprint and ORFs for each gene in GFF dict
@@ -389,116 +391,119 @@
     try :
         
         for gene in GFF.iterkeys():
-            nb_aa = len(GFF[gene]['coverage'])/3
-            subprofile_1 = []
-            subprofile_2 = []
-            subprofile_3 = []
-            cov = GFF[gene]['coverage']
+            if GFF[gene]['coverage'] :
+                nb_aa = len(GFF[gene]['coverage'])/3
+                subprofile_1 = []
+                subprofile_2 = []
+                subprofile_3 = []
+                cov = GFF[gene]['coverage']
+    
                 
-            for z in range(0,len(cov)-2,3):
-                for i in range(3):
-                    subprofile_1.append(cov[z])
-                    subprofile_2.append(cov[z+1])
-                    subprofile_3.append(cov[z+2])
-                
-            ## Normalisation of density plot
-            tot = max(cov)
-                
-            subprofile_1 = [(x * _MAX/tot) for x in subprofile_1]
-            subprofile_2 = [(x * _MAX/tot) for x in subprofile_2]
-            subprofile_3 = [(x * _MAX/tot) for x in subprofile_3]
-                
-                
-            if len(subprofile_1) == len(subprofile_2) and len(subprofile_1) == len(subprofile_3) :
                     
-                ind = arange(len(subprofile_1))
-                fig = pl.figure(num=1)
-                #sub profile frame 0
-                pl.subplot(4,1,1)
-                ax1 = fig.add_subplot(4,1,1)
-                pl.title( "%s (%s)\n$(%s RPKM)$"% (gene, GFF[gene]['name'], GFF[gene]['total_rpkm']))
-                pl.ylim(0,_MAX)
-                pl.xlim(0,len(subprofile_1))
-                pl.yticks([0, _MAX/2, _MAX])
-                ax1.plot(ind, subprofile_1, 'b-', lw = 3,label = 'Frame 0')
-                ax1.fill_between(ind, 0,subprofile_1,color='b')
-                ax1.legend(loc=1,prop={'size':8})
-                pl.ylabel('#RPF')
+                for z in range(0,len(cov)-2,3):
+                    for i in range(3):
+                        subprofile_1.append(cov[z])
+                        subprofile_2.append(cov[z+1])
+                        subprofile_3.append(cov[z+2])
                     
-                ##sub profile frame +1
-                pl.subplot(4,1,2)
-                ax2 = fig.add_subplot(4,1,2)
-                pl.ylim(0,_MAX)
-                pl.xlim(0,len(subprofile_2))
-                pl.yticks([0, _MAX/2, _MAX])
-                ax2.plot(ind,subprofile_2,'r-',lw = 3,label = 'Frame +1')
-                ax2.fill_between(ind, 0,subprofile_2,color='r')
-                ax2.legend(loc=1,prop={'size':8})
-                pl.ylabel('#RPF')
+                ## Normalisation of density plot
+                tot = max(cov)
+                    
+                subprofile_1 = [(x * _MAX/tot) for x in subprofile_1]
+                subprofile_2 = [(x * _MAX/tot) for x in subprofile_2]
+                subprofile_3 = [(x * _MAX/tot) for x in subprofile_3]
+                    
                     
-                ##sub profile frame -1
-                pl.subplot(4,1,3)
-                ax3 = fig.add_subplot(4,1,3)
-                pl.ylim(0,_MAX)
-                pl.yticks([0, _MAX/2, _MAX])
-                pl.xlim(0,len(subprofile_3))
-                ax3.plot(ind,subprofile_3,'g-',lw =3,label = 'Frame -1')
-                ax3.fill_between(ind, 0,subprofile_3,color='g')
-                ax3.legend(loc=1,prop={'size':8})
-                pl.ylabel('#RPF')
-                   
-                ## get orf for each frame
-                gene_seq,met_1,stop_1,met_2,stop_2,met_3,stop_3 = get_orf(fasta,GFF[gene])  
-                ## ORF plot
-                idc = arange(len(gene_seq))
-                ##define axe for frame :
-                x = [0]*len(gene_seq)
-                y = [1]*len(gene_seq)
-                z = [2]*len(gene_seq)
-                ## define last subplot
-                #pdb.set_trace()
-                pl.subplot(4,1,4)
-                ## modify axes setting
-                ax4 = fig.add_subplot(4,1,4)
-                ax4.spines['right'].set_color('none')
-                ax4.spines['top'].set_color('none')
-                ax4.spines['left'].set_color('none')
-                ax4.spines['bottom'].set_color('none')
-                ax4.tick_params(bottom = 'off',top='off',left='off', right='off')
-                pl.ylim(0,3)
-                pl.xlim(0,len(gene_seq))
-                pl.yticks([0.5,1.5,2.5],[r'-1',r'+1',r'0'])
-                pl.xticks([])
-                ## plot line for delimiting frame and arrow corresponding to start and stop
-                pl.plot(idc,x,'k',lw=2)
-                for val in met_1:
-                    pl.arrow( val, 2, 0.0, 0.5, fc="g", ec="g",lw=2)
-                for val in stop_1:
-                    pl.arrow( val, 2, 0.0, 1, fc="r", ec="r",lw=2)
-                pl.plot(idc,y,'k')
-                for val in met_2:
-                    pl.arrow( val, 1, 0.0, 0.5, fc="g", ec="g",lw=2)
-                for val in stop_2:
-                    pl.arrow( val, 1, 0.0, 1, fc="r", ec="r",lw=2)
-                pl.plot(idc,z,'k')
-                for val in met_3:
-                    pl.arrow( val, 0, 0.0, 0.5, fc="g", ec="g",lw=2)
-                for val in stop_3:
-                    pl.arrow( val, 0, 0.0, 1, fc="r", ec="r",lw=2)
-                    
-
-                pl.ylabel('ORF',fontsize='small')
-                pl.draw()
-                pl.savefig(directory+'/'+gene+'.png',format='png', dpi=600)
-                pl.clf()
-                ##pl.show()
-                    
-                ## Make thumbnail for html page
-                infile = directory+'/'+gene+'.png'
-                size = 128, 128
-                im = Image.open(infile)
-                im.thumbnail(size, Image.ANTIALIAS)
-                im.save(directory+'/'+gene + "_thumbnail.png", "PNG")
+                if len(subprofile_1) == len(subprofile_2) and len(subprofile_1) == len(subprofile_3) :
+   
+                    ind = arange(len(subprofile_1))
+                    fig = pl.figure(num=1)
+                    #sub profile frame 0
+                    pl.subplot(4,1,1)
+                    ax1 = fig.add_subplot(4,1,1)
+                    pl.title( "%s (%s)\n$(%s RPKM)$"% (gene, GFF[gene]['name'], GFF[gene]['total_rpkm']))
+                    pl.ylim(0,_MAX)
+                    pl.xlim(0,len(subprofile_1))
+                    pl.yticks([0, _MAX/2, _MAX])
+                    ax1.plot(ind, subprofile_1, 'b-', lw = 3,label = 'Frame 0')
+                    ax1.fill_between(ind, 0,subprofile_1,color='b')
+                    ax1.legend(loc=1,prop={'size':8})
+                    pl.ylabel('#RPF')
+                        
+                    ##sub profile frame +1
+                    pl.subplot(4,1,2)
+                    ax2 = fig.add_subplot(4,1,2)
+                    pl.ylim(0,_MAX)
+                    pl.xlim(0,len(subprofile_2))
+                    pl.yticks([0, _MAX/2, _MAX])
+                    ax2.plot(ind,subprofile_2,'r-',lw = 3,label = 'Frame +1')
+                    ax2.fill_between(ind, 0,subprofile_2,color='r')
+                    ax2.legend(loc=1,prop={'size':8})
+                    pl.ylabel('#RPF')
+                        
+                    ##sub profile frame -1
+                    pl.subplot(4,1,3)
+                    ax3 = fig.add_subplot(4,1,3)
+                    pl.ylim(0,_MAX)
+                    pl.yticks([0, _MAX/2, _MAX])
+                    pl.xlim(0,len(subprofile_3))
+                    ax3.plot(ind,subprofile_3,'g-',lw =3,label = 'Frame -1')
+                    ax3.fill_between(ind, 0,subprofile_3,color='g')
+                    ax3.legend(loc=1,prop={'size':8})
+                    pl.ylabel('#RPF')
+                       
+                    ## get orf for each frame
+                    gene_seq,met_1,stop_1,met_2,stop_2,met_3,stop_3 = get_orf(fasta,GFF[gene])  
+                    ## ORF plot
+                    idc = arange(len(gene_seq))
+                    ##define axe for frame :
+                    x = [0]*len(gene_seq)
+                    y = [1]*len(gene_seq)
+                    z = [2]*len(gene_seq)
+                    ## define last subplot
+                    #pdb.set_trace()
+                    pl.subplot(4,1,4)
+                    ## modify axes setting
+                    ax4 = fig.add_subplot(4,1,4)
+                    ax4.spines['right'].set_color('none')
+                    ax4.spines['top'].set_color('none')
+                    ax4.spines['left'].set_color('none')
+                    ax4.spines['bottom'].set_color('none')
+                    ax4.tick_params(bottom = 'off',top='off',left='off', right='off')
+                    pl.ylim(0,3)
+                    pl.xlim(0,len(gene_seq))
+                    pl.yticks([0.5,1.5,2.5],[r'-1',r'+1',r'0'])
+                    pl.xticks([])
+                    ## plot line for delimiting frame and arrow corresponding to start and stop
+                    pl.plot(idc,x,'k',lw=2)
+                    for val in met_1:
+                        pl.arrow( val, 2, 0.0, 0.5, fc="g", ec="g",lw=2)
+                    for val in stop_1:
+                        pl.arrow( val, 2, 0.0, 1, fc="r", ec="r",lw=2)
+                    pl.plot(idc,y,'k')
+                    for val in met_2:
+                        pl.arrow( val, 1, 0.0, 0.5, fc="g", ec="g",lw=2)
+                    for val in stop_2:
+                        pl.arrow( val, 1, 0.0, 1, fc="r", ec="r",lw=2)
+                    pl.plot(idc,z,'k')
+                    for val in met_3:
+                        pl.arrow( val, 0, 0.0, 0.5, fc="g", ec="g",lw=2)
+                    for val in stop_3:
+                        pl.arrow( val, 0, 0.0, 1, fc="r", ec="r",lw=2)
+                        
+    
+                    pl.ylabel('ORF',fontsize='small')
+                    pl.draw()
+                    pl.savefig(directory+'/'+gene+'.png',format='png', dpi=600)
+                    pl.clf()
+                    ##pl.show()
+                        
+                    ## Make thumbnail for html page
+                    infile = directory+'/'+gene+'.png'
+                    size = 128, 128
+                    im = Image.open(infile)
+                    im.thumbnail(size, Image.ANTIALIAS)
+                    im.save(directory+'/'+gene + "_thumbnail.png", "PNG")
                     
                     
             else :
@@ -511,12 +516,15 @@
         
         
     try :
-        gene_table = ''
-        gene_table += '<table>'
-        gene_table += '<thead><tr><th data-sort="string">Gene</th><th>Plot</th><th data-sort="string">Name</th><th>Total Proportion</th><th>Segment proportion</th><th data-sort="float">RPKM</th><th>Segment RPKM</th></tr></thead><tbody>'
-        for gene in GFF.iterkeys():
-            gene_table += '<tr><td>%s</td><td><a href="%s.png" data-lightbox="%s"><img src="%s_thumbnail.png" /></a></td><td><a title="%s">%s</a></td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' %(gene,gene,gene,gene,GFF[gene]['note'],GFF[gene]['name'],GFF[gene]['total_prop'],GFF[gene]['seg_prop'],GFF[gene]['total_rpkm'],GFF[gene]['seg_rpkm'])
-        gene_table += '</tbody></table>'  
+        if not GFF: 
+            gene_table = 'Sorry, there are no dual coding events in your data\n'
+        else :
+            gene_table = ''
+            gene_table += '<table>'
+            gene_table += '<thead><tr><th data-sort="string">Gene</th><th>Plot</th><th data-sort="string">Name</th><th>Total Proportion</th><th>Segment proportion</th><th data-sort="float">RPKM</th><th>Segment RPKM</th></tr></thead><tbody>'
+            for gene in GFF.iterkeys():
+                gene_table += '<tr><td>%s</td><td><a href="%s.png" data-lightbox="%s"><img src="%s_thumbnail.png" /></a></td><td><a title="%s">%s</a></td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' %(gene,gene,gene,gene,GFF[gene]['note'],GFF[gene]['name'],GFF[gene]['total_prop'],GFF[gene]['seg_prop'],GFF[gene]['total_rpkm'],GFF[gene]['seg_rpkm'])
+            gene_table += '</tbody></table>'  
             
  
         html_str = """  
@@ -797,12 +805,11 @@
             proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
             returncode = proc.wait()
 
-        if os.path.exists(subfolder):
-            raise
-        try:
-            os.mkdir(subfolder)
-        except:
-            raise 
+        if not os.path.exists(subfolder):
+            try:
+                os.mkdir(subfolder)
+            except Exception, e :
+                stop_err('Error running make directory : ' + str(e)) 
         
         ## check frame arg
         if options.frame not in [1,2,3]:
@@ -831,7 +838,11 @@
         ## check gff reading
         if not GFF['order'] :
             stop_err( 'Incorrect GFF file')
-        #
+        for gene in GFF['order']:
+            if not GFF[gene]['exon'] :
+                del GFF[gene]
+                GFF['order'].remove(gene) 
+                
         #### cPickles for Test ####
         #if os.path.isfile("/home/rlegendre/Documents/FrameShift/OutOfFrameWithIntron/pyt_dict") :
         #    with open("/home/rlegendre/Documents/FrameShift/OutOfFrameWithIntron/pyt_dict",'rb') as fp:
@@ -861,7 +872,7 @@
         stop_err( 'Error running metagene analysis : ' + str( e ) )
             
     if os.path.exists( tmp_dir ):
-        shutil.rmtree( tmp_dir )
+         shutil.rmtree( tmp_dir )
 
 
 
--- a/ribo_functions.py	Fri May 29 09:17:29 2015 -0400
+++ b/ribo_functions.py	Tue Jun 09 09:06:17 2015 -0400
@@ -141,6 +141,7 @@
     '''
     try:
         GFF = {}
+        mRNA = {}
         with open(gff, 'r') as f_gff : 
 
             GFF['order'] = []
@@ -153,8 +154,9 @@
                 # first line is already gene line :
                     if line.split('\t')[2] == 'gene' :
                         gene = feature[0].replace("ID=","")
+                        curent_gene = gene
                         if 'Name' in line :
-                            regex = re.compile('(Name=)([^;]*);')
+                            regex = re.compile('(Name=)([^;]*);?')
                             res = regex.search(line.split('\t')[8])
                             Name = res.group(2)
                             Name = Name.rstrip()
@@ -162,7 +164,7 @@
                             Name = "Unknown"
                         ##get annotation
                         if 'Note' in line :
-                            regex = re.compile('(Note=)([^;]*);')
+                            regex = re.compile('(Note=)([^;]*);?')
                             res = regex.search(line.split('\t')[8])
                             note = res.group(2)                     
                             note = urllib.unquote(str(note)).replace("\n","")
@@ -180,12 +182,27 @@
                         GFF[gene]['exon'] = {}
                         GFF[gene]['exon_number'] = 0
                         #print Name
+                    elif line.split('\t')[2] == 'mRNA' :
+                        regex = re.compile('(Parent=)([^;]*);?')
+                        res = regex.search(line.split('\t')[8])
+                        gene_name = res.group(2)
+                        regex = re.compile('(ID=)([^;]*);?')
+                        res = regex.search(line.split('\t')[8])
+                        mRNA_name = res.group(2)
+                        if gene not in mRNA.viewvalues() and gene_name == curent_gene :
+                            mRNA[mRNA_name] = gene_name
+
                     elif line.split('\t')[2] == 'CDS' :
-                        regex = re.compile('(Parent=)([^;]*);')
+                        regex = re.compile('(Parent=)([^;]*);?')
                         res = regex.search(line.split('\t')[8])
                         gene = res.group(2) 
+
                         if 'mRNA' in gene:
                             gene = re.sub(r"(.*)(\_mRNA)", r"\1", gene)
+                        if mRNA.has_key(gene) and GFF.has_key(mRNA[gene]):
+
+                            gene = gene_name
+
                         if GFF.has_key(gene) :
                             GFF[gene]['exon_number'] += 1
                             exon_number = GFF[gene]['exon_number'] 
@@ -193,7 +210,7 @@
                             GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
                             GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
                             GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
-                        
+
                     ## if there is a five prim UTR intron, we change start of gene
                     elif line.split('\t')[2] == 'five_prime_UTR_intron' :
                         if GFF[gene]['strand'] == "+" :