changeset 10:707807fee542

(none)
author rlegendre
date Thu, 22 Jan 2015 14:34:38 +0100
parents d7739f797a26
children 313b8f7d2a92
files get_codon_frequency.py kmer_analysis.py metagene_frameshift_analysis.py metagene_readthrough.py ribo_functions.py
diffstat 5 files changed, 370 insertions(+), 259 deletions(-) [+]
line wrap: on
line diff
--- a/get_codon_frequency.py	Tue Dec 09 09:43:07 2014 -0500
+++ b/get_codon_frequency.py	Thu Jan 22 14:34:38 2015 +0100
@@ -25,8 +25,10 @@
 import csv
 from scipy import stats
 from collections import OrderedDict
+import ribo_functions
+import HTSeq
 # #libraries for debugg
-# import pdb
+import pdb
 # import cPickle
 
 def stop_err(msg):
@@ -34,7 +36,6 @@
     sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
     sys.exit()
 
-    
 def store_gff(gff):
     '''
         parse and store gff file in a dictionnary
@@ -98,8 +99,7 @@
 #om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified
 #chrI    SGD     CDS     87286   87387   .       +       0       Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
 #chrI    SGD     CDS     87501   87752   .       +       0       Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified    
-
-
+    
 
 def init_codon_dict():
     
@@ -115,17 +115,28 @@
     '''
     try:
         codon = init_codon_dict()
-                    
-        for chrom in GFF.iterkeys():
-            for gene in GFF[chrom] :
+        for feature in GFF :
+            if feature.type == 'gene' :
                 codon_dict = init_codon_dict()
-                start = GFF[chrom][gene]['start']
-                stop = GFF[chrom][gene]['stop']
+                chrom = feature.iv.chrom
+                start = feature.iv.start
+                stop = feature.iv.end 
                 region = chrom + ':' + str(start) + '-' + str(stop)
+        
+        ## DEPRECATED
+        #for chrom in GFF.iterkeys():
+            #for gene in GFF[chrom] :
+               # codon_dict = init_codon_dict()
+                #start = GFF[chrom][gene]['start']
+                #print start
+                #stop = GFF[chrom][gene]['stop']
+                #print stop
+                #region = chrom + ':' + str(start) + '-' + str(stop)
+        #######
                 # #get all reads in this gene
                 reads = subprocess.check_output(["samtools", "view", bamfile, region])
                 head = subprocess.check_output(["samtools", "view", "-H", bamfile])
-                read_tab = reads.split('\n')
+                read_tab = reads.split('\n')        
                 for read in read_tab:
                     # # search mapper for eliminate multiple alignements
                     if 'bowtie' in head:
@@ -136,7 +147,6 @@
                         stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping")  
                     if len(read) == 0:
                         continue
-                    
                     len_read = len(read.split('\t')[9])        
                     # if it's read of good length
                     if len_read == kmer and multi_tag not in read:
@@ -179,6 +189,8 @@
         stop_err('Error during codon usage calcul: ' + str(e))
 
 
+
+
 '''
 http://pyinsci.blogspot.fr/2009/09/violin-plot-with-matplotlib.html
 '''
@@ -491,7 +503,7 @@
         
         # # plot amino acid profile :
         fig = pl.figure(num=1)
-        width = .35
+        width = .45
         ax = fig.add_subplot(111)
         ind = arange(21)
         pl.xlim(0, 21) 
@@ -503,7 +515,7 @@
         ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2)        
         #for x, y, z in zip(ind, max_val, aa_name):
         #    ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14)
-        axis_font = {'size':'16'}
+        axis_font = {'size':'10'}
         pl.xticks(ind + width, aa_name,**axis_font)
         ax.spines['right'].set_visible(False)
         ax.spines['top'].set_visible(False)
@@ -513,7 +525,7 @@
         ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)',**axis_font)
         ax.set_xlabel('Amino Acids', **axis_font)
         handles, labels = ax.get_legend_handles_labels()
-        font_prop = font_manager.FontProperties(size=12)
+        font_prop = font_manager.FontProperties(size=8)
         ax.legend(handles, labels, prop=font_prop)
         pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340)
         pl.clf()
@@ -534,8 +546,9 @@
             max_val.append(max(cond1_norm[i], cond2_norm[i]))
             
         # plot result
-        fig = pl.figure(figsize=(24, 10), num=1)
-        width = .50
+        fig = pl.figure(figsize=(30, 10), num=1)
+        #fig = pl.figure(num=1)
+        width = .40
         ind = arange(len(codon_sorted))
         ax = fig.add_subplot(111)
         pl.xlim(0, len(codon_sorted) + 1) 
@@ -625,10 +638,7 @@
         #    raise Exception        
         
 def __main__():
-    '''   
-        python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/get_codon_frequency.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -g Saccer3.gff -t tAI.csv -1 psiM1_sorted.bam,psiM2_sorted.bam -2 psiP1_sorted.bam,psiP2_sorted.bam -c psiM -C psiP -l TAG,TAA,TGA -r yes -o psi_count -d psi.html,html_dir > log2
-        python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/get_codon_frequency.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -g Saccer3.gff -t tAI.csv -c psiM -C psiP -1 RPF_psi-_28sorted.bam -2 RPF_psi+_28sorted.bam -l TAG,TAA,TGA -n Stop Codon -r no -o psi_count -d psi.html,html_dir > log2
-    '''     
+    
     
     # Parse command line options
     parser = optparse.OptionParser()
@@ -648,7 +658,7 @@
                   help="Name of second condition", metavar="STR")
    
     parser.add_option("-k", "--kmer", dest="kmer", type="int",
-                  help="Longer of your phasing reads", metavar="INT") 
+                  help="Length of your phasing reads", metavar="INT") 
         
 #     parser.add_option("-l", "--list", dest="list_cod", type= "string",
 #                   help="list of codons to compare to other", metavar="STR")
@@ -692,13 +702,34 @@
         if not colors.is_color_like(options.color2) :
             stop_err( options.color2+' is not a proper color' )
         
-        GFF = store_gff(options.gff)
+        ## identify GFF or GTF format from 9th column
+        #with open (options.gff,"r") as gffile :
+        #    for line in gffile :
+        #        if '#' in line :
+        #            ## skip header
+        #            gffile.next()
+         #       elif 'gene_id' in line :
+        #            ## launch gtf reader :
+        #            GFF = ribo_functions.store_gtf(options.gff)
+        #            break
+        #        elif 'ID=' in line :
+        #            ## launch gff reader 
+        #            GFF = ribo_functions.store_gff(options.gff)
+        #            break
+        #        else :
+        #            stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )
+            
+        #GFF = store_gff(options.gff)
+        #GFF = ribo_functions.store_gtf(options.gff)
+        ## check gff reading
+        #if not GFF['order'] :
+         #   stop_err( 'Incorrect GFF file' + str( e ) )
         
         #### NOT USE IN FINAL VERSION
         # # get codon list
         # codons = options.list_cod.upper().split(',')
         # check_codons_list(codons)
-        
+        GFF = HTSeq.GFF_Reader(options.gff)
         # # get html file and directory :
         (html, html_dir) = options.dirout.split(',')
         if os.path.exists(html_dir):
--- a/kmer_analysis.py	Tue Dec 09 09:43:07 2014 -0500
+++ b/kmer_analysis.py	Thu Jan 22 14:34:38 2015 +0100
@@ -14,9 +14,13 @@
 matplotlib.use('Agg')
 import matplotlib.pyplot as pl
 from numpy import arange
+from collections import OrderedDict
 import ribo_functions
-from collections import OrderedDict
 #import cPickle
+## suppress matplotlib warnings
+import warnings
+warnings.filterwarnings('ignore')
+
 
 total_mapped_read = 0 
 
@@ -25,9 +29,51 @@
     sys.stderr.write( "%s\n" % msg )
     sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) )
     sys.exit()
-
+    
+    
+def split_bam(bamfile,tmpdir):
+    '''
+        split bam by chromosome and write sam file in tmpdir
+    '''
+    try:
+        #get header
+        results = subprocess.check_output(['samtools', 'view', '-H',bamfile])
+        header = results.split('\n')
+       
+        #define genome size
+        genome = []
+        for line in header:
+            result = re.search('SN', line)
+            if result :
+                #print line    
+                feat = line.split('\t')
+                chrom = re.split(":", feat[1])
+                #print feat[1]
+                genome.append(chrom[1])
+       
+        #split sam by chrom
+        n = 0
+        for chrm in genome:
+            with open(tmpdir+'/'+chrm+'.sam', 'w') as f : 
+                #write header correctly for each chromosome
+                f.write(header[0]+'\n')
+                expr = re.compile(chrm+'\t')
+                el =[elem for elem in header if expr.search(elem)][0]
+                f.write(el+'\n')            
+                f.write(header[-2]+'\n')
+                #write all reads for each chromosome
+                reads = subprocess.check_output(["samtools", "view", bamfile, chrm])
+                f.write(reads)
+                # calculate number of reads 
+                n += reads.count(chrm)
 
+        sys.stdout.write("%d reads are presents in your bam file\n" % n)
+    
+    except Exception, e:
+        stop_err( 'Error during bam file splitting : ' + str( e ) )
         
+
+       
 def get_first_base(tmpdir, kmer): 
     '''
         write footprint coverage file for each sam file in tmpdir and get kmer distribution
@@ -47,18 +93,20 @@
         
                 for line in sam:
                     #initialize dictionnary
-                    if '@SQ' in line :
+                    if '@SQ\tSN:' in line :
                         size = int(line.split('LN:')[1])
                         genomeF = [0]*size
                         genomeR = [0]*size
                     # define multiple reads keys from mapper
-                    elif '@PG' in line :
+                    elif '@PG\tID' in line :
                         if 'bowtie' in line:
                             multi_tag = "XS:i:"
                         elif 'bwa' in  line:
                             multi_tag = "XT:A:R"
+                        #elif 'TopHat' in  line:
+                        #    multi_tag = "NH:i:1"
                         else :
-                            stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping")  
+                            stop_err("No PG tag find in "+samfile+". Please use bowtie or bwa for mapping")  
                         
                     # get footprint
                     elif re.search('^[^@].+', line) :
@@ -128,33 +176,37 @@
         whole_phasing = [0,0,0]
         for gene in GFF['order']:
             ## maybe no start position in GTF file so we must to check and replace
+            exon_number = GFF[gene]['exon_number']
             try : GFF[gene]['start'] 
             except :
                 if GFF[gene]['strand'] == '+' :
                     GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
                 else :
-                    GFF[gene]['start'] = GFF[gene]['exon'][1]['stop']
+                    GFF[gene]['start'] = GFF[gene]['exon'][exon_number]['stop']
             ## also for stop coordinates
             try : GFF[gene]['stop'] 
             except :
-                exon_number = GFF[gene]['exon_number']
                 if GFF[gene]['strand'] == '+' :
                     GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
 
                 else :
-                    GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['start']
+                    GFF[gene]['stop'] = GFF[gene]['exon'][1]['start']
             cov = []
             ##first chromosome : we open corresponding file
-            if chrom == "" :
-                chrom = GFF[gene]['chrom']
-                with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
-                    data = f.readlines()
-            ##if we change chrosomosome
-            elif chrom != GFF[gene]['chrom'] :
-                chrom = GFF[gene]['chrom']
-                with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
-                    data = f.readlines()
-                 
+            try: 
+                if chrom == "" :
+                    chrom = GFF[gene]['chrom']
+                    with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
+                        data = f.readlines()
+                ##if we change chromosome
+                elif chrom != GFF[gene]['chrom'] :
+                    chrom = GFF[gene]['chrom']
+                    with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
+                        data = f.readlines()
+            except IOError :
+                print tmpdir+"/assoCov_"+chrom+".txt doesn't exist"     
+
+            
             ## if a gene without intron :
             if GFF[gene]['exon_number'] == 1:
                     
@@ -172,14 +224,14 @@
                         
                     for exon in range(1,GFF[gene]['exon_number']+1) :
                         for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
-                            if i <= GFF[gene]['stop'] :
-                                cov.append(int((data[i].rstrip()).split("\t")[0]))  
+                            #if i <= GFF[gene]['stop'] :
+                            cov.append(int((data[i].rstrip()).split("\t")[0]))  
                 else :
                           
                     for exon in range(1,GFF[gene]['exon_number']+1) :
                         for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
-                            if i <= GFF[gene]['start'] :
-                                cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
+                            #if i <= GFF[gene]['start'] :
+                            cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
                     cov.reverse()        
                             
             len_cov = len(cov)
@@ -218,6 +270,7 @@
     pl.title('Number of reads for each k-mer')
     pl.xticks(index + bar_width, label, **axis_font)
     #pl.show()
+    fig.subplots_adjust()
     pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640)
     pl.clf()
     
@@ -231,10 +284,11 @@
         pl.ylabel('Percent of read', **axis_font)
         pl.title('Proportion of reads in each frame for '+str(key)+'-mer')
         pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font)
-        pl.tight_layout()
+        #pl.tight_layout()
         pl.ylim(0,100)
+        fig.subplots_adjust() 
         pl.draw()
-        #pl.show()
+        pl.show()
         pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300)
         pl.clf()
     
@@ -288,7 +342,7 @@
 
     #Parse command line options
     parser = optparse.OptionParser()
-    parser.add_option("-g", "--gff", dest="gfffile", type= "string",
+    parser.add_option("-g", "--gff", dest="gff", type= "string",
                   help="GFF annotation file", metavar="FILE")
                   
     parser.add_option("-b", "--bam", dest="bamfile", type= "string",
@@ -325,9 +379,30 @@
             returncode = proc.wait()
         
         tmpdir = tempfile.mkdtemp()
-        GFF = ribo_functions.store_gff(options.gfffile)
+        ## identify GFF or GTF format from 9th column
+        with open (options.gff,"r") as gffile :
+            for line in gffile :
+                if '#' in line :
+                    ## skip header
+                    gffile.next()
+                elif 'gene_id' in line :
+                    ## launch gtf reader :
+                    GFF = ribo_functions.store_gtf(options.gff)
+                    break
+                elif 'ID=' in line :
+                    ## launch gff reader 
+                    GFF = ribo_functions.store_gff(options.gff)
+                    break
+                else :
+                    stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )
+        #GFF = store_gff(options.gff)
+        #GFF = ribo_functions.store_gtf(options.gff)
+        ## check gff reading
+        if not GFF['order'] :
+            stop_err( 'Incorrect GFF file' + str( e ) )
+        
         ## split bam
-        ribo_functions.split_bam(options.bamfile,tmpdir)
+        split_bam(options.bamfile,tmpdir)
         ###################################
         ## First analysis with 28mer :
         ###################################
@@ -351,6 +426,7 @@
                     tmp = get_first_base(tmpdir, keys)
                     ## compute phasing
                     whole_phasing = frame_analysis(tmpdir,GFF)
+
                     results[keys] = whole_phasing
                  ## get report
         make_report(options.html_file, options.dirout, kmer, results)
--- a/metagene_frameshift_analysis.py	Tue Dec 09 09:43:07 2014 -0500
+++ b/metagene_frameshift_analysis.py	Thu Jan 22 14:34:38 2015 +0100
@@ -11,15 +11,16 @@
 import os, sys, optparse, tempfile, subprocess, re, shutil, commands, urllib, time, csv
 import math
 from numpy import arange
-#from matplotlib import pyplot as pl
-import matplotlib
-matplotlib.use('Agg')
-import matplotlib.pyplot as pl
+from matplotlib import pyplot as pl
+#import matplotlib
+#matplotlib.use('Agg')
+#import matplotlib.pyplot as pl
 import itertools
 from Bio import SeqIO
 from Bio.Seq import Seq
 from Bio.Alphabet import IUPAC
 from PIL import Image
+import ribo_functions
 ##libraries for debugg
 #import cPickle
 #import pdb
@@ -81,12 +82,15 @@
     
     
     
-def get_first_base(tmpdir, kmer):
+def get_first_base(tmpdir, kmer, frame):
     '''
         write footprint coverage file for each sam file in tmpdir
     '''
     global total_mapped_read
     try:
+        ### compute position of P-site according to frame (P-site -> +15 starting from 5' end of footprint)
+        p_site_pos = 16-frame
+        
         file_array = (commands.getoutput('ls '+tmpdir)).split('\n')
         ##write coverage for each sam file in tmpdir
         for samfile in file_array:
@@ -123,12 +127,14 @@
                             #if it's a forward read
                             if read_sens == 0 :
                                 #get P-site : start read + 12 nt
-                                read_pos += 15
+                                #read_pos += 15
+                                read_pos += p_site_pos
                                 genomeF[read_pos] += 1
                             #if it's a reverse read
                             elif read_sens == 16 :
                                 #get P-site
-                                read_pos += 12
+                                #read_pos += 12
+                                read_pos += (len_read-1) - p_site_pos
                                 genomeR[read_pos] += 1
 
             #try:
@@ -145,74 +151,6 @@
         stop_err( 'Error during footprinting : ' + str( e ) )
 
 
-
-
-    
-def store_gff(gff):
-    '''
-        parse and store gff file in a dictionnary
-    '''
-    try:
-        GFF = {}
-        with open(gff, 'r') as f_gff : 
-            GFF['order'] = []
-            for line in f_gff:
-                ## switch commented lines
-                line = line.split("#")[0]
-                if line != "" :
-                    feature = (line.split('\t')[8]).split(';')
-                # first line is already gene line :
-                    if line.split('\t')[2] == 'gene' :
-                        gene = feature[0].replace("ID=","")
-                        if re.search('gene',feature[2]) :
-                            Name = feature[2].replace("gene=","")
-                        else :
-                            Name = "Unknown"
-                        ##get annotation
-                        note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line)
-                        note = urllib.unquote(str(note)).replace("\n","")
-                        ## store gene information
-                        GFF['order'].append(gene)
-                        GFF[gene] = {}
-                        GFF[gene]['chrom'] = line.split('\t')[0]
-                        GFF[gene]['start'] = int(line.split('\t')[3])
-                        GFF[gene]['stop'] = int(line.split('\t')[4])
-                        GFF[gene]['strand'] = line.split('\t')[6]
-                        GFF[gene]['name'] = Name
-                        GFF[gene]['note'] = note
-                        GFF[gene]['exon'] = {}
-                        GFF[gene]['exon_number'] = 0
-                        #print Name
-                    elif line.split('\t')[2] == 'CDS' :
-                        gene = re.sub(r".?Parent\=(.+)(_mRNA)+", r"\1", feature[0])
-                        if GFF.has_key(gene) :
-                            GFF[gene]['exon_number'] += 1
-                            exon_number = GFF[gene]['exon_number'] 
-                            GFF[gene]['exon'][exon_number] = {}
-                            GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
-                            GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
-                            GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
-                        
-                    ## if there is a five prim UTR intron, we change start of gene
-                    elif   line.split('\t')[2] == 'five_prime_UTR_intron' :
-                        if GFF[gene]['strand'] == "+" :
-                            GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
-                        else :
-                            GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
-        return GFF
-    except Exception,e:
-        stop_err( 'Error during gff storage : ' + str( e ) )
-  
-  
-#chrI    SGD     gene    87286   87752   .       +       .       ID=YAL030W;Name=YAL030W;gene=SNC1;Alias=SNC1;Ontology_term=GO:0005484,GO:0005768,GO:0005802,GO:0005886,GO:0005935,GO:0006887,GO:0006893,GO:000689
-#7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra
-#ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr
-#om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified
-#chrI    SGD     CDS     87286   87387   .       +       0       Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
-#chrI    SGD     CDS     87501   87752   .       +       0       Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
-#chrI    SGD     intron  87388   87500   .       +       .       Parent=YAL030W_mRNA;Name=YAL030W_intron;orf_classification=Verified
-
-
 def __percent__(prop):
         
     if sum(prop) != 0 :
@@ -228,7 +166,7 @@
         return prop
     
     
-def frame_analysis(tmpdir,subfolder,cutoff,GFF,box_file):
+def frame_analysis(tmpdir,subfolder,cutoff,GFF,box_file, orf_length):
     '''
     This function take footprint and annotation (gff) for analyse reading frame in each gene
     '''
@@ -237,6 +175,7 @@
     try:
         chrom = "" # initializing chromosome
         nb_gene = 0 # number of analysed genes
+        mean_orf = 0
         myheader = ['Gene','Name','Total_proportion','Dual_coding','RPKM','Segment_RPKM','Note']
 
         with open(subfolder+'/dual_coding.tab',"w") as out :
@@ -247,15 +186,18 @@
             for gene in GFF['order']:
                 cov = []
                 ##first chromosome : we open corresponding file
-                if chrom == "" :
-                    chrom = GFF[gene]['chrom']
-                    with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
-                        data = f.readlines()
-                ##if we change chrosomosome
-                elif chrom != GFF[gene]['chrom'] :
-                    chrom = GFF[gene]['chrom']
-                    with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
-                        data = f.readlines()
+                try: 
+                    if chrom == "" :
+                        chrom = GFF[gene]['chrom']
+                        with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
+                            data = f.readlines()
+                    ##if we change chromosome
+                    elif chrom != GFF[gene]['chrom'] :
+                        chrom = GFF[gene]['chrom']
+                        with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
+                            data = f.readlines()
+                except IOError :
+                    print tmpdir+"/assoCov_"+chrom+".txt doesn't exist" 
                      
                 ## if a gene without intron :
                 if GFF[gene]['exon_number'] == 1:
@@ -283,13 +225,20 @@
                         cov.reverse()        
                                 
                 len_cov = len(cov)
-                #segmentation by gene size
-                if len_cov < 1000 :
-                    nb_segment = 3
-                elif len_cov > 1000 and len_cov < 2000:
-                    nb_segment = 6
-                else :
-                    nb_segment = 9
+                ## segmentation by orf_length
+                nb_segment = int(len_cov/orf_length)
+                ## too short segment gestion
+                if len_cov < orf_length :
+                    nb_segment = 1
+                
+#                 #segmentation by gene size
+#                 if len_cov < 1000 :
+#                     nb_segment = 3
+#                 elif len_cov > 1000 and len_cov < 2000:
+#                     nb_segment = 6
+#                 else :
+#                     nb_segment = 9
+
                 ## number of aa in gene :
                 nb_aa = len_cov/3
                 len_frag = len_cov/nb_segment
@@ -302,6 +251,8 @@
                 GFF[gene]['dual_coding'] = '-'
                 ''' segmentation et total proportions '''
                 if sum(cov) > MIN_COV_REQUIREMENT:
+                    ## compute mean of ORF length
+                    mean_orf += len_cov
                     nb_gene += 1
                     for nuc in range(0,len_cov-2,3) :
                         window += 1
@@ -356,6 +307,7 @@
            
         whole_phasing = __percent__(whole_phasing)
         sys.stdout.write("Proportion of reads in each frame :\n%s\n" % whole_phasing)
+        sys.stdout.write("Mean length of ORF in your organisme : %s\n" % int(mean_orf/nb_gene))
         sys.stdout.write("%d genes are used for frame analysis\n" % nb_gene)
             
         ##produce boxplot
@@ -363,8 +315,9 @@
         pl.ylabel('Total number of footprints (percent)')
         pl.draw()
         pl.savefig(subfolder+'/boxplot.png',format='png', dpi=640)
-        pl.savefig(box_file, format='png', dpi=640)
-        pl.clf()
+        #pl.savefig(box_file, format='png', dpi=640)
+        
+        
         return GFF
 
     except Exception, e:
@@ -790,13 +743,9 @@
 
 def __main__():
     
-    #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_frameshift_analysis.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /data/BY_project/BY_BAM/LMA830_sort.bam -o /data/BY_project/dual_coding_lma.csv -d /data/BY_project/LMA_dual/index.html,/data/BY_project/LMA_dual -c 60
-    #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_frameshift_analysis.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /home/rlegendre/Documents/PDE2S/PDE2S_trim_no_rRNA_28mer.bam -o /home/rlegendre/Documents/PDE2S/dual_coding_pde2s.csv -d /home/rlegendre/Documents/PDE2S/index.html,/home/rlegendre/Documents/PDE2S/PDE2S_dual -c 60
-    #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_frameshift_analysis.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/RPF_psi+_28sorted.bam -o /home/rlegendre/Documents/test_rpkmlesszero.tab -d /home/rlegendre/Documents/index.html,/home/rlegendre/Documents/plouf -c 60
-
     #Parse command line options
     parser = optparse.OptionParser()
-    parser.add_option("-i", "--input", dest="gfffile", type= "string",
+    parser.add_option("-g", "--gff", dest="gff", type= "string",
                   help="GFF annotation file", metavar="FILE")
                     
     parser.add_option("-b", "--bam", dest="bamfile", type= "string",
@@ -809,13 +758,19 @@
                   help="write report in this html file and in associated directory", metavar="STR,STR")     
     
     parser.add_option("-x", "--boxplot", dest="box", type= "string",
-                  help="report boxplot in this file", metavar="STR,STR")                     
+                  help="report boxplot in this file", metavar="STR")                     
     
     parser.add_option("-c", "--cutoff", dest="cutoff", type= "int",
                   help="Integer value for selecting all genes that have less than 60 % (default) of reads in coding frame ", metavar="INT")
     
     parser.add_option("-k", "--kmer", dest="kmer", type= "int",
-                  help="Longer of your phasing reads", metavar="INT")               
+                  help="Longer of your phasing reads", metavar="INT")
+    
+    parser.add_option("-o", "--orf_length", dest="orf_length", type= "int",
+                  help="Mean of ORF's length for segmentation", metavar="INT")    
+
+    parser.add_option("-p", "--frame", dest="frame", type="int",
+                  help="Script can compute in frame 1, 2 or 3", metavar="1|2|3")               
     
     parser.add_option("-q", "--quiet",
                   action="store_false", dest="verbose", default=True,
@@ -835,11 +790,41 @@
             cmd = "samtools index %s " % (options.bamfile)
             proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
             returncode = proc.wait()
-            
+
+        if os.path.exists(subfolder):
+            raise
+        try:
+            os.mkdir(subfolder)
+        except:
+            raise 
+        
+        ## check frame arg
+        if options.frame not in [1,2,3]:
+            stop_err( 'Please enter a good value for frame argument : must be 1, 2 or 3' )
+        
         ##RUN analysis
         split_bam(options.bamfile,tmp_dir)
-        get_first_base (tmp_dir,options.kmer)
-        GFF = store_gff(options.gfffile)
+        get_first_base (tmp_dir,options.kmer,options.frame)
+        ## identify GFF or GTF format from 9th column
+        with open (options.gff,"r") as gffile :
+            for line in gffile :
+                if '#' in line :
+                    ## skip header
+                    gffile.next()
+                elif 'gene_id' in line :
+                    ## launch gtf reader :
+                    GFF = ribo_functions.store_gtf(options.gff)
+                    break
+                elif 'ID=' in line :
+                    ## launch gff reader 
+                    GFF = ribo_functions.store_gff(options.gff)
+                    break
+                else :
+                    stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )
+        #GFF = store_gff(options.gfffile)
+        ## check gff reading
+        if not GFF['order'] :
+            stop_err( 'Incorrect GFF file' + str( e ) )
         #
         #### cPickles for Test ####
         #if os.path.isfile("/home/rlegendre/Documents/FrameShift/OutOfFrameWithIntron/pyt_dict") :
@@ -855,13 +840,7 @@
         #    
         #### cPickles for Test ####
        
-        if os.path.exists(subfolder):
-            raise
-        try:
-            os.mkdir(subfolder)
-        except:
-            raise 
-        GFF = frame_analysis (tmp_dir,subfolder,options.cutoff,GFF, options.box)
+        GFF = frame_analysis (tmp_dir,subfolder,options.cutoff,GFF, options.box, options.orf_length)
         plot_subprofile (GFF,subfolder,options.fasta)
         write_html_page(html_file,subfolder,GFF)
         
--- a/metagene_readthrough.py	Tue Dec 09 09:43:07 2014 -0500
+++ b/metagene_readthrough.py	Thu Jan 22 14:34:38 2015 +0100
@@ -21,70 +21,17 @@
 from numpy import arange
 from matplotlib import ticker as t
 from PIL import Image
+import ribo_functions
+## suppress matplotlib warnings
+import warnings
+
+
 
 def stop_err( msg ):
     sys.stderr.write( "%s\n" % msg )
     sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
     sys.exit()
     
-    
-def store_gff(gff):
-    '''
-        parse and store gff file in a dictionnary
-    '''
-    try:
-        GFF = {}
-        with open(gff, 'r') as f_gff : 
-            GFF['order'] = []
-            for line in f_gff:
-                ## switch commented lines
-                line = line.split("#")[0]
-                if line != "" :
-                    feature = (line.split('\t')[8]).split(';')
-                # first line is already gene line :
-                    if line.split('\t')[2] == 'gene' :
-                        gene = feature[0].replace("ID=","")
-                        if re.search('gene',feature[2]) :
-                            Name = feature[2].replace("gene=","")
-                        else :
-                            Name = "Unknown"
-                        ##get annotation
-                        note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line)
-                        note = unquote(str(note)).replace("\n","")
-                        ## store gene information
-                        GFF['order'].append(gene)
-                        GFF[gene] = {}
-                        GFF[gene]['chrom'] = line.split('\t')[0]
-                        GFF[gene]['start'] = int(line.split('\t')[3])
-                        GFF[gene]['stop'] = int(line.split('\t')[4])
-                        GFF[gene]['strand'] = line.split('\t')[6]
-                        GFF[gene]['name'] = Name
-                        GFF[gene]['note'] = note
-                        GFF[gene]['exon'] = {}
-                        GFF[gene]['exon_number'] = 0
-                        #print Name
-                    elif line.split('\t')[2] == 'CDS' :
-                        gene = re.sub(r".?Parent\=(.+)(_mRNA)+", r"\1", feature[0])
-                        if GFF.has_key(gene) :
-                            GFF[gene]['exon_number'] += 1
-                            exon_number = GFF[gene]['exon_number'] 
-                            GFF[gene]['exon'][exon_number] = {}
-                            GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
-                            GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
-                            GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
-                        
-                    ## if there is a five prim UTR intron, we change start of gene
-                    elif   line.split('\t')[2] == 'five_prime_UTR_intron' :
-                        if GFF[gene]['strand'] == "+" :
-                            GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
-                        else :
-                            GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
-        return GFF
-    except Exception,e:
-        stop_err( 'Error during gff storage : ' + str( e ) ) 
-
-
-       
         
 def compute_rpkm(length,count_gene,count_tot) :
     
@@ -135,7 +82,7 @@
       ranges, this in one more than the coordinate of the last base
       that is considered part of the sequence.
     strand: The strand, as a single character, '+' or '-'. '.' indicates
-      that the strand is irrelavant. (Alternatively, pass a Strand object.)
+      that the strand is irrelevant. (Alternatively, pass a Strand object.)
     length: The length of the interval, i.e., end - start
     start_d: The "directional start" position. This is the position of the
      first base of the interval, taking the strand into account. Hence, 
@@ -145,14 +92,14 @@
      strand=='-1', it is start-1.
   
 '''
-def check_overlapping(gff_reader,chrm,start,stop,strand):
+def check_overlapping(gff_reader,chrm,start,stop,strand, name):
     
     #### probleme avec les genes completement inclu...
     
     iv2 = HTSeq.GenomicInterval(chrm,start,stop,strand)
     for feature in gff_reader:
         if feature.type == "gene" :
-            if feature.iv.overlaps(iv2) :
+            if feature.iv.overlaps(iv2) and feature.attr.get('gene_name') != name :
                 ## if its a reverse gene, we replace start of extension by start of previous gene
                 if strand == '-' :
                     return (feature.iv.end+3,stop)
@@ -200,6 +147,9 @@
 
 def plot_gene ( aln, gene, start_extension, stop_extension, dirout ) :
     
+    
+    ### ignore matplotlib warnings for galaxy
+    warnings.filterwarnings('ignore')
     try:
         strand = gene['strand']
         len_gene = gene['stop']-gene['start']
@@ -446,13 +396,13 @@
         im = Image.open(infile)
         im.thumbnail(size, Image.ANTIALIAS)
         im.save(dirout+"_thumbnail.png", "PNG")
-    
+        warnings.resetwarnings()
                 
     except Exception, e:
         stop_err( 'Error during gene plotting : ' + str( e ) )
 
     
-def compute_analysis(bam, GFF, fasta, gff, dirout) :
+def compute_analysis(bam, GFF, fasta, gff, dirout, extend) :
     
     try:
         background_file = dirout+"/background_sequence.fasta"
@@ -479,20 +429,46 @@
             writer = csv.writer(out,delimiter='\t')
             writer.writerow(myheader)    
             for gene in GFF['order'] :
+                #print GFF[gene]
+                ## maybe no start position in GTF file so we must to check and replace
+                exon_number = GFF[gene]['exon_number']
+                try : GFF[gene]['start'] 
+                except :
+                    if GFF[gene]['strand'] == '+' :
+                        GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
+                    else :
+                        GFF[gene]['start'] = GFF[gene]['exon'][exon_number]['stop']
+                ## also for stop coordinates
+                try : GFF[gene]['stop'] 
+                except :
+                    if GFF[gene]['strand'] == '+' :
+                        GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
+    
+                    else :
+                        GFF[gene]['stop'] = GFF[gene]['exon'][1]['start']
+                
+                               
                 indic = ""
                 # compute rpkm of CDS :
                 len_cds = GFF[gene]['stop']-GFF[gene]['start']
                 count_cds = 0
-                ### count method of pysam doesn't strand information
-                if GFF[gene]['strand'] == '+' :
-                    for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+12,GFF[gene]['stop']-15) :
-                        if not track.is_reverse :
-                            count_cds += 1
-                else :
-                    for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+15,GFF[gene]['stop']-12) :
-                        if track.is_reverse :
-                            count_cds += 1
-                rpkm_cds = compute_rpkm(len_cds,count_cds,count_tot)
+                rpkm_cds = 0
+                count_cds = 0
+                try :
+                    ### count method of pysam doesn't strand information
+                    if GFF[gene]['strand'] == '+' :
+                        for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+12,GFF[gene]['stop']-15) :
+                            if not track.is_reverse :
+                                count_cds += 1
+                    else :
+                        for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+15,GFF[gene]['stop']-12) :
+                            if track.is_reverse :
+                                count_cds += 1
+                    rpkm_cds = compute_rpkm(len_cds,count_cds,count_tot)
+                except ValueError:
+                    ## genere warning about gtf coordinates
+                    #warnings.warn("Please check coordinates in GFF : maybe stop or start codon are missing" )
+                    pass
                 ## Only if gene is translate :
                 if rpkm_cds > 0 and count_cds > 128:  
                     ## search footprint in UTR3
@@ -520,7 +496,8 @@
                                     count += 1
                             ## get sequence of extension
                             seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement())
-                                
+                             
+                       
                         ## if we have coverage after cds stop codon    
                         if count > 10 :    
                             res = find_stop(seq)
@@ -530,9 +507,11 @@
                                 '''
                                 ## check if next in-frame codon is far than 90 nt extension :
                                 if GFF[gene]['strand'] == '+' :
-                                    pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['stop']+1,GFF[gene]['stop']+300,'+')
+                                    pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['stop']+1,GFF[gene]['stop']+extend,'+',GFF[gene]['name'])
                                     start_extension = pos[0]-1
                                     stop_extension = pos[1]
+                                    #start_extension = GFF[gene]['stop']
+                                    #stop_extension = GFF[gene]['stop']+extend
                                     seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension])
                                     
                                     #print gene,count,pos,'\n',seq
@@ -547,9 +526,11 @@
                                             #print res
                                             #stop_extension = start_extension + res +3
                                 else :
-                                    pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['start']-300,GFF[gene]['start']-1,'-')
+                                    pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['start']-extend,GFF[gene]['start']-1,'-',GFF[gene]['name'])
                                     start_extension = pos[0]
                                     stop_extension = pos[1]+1
+                                    #start_extension = GFF[gene]['start']-extend
+                                    #stop_extension = GFF[gene]['start']
                                     seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement())
                                     
                                     #print gene,count,pos,'\n',seq
@@ -970,11 +951,6 @@
         
 def __main__():
 
-    ## python metagene_readthrough.py -g Saccer3.gff -f Saccer3.fa -b B1_sorted.bam -o Bstrain_readthrough.csv
-    #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_readthrough.py -g /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /data/Mapping/read_mapped_unique_psiP_sorted.bam -o /home/rlegendre/Documents/Translecture/plop_py.csv -d /home/rlegendre/Documents/Translecture/out_trans/
-    #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_readthrough.py -g /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /home/rlegendre/Documents/PDE2S/PDE2S_trim_no_rRNA_sorted.bam -o /home/rlegendre/Documents/PDE2S/Readthrough_pde2s.csv -d /home/rlegendre/Documents/PDE2S/out_trans
-
-
     #Parse command line options
     parser = optparse.OptionParser()
     parser.add_option("-g", "--gff", dest="gff", type= "string",
@@ -987,7 +963,10 @@
                   help="Bam Ribo-Seq alignments ", metavar="FILE")
                     
     parser.add_option("-d", "--dirout", dest="dirout", type= "string",
-                  help="write report in this html file and in associated directory", metavar="STR,STR")    
+                  help="write report in this html file and in associated directory", metavar="STR,STR")   
+    
+    parser.add_option("-e", "--extend", dest="extend", type= "int",
+                  help="Lenght of extension after stop in number of base pairs (depends on your organisme)", metavar="INT")     
                     
     parser.add_option("-q", "--quiet",
                   action="store_false", dest="verbose", default=True,
@@ -1004,10 +983,30 @@
             os.mkdir(subfolder)
         except:
             raise
-        
-        GFF = store_gff(options.gff)
+        ## identify GFF or GTF format from 9th column
+        with open (options.gff,"r") as gffile :
+            for line in gffile :
+                if '#' in line :
+                    ## skip header
+                    gffile.next()
+                elif 'gene_id' in line :
+                    ## launch gtf reader :
+                    GFF = ribo_functions.store_gtf(options.gff)
+                    break
+                elif 'ID=' in line :
+                    ## launch gff reader 
+                    GFF = ribo_functions.store_gff(options.gff)
+                    break
+                else :
+                    stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )
+            
+        #GFF = store_gff(options.gff)
+        #GFF = ribo_functions.store_gtf(options.gff)
+        ## check gff reading
+        if not GFF['order'] :
+            stop_err( 'Incorrect GFF file' + str( e ) )
         clean_file = cleaning_bam(options.bamfile)
-        compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder)
+        compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder, options.extend)
         if os.path.exists( clean_file ):
             os.remove( clean_file )
         
--- a/ribo_functions.py	Tue Dec 09 09:43:07 2014 -0500
+++ b/ribo_functions.py	Thu Jan 22 14:34:38 2015 +0100
@@ -8,6 +8,8 @@
 '''
 
 import sys, subprocess, re, commands, time, urllib
+from copy import copy
+
 
 def stop_err( msg ):
     sys.stderr.write( "%s\n" % msg )
@@ -154,7 +156,7 @@
                         GFF[gene]['exon_number'] = 0
                         #print Name
                     elif line.split('\t')[2] == 'CDS' :
-                        gene = re.sub(r".?Parent\=(.+)(_mRNA)+", r"\1", feature[0])
+                        gene = re.sub(r".?Parent\=(.+)\_mRNA?", r"\1", feature[0])
                         if GFF.has_key(gene) :
                             GFF[gene]['exon_number'] += 1
                             exon_number = GFF[gene]['exon_number'] 
@@ -184,7 +186,7 @@
 
 def store_gtf(gff):
     '''
-        parse and store gtf file in a dictionnary (DEPRECATED)
+        parse and store gtf file in a dictionnary 
     '''
     try:
         GFF = {}
@@ -195,11 +197,11 @@
                 line = line.split("#")[0]
                 if line != "" :
                 # first line is already gene line :
-                    if line.split('\t')[1] == 'protein_coding' :
+                    if 'protein_coding' in line :
                         ##get name
-                        gene = re.sub(r".+ transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip()
-                        Name = re.sub(r".+ transcript_name \"([\w|-]+)\";.*", r"\1", line).rstrip()
-                        if line.split('\t')[2] == 'exon' :
+                        gene = re.sub(r".+transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip()
+                        Name = re.sub(r".+gene_name \"([\w|\-|\:|\.|\(|\)]+)\";.*", r"\1", line).rstrip()
+                        if line.split('\t')[2] == 'CDS' :
                             ##if its first time we get this gene
                             if gene not in GFF.keys() :
                             ## store gene information
@@ -207,35 +209,41 @@
                                 GFF[gene] = {}
                                 GFF[gene]['chrom'] = line.split('\t')[0]
                                 GFF[gene]['strand'] = line.split('\t')[6]
+                                GFF[gene]['start'] = int(line.split('\t')[3])
+                                GFF[gene]['stop'] = int(line.split('\t')[4])
                                 GFF[gene]['name'] = Name
+                                GFF[gene]['note'] = ""
                                 GFF[gene]['exon_number'] = 1
                                 GFF[gene]['exon'] = {}
-                                exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
+                                #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
+                                ## some exons are non codant
+                                exon_number = 1
                                 GFF[gene]['exon'][exon_number] = {}
                                 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
                                 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
                             else :
                                 ## we add exon
-                                exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
+                                #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
+                                exon_number += 1
                                 GFF[gene]['exon_number'] = exon_number
                                 GFF[gene]['exon'][exon_number] = {}
                                 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
                                 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
-                        elif line.split('\t')[2] == 'CDS' :
-                            exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
+                        #elif line.split('\t')[2] == 'CDS' :
+                            #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
                             GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
                         elif line.split('\t')[2] == 'start_codon' :
                             if GFF[gene]['strand'] == '-' :
-                                GFF[gene]['start'] = int(line.split('\t')[4])
+                                GFF[gene]['stop'] = int(line.split('\t')[4])
                             else :
                                 GFF[gene]['start'] = int(line.split('\t')[3])
                         elif line.split('\t')[2] == 'stop_codon' :
                             if GFF[gene]['strand'] == '-' :
-                                GFF[gene]['stop'] = int(line.split('\t')[3])
+                                GFF[gene]['start'] = int(line.split('\t')[3])
                             else :
                                 GFF[gene]['stop'] = int(line.split('\t')[4])
 
-        return GFF
+        return __reverse_coordinates__(GFF)
     except Exception,e:
         stop_err( 'Error during gff storage : ' + str( e ) )
 
@@ -252,8 +260,26 @@
 ## protein_id "YDL083C";
 ##IV      protein_coding  stop_codon      306926  306928  .       -       0       gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "
 ##RPS16B";
+def __reverse_coordinates__(GFF):
+    
+    for gene in GFF['order']:
+        ## for reverse gene
+        if GFF[gene]['strand'] == "-":
+            ## if this gene have many exon and the stop of gene is the stop of first (and not last) exon, we reverse exon coordinates
+            if GFF[gene]['stop'] == GFF[gene]['exon'][1]['stop'] and GFF[gene]['exon_number'] > 1 :
+                tmp = copy(GFF[gene]['exon'])
+                exon_number = GFF[gene]['exon_number']
+                rev_index = exon_number+1
+                for z in range(1,exon_number+1):
+                    rev_index -= 1
+                    GFF[gene]['exon'][z] = tmp[rev_index]
 
+                ## check start
+                if GFF[gene]['start'] != GFF[gene]['exon'][1]['start'] and GFF[gene]['start']:
+                    GFF[gene]['exon'][1]['start'] = GFF[gene]['start']
 
+    return GFF
+    
 
 def cleaning_bam(bam):
     '''