changeset 4:eea5fec46e5c

Uploaded
author rlegendre
date Mon, 20 Oct 2014 11:07:19 -0400
parents 8d8552899d20
children 0b8fa8b7f356
files metagene_frameshift_analysis.py
diffstat 1 files changed, 884 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/metagene_frameshift_analysis.py	Mon Oct 20 11:07:19 2014 -0400
@@ -0,0 +1,884 @@
+#!/usr/bin/env python2.7
+#-*- coding: utf-8 -*-
+'''
+    Created on sep. 2013
+    @author: rachel legendre
+    @copyright: rachel.legendre@igmors.u-psud.fr
+    @license: GPL v3
+'''
+
+
+import os, sys, optparse, tempfile, subprocess, re, shutil, commands, urllib, time, csv
+import math
+from numpy import arange
+#from matplotlib import pyplot as pl
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as pl
+import itertools
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.Alphabet import IUPAC
+from PIL import Image
+##libraries for debugg
+#import cPickle
+#import pdb
+
+
+##Setting
+MIN_COV_REQUIREMENT = 500
+total_mapped_read = 0 
+_MAX = 30
+whole_phasing_table = [[],[],[]]
+
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
+    sys.exit()
+
+
+def split_bam(bamfile,tmpdir):
+    '''
+        split bam by chromosome and write sam file in tmpdir
+    '''
+    try:
+        #get header
+        results = subprocess.check_output(['samtools', 'view', '-H',bamfile])
+        header = results.split('\n')
+       
+        #define genome size
+        genome = []
+        for line in header:
+            result = re.search('SN', line)
+            if result :
+                #print line    
+                feat = line.split('\t')
+                chrom = re.split(":", feat[1])
+                #print feat[1]
+                genome.append(chrom[1])
+       
+        #split sam by chrom
+        n = 0
+        for chrm in genome:
+            with open(tmpdir+'/'+chrm+'.sam', 'w') as f : 
+                #write header correctly for each chromosome
+                f.write(header[0]+'\n')
+                expr = re.compile(chrm+'\t')
+                el =[elem for elem in header if expr.search(elem)][0]
+                f.write(el+'\n')            
+                f.write(header[-2]+'\n')
+                #write all reads for each chromosome
+                reads = subprocess.check_output(["samtools", "view", bamfile, chrm])
+                f.write(reads)
+                # calculate number of reads 
+                n += reads.count(chrm)
+
+        sys.stdout.write("%d reads are presents in your bam file\n" % n)
+    
+    except Exception, e:
+        stop_err( 'Error during bam file splitting : ' + str( e ) )
+    
+    
+    
+def get_first_base(tmpdir, kmer):
+    '''
+        write footprint coverage file for each sam file in tmpdir
+    '''
+    global total_mapped_read
+    try:
+        file_array = (commands.getoutput('ls '+tmpdir)).split('\n')
+        ##write coverage for each sam file in tmpdir
+        for samfile in file_array:
+            with open(tmpdir+'/'+samfile, 'r') as sam : 
+                ##get chromosome name
+                chrom = samfile.split(".sam")[0]
+        
+                for line in sam:
+                    #initialize dictionnary
+                    if '@SQ' in line :
+                        size = int(line.split('LN:')[1])
+                        genomeF = [0]*size
+                        genomeR = [0]*size
+                    # define multiple reads keys from mapper
+                    elif '@PG' in line :
+                        if 'bowtie' in line:
+                            multi_tag = "XS:i:"
+                        elif 'bwa' in  line:
+                            multi_tag = "XT:A:R"
+                        else :
+                            stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping")  
+                        
+                    # get footprint
+                    elif re.search('^[^@].+', line) :
+                        #print line.rstrip()
+                        read_pos = int(line.split('\t')[3])
+                        read_sens = int(line.split('\t')[1])
+                        len_read = len(line.split('\t')[9])
+                        read_qual = int(line.split('\t')[4])
+                        if len_read == kmer and multi_tag not in line and read_qual > 20 :
+                        ###if line.split('\t')[5] == '28M' :
+                           # print line.rstrip()
+                            total_mapped_read +=1
+                            #if it's a forward read
+                            if read_sens == 0 :
+                                #get P-site : start read + 12 nt
+                                read_pos += 15
+                                genomeF[read_pos] += 1
+                            #if it's a reverse read
+                            elif read_sens == 16 :
+                                #get P-site
+                                read_pos += 12
+                                genomeR[read_pos] += 1
+
+            #try:
+            #write coverage in files
+            with open(tmpdir+'/assoCov_'+chrom+'.txt', 'w') as cov : 
+                for i in range(0,size):
+                    cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n')
+            #except Exception,e:
+            #    stop_err( 'Error during coverage file writting : ' + str( e ) )            
+            del genomeF
+            del genomeR      
+        sys.stdout.write("%d reads are used for frame analysis\n" % total_mapped_read)              
+    except Exception, e:
+        stop_err( 'Error during footprinting : ' + str( e ) )
+
+
+
+
+    
+def store_gff(gff):
+    '''
+        parse and store gff file in a dictionnary
+    '''
+    try:
+        GFF = {}
+        with open(gff, 'r') as f_gff : 
+            GFF['order'] = []
+            for line in f_gff:
+                ## switch commented lines
+                line = line.split("#")[0]
+                if line != "" :
+                    feature = (line.split('\t')[8]).split(';')
+                # first line is already gene line :
+                    if line.split('\t')[2] == 'gene' :
+                        gene = feature[0].replace("ID=","")
+                        if re.search('gene',feature[2]) :
+                            Name = feature[2].replace("gene=","")
+                        else :
+                            Name = "Unknown"
+                        ##get annotation
+                        note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line)
+                        note = urllib.unquote(str(note)).replace("\n","")
+                        ## store gene information
+                        GFF['order'].append(gene)
+                        GFF[gene] = {}
+                        GFF[gene]['chrom'] = line.split('\t')[0]
+                        GFF[gene]['start'] = int(line.split('\t')[3])
+                        GFF[gene]['stop'] = int(line.split('\t')[4])
+                        GFF[gene]['strand'] = line.split('\t')[6]
+                        GFF[gene]['name'] = Name
+                        GFF[gene]['note'] = note
+                        GFF[gene]['exon'] = {}
+                        GFF[gene]['exon_number'] = 0
+                        #print Name
+                    elif line.split('\t')[2] == 'CDS' :
+                        gene = re.sub(r".?Parent\=(.+)(_mRNA)+", r"\1", feature[0])
+                        if GFF.has_key(gene) :
+                            GFF[gene]['exon_number'] += 1
+                            exon_number = GFF[gene]['exon_number'] 
+                            GFF[gene]['exon'][exon_number] = {}
+                            GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
+                            GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
+                            GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
+                        
+                    ## if there is a five prim UTR intron, we change start of gene
+                    elif   line.split('\t')[2] == 'five_prime_UTR_intron' :
+                        if GFF[gene]['strand'] == "+" :
+                            GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
+                        else :
+                            GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
+        return GFF
+    except Exception,e:
+        stop_err( 'Error during gff storage : ' + str( e ) )
+  
+  
+#chrI    SGD     gene    87286   87752   .       +       .       ID=YAL030W;Name=YAL030W;gene=SNC1;Alias=SNC1;Ontology_term=GO:0005484,GO:0005768,GO:0005802,GO:0005886,GO:0005935,GO:0006887,GO:0006893,GO:000689
+#7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra
+#ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr
+#om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified
+#chrI    SGD     CDS     87286   87387   .       +       0       Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
+#chrI    SGD     CDS     87501   87752   .       +       0       Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
+#chrI    SGD     intron  87388   87500   .       +       .       Parent=YAL030W_mRNA;Name=YAL030W_intron;orf_classification=Verified
+
+
+def __percent__(prop):
+        
+    if sum(prop) != 0 :
+        perc = [0,0,0]
+        if prop[0] != 0 :
+            perc[0] = int("{0:.0f}".format((prop[0]*100.0)/sum(prop)))
+        if prop[1] != 0 :
+            perc[1] = int("{0:.0f}".format((prop[1]*100.0)/sum(prop)))
+        if prop[2] != 0 :
+            perc[2] = int("{0:.0f}".format((prop[2]*100.0)/sum(prop)))
+        return perc
+    else :
+        return prop
+    
+    
+def frame_analysis(tmpdir,subfolder,cutoff,GFF,box_file):
+    '''
+    This function take footprint and annotation (gff) for analyse reading frame in each gene
+    '''
+    global total_mapped_read
+    global whole_phasing_table 
+    try:
+        chrom = "" # initializing chromosome
+        nb_gene = 0 # number of analysed genes
+        myheader = ['Gene','Name','Total_proportion','Dual_coding','RPKM','Segment_RPKM','Note']
+
+        with open(subfolder+'/dual_coding.tab',"w") as out :
+            #out.write('Gene\tName\tTotal_proportion\tDual_coding\tRPKM\tSegment_RPKM\tNote\n')
+            writer = csv.writer(out,delimiter='\t')
+            writer.writerow(myheader)
+            whole_phasing = [0,0,0]
+            for gene in GFF['order']:
+                cov = []
+                ##first chromosome : we open corresponding file
+                if chrom == "" :
+                    chrom = GFF[gene]['chrom']
+                    with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
+                        data = f.readlines()
+                ##if we change chrosomosome
+                elif chrom != GFF[gene]['chrom'] :
+                    chrom = GFF[gene]['chrom']
+                    with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
+                        data = f.readlines()
+                     
+                ## if a gene without intron :
+                if GFF[gene]['exon_number'] == 1:
+                        
+                    ## get coverage for each gene   
+                    if GFF[gene]['strand'] == "+":
+                        for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
+                            cov.append(int((data[i].rstrip()).split("\t")[0]))
+                    else :
+                        for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
+                            cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
+                        cov.reverse()
+                else :
+                    ## For each gene, get coverage and sum of exon size
+                    if GFF[gene]['strand'] == "+":
+                            
+                        for exon in range(1,GFF[gene]['exon_number']+1) :
+                            for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
+                                cov.append(int((data[i].rstrip()).split("\t")[0]))  
+                    else :
+                              
+                        for exon in range(1,GFF[gene]['exon_number']+1) :
+                            for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
+                                cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
+                        cov.reverse()        
+                                
+                len_cov = len(cov)
+                #segmentation by gene size
+                if len_cov < 1000 :
+                    nb_segment = 3
+                elif len_cov > 1000 and len_cov < 2000:
+                    nb_segment = 6
+                else :
+                    nb_segment = 9
+                ## number of aa in gene :
+                nb_aa = len_cov/3
+                len_frag = len_cov/nb_segment
+                prop = [0,0,0]
+                seg_prop = []
+                tot_prop = prop
+                tot_rpkm = 0
+                seg_rpkm = []
+                window = 0
+                GFF[gene]['dual_coding'] = '-'
+                ''' segmentation et total proportions '''
+                if sum(cov) > MIN_COV_REQUIREMENT:
+                    nb_gene += 1
+                    for nuc in range(0,len_cov-2,3) :
+                        window += 1
+                        tot_rpkm += sum(cov[nuc:nuc+2])
+                        ## Calculate proportion
+                        prop[0] += cov[nuc]
+                        prop[1] += cov[nuc+1]
+                        prop[2] += cov[nuc+2]
+                        #segmented proportion
+                        if window == nb_aa/nb_segment:
+                            seg_prop.append(__percent__(prop))
+                            whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop)))
+                            tot_prop = (map(lambda (x, y): x + y, zip(tot_prop, prop)))
+                            seg_rpkm.append("{0:.2f}".format((tot_rpkm*1000000.0)/(total_mapped_read*len_frag)))
+                            window = 0
+                            tot_rpkm = 0
+                            prop = [0,0,0]
+                                
+                    GFF[gene]['total_prop'] = __percent__(tot_prop)
+                    GFF[gene]['seg_prop'] = seg_prop
+                    GFF[gene]['seg_rpkm'] = seg_rpkm
+                    tot_rpkm = "{0:.2f}".format(sum(cov)*1000000.0/(total_mapped_read*len_cov))
+                    GFF[gene]['total_rpkm'] = tot_rpkm
+                    ## If gene has non standard proportion
+                    idx = 0
+                    for seg in GFF[gene]['seg_prop'] :
+                        if (seg[0] < cutoff) and (sum(seg) != 0 and GFF[gene]['seg_rpkm'][idx] != 0.0) :
+                            GFF[gene]['dual_coding'] = 'yes'
+                        idx += 1
+                    for z in range(0,3) :
+                        whole_phasing_table[z].append(__percent__(tot_prop)[z])         
+                else :
+                    for nuc in range(0,len_cov-2,3) :
+                        ## Calculate proportion
+                        prop[0] += cov[nuc]
+                        prop[1] += cov[nuc+1]
+                        prop[2] += cov[nuc+2]
+                    whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop)))
+                    #for z in range(0,3) :
+                    #    whole_phasing_table[z].append(__percent__(tot_prop)[z])
+                            
+                if GFF[gene]['dual_coding'] == 'yes' :
+                    mylist = [gene,GFF[gene]['name'],GFF[gene]['total_prop'],GFF[gene]['seg_prop'],GFF[gene]['total_rpkm'],GFF[gene]['seg_rpkm'],GFF[gene]['note']]
+                    #out.write(gene+'\t'+GFF[gene]['name']+'\t'+str(GFF[gene]['total_prop'])+'\t'+str(GFF[gene]['seg_prop'])+'\t'+str(GFF[gene]['total_rpkm'])+'\t'+GFF[gene]['note']+'\n')
+                    writer.writerow(mylist)
+                    GFF[gene]['coverage'] = cov
+                else :
+                    del GFF[gene]
+                        
+        del GFF['order']
+            
+           
+        whole_phasing = __percent__(whole_phasing)
+        sys.stdout.write("Proportion of reads in each frame :\n%s\n" % whole_phasing)
+        sys.stdout.write("%d genes are used for frame analysis\n" % nb_gene)
+            
+        ##produce boxplot
+        pl.boxplot(whole_phasing_table,notch=True,sym='ro')
+        pl.ylabel('Total number of footprints (percent)')
+        pl.draw()
+        pl.savefig(subfolder+'/boxplot.png',format='png', dpi=640)
+        pl.savefig(box_file, format='png', dpi=640)
+        pl.clf()
+        return GFF
+
+    except Exception, e:
+        stop_err( 'Error during frame analysis : ' + str( e ) )
+
+def get_orf(fasta,gene):
+    '''
+    This function compute ORF in each gene and return multiple lists of methionines and stop in each frame
+    '''
+    try :
+        SeqDict = SeqIO.to_dict(SeqIO.parse(open(fasta,"r"),"fasta"))
+        if gene['exon_number'] == 1 :
+            if gene['strand'] == '-' :
+                gene_seq = str(SeqDict[gene['chrom']].seq[gene['start']-2:gene['stop']+1].reverse_complement())
+            else :
+                gene_seq = str(SeqDict[gene['chrom']].seq[gene['start']-2:gene['stop']+1])
+        else :
+            ## get sequence of gene with intron with -1 and +1 nt for detect all phases
+            gene_seq = str(SeqDict[gene['chrom']].seq[gene['start']-2:gene['start']-1])
+            for exon in range(1,gene['exon_number']+1) :
+                gene_seq = gene_seq+str(SeqDict[gene['chrom']].seq[gene['exon'][exon]['start']-1:gene['exon'][exon]['stop']])
+            gene_seq = gene_seq+str(SeqDict[gene['chrom']].seq[gene['stop']:gene['stop']+1])
+            if gene['strand'] == '-' :
+                my_seq = Seq(gene_seq,IUPAC.unambiguous_dna).reverse_complement()
+                #gene_seq = my_seq.tostring() DEPRECATED
+                gene_seq = str(my_seq)
+        # initialize array of start/stop array_1 is fame 0, array 2 is frame +1, array_3 is frame -1
+        stop_1 = []
+        stop_2 = []
+        stop_3 = []
+        met_1 = []
+        met_2 = []
+        met_3 = []
+        stops = ['TAA','TGA','TAG']
+        for z in range(0,len(gene_seq),3):
+            # Frame -1
+            if gene_seq[z:z+3] == 'ATG' :
+                met_3.append(z)
+            elif gene_seq[z:z+3] in stops :
+                stop_3.append(z)
+            # Frame 0
+            if gene_seq[z+1:z+4] == 'ATG' :
+                met_1.append(z+1)
+            elif gene_seq[z+1:z+4] in stops :
+                stop_1.append(z+1)
+            # Frame +1
+            if gene_seq[z+2:z+5] == 'ATG' :
+                met_2.append(z+2)
+            elif gene_seq[z+2:z+5] in stops :
+                stop_2.append(z+2)
+   
+                
+        return gene_seq,met_1,stop_1,met_2,stop_2,met_3,stop_3
+        
+    except Exception, e:
+        stop_err( 'Error during ORF computing : ' + str( e ) )
+    
+def plot_subprofile(GFF, directory,fasta) :
+    '''
+    This function plot subprofiles of footprint and ORFs for each gene in GFF dict
+    '''
+    global _MAX
+    try :
+        
+        for gene in GFF.iterkeys():
+            nb_aa = len(GFF[gene]['coverage'])/3
+            subprofile_1 = []
+            subprofile_2 = []
+            subprofile_3 = []
+            cov = GFF[gene]['coverage']
+                
+            for z in range(0,len(cov)-2,3):
+                for i in range(3):
+                    subprofile_1.append(cov[z])
+                    subprofile_2.append(cov[z+1])
+                    subprofile_3.append(cov[z+2])
+                
+            ## Normalisation of density plot
+            tot = max(cov)
+                
+            subprofile_1 = [(x * _MAX/tot) for x in subprofile_1]
+            subprofile_2 = [(x * _MAX/tot) for x in subprofile_2]
+            subprofile_3 = [(x * _MAX/tot) for x in subprofile_3]
+                
+                
+            if len(subprofile_1) == len(subprofile_2) and len(subprofile_1) == len(subprofile_3) :
+                    
+                ind = arange(len(subprofile_1))
+                fig = pl.figure(num=1)
+                #sub profile frame 0
+                pl.subplot(4,1,1)
+                ax1 = fig.add_subplot(4,1,1)
+                pl.title( "%s (%s)\n$(%s RPKM)$"% (gene, GFF[gene]['name'], GFF[gene]['total_rpkm']))
+                pl.ylim(0,_MAX)
+                pl.xlim(0,len(subprofile_1))
+                pl.yticks([0, _MAX/2, _MAX])
+                ax1.plot(ind, subprofile_1, 'b-', lw = 3,label = 'Frame 0')
+                ax1.fill_between(ind, 0,subprofile_1,color='b')
+                ax1.legend(loc=1,prop={'size':8})
+                pl.ylabel('#RPF')
+                    
+                ##sub profile frame +1
+                pl.subplot(4,1,2)
+                ax2 = fig.add_subplot(4,1,2)
+                pl.ylim(0,_MAX)
+                pl.xlim(0,len(subprofile_2))
+                pl.yticks([0, _MAX/2, _MAX])
+                ax2.plot(ind,subprofile_2,'r-',lw = 3,label = 'Frame +1')
+                ax2.fill_between(ind, 0,subprofile_2,color='r')
+                ax2.legend(loc=1,prop={'size':8})
+                pl.ylabel('#RPF')
+                    
+                ##sub profile frame -1
+                pl.subplot(4,1,3)
+                ax3 = fig.add_subplot(4,1,3)
+                pl.ylim(0,_MAX)
+                pl.yticks([0, _MAX/2, _MAX])
+                pl.xlim(0,len(subprofile_3))
+                ax3.plot(ind,subprofile_3,'g-',lw =3,label = 'Frame -1')
+                ax3.fill_between(ind, 0,subprofile_3,color='g')
+                ax3.legend(loc=1,prop={'size':8})
+                pl.ylabel('#RPF')
+                   
+                ## get orf for each frame
+                gene_seq,met_1,stop_1,met_2,stop_2,met_3,stop_3 = get_orf(fasta,GFF[gene])  
+                ## ORF plot
+                idc = arange(len(gene_seq))
+                ##define axe for frame :
+                x = [0]*len(gene_seq)
+                y = [1]*len(gene_seq)
+                z = [2]*len(gene_seq)
+                ## define last subplot
+                #pdb.set_trace()
+                pl.subplot(4,1,4)
+                ## modify axes setting
+                ax4 = fig.add_subplot(4,1,4)
+                ax4.spines['right'].set_color('none')
+                ax4.spines['top'].set_color('none')
+                ax4.spines['left'].set_color('none')
+                ax4.spines['bottom'].set_color('none')
+                ax4.tick_params(bottom = 'off',top='off',left='off', right='off')
+                pl.ylim(0,3)
+                pl.xlim(0,len(gene_seq))
+                pl.yticks([0.5,1.5,2.5],[r'-1',r'+1',r'0'])
+                pl.xticks([])
+                ## plot line for delimiting frame and arrow corresponding to start and stop
+                pl.plot(idc,x,'k',lw=2)
+                for val in met_1:
+                    pl.arrow( val, 2, 0.0, 0.5, fc="g", ec="g",lw=2)
+                for val in stop_1:
+                    pl.arrow( val, 2, 0.0, 1, fc="r", ec="r",lw=2)
+                pl.plot(idc,y,'k')
+                for val in met_2:
+                    pl.arrow( val, 1, 0.0, 0.5, fc="g", ec="g",lw=2)
+                for val in stop_2:
+                    pl.arrow( val, 1, 0.0, 1, fc="r", ec="r",lw=2)
+                pl.plot(idc,z,'k')
+                for val in met_3:
+                    pl.arrow( val, 0, 0.0, 0.5, fc="g", ec="g",lw=2)
+                for val in stop_3:
+                    pl.arrow( val, 0, 0.0, 1, fc="r", ec="r",lw=2)
+                    
+
+                pl.ylabel('ORF',fontsize='small')
+                pl.draw()
+                pl.savefig(directory+'/'+gene+'.png',format='png', dpi=600)
+                pl.clf()
+                ##pl.show()
+                    
+                ## Make thumbnail for html page
+                infile = directory+'/'+gene+'.png'
+                size = 128, 128
+                im = Image.open(infile)
+                im.thumbnail(size, Image.ANTIALIAS)
+                im.save(directory+'/'+gene + "_thumbnail.png", "PNG")
+                    
+                    
+            else :
+                stop_err( 'Error during plot : lenght of subprofile are different ')
+                
+    except Exception, e:
+        stop_err( 'Error during subprofiles plotting : ' + str( e ) )
+
+def write_html_page(html,subfolder,GFF) :
+        
+        
+    try :
+        gene_table = ''
+        gene_table += '<table>'
+        gene_table += '<thead><tr><th data-sort="string">Gene</th><th>Plot</th><th data-sort="string">Name</th><th>Total Proportion</th><th>Segment proportion</th><th data-sort="float">RPKM</th><th>Segment RPKM</th></tr></thead><tbody>'
+        for gene in GFF.iterkeys():
+            gene_table += '<tr><td>%s</td><td><a href="%s.png" data-lightbox="%s"><img src="%s_thumbnail.png" /></a></td><td><a title="%s">%s</a></td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' %(gene,gene,gene,gene,GFF[gene]['note'],GFF[gene]['name'],GFF[gene]['total_prop'],GFF[gene]['seg_prop'],GFF[gene]['total_rpkm'],GFF[gene]['seg_rpkm'])
+        gene_table += '</tbody></table>'  
+            
+ 
+        html_str = """  
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+    
+<html xmlns="http://www.w3.org/1999/xhtml">
+    <head>
+        <link rel="stylesheet" href="http://code.jquery.com/ui/1.10.3/themes/smoothness/jquery-ui.css" />
+        <script src="http://code.jquery.com/jquery-1.10.2.min.js"></script>
+        <script src="http://ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js"></script>
+        <script src="http://code.jquery.com/ui/1.10.3/jquery-ui.js"></script>
+        <script src="lightbox/js/jquery-1.10.2.min.js"></script>
+        <script src="lightbox/js/lightbox-2.6.min.js"></script>
+        <link href="lightbox/css/lightbox.css" rel="stylesheet" />
+        <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+        <title>Dual coding result file</title>
+        <link rel="stylesheet" href="http://code.jquery.com/ui/1.10.3/themes/smoothness/jquery-ui.css" />
+        <script>
+          (function(d){d.fn.stupidtable=function(b){return this.each(function(){var a=d(this);b=b||{};b=d.extend({},d.fn.stupidtable.default_sort_fns,b);var n=function(a,b){for(var f=[],c=0,e=a.slice(0).sort(b),h=0;h<a.length;h++){for(c=d.inArray(a[h],e);-1!=d.inArray(c,f);)c++;f.push(c)}return f},q=function(a,b){for(var d=a.slice(0),c=0,e=0;e<b.length;e++)c=b[e],d[c]=a[e];return d};a.on("click","th",function(){var m=a.children("tbody").children("tr"),g=d(this),f=0,c=d.fn.stupidtable.dir;a.find("th").slice(0, g.index()).each(function(){var a=d(this).attr("colspan")||1;f+=parseInt(a,10)});var e=g.data("sort-default")||c.ASC;g.data("sort-dir")&&(e=g.data("sort-dir")===c.ASC?c.DESC:c.ASC);var h=g.data("sort")||null;null!==h&&(a.trigger("beforetablesort",{column:f,direction:e}),a.css("display"),setTimeout(function(){var l=[],p=b[h];m.each(function(a,b){var c=d(b).children().eq(f),e=c.data("sort-value"),c="undefined"!==typeof e?e:c.text();l.push(c)});var k;k=e==c.ASC?n(l,p):n(l,function(a,b){return-p(a,b)}); a.find("th").data("sort-dir",null).removeClass("sorting-desc sorting-asc");g.data("sort-dir",e).addClass("sorting-"+e);k=d(q(m,k));a.children("tbody").remove();a.append("<tbody />").append(k);a.trigger("aftertablesort",{column:f,direction:e});a.css("display")},10))})})};d.fn.stupidtable.dir={ASC:"asc",DESC:"desc"};d.fn.stupidtable.default_sort_fns={"int":function(b,a){return parseInt(b,10)-parseInt(a,10)},"float":function(b,a){return parseFloat(b)-parseFloat(a)},string:function(b,a){return b<a?-1: b>a?1:0},"string-ins":function(b,a){b=b.toLowerCase();a=a.toLowerCase();return b<a?-1:b>a?1:0}}})(jQuery);
+          (function($) {
+
+            $.fn.stupidtable = function(sortFns) {
+              return this.each(function() {
+                var $table = $(this);
+                sortFns = sortFns || {};
+          
+                // ==================================================== //
+                //                  Utility functions                   //
+                // ==================================================== //
+          
+                // Merge sort functions with some default sort functions.
+                sortFns = $.extend({}, $.fn.stupidtable.default_sort_fns, sortFns);
+          
+                // Return the resulting indexes of a sort so we can apply
+                // this result elsewhere. This returns an array of index numbers.
+                // return[0] = x means "arr's 0th element is now at x"
+                var sort_map = function(arr, sort_function) {
+                  var map = [];
+                  var index = 0;
+                  var sorted = arr.slice(0).sort(sort_function);
+                  for (var i=0; i<arr.length; i++) {
+                      index = $.inArray(arr[i], sorted);
+          
+                      // If this index is already in the map, look for the next index.
+                      // This handles the case of duplicate entries.
+                      while ($.inArray(index, map) != -1) {
+                          index++;
+                      }
+                      map.push(index);
+                  }
+          
+                  return map;
+                };
+          
+                // Apply a sort map to the array.
+                var apply_sort_map = function(arr, map) {
+                  var clone = arr.slice(0),
+                      newIndex = 0;
+                  for (var i=0; i<map.length; i++) {
+                    newIndex = map[i];
+                    clone[newIndex] = arr[i];
+                  }
+                  return clone;
+                };
+          
+                // ==================================================== //
+                //                  Begin execution!                    //
+                // ==================================================== //
+          
+                // Do sorting when THs are clicked
+                $table.on("click", "th", function() {
+                  var trs = $table.children("tbody").children("tr");
+                  var $this = $(this);
+                  var th_index = 0;
+                  var dir = $.fn.stupidtable.dir;
+          
+                  $table.find("th").slice(0, $this.index()).each(function() {
+                    var cols = $(this).attr("colspan") || 1;
+                    th_index += parseInt(cols,10);
+                  });
+          
+                  // Determine (and/or reverse) sorting direction, default `asc`
+                  var sort_dir = $this.data("sort-default") || dir.ASC;
+                  if ($this.data("sort-dir"))
+                     sort_dir = $this.data("sort-dir") === dir.ASC ? dir.DESC : dir.ASC;
+          
+                  // Choose appropriate sorting function. 
+                  var type = $this.data("sort") || null;
+          
+                  // Prevent sorting if no type defined
+                  if (type === null) {
+                    return;
+                  }
+          
+                  // Trigger `beforetablesort` event that calling scripts can hook into;
+                  // pass parameters for sorted column index and sorting direction
+                  $table.trigger("beforetablesort", {column: th_index, direction: sort_dir});
+                  // More reliable method of forcing a redraw
+                  $table.css("display");
+          
+                  // Run sorting asynchronously on a timout to force browser redraw after
+                  // `beforetablesort` callback. Also avoids locking up the browser too much.
+                  setTimeout(function() {
+                    // Gather the elements for this column
+                    var column = [];
+                    var sortMethod = sortFns[type];
+          
+                    // Push either the value of the `data-order-by` attribute if specified
+                    // or just the text() value in this column to column[] for comparison.
+                    trs.each(function(index,tr) {
+                      var $e = $(tr).children().eq(th_index);
+                      var sort_val = $e.data("sort-value");
+                      var order_by = typeof(sort_val) !== "undefined" ? sort_val : $e.text();
+                      column.push(order_by);
+                    });
+          
+                    // Create the sort map. This column having a sort-dir implies it was
+                    // the last column sorted. As long as no data-sort-desc is specified,
+                    // we're free to just reverse the column.
+                    var theMap;
+                    if (sort_dir == dir.ASC)
+                      theMap = sort_map(column, sortMethod);
+                    else
+                      theMap = sort_map(column, function(a, b) { return -sortMethod(a, b); });
+          
+                    // Reset siblings
+                    $table.find("th").data("sort-dir", null).removeClass("sorting-desc sorting-asc");
+                    $this.data("sort-dir", sort_dir).addClass("sorting-"+sort_dir);
+          
+                    var sortedTRs = $(apply_sort_map(trs, theMap));
+                    $table.children("tbody").remove();
+                    $table.append("<tbody />").append(sortedTRs);
+          
+                    // Trigger `aftertablesort` event. Similar to `beforetablesort`
+                    $table.trigger("aftertablesort", {column: th_index, direction: sort_dir});
+                    // More reliable method of forcing a redraw
+                    $table.css("display");
+                  }, 10);
+                });
+              });
+            };
+          
+            // Enum containing sorting directions
+            $.fn.stupidtable.dir = {ASC: "asc", DESC: "desc"};
+          
+            $.fn.stupidtable.default_sort_fns = {
+              "int": function(a, b) {
+                return parseInt(a, 10) - parseInt(b, 10);
+              },
+              "float": function(a, b) {
+                return parseFloat(a) - parseFloat(b);
+              },
+              "string": function(a, b) {
+                if (a < b) return -1;
+                if (a > b) return +1;
+                return 0;
+              },
+              "string-ins": function(a, b) {
+                a = a.toLowerCase();
+                b = b.toLowerCase();
+                if (a < b) return -1;
+                if (a > b) return +1;
+                return 0;
+              }
+            };
+          
+          })(jQuery);
+          $(function(){
+            var table = $("table").stupidtable();
+          
+            table.on("beforetablesort", function (event, data) {
+              // data.column - the index of the column sorted after a click
+              // data.direction - the sorting direction (either asc or desc)
+              $("#msg").text("Sorting index " + data.column)
+            });
+            table.on("aftertablesort", function (event, data) {
+              var th = $(this).find("th");
+              th.find(".arrow").remove();
+              var dir = $.fn.stupidtable.dir;
+              var arrow = data.direction === dir.ASC ? "&uarr;" : "&darr;";
+              th.eq(data.column).append('<span class="arrow">' + arrow +'</span>');
+            });
+          });
+        </script>
+        <style type="text/css">
+            label {
+              display: inline-block;
+              width: 5em;
+            }
+            table {
+              border-collapse: collapse;
+            }
+            th, td {
+              padding: 5px 10px;
+              border: 1px solid #999;
+            }
+            th {
+              background-color: #a7d3ff;
+            }
+            th[data-sort]{
+              cursor:pointer;
+            }
+            a[data-lightbox]{
+              cursor:zoom-in;
+            }
+            #msg {
+              color: green;
+            }
+        </style>
+    </head>
+    <body>
+        <h1>Dual coding analyse results</h1>
+            %s
+    </body>
+</html> """ % (gene_table)
+         
+        html_file = open(html,"w")
+        html_file.write(html_str)
+        html_file.close()    
+        html_index = open(subfolder+'/index.html','w')
+        html_index.write(html_str)
+        html_index.close() 
+        
+        
+    except Exception, e :
+        stop_err('Error during html page creation : ' + str( e ) )
+    
+    
+
+def __main__():
+    
+    #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_frameshift_analysis.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /data/BY_project/BY_BAM/LMA830_sort.bam -o /data/BY_project/dual_coding_lma.csv -d /data/BY_project/LMA_dual/index.html,/data/BY_project/LMA_dual -c 60
+    #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_frameshift_analysis.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /home/rlegendre/Documents/PDE2S/PDE2S_trim_no_rRNA_28mer.bam -o /home/rlegendre/Documents/PDE2S/dual_coding_pde2s.csv -d /home/rlegendre/Documents/PDE2S/index.html,/home/rlegendre/Documents/PDE2S/PDE2S_dual -c 60
+    #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_frameshift_analysis.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/RPF_psi+_28sorted.bam -o /home/rlegendre/Documents/test_rpkmlesszero.tab -d /home/rlegendre/Documents/index.html,/home/rlegendre/Documents/plouf -c 60
+
+    #Parse command line options
+    parser = optparse.OptionParser()
+    parser.add_option("-i", "--input", dest="gfffile", type= "string",
+                  help="GFF annotation file", metavar="FILE")
+                    
+    parser.add_option("-b", "--bam", dest="bamfile", type= "string",
+                  help="Bam Ribo-Seq alignments ", metavar="FILE")
+                    
+    parser.add_option("-f", "--fasta", dest="fasta", type= "string",
+                  help="Fasta file ", metavar="FILE")
+                    
+    parser.add_option("-d", "--dirout", dest="dirout", type= "string",
+                  help="write report in this html file and in associated directory", metavar="STR,STR")     
+    
+    parser.add_option("-x", "--boxplot", dest="box", type= "string",
+                  help="report boxplot in this file", metavar="STR,STR")                     
+    
+    parser.add_option("-c", "--cutoff", dest="cutoff", type= "int",
+                  help="Integer value for selecting all genes that have less than 60 % (default) of reads in coding frame ", metavar="INT")
+    
+    parser.add_option("-k", "--kmer", dest="kmer", type= "int",
+                  help="Longer of your phasing reads", metavar="INT")               
+    
+    parser.add_option("-q", "--quiet",
+                  action="store_false", dest="verbose", default=True,
+                  help="don't print status messages to stdout")
+                    
+    (options, args) = parser.parse_args()
+    sys.stdout.write("Begin frame analysis at %s\n" % time.asctime( time.localtime(time.time())))
+    # Create temp dir
+    try:
+        tmp_dir = tempfile.mkdtemp()
+        (html_file, subfolder ) = options.dirout.split(",")
+        
+        ##testing indexed bam file
+        if os.path.isfile(options.bamfile+".bai") :
+            pass
+        else :
+            cmd = "samtools index %s " % (options.bamfile)
+            proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
+            returncode = proc.wait()
+            
+        ##RUN analysis
+        split_bam(options.bamfile,tmp_dir)
+        get_first_base (tmp_dir,options.kmer)
+        GFF = store_gff(options.gfffile)
+        #
+        #### cPickles for Test ####
+        #if os.path.isfile("/home/rlegendre/Documents/FrameShift/OutOfFrameWithIntron/pyt_dict") :
+        #    with open("/home/rlegendre/Documents/FrameShift/OutOfFrameWithIntron/pyt_dict",'rb') as fp:
+        #        GFF = cPickle.load(fp)
+        #else :
+        #    split_bam(options.bamfile,tmp_dir)
+        #    get_first_base (tmp_dir)
+        #    GFF = store_gff(options.gfffile)
+        #    GFF = frame_analysis (tmp_dir,options.outfile,options.cutoff,GFF,subfolder)
+        #    with open("/home/rlegendre/OutOfFrameWithIntron/pyt_dict",'wb') as fp:
+        #        cPickle.dump(GFF,fp)
+        #    
+        #### cPickles for Test ####
+       
+        if os.path.exists(subfolder):
+            raise
+        try:
+            os.mkdir(subfolder)
+        except:
+            raise 
+        GFF = frame_analysis (tmp_dir,subfolder,options.cutoff,GFF, options.box)
+        plot_subprofile (GFF,subfolder,options.fasta)
+        write_html_page(html_file,subfolder,GFF)
+        
+        ##paste jquery script in result directory :
+        jq_src = os.path.join(os.path.dirname(__file__),'lightbox')
+        shutil.copytree(jq_src,os.path.join(subfolder,'lightbox'))
+        
+        
+    
+        sys.stdout.write("Finish frame analysis at %s\n" % time.asctime( time.localtime(time.time())))
+    except Exception, e:
+        stop_err( 'Error running metagene analysis : ' + str( e ) )
+            
+    if os.path.exists( tmp_dir ):
+        shutil.rmtree( tmp_dir )
+
+
+
+if __name__=="__main__":
+    __main__()
\ No newline at end of file