diff kmer_analysis.py @ 2:da126b91f9ea

Uploaded
author rlegendre
date Mon, 20 Oct 2014 11:06:53 -0400
parents
children 707807fee542
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kmer_analysis.py	Mon Oct 20 11:06:53 2014 -0400
@@ -0,0 +1,379 @@
+#!/usr/bin/env python2.7.3
+#-*- coding: utf-8 -*-
+'''
+    Created on Jun. 2014
+    @author: rachel legendre
+    @copyright: rachel.legendre@igmors.u-psud.fr
+    @license: GPL v3
+'''
+
+import os, sys, time, optparse, re, urllib, subprocess, tempfile, commands
+import pysam
+#from matplotlib import pyplot as pl
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as pl
+from numpy import arange
+import ribo_functions
+from collections import OrderedDict
+#import cPickle
+
+total_mapped_read = 0 
+
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) )
+    sys.exit()
+
+
+        
+def get_first_base(tmpdir, kmer): 
+    '''
+        write footprint coverage file for each sam file in tmpdir and get kmer distribution
+    '''
+    global total_mapped_read
+    
+    ## init kmer dict
+    KMER = OrderedDict({})
+    
+    try:
+        file_array = (commands.getoutput('ls '+tmpdir)).split('\n')
+        ##write coverage for each sam file in tmpdir
+        for samfile in file_array:
+            with open(tmpdir+'/'+samfile, 'r') as sam : 
+                ##get chromosome name
+                chrom = samfile.split(".sam")[0]
+        
+                for line in sam:
+                    #initialize dictionnary
+                    if '@SQ' in line :
+                        size = int(line.split('LN:')[1])
+                        genomeF = [0]*size
+                        genomeR = [0]*size
+                    # define multiple reads keys from mapper
+                    elif '@PG' in line :
+                        if 'bowtie' in line:
+                            multi_tag = "XS:i:"
+                        elif 'bwa' in  line:
+                            multi_tag = "XT:A:R"
+                        else :
+                            stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping")  
+                        
+                    # get footprint
+                    elif re.search('^[^@].+', line) :
+                        len_read = len(line.split('\t')[9])
+                        ##full kmer dict
+                        if KMER.has_key(len_read):
+                            KMER[len_read] += 1
+                        else :
+                            KMER[len_read] = 1
+                            
+                        #print line.rstrip()
+                        read_pos = int(line.split('\t')[3])
+                        read_sens = int(line.split('\t')[1])
+                        #len_read = len(line.split('\t')[9])
+                        if len_read == kmer and multi_tag not in line:
+                        ###if line.split('\t')[5] == '28M' :
+                            total_mapped_read +=1
+                            #if it's a forward read
+                            if read_sens == 0 :
+                                #get 5' base
+                                genomeF[read_pos] += 1
+                            #if it's a reverse read
+                            elif read_sens == 16 :
+                                #get 5' base
+                                read_pos += (len_read-1)
+                                genomeR[read_pos] += 1
+
+            #try:
+            #write coverage in files
+            with open(tmpdir+'/assoCov_'+chrom+'.txt', 'w') as cov : 
+                for i in range(0,size):
+                    cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n')
+            #except Exception,e:
+            #    stop_err( 'Error during coverage file writting : ' + str( e ) )            
+        
+        #sys.stdout.write("%d reads are in your bam file\n" % total_mapped_read)      
+        
+        return KMER       
+        
+    except Exception, e:
+        stop_err( 'Error during footprinting : ' + str( e ) )
+
+
+def __percent__(prop):
+        
+    if sum(prop) != 0 :
+        perc = [0,0,0]
+        if prop[0] != 0 :
+            perc[0] = int("{0:.0f}".format((prop[0]*100.0)/sum(prop)))
+        if prop[1] != 0 :
+            perc[1] = int("{0:.0f}".format((prop[1]*100.0)/sum(prop)))
+        if prop[2] != 0 :
+            perc[2] = int("{0:.0f}".format((prop[2]*100.0)/sum(prop)))
+        return perc
+    else :
+        return prop
+    
+
+def frame_analysis(tmpdir,GFF):
+    '''
+    This function take footprint and annotation (gff) for analyse reading frame in each gene
+    '''
+    global total_mapped_read
+    try:
+        chrom = "" # initializing chromosome
+        nb_gene = 0 # number of analysed genes
+        whole_phasing = [0,0,0]
+        for gene in GFF['order']:
+            ## maybe no start position in GTF file so we must to check and replace
+            try : GFF[gene]['start'] 
+            except :
+                if GFF[gene]['strand'] == '+' :
+                    GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
+                else :
+                    GFF[gene]['start'] = GFF[gene]['exon'][1]['stop']
+            ## also for stop coordinates
+            try : GFF[gene]['stop'] 
+            except :
+                exon_number = GFF[gene]['exon_number']
+                if GFF[gene]['strand'] == '+' :
+                    GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
+
+                else :
+                    GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['start']
+            cov = []
+            ##first chromosome : we open corresponding file
+            if chrom == "" :
+                chrom = GFF[gene]['chrom']
+                with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
+                    data = f.readlines()
+            ##if we change chrosomosome
+            elif chrom != GFF[gene]['chrom'] :
+                chrom = GFF[gene]['chrom']
+                with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
+                    data = f.readlines()
+                 
+            ## if a gene without intron :
+            if GFF[gene]['exon_number'] == 1:
+                    
+                ## get coverage for each gene   
+                if GFF[gene]['strand'] == "+":
+                    for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
+                        cov.append(int((data[i].rstrip()).split("\t")[0]))
+                else :
+                    for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
+                        cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
+                    cov.reverse()
+            else :
+                ## For each gene, get coverage and sum of exon size
+                if GFF[gene]['strand'] == "+":
+                        
+                    for exon in range(1,GFF[gene]['exon_number']+1) :
+                        for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
+                            if i <= GFF[gene]['stop'] :
+                                cov.append(int((data[i].rstrip()).split("\t")[0]))  
+                else :
+                          
+                    for exon in range(1,GFF[gene]['exon_number']+1) :
+                        for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
+                            if i <= GFF[gene]['start'] :
+                                cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
+                    cov.reverse()        
+                            
+            len_cov = len(cov)
+            prop = [0,0,0]
+            for nuc in range(0,len_cov-2,3) :
+                ## Calculate proportion
+                prop[0] += cov[nuc]
+                prop[1] += cov[nuc+1]
+                prop[2] += cov[nuc+2]
+            whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop)))
+
+        whole_phasing = __percent__(whole_phasing)
+        #sys.stdout.write("Proportion of reads in each frame :\n%s\n" % whole_phasing)
+        return whole_phasing
+        
+    except Exception, e:
+        stop_err( 'Error during frame analysis : ' + str( e ) )
+
+
+
+def make_report(html_file, dirout, kmer, results) :
+    
+    array = sorted(kmer.items(), key=lambda x: x[0])
+    values = []
+    label = []
+    for x,y in array :
+        values.append(y)
+        label.append(x)
+    index = arange(len(label))
+    bar_width = 0.35
+    axis_font = {'size':'10'}
+    fig, ax = pl.subplots()
+    pl.bar(index ,values, color='LightsteelBlue')
+    pl.xlabel('kmer value', **axis_font)
+    pl.ylabel('Number of reads', **axis_font)
+    pl.title('Number of reads for each k-mer')
+    pl.xticks(index + bar_width, label, **axis_font)
+    #pl.show()
+    pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640)
+    pl.clf()
+    
+    for key, phase in results.iteritems() :
+        fig = pl.figure(num=1)
+        frame = phase
+        index = arange(3)
+        bar_width = 0.5
+        pl.bar(index,frame,color=['RoyalBlue','LightSkyBlue','LightBlue'])
+        pl.xlabel('Frame in gene', **axis_font)
+        pl.ylabel('Percent of read', **axis_font)
+        pl.title('Proportion of reads in each frame for '+str(key)+'-mer')
+        pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font)
+        pl.tight_layout()
+        pl.ylim(0,100)
+        pl.draw()
+        #pl.show()
+        pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300)
+        pl.clf()
+    
+    kmer_summary = ''
+    kmer_sorted = OrderedDict(sorted(kmer.iteritems(), key=lambda x: x[0]))
+    for key, number in kmer_sorted.iteritems() :
+        if number > 100 :
+            kmer_summary += '<li>'
+            kmer_summary += 'Analysis of '+str(key)+'-mer : <br>'
+            kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>'
+            kmer_summary += 'Phasing of '+str(key)+'-mer is : '+str(results[key])+'<br>'
+            kmer_summary += '<img border="0" src="'+str(key)+'_phasing.png" width="50%%" height="50%%"/><br>'
+            kmer_summary += '</li>'
+        else :
+            kmer_summary += '<li>'
+            kmer_summary += 'Analysis of '+str(key)+'-mer : <br>'
+            kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>'            
+            kmer_summary += '</li>'        
+    
+    html_str = """ 
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+    
+<html xmlns="http://www.w3.org/1999/xhtml">
+    <head>
+        <link href="lightbox/css/lightbox.css" rel="stylesheet" />
+        <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+        <title>kmer analysis</title>
+    </head>
+    <body>
+        <h1>k-mer analysis results</h1>
+        <p>
+            <ul>
+                <li>
+                    Number of reads for each k-mer in your sample : <br />
+                    <img border="0" src="kmer_proportion.png" width="50%%" height="50%%"/>
+                </li>
+                %s
+            </ul>
+        </p>
+    </body>
+</html> 
+"""
+    all = html_str % kmer_summary
+        
+    html = open(html_file,"w")
+    html.write(all)
+    html.close()  
+       
+def __main__():
+
+    #Parse command line options
+    parser = optparse.OptionParser()
+    parser.add_option("-g", "--gff", dest="gfffile", type= "string",
+                  help="GFF annotation file", metavar="FILE")
+                  
+    parser.add_option("-b", "--bam", dest="bamfile", type= "string",
+                  help="Bam Ribo-Seq alignments ", metavar="FILE")
+                    
+    parser.add_option("-o", "--out", dest="html_file", type= "string",
+                  help="write report to FILE", metavar="FILE")
+                    
+    parser.add_option("-d", "--dirout", dest="dirout", type= "string",
+                  help="write report to PNG files", metavar="FILE")    
+                    
+    parser.add_option("-q", "--quiet",
+                  action="store_false", dest="verbose", default=True,
+                  help="don't print status messages to stdout")
+                    
+    (options, args) = parser.parse_args()
+    sys.stdout.write("Begin kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) )
+
+    try:
+        
+        if os.path.exists(options.dirout):
+            raise
+        try:
+            os.mkdir(options.dirout)
+        except:
+            raise 
+        
+        ##testing indexed bam file
+        if os.path.isfile(options.bamfile+".bai") :
+            pass
+        else :
+            cmd = "samtools index %s " % (options.bamfile)
+            proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
+            returncode = proc.wait()
+        
+        tmpdir = tempfile.mkdtemp()
+        GFF = ribo_functions.store_gff(options.gfffile)
+        ## split bam
+        ribo_functions.split_bam(options.bamfile,tmpdir)
+        ###################################
+        ## First analysis with 28mer :
+        ###################################
+        ## compute coverage and distribution kmer
+        kmer = get_first_base(tmpdir, 28)
+        ## compute phasing
+        whole_phasing = frame_analysis(tmpdir,GFF)
+        ## save phasing
+        results = {}
+        results[28] = whole_phasing 
+        ## compute analysis with other kmer
+        for keys in kmer.iterkeys() :
+            if keys != 28 :
+                ## remove all txt files in tmp directory
+                if os.system("rm "+tmpdir+"/*.txt") != 0 :
+                    stop_err( 'Error during tmp directory cleaning : ' + str( e ) )
+                    
+                ## If not enought reads in this kmer :
+                if kmer[keys] > 100 :
+                    ## compute coverage and distribution kmer
+                    tmp = get_first_base(tmpdir, keys)
+                    ## compute phasing
+                    whole_phasing = frame_analysis(tmpdir,GFF)
+                    results[keys] = whole_phasing
+                 ## get report
+        make_report(options.html_file, options.dirout, kmer, results)
+        #=======================================================================
+        # ############
+        # #  tests
+        # ############
+        #  with open("kmer_dict",'rb') as km:
+        #          kmer = cPickle.load(km)
+        #  with open("results_dict",'rb') as res:
+        #          results = cPickle.load(res)
+        #  with open("kmer_dict",'wb') as km:
+        #          cPickle.dump(kmer,km)  
+        #  with open("results_dict",'wb') as res:
+        #          cPickle.dump(results,res)  
+        # make_report(options.html_file, options.dirout, kmer, results)
+        #=======================================================================
+        
+        sys.stdout.write("Finish kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) )
+    
+    except Exception, e:
+        stop_err( 'Error running kmer and phasing analysis : ' + str( e ) )
+    
+
+if __name__=="__main__":
+    __main__()
\ No newline at end of file