view kmer_analysis.py @ 19:385fc64fa988 draft default tip

Uploaded
author rlegendre
date Fri, 12 Jun 2015 11:32:59 -0400
parents a121cce43f90
children
line wrap: on
line source

#!/usr/bin/env python2.7.3
#-*- coding: utf-8 -*-
'''
    Created on Jun. 2014
    @author: rachel legendre
    @copyright: rachel.legendre@igmors.u-psud.fr
    @license: GPL v3
'''

import os, sys, time, optparse, re, urllib, subprocess, tempfile, commands
import pysam
#from matplotlib import pyplot as pl
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as pl
from numpy import arange
from collections import OrderedDict
import ribo_functions
#import cPickle
## suppress matplotlib warnings
import warnings
warnings.filterwarnings('ignore')


total_mapped_read = 0 


def stop_err( msg ):
    sys.stderr.write( "%s\n" % msg )
    sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) )
    sys.exit()
    
    
def split_bam(bamfile,tmpdir):
    '''
        split bam by chromosome and write sam file in tmpdir
    '''
    try:
        #get header
        results = subprocess.check_output(['samtools', 'view', '-H',bamfile])
        header = results.split('\n')
       
        #define genome size
        genome = []
        for line in header:
            result = re.search('SN', line)
            if result :
                #print line    
                feat = line.split('\t')
                chrom = re.split(":", feat[1])
                #print feat[1]
                genome.append(chrom[1])
       
        #split sam by chrom
        n = 0
        for chrm in genome:
            with open(tmpdir+'/'+chrm+'.sam', 'w') as f : 
                #write header correctly for each chromosome
                f.write(header[0]+'\n')
                expr = re.compile(chrm+'\t')
                el =[elem for elem in header if expr.search(elem)][0]
                f.write(el+'\n')            
                f.write(header[-2]+'\n')
                #write all reads for each chromosome
                reads = subprocess.check_output(["samtools", "view", bamfile, chrm])
                f.write(reads)
                # calculate number of reads 
                n += reads.count(chrm)

        sys.stdout.write("%d reads are presents in your bam file\n" % n)
    
    except Exception, e:
        stop_err( 'Error during bam file splitting : ' + str( e ) )
        

       
def get_first_base(tmpdir, kmer): 
    '''
        write footprint coverage file for each sam file in tmpdir and get kmer distribution
    '''
    global total_mapped_read
    
    ## tags by default
    multi_tag = "XS:i:"
    tag = "IH:i:1"
    
    ## init kmer dict
    KMER = OrderedDict({})
    
    try:
        file_array = (commands.getoutput('ls '+tmpdir)).split('\n')
        ##write coverage for each sam file in tmpdir
        for samfile in file_array:
            with open(tmpdir+'/'+samfile, 'r') as sam : 
                ##get chromosome name
                chrom = samfile.split(".sam")[0]
        
                for line in sam:
                    #initialize dictionnary
                    if '@SQ\tSN:' in line :
                        size = int(line.split('LN:')[1])
                        genomeF = [0]*size
                        genomeR = [0]*size
                    # define multiple reads keys from mapper
                    elif '@PG\tID' in line :
                        if 'bowtie' in line:
                            multi_tag = "XS:i:"
                        elif 'bwa' in  line:
                            multi_tag = "XT:A:R"
                        elif 'TopHat' in  line:
                            tag = "NH:i:1"
                        else :
                            stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")  
                        
                    # get footprint
                    elif re.search('^[^@].+', line) :
                        len_read = len(line.split('\t')[9])
                        ##full kmer dict
                        if KMER.has_key(len_read):
                            KMER[len_read] += 1
                        else :
                            KMER[len_read] = 1
                            
                        #print line.rstrip()
                        read_pos = int(line.split('\t')[3])
                        read_sens = int(line.split('\t')[1])
                        #len_read = len(line.split('\t')[9])
                        if len_read == kmer and (tag in line or multi_tag not in line):
                        ###if line.split('\t')[5] == '28M' :
                            total_mapped_read +=1
                            #if it's a forward read
                            if read_sens == 0 :
                                #get 5' base
                                genomeF[read_pos] += 1
                            #if it's a reverse read
                            elif read_sens == 16 :
                                #get 5' base
                                read_pos += (len_read-1)
                                genomeR[read_pos] += 1

            #try:
            #write coverage in files
            with open(tmpdir+'/assoCov_'+chrom+'.txt', 'w') as cov : 
                for i in range(0,size):
                    cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n')
            #except Exception,e:
            #    stop_err( 'Error during coverage file writting : ' + str( e ) )            
        
        #sys.stdout.write("%d reads are in your bam file\n" % total_mapped_read)      
        
        return KMER       
        
    except Exception, e:
        stop_err( 'Error during footprinting : ' + str( e ) )


def __percent__(prop):
        
    if sum(prop) != 0 :
        perc = [0,0,0]
        if prop[0] != 0 :
            perc[0] = int("{0:.0f}".format((prop[0]*100.0)/sum(prop)))
        if prop[1] != 0 :
            perc[1] = int("{0:.0f}".format((prop[1]*100.0)/sum(prop)))
        if prop[2] != 0 :
            perc[2] = int("{0:.0f}".format((prop[2]*100.0)/sum(prop)))
        return perc
    else :
        return prop
    

def frame_analysis(tmpdir,GFF):
    '''
    This function take footprint and annotation (gff) for analyse reading frame in each gene
    '''
    global total_mapped_read
    try:
        chrom = "" # initializing chromosome
        nb_gene = 0 # number of analysed genes
        whole_phasing = [0,0,0]
        for gene in GFF['order']:
            ## maybe no start position in GTF file so we must to check and replace
            exon_number = GFF[gene]['exon_number']
            try : GFF[gene]['start'] 
            except :
                if GFF[gene]['strand'] == '+' :
                    GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
                else :
                    GFF[gene]['start'] = GFF[gene]['exon'][exon_number]['stop']
            ## also for stop coordinates
            try : GFF[gene]['stop'] 
            except :
                if GFF[gene]['strand'] == '+' :
                    GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']

                else :
                    GFF[gene]['stop'] = GFF[gene]['exon'][1]['start']
            cov = []
            ##first chromosome : we open corresponding file
            try: 
                if chrom == "" :
                    chrom = GFF[gene]['chrom']
                    with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
                        data = f.readlines()
                ##if we change chromosome
                elif chrom != GFF[gene]['chrom'] :
                    chrom = GFF[gene]['chrom']
                    with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
                        data = f.readlines()
            except IOError :
                print tmpdir+"/assoCov_"+chrom+".txt doesn't exist"     

            try:
                ## if a gene without intron :
                if GFF[gene]['exon_number'] == 1:
                        
                    ## get coverage for each gene   
                    if GFF[gene]['strand'] == "+":
                        for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
                            cov.append(int((data[i].rstrip()).split("\t")[0]))
                    else :
                        for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
                            cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
                        cov.reverse()
                else :
                    ## For each gene, get coverage and sum of exon size
                    if GFF[gene]['strand'] == "+":
                            
                        for exon in range(1,GFF[gene]['exon_number']+1) :
                            for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
                                #if i <= GFF[gene]['stop'] :
                                cov.append(int((data[i].rstrip()).split("\t")[0]))  
                    else :
                              
                        for exon in range(1,GFF[gene]['exon_number']+1) :
                            for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
                                #if i <= GFF[gene]['start'] :
                                cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
                        cov.reverse()        
            except :
                #print gene+" could not be analysed."
                #del GFF[gene]
                continue          
            len_cov = len(cov)
            prop = [0,0,0]
            for nuc in range(0,len_cov-2,3) :
                ## Calculate proportion
                prop[0] += cov[nuc]
                prop[1] += cov[nuc+1]
                prop[2] += cov[nuc+2]
            whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop)))

        whole_phasing = __percent__(whole_phasing)
        #sys.stdout.write("Proportion of reads in each frame :\n%s\n" % whole_phasing)
        return whole_phasing
        
    except Exception, e:
        stop_err( 'Error during frame analysis : ' + str( e ) )



def make_report(html_file, dirout, kmer, results) :
    
    array = sorted(kmer.items(), key=lambda x: x[0])
    values = []
    label = []
    for x,y in array :
        values.append(y)
        label.append(x)
    index = arange(len(label))
    bar_width = 0.35
    axis_font = {'size':'10'}
    fig, ax = pl.subplots()
    pl.bar(index ,values, color='LightsteelBlue')
    pl.xlabel('kmer value', **axis_font)
    pl.ylabel('Number of reads', **axis_font)
    pl.title('Number of reads for each k-mer')
    pl.xticks(index + bar_width, label, **axis_font)
    #pl.show()
    fig.subplots_adjust()
    pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640)
    pl.clf()
    
    for key, phase in results.iteritems() :
        fig = pl.figure(num=1)
        frame = phase
        index = arange(3)
        bar_width = 0.5
        pl.bar(index,frame,color=['RoyalBlue','LightSkyBlue','LightBlue'])
        pl.xlabel('Frame in gene', **axis_font)
        pl.ylabel('Percent of read', **axis_font)
        pl.title('Proportion of reads in each frame for '+str(key)+'-mer')
        pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font)
        #pl.tight_layout()
        pl.ylim(0,100)
        fig.subplots_adjust() 
        pl.draw()
        pl.show()
        pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300)
        pl.clf()
    
    kmer_summary = ''
    kmer_sorted = OrderedDict(sorted(kmer.iteritems(), key=lambda x: x[0]))
    for key, number in kmer_sorted.iteritems() :
        if number > 100 :
            kmer_summary += '<li>'
            kmer_summary += 'Analysis of '+str(key)+'-mer : <br>'
            kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>'
            kmer_summary += 'Phasing of '+str(key)+'-mer is : '+str(results[key])+'<br>'
            kmer_summary += '<img border="0" src="'+str(key)+'_phasing.png" width="50%%" height="50%%"/><br>'
            kmer_summary += '</li>'
        else :
            kmer_summary += '<li>'
            kmer_summary += 'Analysis of '+str(key)+'-mer : <br>'
            kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>'            
            kmer_summary += '</li>'        
    
    html_str = """ 
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
    
<html xmlns="http://www.w3.org/1999/xhtml">
    <head>
        <link href="lightbox/css/lightbox.css" rel="stylesheet" />
        <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
        <title>kmer analysis</title>
    </head>
    <body>
        <h1>k-mer analysis results</h1>
        <p>
            <ul>
                <li>
                    Number of reads for each k-mer in your sample : <br />
                    <img border="0" src="kmer_proportion.png" width="50%%" height="50%%"/>
                </li>
                %s
            </ul>
        </p>
    </body>
</html> 
"""
    all = html_str % kmer_summary
        
    html = open(html_file,"w")
    html.write(all)
    html.close()  
       
def __main__():

    #Parse command line options
    parser = optparse.OptionParser()
    parser.add_option("-g", "--gff", dest="gff", type= "string",
                  help="GFF annotation file", metavar="FILE")
                  
    parser.add_option("-b", "--bam", dest="bamfile", type= "string",
                  help="Bam Ribo-Seq alignments ", metavar="FILE")
                    
    parser.add_option("-o", "--out", dest="html_file", type= "string",
                  help="write report to FILE", metavar="FILE")
                    
    parser.add_option("-d", "--dirout", dest="dirout", type= "string",
                  help="write report to PNG files", metavar="FILE")    
                    
    parser.add_option("-q", "--quiet",
                  action="store_false", dest="verbose", default=True,
                  help="don't print status messages to stdout")
                    
    (options, args) = parser.parse_args()
    sys.stdout.write("Begin kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) )

    try:
        
        if not os.path.exists(options.dirout):
            try:
                os.mkdir(options.dirout)
            except Exception, e :
                stop_err('Error running make directory : ' + str(e)) 
        
        ##testing indexed bam file
        if os.path.isfile(options.bamfile+".bai") :
            pass
        else :
            cmd = "samtools index %s " % (options.bamfile)
            proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
            returncode = proc.wait()
        
        tmpdir = tempfile.mkdtemp()
        ## identify GFF or GTF format from 9th column
        with open (options.gff,"r") as gffile :
            for line in gffile :
                if '#' in line :
                    ## skip header
                    gffile.next()
                elif 'gene_id' in line :
                    ## launch gtf reader :
                    GFF = ribo_functions.store_gtf(options.gff)
                    break
                elif 'ID=' in line :
                    ## launch gff reader 
                    GFF = ribo_functions.store_gff(options.gff)
                    break
                else :
                    stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )
        #GFF = store_gff(options.gff)
        #GFF = ribo_functions.store_gtf(options.gff)
        ## check gff reading
        if not GFF['order'] :
            stop_err( 'Incorrect GFF file' )
        for gene in GFF['order']:
            if not GFF[gene]['exon'] :
                del GFF[gene]
                GFF['order'].remove(gene)  
        ## split bam
        split_bam(options.bamfile,tmpdir)
        ###################################
        ## First analysis with 28mer :
        ###################################
        ## compute coverage and distribution kmer
        kmer = get_first_base(tmpdir, 28)
        ## compute phasing
        whole_phasing = frame_analysis(tmpdir,GFF)
        ## save phasing
        results = {}
        results[28] = whole_phasing 
        ## compute analysis with other kmer
        for keys in kmer.iterkeys() :
            if keys != 28 :
 
                ## If not enought reads in this kmer :
                if kmer[keys] > 100 :
                    ## remove all txt files in tmp directory
                    if os.system("rm "+tmpdir+"/*.txt") != 0 :
                        stop_err( 'Error during tmp directory cleaning')
                    ## compute coverage and distribution kmer
                    tmp = get_first_base(tmpdir, keys)
                    ## compute phasing
                    whole_phasing = frame_analysis(tmpdir,GFF)

                    results[keys] = whole_phasing
                 ## get report
        make_report(options.html_file, options.dirout, kmer, results)
        #=======================================================================
        # ############
        # #  tests
        # ############
        #  with open("kmer_dict",'rb') as km:
        #          kmer = cPickle.load(km)
        #  with open("results_dict",'rb') as res:
        #          results = cPickle.load(res)
        #  with open("kmer_dict",'wb') as km:
        #          cPickle.dump(kmer,km)  
        #  with open("results_dict",'wb') as res:
        #          cPickle.dump(results,res)  
        # make_report(options.html_file, options.dirout, kmer, results)
        #=======================================================================
        
        sys.stdout.write("Finish kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) )
    
    except Exception, e:
        stop_err( 'Error running kmer and phasing analysis : ' + str( e ) )
    

if __name__=="__main__":
    __main__()