Mercurial > repos > rlegendre > ribo_tools
diff kmer_analysis.py @ 2:da126b91f9ea
Uploaded
author | rlegendre |
---|---|
date | Mon, 20 Oct 2014 11:06:53 -0400 |
parents | |
children | 707807fee542 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kmer_analysis.py Mon Oct 20 11:06:53 2014 -0400 @@ -0,0 +1,379 @@ +#!/usr/bin/env python2.7.3 +#-*- coding: utf-8 -*- +''' + Created on Jun. 2014 + @author: rachel legendre + @copyright: rachel.legendre@igmors.u-psud.fr + @license: GPL v3 +''' + +import os, sys, time, optparse, re, urllib, subprocess, tempfile, commands +import pysam +#from matplotlib import pyplot as pl +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as pl +from numpy import arange +import ribo_functions +from collections import OrderedDict +#import cPickle + +total_mapped_read = 0 + + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) ) + sys.exit() + + + +def get_first_base(tmpdir, kmer): + ''' + write footprint coverage file for each sam file in tmpdir and get kmer distribution + ''' + global total_mapped_read + + ## init kmer dict + KMER = OrderedDict({}) + + try: + file_array = (commands.getoutput('ls '+tmpdir)).split('\n') + ##write coverage for each sam file in tmpdir + for samfile in file_array: + with open(tmpdir+'/'+samfile, 'r') as sam : + ##get chromosome name + chrom = samfile.split(".sam")[0] + + for line in sam: + #initialize dictionnary + if '@SQ' in line : + size = int(line.split('LN:')[1]) + genomeF = [0]*size + genomeR = [0]*size + # define multiple reads keys from mapper + elif '@PG' in line : + if 'bowtie' in line: + multi_tag = "XS:i:" + elif 'bwa' in line: + multi_tag = "XT:A:R" + else : + stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") + + # get footprint + elif re.search('^[^@].+', line) : + len_read = len(line.split('\t')[9]) + ##full kmer dict + if KMER.has_key(len_read): + KMER[len_read] += 1 + else : + KMER[len_read] = 1 + + #print line.rstrip() + read_pos = int(line.split('\t')[3]) + read_sens = int(line.split('\t')[1]) + #len_read = len(line.split('\t')[9]) + if len_read == kmer and multi_tag not in line: + ###if line.split('\t')[5] == '28M' : + total_mapped_read +=1 + #if it's a forward read + if read_sens == 0 : + #get 5' base + genomeF[read_pos] += 1 + #if it's a reverse read + elif read_sens == 16 : + #get 5' base + read_pos += (len_read-1) + genomeR[read_pos] += 1 + + #try: + #write coverage in files + with open(tmpdir+'/assoCov_'+chrom+'.txt', 'w') as cov : + for i in range(0,size): + cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n') + #except Exception,e: + # stop_err( 'Error during coverage file writting : ' + str( e ) ) + + #sys.stdout.write("%d reads are in your bam file\n" % total_mapped_read) + + return KMER + + except Exception, e: + stop_err( 'Error during footprinting : ' + str( e ) ) + + +def __percent__(prop): + + if sum(prop) != 0 : + perc = [0,0,0] + if prop[0] != 0 : + perc[0] = int("{0:.0f}".format((prop[0]*100.0)/sum(prop))) + if prop[1] != 0 : + perc[1] = int("{0:.0f}".format((prop[1]*100.0)/sum(prop))) + if prop[2] != 0 : + perc[2] = int("{0:.0f}".format((prop[2]*100.0)/sum(prop))) + return perc + else : + return prop + + +def frame_analysis(tmpdir,GFF): + ''' + This function take footprint and annotation (gff) for analyse reading frame in each gene + ''' + global total_mapped_read + try: + chrom = "" # initializing chromosome + nb_gene = 0 # number of analysed genes + whole_phasing = [0,0,0] + for gene in GFF['order']: + ## maybe no start position in GTF file so we must to check and replace + try : GFF[gene]['start'] + except : + if GFF[gene]['strand'] == '+' : + GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] + else : + GFF[gene]['start'] = GFF[gene]['exon'][1]['stop'] + ## also for stop coordinates + try : GFF[gene]['stop'] + except : + exon_number = GFF[gene]['exon_number'] + if GFF[gene]['strand'] == '+' : + GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] + + else : + GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['start'] + cov = [] + ##first chromosome : we open corresponding file + if chrom == "" : + chrom = GFF[gene]['chrom'] + with open(tmpdir+"/assoCov_"+chrom+".txt") as f : + data = f.readlines() + ##if we change chrosomosome + elif chrom != GFF[gene]['chrom'] : + chrom = GFF[gene]['chrom'] + with open(tmpdir+"/assoCov_"+chrom+".txt") as f : + data = f.readlines() + + ## if a gene without intron : + if GFF[gene]['exon_number'] == 1: + + ## get coverage for each gene + if GFF[gene]['strand'] == "+": + for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1): + cov.append(int((data[i].rstrip()).split("\t")[0])) + else : + for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1): + cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) + cov.reverse() + else : + ## For each gene, get coverage and sum of exon size + if GFF[gene]['strand'] == "+": + + for exon in range(1,GFF[gene]['exon_number']+1) : + for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): + if i <= GFF[gene]['stop'] : + cov.append(int((data[i].rstrip()).split("\t")[0])) + else : + + for exon in range(1,GFF[gene]['exon_number']+1) : + for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): + if i <= GFF[gene]['start'] : + cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) + cov.reverse() + + len_cov = len(cov) + prop = [0,0,0] + for nuc in range(0,len_cov-2,3) : + ## Calculate proportion + prop[0] += cov[nuc] + prop[1] += cov[nuc+1] + prop[2] += cov[nuc+2] + whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop))) + + whole_phasing = __percent__(whole_phasing) + #sys.stdout.write("Proportion of reads in each frame :\n%s\n" % whole_phasing) + return whole_phasing + + except Exception, e: + stop_err( 'Error during frame analysis : ' + str( e ) ) + + + +def make_report(html_file, dirout, kmer, results) : + + array = sorted(kmer.items(), key=lambda x: x[0]) + values = [] + label = [] + for x,y in array : + values.append(y) + label.append(x) + index = arange(len(label)) + bar_width = 0.35 + axis_font = {'size':'10'} + fig, ax = pl.subplots() + pl.bar(index ,values, color='LightsteelBlue') + pl.xlabel('kmer value', **axis_font) + pl.ylabel('Number of reads', **axis_font) + pl.title('Number of reads for each k-mer') + pl.xticks(index + bar_width, label, **axis_font) + #pl.show() + pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640) + pl.clf() + + for key, phase in results.iteritems() : + fig = pl.figure(num=1) + frame = phase + index = arange(3) + bar_width = 0.5 + pl.bar(index,frame,color=['RoyalBlue','LightSkyBlue','LightBlue']) + pl.xlabel('Frame in gene', **axis_font) + pl.ylabel('Percent of read', **axis_font) + pl.title('Proportion of reads in each frame for '+str(key)+'-mer') + pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font) + pl.tight_layout() + pl.ylim(0,100) + pl.draw() + #pl.show() + pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300) + pl.clf() + + kmer_summary = '' + kmer_sorted = OrderedDict(sorted(kmer.iteritems(), key=lambda x: x[0])) + for key, number in kmer_sorted.iteritems() : + if number > 100 : + kmer_summary += '<li>' + kmer_summary += 'Analysis of '+str(key)+'-mer : <br>' + kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>' + kmer_summary += 'Phasing of '+str(key)+'-mer is : '+str(results[key])+'<br>' + kmer_summary += '<img border="0" src="'+str(key)+'_phasing.png" width="50%%" height="50%%"/><br>' + kmer_summary += '</li>' + else : + kmer_summary += '<li>' + kmer_summary += 'Analysis of '+str(key)+'-mer : <br>' + kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>' + kmer_summary += '</li>' + + html_str = """ +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> + +<html xmlns="http://www.w3.org/1999/xhtml"> + <head> + <link href="lightbox/css/lightbox.css" rel="stylesheet" /> + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> + <title>kmer analysis</title> + </head> + <body> + <h1>k-mer analysis results</h1> + <p> + <ul> + <li> + Number of reads for each k-mer in your sample : <br /> + <img border="0" src="kmer_proportion.png" width="50%%" height="50%%"/> + </li> + %s + </ul> + </p> + </body> +</html> +""" + all = html_str % kmer_summary + + html = open(html_file,"w") + html.write(all) + html.close() + +def __main__(): + + #Parse command line options + parser = optparse.OptionParser() + parser.add_option("-g", "--gff", dest="gfffile", type= "string", + help="GFF annotation file", metavar="FILE") + + parser.add_option("-b", "--bam", dest="bamfile", type= "string", + help="Bam Ribo-Seq alignments ", metavar="FILE") + + parser.add_option("-o", "--out", dest="html_file", type= "string", + help="write report to FILE", metavar="FILE") + + parser.add_option("-d", "--dirout", dest="dirout", type= "string", + help="write report to PNG files", metavar="FILE") + + parser.add_option("-q", "--quiet", + action="store_false", dest="verbose", default=True, + help="don't print status messages to stdout") + + (options, args) = parser.parse_args() + sys.stdout.write("Begin kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) ) + + try: + + if os.path.exists(options.dirout): + raise + try: + os.mkdir(options.dirout) + except: + raise + + ##testing indexed bam file + if os.path.isfile(options.bamfile+".bai") : + pass + else : + cmd = "samtools index %s " % (options.bamfile) + proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) + returncode = proc.wait() + + tmpdir = tempfile.mkdtemp() + GFF = ribo_functions.store_gff(options.gfffile) + ## split bam + ribo_functions.split_bam(options.bamfile,tmpdir) + ################################### + ## First analysis with 28mer : + ################################### + ## compute coverage and distribution kmer + kmer = get_first_base(tmpdir, 28) + ## compute phasing + whole_phasing = frame_analysis(tmpdir,GFF) + ## save phasing + results = {} + results[28] = whole_phasing + ## compute analysis with other kmer + for keys in kmer.iterkeys() : + if keys != 28 : + ## remove all txt files in tmp directory + if os.system("rm "+tmpdir+"/*.txt") != 0 : + stop_err( 'Error during tmp directory cleaning : ' + str( e ) ) + + ## If not enought reads in this kmer : + if kmer[keys] > 100 : + ## compute coverage and distribution kmer + tmp = get_first_base(tmpdir, keys) + ## compute phasing + whole_phasing = frame_analysis(tmpdir,GFF) + results[keys] = whole_phasing + ## get report + make_report(options.html_file, options.dirout, kmer, results) + #======================================================================= + # ############ + # # tests + # ############ + # with open("kmer_dict",'rb') as km: + # kmer = cPickle.load(km) + # with open("results_dict",'rb') as res: + # results = cPickle.load(res) + # with open("kmer_dict",'wb') as km: + # cPickle.dump(kmer,km) + # with open("results_dict",'wb') as res: + # cPickle.dump(results,res) + # make_report(options.html_file, options.dirout, kmer, results) + #======================================================================= + + sys.stdout.write("Finish kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) ) + + except Exception, e: + stop_err( 'Error running kmer and phasing analysis : ' + str( e ) ) + + +if __name__=="__main__": + __main__() \ No newline at end of file