Mercurial > repos > rlegendre > ribo_tools
view kmer_analysis.py @ 10:707807fee542
(none)
author | rlegendre |
---|---|
date | Thu, 22 Jan 2015 14:34:38 +0100 |
parents | da126b91f9ea |
children | 7c944fd9907e |
line wrap: on
line source
#!/usr/bin/env python2.7.3 #-*- coding: utf-8 -*- ''' Created on Jun. 2014 @author: rachel legendre @copyright: rachel.legendre@igmors.u-psud.fr @license: GPL v3 ''' import os, sys, time, optparse, re, urllib, subprocess, tempfile, commands import pysam #from matplotlib import pyplot as pl import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as pl from numpy import arange from collections import OrderedDict import ribo_functions #import cPickle ## suppress matplotlib warnings import warnings warnings.filterwarnings('ignore') total_mapped_read = 0 def stop_err( msg ): sys.stderr.write( "%s\n" % msg ) sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) ) sys.exit() def split_bam(bamfile,tmpdir): ''' split bam by chromosome and write sam file in tmpdir ''' try: #get header results = subprocess.check_output(['samtools', 'view', '-H',bamfile]) header = results.split('\n') #define genome size genome = [] for line in header: result = re.search('SN', line) if result : #print line feat = line.split('\t') chrom = re.split(":", feat[1]) #print feat[1] genome.append(chrom[1]) #split sam by chrom n = 0 for chrm in genome: with open(tmpdir+'/'+chrm+'.sam', 'w') as f : #write header correctly for each chromosome f.write(header[0]+'\n') expr = re.compile(chrm+'\t') el =[elem for elem in header if expr.search(elem)][0] f.write(el+'\n') f.write(header[-2]+'\n') #write all reads for each chromosome reads = subprocess.check_output(["samtools", "view", bamfile, chrm]) f.write(reads) # calculate number of reads n += reads.count(chrm) sys.stdout.write("%d reads are presents in your bam file\n" % n) except Exception, e: stop_err( 'Error during bam file splitting : ' + str( e ) ) def get_first_base(tmpdir, kmer): ''' write footprint coverage file for each sam file in tmpdir and get kmer distribution ''' global total_mapped_read ## init kmer dict KMER = OrderedDict({}) try: file_array = (commands.getoutput('ls '+tmpdir)).split('\n') ##write coverage for each sam file in tmpdir for samfile in file_array: with open(tmpdir+'/'+samfile, 'r') as sam : ##get chromosome name chrom = samfile.split(".sam")[0] for line in sam: #initialize dictionnary if '@SQ\tSN:' in line : size = int(line.split('LN:')[1]) genomeF = [0]*size genomeR = [0]*size # define multiple reads keys from mapper elif '@PG\tID' in line : if 'bowtie' in line: multi_tag = "XS:i:" elif 'bwa' in line: multi_tag = "XT:A:R" #elif 'TopHat' in line: # multi_tag = "NH:i:1" else : stop_err("No PG tag find in "+samfile+". Please use bowtie or bwa for mapping") # get footprint elif re.search('^[^@].+', line) : len_read = len(line.split('\t')[9]) ##full kmer dict if KMER.has_key(len_read): KMER[len_read] += 1 else : KMER[len_read] = 1 #print line.rstrip() read_pos = int(line.split('\t')[3]) read_sens = int(line.split('\t')[1]) #len_read = len(line.split('\t')[9]) if len_read == kmer and multi_tag not in line: ###if line.split('\t')[5] == '28M' : total_mapped_read +=1 #if it's a forward read if read_sens == 0 : #get 5' base genomeF[read_pos] += 1 #if it's a reverse read elif read_sens == 16 : #get 5' base read_pos += (len_read-1) genomeR[read_pos] += 1 #try: #write coverage in files with open(tmpdir+'/assoCov_'+chrom+'.txt', 'w') as cov : for i in range(0,size): cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n') #except Exception,e: # stop_err( 'Error during coverage file writting : ' + str( e ) ) #sys.stdout.write("%d reads are in your bam file\n" % total_mapped_read) return KMER except Exception, e: stop_err( 'Error during footprinting : ' + str( e ) ) def __percent__(prop): if sum(prop) != 0 : perc = [0,0,0] if prop[0] != 0 : perc[0] = int("{0:.0f}".format((prop[0]*100.0)/sum(prop))) if prop[1] != 0 : perc[1] = int("{0:.0f}".format((prop[1]*100.0)/sum(prop))) if prop[2] != 0 : perc[2] = int("{0:.0f}".format((prop[2]*100.0)/sum(prop))) return perc else : return prop def frame_analysis(tmpdir,GFF): ''' This function take footprint and annotation (gff) for analyse reading frame in each gene ''' global total_mapped_read try: chrom = "" # initializing chromosome nb_gene = 0 # number of analysed genes whole_phasing = [0,0,0] for gene in GFF['order']: ## maybe no start position in GTF file so we must to check and replace exon_number = GFF[gene]['exon_number'] try : GFF[gene]['start'] except : if GFF[gene]['strand'] == '+' : GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] else : GFF[gene]['start'] = GFF[gene]['exon'][exon_number]['stop'] ## also for stop coordinates try : GFF[gene]['stop'] except : if GFF[gene]['strand'] == '+' : GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] else : GFF[gene]['stop'] = GFF[gene]['exon'][1]['start'] cov = [] ##first chromosome : we open corresponding file try: if chrom == "" : chrom = GFF[gene]['chrom'] with open(tmpdir+"/assoCov_"+chrom+".txt") as f : data = f.readlines() ##if we change chromosome elif chrom != GFF[gene]['chrom'] : chrom = GFF[gene]['chrom'] with open(tmpdir+"/assoCov_"+chrom+".txt") as f : data = f.readlines() except IOError : print tmpdir+"/assoCov_"+chrom+".txt doesn't exist" ## if a gene without intron : if GFF[gene]['exon_number'] == 1: ## get coverage for each gene if GFF[gene]['strand'] == "+": for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1): cov.append(int((data[i].rstrip()).split("\t")[0])) else : for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1): cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) cov.reverse() else : ## For each gene, get coverage and sum of exon size if GFF[gene]['strand'] == "+": for exon in range(1,GFF[gene]['exon_number']+1) : for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): #if i <= GFF[gene]['stop'] : cov.append(int((data[i].rstrip()).split("\t")[0])) else : for exon in range(1,GFF[gene]['exon_number']+1) : for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): #if i <= GFF[gene]['start'] : cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) cov.reverse() len_cov = len(cov) prop = [0,0,0] for nuc in range(0,len_cov-2,3) : ## Calculate proportion prop[0] += cov[nuc] prop[1] += cov[nuc+1] prop[2] += cov[nuc+2] whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop))) whole_phasing = __percent__(whole_phasing) #sys.stdout.write("Proportion of reads in each frame :\n%s\n" % whole_phasing) return whole_phasing except Exception, e: stop_err( 'Error during frame analysis : ' + str( e ) ) def make_report(html_file, dirout, kmer, results) : array = sorted(kmer.items(), key=lambda x: x[0]) values = [] label = [] for x,y in array : values.append(y) label.append(x) index = arange(len(label)) bar_width = 0.35 axis_font = {'size':'10'} fig, ax = pl.subplots() pl.bar(index ,values, color='LightsteelBlue') pl.xlabel('kmer value', **axis_font) pl.ylabel('Number of reads', **axis_font) pl.title('Number of reads for each k-mer') pl.xticks(index + bar_width, label, **axis_font) #pl.show() fig.subplots_adjust() pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640) pl.clf() for key, phase in results.iteritems() : fig = pl.figure(num=1) frame = phase index = arange(3) bar_width = 0.5 pl.bar(index,frame,color=['RoyalBlue','LightSkyBlue','LightBlue']) pl.xlabel('Frame in gene', **axis_font) pl.ylabel('Percent of read', **axis_font) pl.title('Proportion of reads in each frame for '+str(key)+'-mer') pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font) #pl.tight_layout() pl.ylim(0,100) fig.subplots_adjust() pl.draw() pl.show() pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300) pl.clf() kmer_summary = '' kmer_sorted = OrderedDict(sorted(kmer.iteritems(), key=lambda x: x[0])) for key, number in kmer_sorted.iteritems() : if number > 100 : kmer_summary += '<li>' kmer_summary += 'Analysis of '+str(key)+'-mer : <br>' kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>' kmer_summary += 'Phasing of '+str(key)+'-mer is : '+str(results[key])+'<br>' kmer_summary += '<img border="0" src="'+str(key)+'_phasing.png" width="50%%" height="50%%"/><br>' kmer_summary += '</li>' else : kmer_summary += '<li>' kmer_summary += 'Analysis of '+str(key)+'-mer : <br>' kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>' kmer_summary += '</li>' html_str = """ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <link href="lightbox/css/lightbox.css" rel="stylesheet" /> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <title>kmer analysis</title> </head> <body> <h1>k-mer analysis results</h1> <p> <ul> <li> Number of reads for each k-mer in your sample : <br /> <img border="0" src="kmer_proportion.png" width="50%%" height="50%%"/> </li> %s </ul> </p> </body> </html> """ all = html_str % kmer_summary html = open(html_file,"w") html.write(all) html.close() def __main__(): #Parse command line options parser = optparse.OptionParser() parser.add_option("-g", "--gff", dest="gff", type= "string", help="GFF annotation file", metavar="FILE") parser.add_option("-b", "--bam", dest="bamfile", type= "string", help="Bam Ribo-Seq alignments ", metavar="FILE") parser.add_option("-o", "--out", dest="html_file", type= "string", help="write report to FILE", metavar="FILE") parser.add_option("-d", "--dirout", dest="dirout", type= "string", help="write report to PNG files", metavar="FILE") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") (options, args) = parser.parse_args() sys.stdout.write("Begin kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) ) try: if os.path.exists(options.dirout): raise try: os.mkdir(options.dirout) except: raise ##testing indexed bam file if os.path.isfile(options.bamfile+".bai") : pass else : cmd = "samtools index %s " % (options.bamfile) proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) returncode = proc.wait() tmpdir = tempfile.mkdtemp() ## identify GFF or GTF format from 9th column with open (options.gff,"r") as gffile : for line in gffile : if '#' in line : ## skip header gffile.next() elif 'gene_id' in line : ## launch gtf reader : GFF = ribo_functions.store_gtf(options.gff) break elif 'ID=' in line : ## launch gff reader GFF = ribo_functions.store_gff(options.gff) break else : stop_err( 'Please check your annotation file is in correct format, GFF or GTF' ) #GFF = store_gff(options.gff) #GFF = ribo_functions.store_gtf(options.gff) ## check gff reading if not GFF['order'] : stop_err( 'Incorrect GFF file' + str( e ) ) ## split bam split_bam(options.bamfile,tmpdir) ################################### ## First analysis with 28mer : ################################### ## compute coverage and distribution kmer kmer = get_first_base(tmpdir, 28) ## compute phasing whole_phasing = frame_analysis(tmpdir,GFF) ## save phasing results = {} results[28] = whole_phasing ## compute analysis with other kmer for keys in kmer.iterkeys() : if keys != 28 : ## remove all txt files in tmp directory if os.system("rm "+tmpdir+"/*.txt") != 0 : stop_err( 'Error during tmp directory cleaning : ' + str( e ) ) ## If not enought reads in this kmer : if kmer[keys] > 100 : ## compute coverage and distribution kmer tmp = get_first_base(tmpdir, keys) ## compute phasing whole_phasing = frame_analysis(tmpdir,GFF) results[keys] = whole_phasing ## get report make_report(options.html_file, options.dirout, kmer, results) #======================================================================= # ############ # # tests # ############ # with open("kmer_dict",'rb') as km: # kmer = cPickle.load(km) # with open("results_dict",'rb') as res: # results = cPickle.load(res) # with open("kmer_dict",'wb') as km: # cPickle.dump(kmer,km) # with open("results_dict",'wb') as res: # cPickle.dump(results,res) # make_report(options.html_file, options.dirout, kmer, results) #======================================================================= sys.stdout.write("Finish kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) ) except Exception, e: stop_err( 'Error running kmer and phasing analysis : ' + str( e ) ) if __name__=="__main__": __main__()