ribo_tools: kmer_analysis.py comparison

comparison kmer_analysis.py @ 2:da126b91f9ea

Uploaded

author	rlegendre
date	Mon, 20 Oct 2014 11:06:53 -0400
parents
children	707807fee542

comparison

equal deleted inserted replaced

-:1fbddace2db6
+:da126b91f9ea
+#!/usr/bin/env python2.7.3
+#-*- coding: utf-8 -*-
+'''
+Created on Jun. 2014
+@author: rachel legendre
+@copyright: rachel.legendre@igmors.u-psud.fr
+@license: GPL v3
+'''
+import os, sys, time, optparse, re, urllib, subprocess, tempfile, commands
+import pysam
+#from matplotlib import pyplot as pl
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as pl
+from numpy import arange
+import ribo_functions
+from collections import OrderedDict
+#import cPickle
+total_mapped_read = 0
+def stop_err( msg ):
+sys.stderr.write( "%s\n" % msg )
+sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) )
+sys.exit()
+def get_first_base(tmpdir, kmer):
+'''
+write footprint coverage file for each sam file in tmpdir and get kmer distribution
+'''
+global total_mapped_read
+## init kmer dict
+KMER = OrderedDict({})
+try:
+file_array = (commands.getoutput('ls '+tmpdir)).split('\n')
+##write coverage for each sam file in tmpdir
+for samfile in file_array:
+with open(tmpdir+'/'+samfile, 'r') as sam :
+##get chromosome name
+chrom = samfile.split(".sam")[0]
+for line in sam:
+#initialize dictionnary
+if '@SQ' in line :
+size = int(line.split('LN:')[1])
+genomeF = [0]*size
+genomeR = [0]*size
+# define multiple reads keys from mapper
+elif '@PG' in line :
+if 'bowtie' in line:
+multi_tag = "XS:i:"
+elif 'bwa' in  line:
+multi_tag = "XT:A:R"
+else :
+stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping")
+# get footprint
+elif re.search('^[^@].+', line) :
+len_read = len(line.split('\t')[9])
+##full kmer dict
+if KMER.has_key(len_read):
+KMER[len_read] += 1
+else :
+KMER[len_read] = 1
+#print line.rstrip()
+read_pos = int(line.split('\t')[3])
+read_sens = int(line.split('\t')[1])
+#len_read = len(line.split('\t')[9])
+if len_read == kmer and multi_tag not in line:
+###if line.split('\t')[5] == '28M' :
+total_mapped_read +=1
+#if it's a forward read
+if read_sens == 0 :
+#get 5' base
+genomeF[read_pos] += 1
+#if it's a reverse read
+elif read_sens == 16 :
+#get 5' base
+read_pos += (len_read-1)
+genomeR[read_pos] += 1
+#try:
+#write coverage in files
+with open(tmpdir+'/assoCov_'+chrom+'.txt', 'w') as cov :
+for i in range(0,size):
+cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n')
+#except Exception,e:
+#    stop_err( 'Error during coverage file writting : ' + str( e ) )
+#sys.stdout.write("%d reads are in your bam file\n" % total_mapped_read)
+return KMER
+except Exception, e:
+stop_err( 'Error during footprinting : ' + str( e ) )
+def __percent__(prop):
+if sum(prop) != 0 :
+perc = [0,0,0]
+if prop[0] != 0 :
+perc[0] = int("{0:.0f}".format((prop[0]*100.0)/sum(prop)))
+if prop[1] != 0 :
+perc[1] = int("{0:.0f}".format((prop[1]*100.0)/sum(prop)))
+if prop[2] != 0 :
+perc[2] = int("{0:.0f}".format((prop[2]*100.0)/sum(prop)))
+return perc
+else :
+return prop
+def frame_analysis(tmpdir,GFF):
+'''
+This function take footprint and annotation (gff) for analyse reading frame in each gene
+'''
+global total_mapped_read
+try:
+chrom = "" # initializing chromosome
+nb_gene = 0 # number of analysed genes
+whole_phasing = [0,0,0]
+for gene in GFF['order']:
+## maybe no start position in GTF file so we must to check and replace
+try : GFF[gene]['start']
+except :
+if GFF[gene]['strand'] == '+' :
+GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
+else :
+GFF[gene]['start'] = GFF[gene]['exon'][1]['stop']
+## also for stop coordinates
+try : GFF[gene]['stop']
+except :
+exon_number = GFF[gene]['exon_number']
+if GFF[gene]['strand'] == '+' :
+GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
+else :
+GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['start']
+cov = []
+##first chromosome : we open corresponding file
+if chrom == "" :
+chrom = GFF[gene]['chrom']
+with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
+data = f.readlines()
+##if we change chrosomosome
+elif chrom != GFF[gene]['chrom'] :
+chrom = GFF[gene]['chrom']
+with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
+data = f.readlines()
+## if a gene without intron :
+if GFF[gene]['exon_number'] == 1:
+## get coverage for each gene
+if GFF[gene]['strand'] == "+":
+for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
+cov.append(int((data[i].rstrip()).split("\t")[0]))
+else :
+for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
+cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
+cov.reverse()
+else :
+## For each gene, get coverage and sum of exon size
+if GFF[gene]['strand'] == "+":
+for exon in range(1,GFF[gene]['exon_number']+1) :
+for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
+if i <= GFF[gene]['stop'] :
+cov.append(int((data[i].rstrip()).split("\t")[0]))
+else :
+for exon in range(1,GFF[gene]['exon_number']+1) :
+for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
+if i <= GFF[gene]['start'] :
+cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
+cov.reverse()
+len_cov = len(cov)
+prop = [0,0,0]
+for nuc in range(0,len_cov-2,3) :
+## Calculate proportion
+prop[0] += cov[nuc]
+prop[1] += cov[nuc+1]
+prop[2] += cov[nuc+2]
+whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop)))
+whole_phasing = __percent__(whole_phasing)
+#sys.stdout.write("Proportion of reads in each frame :\n%s\n" % whole_phasing)
+return whole_phasing
+except Exception, e:
+stop_err( 'Error during frame analysis : ' + str( e ) )
+def make_report(html_file, dirout, kmer, results) :
+array = sorted(kmer.items(), key=lambda x: x[0])
+values = []
+label = []
+for x,y in array :
+values.append(y)
+label.append(x)
+index = arange(len(label))
+bar_width = 0.35
+axis_font = {'size':'10'}
+fig, ax = pl.subplots()
+pl.bar(index ,values, color='LightsteelBlue')
+pl.xlabel('kmer value', **axis_font)
+pl.ylabel('Number of reads', **axis_font)
+pl.title('Number of reads for each k-mer')
+pl.xticks(index + bar_width, label, **axis_font)
+#pl.show()
+pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640)
+pl.clf()
+for key, phase in results.iteritems() :
+fig = pl.figure(num=1)
+frame = phase
+index = arange(3)
+bar_width = 0.5
+pl.bar(index,frame,color=['RoyalBlue','LightSkyBlue','LightBlue'])
+pl.xlabel('Frame in gene', **axis_font)
+pl.ylabel('Percent of read', **axis_font)
+pl.title('Proportion of reads in each frame for '+str(key)+'-mer')
+pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font)
+pl.tight_layout()
+pl.ylim(0,100)
+pl.draw()
+#pl.show()
+pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300)
+pl.clf()
+kmer_summary = ''
+kmer_sorted = OrderedDict(sorted(kmer.iteritems(), key=lambda x: x[0]))
+for key, number in kmer_sorted.iteritems() :
+if number > 100 :
+kmer_summary += '<li>'
+kmer_summary += 'Analysis of '+str(key)+'-mer : <br>'
+kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>'
+kmer_summary += 'Phasing of '+str(key)+'-mer is : '+str(results[key])+'<br>'
+kmer_summary += '<img border="0" src="'+str(key)+'_phasing.png" width="50%%" height="50%%"/><br>'
+kmer_summary += '</li>'
+else :
+kmer_summary += '<li>'
+kmer_summary += 'Analysis of '+str(key)+'-mer : <br>'
+kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>'
+kmer_summary += '</li>'
+html_str = """
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<link href="lightbox/css/lightbox.css" rel="stylesheet" />
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<title>kmer analysis</title>
+</head>
+<body>
+<h1>k-mer analysis results</h1>
+<p>
+<ul>
+<li>
+Number of reads for each k-mer in your sample : <br />
+<img border="0" src="kmer_proportion.png" width="50%%" height="50%%"/>
+</li>
+%s
+</ul>
+</p>
+</body>
+</html>
+"""
+all = html_str % kmer_summary
+html = open(html_file,"w")
+html.write(all)
+html.close()
+def __main__():
+#Parse command line options
+parser = optparse.OptionParser()
+parser.add_option("-g", "--gff", dest="gfffile", type= "string",
+help="GFF annotation file", metavar="FILE")
+parser.add_option("-b", "--bam", dest="bamfile", type= "string",
+help="Bam Ribo-Seq alignments ", metavar="FILE")
+parser.add_option("-o", "--out", dest="html_file", type= "string",
+help="write report to FILE", metavar="FILE")
+parser.add_option("-d", "--dirout", dest="dirout", type= "string",
+help="write report to PNG files", metavar="FILE")
+parser.add_option("-q", "--quiet",
+action="store_false", dest="verbose", default=True,
+help="don't print status messages to stdout")
+(options, args) = parser.parse_args()
+sys.stdout.write("Begin kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) )
+try:
+if os.path.exists(options.dirout):
+raise
+try:
+os.mkdir(options.dirout)
+except:
+raise
+##testing indexed bam file
+if os.path.isfile(options.bamfile+".bai") :
+pass
+else :
+cmd = "samtools index %s " % (options.bamfile)
+proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
+returncode = proc.wait()
+tmpdir = tempfile.mkdtemp()
+GFF = ribo_functions.store_gff(options.gfffile)
+## split bam
+ribo_functions.split_bam(options.bamfile,tmpdir)
+###################################
+## First analysis with 28mer :
+###################################
+## compute coverage and distribution kmer
+kmer = get_first_base(tmpdir, 28)
+## compute phasing
+whole_phasing = frame_analysis(tmpdir,GFF)
+## save phasing
+results = {}
+results[28] = whole_phasing
+## compute analysis with other kmer
+for keys in kmer.iterkeys() :
+if keys != 28 :
+## remove all txt files in tmp directory
+if os.system("rm "+tmpdir+"/*.txt") != 0 :
+stop_err( 'Error during tmp directory cleaning : ' + str( e ) )
+## If not enought reads in this kmer :
+if kmer[keys] > 100 :
+## compute coverage and distribution kmer
+tmp = get_first_base(tmpdir, keys)
+## compute phasing
+whole_phasing = frame_analysis(tmpdir,GFF)
+results[keys] = whole_phasing
+## get report
+make_report(options.html_file, options.dirout, kmer, results)
+#=======================================================================
+# ############
+# #  tests
+# ############
+#  with open("kmer_dict",'rb') as km:
+#          kmer = cPickle.load(km)
+#  with open("results_dict",'rb') as res:
+#          results = cPickle.load(res)
+#  with open("kmer_dict",'wb') as km:
+#          cPickle.dump(kmer,km)
+#  with open("results_dict",'wb') as res:
+#          cPickle.dump(results,res)
+# make_report(options.html_file, options.dirout, kmer, results)
+#=======================================================================
+sys.stdout.write("Finish kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) )
+except Exception, e:
+stop_err( 'Error running kmer and phasing analysis : ' + str( e ) )
+if __name__=="__main__":
+__main__()

Mercurial > repos > rlegendre > ribo_tools

comparison kmer_analysis.py @ 2:da126b91f9ea