comparison kmer_analysis.py @ 2:da126b91f9ea

Uploaded
author rlegendre
date Mon, 20 Oct 2014 11:06:53 -0400
parents
children 707807fee542
comparison
equal deleted inserted replaced
1:1fbddace2db6 2:da126b91f9ea
1 #!/usr/bin/env python2.7.3
2 #-*- coding: utf-8 -*-
3 '''
4 Created on Jun. 2014
5 @author: rachel legendre
6 @copyright: rachel.legendre@igmors.u-psud.fr
7 @license: GPL v3
8 '''
9
10 import os, sys, time, optparse, re, urllib, subprocess, tempfile, commands
11 import pysam
12 #from matplotlib import pyplot as pl
13 import matplotlib
14 matplotlib.use('Agg')
15 import matplotlib.pyplot as pl
16 from numpy import arange
17 import ribo_functions
18 from collections import OrderedDict
19 #import cPickle
20
21 total_mapped_read = 0
22
23
24 def stop_err( msg ):
25 sys.stderr.write( "%s\n" % msg )
26 sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) )
27 sys.exit()
28
29
30
31 def get_first_base(tmpdir, kmer):
32 '''
33 write footprint coverage file for each sam file in tmpdir and get kmer distribution
34 '''
35 global total_mapped_read
36
37 ## init kmer dict
38 KMER = OrderedDict({})
39
40 try:
41 file_array = (commands.getoutput('ls '+tmpdir)).split('\n')
42 ##write coverage for each sam file in tmpdir
43 for samfile in file_array:
44 with open(tmpdir+'/'+samfile, 'r') as sam :
45 ##get chromosome name
46 chrom = samfile.split(".sam")[0]
47
48 for line in sam:
49 #initialize dictionnary
50 if '@SQ' in line :
51 size = int(line.split('LN:')[1])
52 genomeF = [0]*size
53 genomeR = [0]*size
54 # define multiple reads keys from mapper
55 elif '@PG' in line :
56 if 'bowtie' in line:
57 multi_tag = "XS:i:"
58 elif 'bwa' in line:
59 multi_tag = "XT:A:R"
60 else :
61 stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping")
62
63 # get footprint
64 elif re.search('^[^@].+', line) :
65 len_read = len(line.split('\t')[9])
66 ##full kmer dict
67 if KMER.has_key(len_read):
68 KMER[len_read] += 1
69 else :
70 KMER[len_read] = 1
71
72 #print line.rstrip()
73 read_pos = int(line.split('\t')[3])
74 read_sens = int(line.split('\t')[1])
75 #len_read = len(line.split('\t')[9])
76 if len_read == kmer and multi_tag not in line:
77 ###if line.split('\t')[5] == '28M' :
78 total_mapped_read +=1
79 #if it's a forward read
80 if read_sens == 0 :
81 #get 5' base
82 genomeF[read_pos] += 1
83 #if it's a reverse read
84 elif read_sens == 16 :
85 #get 5' base
86 read_pos += (len_read-1)
87 genomeR[read_pos] += 1
88
89 #try:
90 #write coverage in files
91 with open(tmpdir+'/assoCov_'+chrom+'.txt', 'w') as cov :
92 for i in range(0,size):
93 cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n')
94 #except Exception,e:
95 # stop_err( 'Error during coverage file writting : ' + str( e ) )
96
97 #sys.stdout.write("%d reads are in your bam file\n" % total_mapped_read)
98
99 return KMER
100
101 except Exception, e:
102 stop_err( 'Error during footprinting : ' + str( e ) )
103
104
105 def __percent__(prop):
106
107 if sum(prop) != 0 :
108 perc = [0,0,0]
109 if prop[0] != 0 :
110 perc[0] = int("{0:.0f}".format((prop[0]*100.0)/sum(prop)))
111 if prop[1] != 0 :
112 perc[1] = int("{0:.0f}".format((prop[1]*100.0)/sum(prop)))
113 if prop[2] != 0 :
114 perc[2] = int("{0:.0f}".format((prop[2]*100.0)/sum(prop)))
115 return perc
116 else :
117 return prop
118
119
120 def frame_analysis(tmpdir,GFF):
121 '''
122 This function take footprint and annotation (gff) for analyse reading frame in each gene
123 '''
124 global total_mapped_read
125 try:
126 chrom = "" # initializing chromosome
127 nb_gene = 0 # number of analysed genes
128 whole_phasing = [0,0,0]
129 for gene in GFF['order']:
130 ## maybe no start position in GTF file so we must to check and replace
131 try : GFF[gene]['start']
132 except :
133 if GFF[gene]['strand'] == '+' :
134 GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
135 else :
136 GFF[gene]['start'] = GFF[gene]['exon'][1]['stop']
137 ## also for stop coordinates
138 try : GFF[gene]['stop']
139 except :
140 exon_number = GFF[gene]['exon_number']
141 if GFF[gene]['strand'] == '+' :
142 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
143
144 else :
145 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['start']
146 cov = []
147 ##first chromosome : we open corresponding file
148 if chrom == "" :
149 chrom = GFF[gene]['chrom']
150 with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
151 data = f.readlines()
152 ##if we change chrosomosome
153 elif chrom != GFF[gene]['chrom'] :
154 chrom = GFF[gene]['chrom']
155 with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
156 data = f.readlines()
157
158 ## if a gene without intron :
159 if GFF[gene]['exon_number'] == 1:
160
161 ## get coverage for each gene
162 if GFF[gene]['strand'] == "+":
163 for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
164 cov.append(int((data[i].rstrip()).split("\t")[0]))
165 else :
166 for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
167 cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
168 cov.reverse()
169 else :
170 ## For each gene, get coverage and sum of exon size
171 if GFF[gene]['strand'] == "+":
172
173 for exon in range(1,GFF[gene]['exon_number']+1) :
174 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
175 if i <= GFF[gene]['stop'] :
176 cov.append(int((data[i].rstrip()).split("\t")[0]))
177 else :
178
179 for exon in range(1,GFF[gene]['exon_number']+1) :
180 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
181 if i <= GFF[gene]['start'] :
182 cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
183 cov.reverse()
184
185 len_cov = len(cov)
186 prop = [0,0,0]
187 for nuc in range(0,len_cov-2,3) :
188 ## Calculate proportion
189 prop[0] += cov[nuc]
190 prop[1] += cov[nuc+1]
191 prop[2] += cov[nuc+2]
192 whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop)))
193
194 whole_phasing = __percent__(whole_phasing)
195 #sys.stdout.write("Proportion of reads in each frame :\n%s\n" % whole_phasing)
196 return whole_phasing
197
198 except Exception, e:
199 stop_err( 'Error during frame analysis : ' + str( e ) )
200
201
202
203 def make_report(html_file, dirout, kmer, results) :
204
205 array = sorted(kmer.items(), key=lambda x: x[0])
206 values = []
207 label = []
208 for x,y in array :
209 values.append(y)
210 label.append(x)
211 index = arange(len(label))
212 bar_width = 0.35
213 axis_font = {'size':'10'}
214 fig, ax = pl.subplots()
215 pl.bar(index ,values, color='LightsteelBlue')
216 pl.xlabel('kmer value', **axis_font)
217 pl.ylabel('Number of reads', **axis_font)
218 pl.title('Number of reads for each k-mer')
219 pl.xticks(index + bar_width, label, **axis_font)
220 #pl.show()
221 pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640)
222 pl.clf()
223
224 for key, phase in results.iteritems() :
225 fig = pl.figure(num=1)
226 frame = phase
227 index = arange(3)
228 bar_width = 0.5
229 pl.bar(index,frame,color=['RoyalBlue','LightSkyBlue','LightBlue'])
230 pl.xlabel('Frame in gene', **axis_font)
231 pl.ylabel('Percent of read', **axis_font)
232 pl.title('Proportion of reads in each frame for '+str(key)+'-mer')
233 pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font)
234 pl.tight_layout()
235 pl.ylim(0,100)
236 pl.draw()
237 #pl.show()
238 pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300)
239 pl.clf()
240
241 kmer_summary = ''
242 kmer_sorted = OrderedDict(sorted(kmer.iteritems(), key=lambda x: x[0]))
243 for key, number in kmer_sorted.iteritems() :
244 if number > 100 :
245 kmer_summary += '<li>'
246 kmer_summary += 'Analysis of '+str(key)+'-mer : <br>'
247 kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>'
248 kmer_summary += 'Phasing of '+str(key)+'-mer is : '+str(results[key])+'<br>'
249 kmer_summary += '<img border="0" src="'+str(key)+'_phasing.png" width="50%%" height="50%%"/><br>'
250 kmer_summary += '</li>'
251 else :
252 kmer_summary += '<li>'
253 kmer_summary += 'Analysis of '+str(key)+'-mer : <br>'
254 kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>'
255 kmer_summary += '</li>'
256
257 html_str = """
258 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
259 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
260
261 <html xmlns="http://www.w3.org/1999/xhtml">
262 <head>
263 <link href="lightbox/css/lightbox.css" rel="stylesheet" />
264 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
265 <title>kmer analysis</title>
266 </head>
267 <body>
268 <h1>k-mer analysis results</h1>
269 <p>
270 <ul>
271 <li>
272 Number of reads for each k-mer in your sample : <br />
273 <img border="0" src="kmer_proportion.png" width="50%%" height="50%%"/>
274 </li>
275 %s
276 </ul>
277 </p>
278 </body>
279 </html>
280 """
281 all = html_str % kmer_summary
282
283 html = open(html_file,"w")
284 html.write(all)
285 html.close()
286
287 def __main__():
288
289 #Parse command line options
290 parser = optparse.OptionParser()
291 parser.add_option("-g", "--gff", dest="gfffile", type= "string",
292 help="GFF annotation file", metavar="FILE")
293
294 parser.add_option("-b", "--bam", dest="bamfile", type= "string",
295 help="Bam Ribo-Seq alignments ", metavar="FILE")
296
297 parser.add_option("-o", "--out", dest="html_file", type= "string",
298 help="write report to FILE", metavar="FILE")
299
300 parser.add_option("-d", "--dirout", dest="dirout", type= "string",
301 help="write report to PNG files", metavar="FILE")
302
303 parser.add_option("-q", "--quiet",
304 action="store_false", dest="verbose", default=True,
305 help="don't print status messages to stdout")
306
307 (options, args) = parser.parse_args()
308 sys.stdout.write("Begin kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) )
309
310 try:
311
312 if os.path.exists(options.dirout):
313 raise
314 try:
315 os.mkdir(options.dirout)
316 except:
317 raise
318
319 ##testing indexed bam file
320 if os.path.isfile(options.bamfile+".bai") :
321 pass
322 else :
323 cmd = "samtools index %s " % (options.bamfile)
324 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
325 returncode = proc.wait()
326
327 tmpdir = tempfile.mkdtemp()
328 GFF = ribo_functions.store_gff(options.gfffile)
329 ## split bam
330 ribo_functions.split_bam(options.bamfile,tmpdir)
331 ###################################
332 ## First analysis with 28mer :
333 ###################################
334 ## compute coverage and distribution kmer
335 kmer = get_first_base(tmpdir, 28)
336 ## compute phasing
337 whole_phasing = frame_analysis(tmpdir,GFF)
338 ## save phasing
339 results = {}
340 results[28] = whole_phasing
341 ## compute analysis with other kmer
342 for keys in kmer.iterkeys() :
343 if keys != 28 :
344 ## remove all txt files in tmp directory
345 if os.system("rm "+tmpdir+"/*.txt") != 0 :
346 stop_err( 'Error during tmp directory cleaning : ' + str( e ) )
347
348 ## If not enought reads in this kmer :
349 if kmer[keys] > 100 :
350 ## compute coverage and distribution kmer
351 tmp = get_first_base(tmpdir, keys)
352 ## compute phasing
353 whole_phasing = frame_analysis(tmpdir,GFF)
354 results[keys] = whole_phasing
355 ## get report
356 make_report(options.html_file, options.dirout, kmer, results)
357 #=======================================================================
358 # ############
359 # # tests
360 # ############
361 # with open("kmer_dict",'rb') as km:
362 # kmer = cPickle.load(km)
363 # with open("results_dict",'rb') as res:
364 # results = cPickle.load(res)
365 # with open("kmer_dict",'wb') as km:
366 # cPickle.dump(kmer,km)
367 # with open("results_dict",'wb') as res:
368 # cPickle.dump(results,res)
369 # make_report(options.html_file, options.dirout, kmer, results)
370 #=======================================================================
371
372 sys.stdout.write("Finish kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) )
373
374 except Exception, e:
375 stop_err( 'Error running kmer and phasing analysis : ' + str( e ) )
376
377
378 if __name__=="__main__":
379 __main__()