2
|
1 #!/usr/bin/env python2.7.3
|
|
2 #-*- coding: utf-8 -*-
|
|
3 '''
|
|
4 Created on Jun. 2014
|
|
5 @author: rachel legendre
|
|
6 @copyright: rachel.legendre@igmors.u-psud.fr
|
|
7 @license: GPL v3
|
|
8 '''
|
|
9
|
|
10 import os, sys, time, optparse, re, urllib, subprocess, tempfile, commands
|
|
11 import pysam
|
|
12 #from matplotlib import pyplot as pl
|
|
13 import matplotlib
|
|
14 matplotlib.use('Agg')
|
|
15 import matplotlib.pyplot as pl
|
|
16 from numpy import arange
|
10
|
17 from collections import OrderedDict
|
2
|
18 import ribo_functions
|
|
19 #import cPickle
|
10
|
20 ## suppress matplotlib warnings
|
|
21 import warnings
|
|
22 warnings.filterwarnings('ignore')
|
|
23
|
2
|
24
|
|
25 total_mapped_read = 0
|
|
26
|
|
27
|
|
28 def stop_err( msg ):
|
|
29 sys.stderr.write( "%s\n" % msg )
|
|
30 sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) )
|
|
31 sys.exit()
|
10
|
32
|
|
33
|
|
34 def split_bam(bamfile,tmpdir):
|
|
35 '''
|
|
36 split bam by chromosome and write sam file in tmpdir
|
|
37 '''
|
|
38 try:
|
|
39 #get header
|
|
40 results = subprocess.check_output(['samtools', 'view', '-H',bamfile])
|
|
41 header = results.split('\n')
|
|
42
|
|
43 #define genome size
|
|
44 genome = []
|
|
45 for line in header:
|
|
46 result = re.search('SN', line)
|
|
47 if result :
|
|
48 #print line
|
|
49 feat = line.split('\t')
|
|
50 chrom = re.split(":", feat[1])
|
|
51 #print feat[1]
|
|
52 genome.append(chrom[1])
|
|
53
|
|
54 #split sam by chrom
|
|
55 n = 0
|
|
56 for chrm in genome:
|
|
57 with open(tmpdir+'/'+chrm+'.sam', 'w') as f :
|
|
58 #write header correctly for each chromosome
|
|
59 f.write(header[0]+'\n')
|
|
60 expr = re.compile(chrm+'\t')
|
|
61 el =[elem for elem in header if expr.search(elem)][0]
|
|
62 f.write(el+'\n')
|
|
63 f.write(header[-2]+'\n')
|
|
64 #write all reads for each chromosome
|
|
65 reads = subprocess.check_output(["samtools", "view", bamfile, chrm])
|
|
66 f.write(reads)
|
|
67 # calculate number of reads
|
|
68 n += reads.count(chrm)
|
2
|
69
|
10
|
70 sys.stdout.write("%d reads are presents in your bam file\n" % n)
|
|
71
|
|
72 except Exception, e:
|
|
73 stop_err( 'Error during bam file splitting : ' + str( e ) )
|
2
|
74
|
10
|
75
|
|
76
|
2
|
77 def get_first_base(tmpdir, kmer):
|
|
78 '''
|
|
79 write footprint coverage file for each sam file in tmpdir and get kmer distribution
|
|
80 '''
|
|
81 global total_mapped_read
|
|
82
|
|
83 ## init kmer dict
|
|
84 KMER = OrderedDict({})
|
|
85
|
|
86 try:
|
|
87 file_array = (commands.getoutput('ls '+tmpdir)).split('\n')
|
|
88 ##write coverage for each sam file in tmpdir
|
|
89 for samfile in file_array:
|
|
90 with open(tmpdir+'/'+samfile, 'r') as sam :
|
|
91 ##get chromosome name
|
|
92 chrom = samfile.split(".sam")[0]
|
|
93
|
|
94 for line in sam:
|
|
95 #initialize dictionnary
|
10
|
96 if '@SQ\tSN:' in line :
|
2
|
97 size = int(line.split('LN:')[1])
|
|
98 genomeF = [0]*size
|
|
99 genomeR = [0]*size
|
|
100 # define multiple reads keys from mapper
|
10
|
101 elif '@PG\tID' in line :
|
2
|
102 if 'bowtie' in line:
|
|
103 multi_tag = "XS:i:"
|
|
104 elif 'bwa' in line:
|
|
105 multi_tag = "XT:A:R"
|
10
|
106 #elif 'TopHat' in line:
|
|
107 # multi_tag = "NH:i:1"
|
2
|
108 else :
|
10
|
109 stop_err("No PG tag find in "+samfile+". Please use bowtie or bwa for mapping")
|
2
|
110
|
|
111 # get footprint
|
|
112 elif re.search('^[^@].+', line) :
|
|
113 len_read = len(line.split('\t')[9])
|
|
114 ##full kmer dict
|
|
115 if KMER.has_key(len_read):
|
|
116 KMER[len_read] += 1
|
|
117 else :
|
|
118 KMER[len_read] = 1
|
|
119
|
|
120 #print line.rstrip()
|
|
121 read_pos = int(line.split('\t')[3])
|
|
122 read_sens = int(line.split('\t')[1])
|
|
123 #len_read = len(line.split('\t')[9])
|
|
124 if len_read == kmer and multi_tag not in line:
|
|
125 ###if line.split('\t')[5] == '28M' :
|
|
126 total_mapped_read +=1
|
|
127 #if it's a forward read
|
|
128 if read_sens == 0 :
|
|
129 #get 5' base
|
|
130 genomeF[read_pos] += 1
|
|
131 #if it's a reverse read
|
|
132 elif read_sens == 16 :
|
|
133 #get 5' base
|
|
134 read_pos += (len_read-1)
|
|
135 genomeR[read_pos] += 1
|
|
136
|
|
137 #try:
|
|
138 #write coverage in files
|
|
139 with open(tmpdir+'/assoCov_'+chrom+'.txt', 'w') as cov :
|
|
140 for i in range(0,size):
|
|
141 cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n')
|
|
142 #except Exception,e:
|
|
143 # stop_err( 'Error during coverage file writting : ' + str( e ) )
|
|
144
|
|
145 #sys.stdout.write("%d reads are in your bam file\n" % total_mapped_read)
|
|
146
|
|
147 return KMER
|
|
148
|
|
149 except Exception, e:
|
|
150 stop_err( 'Error during footprinting : ' + str( e ) )
|
|
151
|
|
152
|
|
153 def __percent__(prop):
|
|
154
|
|
155 if sum(prop) != 0 :
|
|
156 perc = [0,0,0]
|
|
157 if prop[0] != 0 :
|
|
158 perc[0] = int("{0:.0f}".format((prop[0]*100.0)/sum(prop)))
|
|
159 if prop[1] != 0 :
|
|
160 perc[1] = int("{0:.0f}".format((prop[1]*100.0)/sum(prop)))
|
|
161 if prop[2] != 0 :
|
|
162 perc[2] = int("{0:.0f}".format((prop[2]*100.0)/sum(prop)))
|
|
163 return perc
|
|
164 else :
|
|
165 return prop
|
|
166
|
|
167
|
|
168 def frame_analysis(tmpdir,GFF):
|
|
169 '''
|
|
170 This function take footprint and annotation (gff) for analyse reading frame in each gene
|
|
171 '''
|
|
172 global total_mapped_read
|
|
173 try:
|
|
174 chrom = "" # initializing chromosome
|
|
175 nb_gene = 0 # number of analysed genes
|
|
176 whole_phasing = [0,0,0]
|
|
177 for gene in GFF['order']:
|
|
178 ## maybe no start position in GTF file so we must to check and replace
|
10
|
179 exon_number = GFF[gene]['exon_number']
|
2
|
180 try : GFF[gene]['start']
|
|
181 except :
|
|
182 if GFF[gene]['strand'] == '+' :
|
|
183 GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
|
|
184 else :
|
10
|
185 GFF[gene]['start'] = GFF[gene]['exon'][exon_number]['stop']
|
2
|
186 ## also for stop coordinates
|
|
187 try : GFF[gene]['stop']
|
|
188 except :
|
|
189 if GFF[gene]['strand'] == '+' :
|
|
190 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
|
|
191
|
|
192 else :
|
10
|
193 GFF[gene]['stop'] = GFF[gene]['exon'][1]['start']
|
2
|
194 cov = []
|
|
195 ##first chromosome : we open corresponding file
|
10
|
196 try:
|
|
197 if chrom == "" :
|
|
198 chrom = GFF[gene]['chrom']
|
|
199 with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
|
|
200 data = f.readlines()
|
|
201 ##if we change chromosome
|
|
202 elif chrom != GFF[gene]['chrom'] :
|
|
203 chrom = GFF[gene]['chrom']
|
|
204 with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
|
|
205 data = f.readlines()
|
|
206 except IOError :
|
|
207 print tmpdir+"/assoCov_"+chrom+".txt doesn't exist"
|
|
208
|
|
209
|
2
|
210 ## if a gene without intron :
|
|
211 if GFF[gene]['exon_number'] == 1:
|
|
212
|
|
213 ## get coverage for each gene
|
|
214 if GFF[gene]['strand'] == "+":
|
|
215 for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
|
|
216 cov.append(int((data[i].rstrip()).split("\t")[0]))
|
|
217 else :
|
|
218 for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
|
|
219 cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
|
|
220 cov.reverse()
|
|
221 else :
|
|
222 ## For each gene, get coverage and sum of exon size
|
|
223 if GFF[gene]['strand'] == "+":
|
|
224
|
|
225 for exon in range(1,GFF[gene]['exon_number']+1) :
|
|
226 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
|
10
|
227 #if i <= GFF[gene]['stop'] :
|
|
228 cov.append(int((data[i].rstrip()).split("\t")[0]))
|
2
|
229 else :
|
|
230
|
|
231 for exon in range(1,GFF[gene]['exon_number']+1) :
|
|
232 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
|
10
|
233 #if i <= GFF[gene]['start'] :
|
|
234 cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
|
2
|
235 cov.reverse()
|
|
236
|
|
237 len_cov = len(cov)
|
|
238 prop = [0,0,0]
|
|
239 for nuc in range(0,len_cov-2,3) :
|
|
240 ## Calculate proportion
|
|
241 prop[0] += cov[nuc]
|
|
242 prop[1] += cov[nuc+1]
|
|
243 prop[2] += cov[nuc+2]
|
|
244 whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop)))
|
|
245
|
|
246 whole_phasing = __percent__(whole_phasing)
|
|
247 #sys.stdout.write("Proportion of reads in each frame :\n%s\n" % whole_phasing)
|
|
248 return whole_phasing
|
|
249
|
|
250 except Exception, e:
|
|
251 stop_err( 'Error during frame analysis : ' + str( e ) )
|
|
252
|
|
253
|
|
254
|
|
255 def make_report(html_file, dirout, kmer, results) :
|
|
256
|
|
257 array = sorted(kmer.items(), key=lambda x: x[0])
|
|
258 values = []
|
|
259 label = []
|
|
260 for x,y in array :
|
|
261 values.append(y)
|
|
262 label.append(x)
|
|
263 index = arange(len(label))
|
|
264 bar_width = 0.35
|
|
265 axis_font = {'size':'10'}
|
|
266 fig, ax = pl.subplots()
|
|
267 pl.bar(index ,values, color='LightsteelBlue')
|
|
268 pl.xlabel('kmer value', **axis_font)
|
|
269 pl.ylabel('Number of reads', **axis_font)
|
|
270 pl.title('Number of reads for each k-mer')
|
|
271 pl.xticks(index + bar_width, label, **axis_font)
|
|
272 #pl.show()
|
10
|
273 fig.subplots_adjust()
|
2
|
274 pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640)
|
|
275 pl.clf()
|
|
276
|
|
277 for key, phase in results.iteritems() :
|
|
278 fig = pl.figure(num=1)
|
|
279 frame = phase
|
|
280 index = arange(3)
|
|
281 bar_width = 0.5
|
|
282 pl.bar(index,frame,color=['RoyalBlue','LightSkyBlue','LightBlue'])
|
|
283 pl.xlabel('Frame in gene', **axis_font)
|
|
284 pl.ylabel('Percent of read', **axis_font)
|
|
285 pl.title('Proportion of reads in each frame for '+str(key)+'-mer')
|
|
286 pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font)
|
10
|
287 #pl.tight_layout()
|
2
|
288 pl.ylim(0,100)
|
10
|
289 fig.subplots_adjust()
|
2
|
290 pl.draw()
|
10
|
291 pl.show()
|
2
|
292 pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300)
|
|
293 pl.clf()
|
|
294
|
|
295 kmer_summary = ''
|
|
296 kmer_sorted = OrderedDict(sorted(kmer.iteritems(), key=lambda x: x[0]))
|
|
297 for key, number in kmer_sorted.iteritems() :
|
|
298 if number > 100 :
|
|
299 kmer_summary += '<li>'
|
|
300 kmer_summary += 'Analysis of '+str(key)+'-mer : <br>'
|
|
301 kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>'
|
|
302 kmer_summary += 'Phasing of '+str(key)+'-mer is : '+str(results[key])+'<br>'
|
|
303 kmer_summary += '<img border="0" src="'+str(key)+'_phasing.png" width="50%%" height="50%%"/><br>'
|
|
304 kmer_summary += '</li>'
|
|
305 else :
|
|
306 kmer_summary += '<li>'
|
|
307 kmer_summary += 'Analysis of '+str(key)+'-mer : <br>'
|
|
308 kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>'
|
|
309 kmer_summary += '</li>'
|
|
310
|
|
311 html_str = """
|
|
312 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
|
313 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
314
|
|
315 <html xmlns="http://www.w3.org/1999/xhtml">
|
|
316 <head>
|
|
317 <link href="lightbox/css/lightbox.css" rel="stylesheet" />
|
|
318 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
|
319 <title>kmer analysis</title>
|
|
320 </head>
|
|
321 <body>
|
|
322 <h1>k-mer analysis results</h1>
|
|
323 <p>
|
|
324 <ul>
|
|
325 <li>
|
|
326 Number of reads for each k-mer in your sample : <br />
|
|
327 <img border="0" src="kmer_proportion.png" width="50%%" height="50%%"/>
|
|
328 </li>
|
|
329 %s
|
|
330 </ul>
|
|
331 </p>
|
|
332 </body>
|
|
333 </html>
|
|
334 """
|
|
335 all = html_str % kmer_summary
|
|
336
|
|
337 html = open(html_file,"w")
|
|
338 html.write(all)
|
|
339 html.close()
|
|
340
|
|
341 def __main__():
|
|
342
|
|
343 #Parse command line options
|
|
344 parser = optparse.OptionParser()
|
10
|
345 parser.add_option("-g", "--gff", dest="gff", type= "string",
|
2
|
346 help="GFF annotation file", metavar="FILE")
|
|
347
|
|
348 parser.add_option("-b", "--bam", dest="bamfile", type= "string",
|
|
349 help="Bam Ribo-Seq alignments ", metavar="FILE")
|
|
350
|
|
351 parser.add_option("-o", "--out", dest="html_file", type= "string",
|
|
352 help="write report to FILE", metavar="FILE")
|
|
353
|
|
354 parser.add_option("-d", "--dirout", dest="dirout", type= "string",
|
|
355 help="write report to PNG files", metavar="FILE")
|
|
356
|
|
357 parser.add_option("-q", "--quiet",
|
|
358 action="store_false", dest="verbose", default=True,
|
|
359 help="don't print status messages to stdout")
|
|
360
|
|
361 (options, args) = parser.parse_args()
|
|
362 sys.stdout.write("Begin kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) )
|
|
363
|
|
364 try:
|
|
365
|
|
366 if os.path.exists(options.dirout):
|
|
367 raise
|
|
368 try:
|
|
369 os.mkdir(options.dirout)
|
|
370 except:
|
|
371 raise
|
|
372
|
|
373 ##testing indexed bam file
|
|
374 if os.path.isfile(options.bamfile+".bai") :
|
|
375 pass
|
|
376 else :
|
|
377 cmd = "samtools index %s " % (options.bamfile)
|
|
378 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
|
|
379 returncode = proc.wait()
|
|
380
|
|
381 tmpdir = tempfile.mkdtemp()
|
10
|
382 ## identify GFF or GTF format from 9th column
|
|
383 with open (options.gff,"r") as gffile :
|
|
384 for line in gffile :
|
|
385 if '#' in line :
|
|
386 ## skip header
|
|
387 gffile.next()
|
|
388 elif 'gene_id' in line :
|
|
389 ## launch gtf reader :
|
|
390 GFF = ribo_functions.store_gtf(options.gff)
|
|
391 break
|
|
392 elif 'ID=' in line :
|
|
393 ## launch gff reader
|
|
394 GFF = ribo_functions.store_gff(options.gff)
|
|
395 break
|
|
396 else :
|
|
397 stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )
|
|
398 #GFF = store_gff(options.gff)
|
|
399 #GFF = ribo_functions.store_gtf(options.gff)
|
|
400 ## check gff reading
|
|
401 if not GFF['order'] :
|
|
402 stop_err( 'Incorrect GFF file' + str( e ) )
|
|
403
|
2
|
404 ## split bam
|
10
|
405 split_bam(options.bamfile,tmpdir)
|
2
|
406 ###################################
|
|
407 ## First analysis with 28mer :
|
|
408 ###################################
|
|
409 ## compute coverage and distribution kmer
|
|
410 kmer = get_first_base(tmpdir, 28)
|
|
411 ## compute phasing
|
|
412 whole_phasing = frame_analysis(tmpdir,GFF)
|
|
413 ## save phasing
|
|
414 results = {}
|
|
415 results[28] = whole_phasing
|
|
416 ## compute analysis with other kmer
|
|
417 for keys in kmer.iterkeys() :
|
|
418 if keys != 28 :
|
|
419 ## remove all txt files in tmp directory
|
|
420 if os.system("rm "+tmpdir+"/*.txt") != 0 :
|
|
421 stop_err( 'Error during tmp directory cleaning : ' + str( e ) )
|
|
422
|
|
423 ## If not enought reads in this kmer :
|
|
424 if kmer[keys] > 100 :
|
|
425 ## compute coverage and distribution kmer
|
|
426 tmp = get_first_base(tmpdir, keys)
|
|
427 ## compute phasing
|
|
428 whole_phasing = frame_analysis(tmpdir,GFF)
|
10
|
429
|
2
|
430 results[keys] = whole_phasing
|
|
431 ## get report
|
|
432 make_report(options.html_file, options.dirout, kmer, results)
|
|
433 #=======================================================================
|
|
434 # ############
|
|
435 # # tests
|
|
436 # ############
|
|
437 # with open("kmer_dict",'rb') as km:
|
|
438 # kmer = cPickle.load(km)
|
|
439 # with open("results_dict",'rb') as res:
|
|
440 # results = cPickle.load(res)
|
|
441 # with open("kmer_dict",'wb') as km:
|
|
442 # cPickle.dump(kmer,km)
|
|
443 # with open("results_dict",'wb') as res:
|
|
444 # cPickle.dump(results,res)
|
|
445 # make_report(options.html_file, options.dirout, kmer, results)
|
|
446 #=======================================================================
|
|
447
|
|
448 sys.stdout.write("Finish kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) )
|
|
449
|
|
450 except Exception, e:
|
|
451 stop_err( 'Error running kmer and phasing analysis : ' + str( e ) )
|
|
452
|
|
453
|
|
454 if __name__=="__main__":
|
|
455 __main__() |