2
|
1 #!/usr/bin/env python2.7.3
|
|
2 #-*- coding: utf-8 -*-
|
|
3 '''
|
|
4 Created on Jun. 2014
|
|
5 @author: rachel legendre
|
|
6 @copyright: rachel.legendre@igmors.u-psud.fr
|
|
7 @license: GPL v3
|
|
8 '''
|
|
9
|
|
10 import os, sys, time, optparse, re, urllib, subprocess, tempfile, commands
|
|
11 import pysam
|
|
12 #from matplotlib import pyplot as pl
|
|
13 import matplotlib
|
|
14 matplotlib.use('Agg')
|
|
15 import matplotlib.pyplot as pl
|
|
16 from numpy import arange
|
10
|
17 from collections import OrderedDict
|
2
|
18 import ribo_functions
|
|
19 #import cPickle
|
10
|
20 ## suppress matplotlib warnings
|
|
21 import warnings
|
|
22 warnings.filterwarnings('ignore')
|
|
23
|
2
|
24
|
|
25 total_mapped_read = 0
|
|
26
|
|
27
|
|
28 def stop_err( msg ):
|
|
29 sys.stderr.write( "%s\n" % msg )
|
|
30 sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) )
|
|
31 sys.exit()
|
10
|
32
|
|
33
|
|
34 def split_bam(bamfile,tmpdir):
|
|
35 '''
|
|
36 split bam by chromosome and write sam file in tmpdir
|
|
37 '''
|
|
38 try:
|
|
39 #get header
|
|
40 results = subprocess.check_output(['samtools', 'view', '-H',bamfile])
|
|
41 header = results.split('\n')
|
|
42
|
|
43 #define genome size
|
|
44 genome = []
|
|
45 for line in header:
|
|
46 result = re.search('SN', line)
|
|
47 if result :
|
|
48 #print line
|
|
49 feat = line.split('\t')
|
|
50 chrom = re.split(":", feat[1])
|
|
51 #print feat[1]
|
|
52 genome.append(chrom[1])
|
|
53
|
|
54 #split sam by chrom
|
|
55 n = 0
|
|
56 for chrm in genome:
|
|
57 with open(tmpdir+'/'+chrm+'.sam', 'w') as f :
|
|
58 #write header correctly for each chromosome
|
|
59 f.write(header[0]+'\n')
|
|
60 expr = re.compile(chrm+'\t')
|
|
61 el =[elem for elem in header if expr.search(elem)][0]
|
|
62 f.write(el+'\n')
|
|
63 f.write(header[-2]+'\n')
|
|
64 #write all reads for each chromosome
|
|
65 reads = subprocess.check_output(["samtools", "view", bamfile, chrm])
|
|
66 f.write(reads)
|
|
67 # calculate number of reads
|
|
68 n += reads.count(chrm)
|
2
|
69
|
10
|
70 sys.stdout.write("%d reads are presents in your bam file\n" % n)
|
|
71
|
|
72 except Exception, e:
|
|
73 stop_err( 'Error during bam file splitting : ' + str( e ) )
|
2
|
74
|
10
|
75
|
|
76
|
2
|
77 def get_first_base(tmpdir, kmer):
|
|
78 '''
|
|
79 write footprint coverage file for each sam file in tmpdir and get kmer distribution
|
|
80 '''
|
|
81 global total_mapped_read
|
|
82
|
13
|
83 ## tags by default
|
|
84 multi_tag = "XS:i:"
|
|
85 tag = "IH:i:1"
|
|
86
|
2
|
87 ## init kmer dict
|
|
88 KMER = OrderedDict({})
|
|
89
|
|
90 try:
|
|
91 file_array = (commands.getoutput('ls '+tmpdir)).split('\n')
|
|
92 ##write coverage for each sam file in tmpdir
|
|
93 for samfile in file_array:
|
|
94 with open(tmpdir+'/'+samfile, 'r') as sam :
|
|
95 ##get chromosome name
|
|
96 chrom = samfile.split(".sam")[0]
|
|
97
|
|
98 for line in sam:
|
|
99 #initialize dictionnary
|
10
|
100 if '@SQ\tSN:' in line :
|
2
|
101 size = int(line.split('LN:')[1])
|
|
102 genomeF = [0]*size
|
|
103 genomeR = [0]*size
|
|
104 # define multiple reads keys from mapper
|
10
|
105 elif '@PG\tID' in line :
|
2
|
106 if 'bowtie' in line:
|
|
107 multi_tag = "XS:i:"
|
|
108 elif 'bwa' in line:
|
|
109 multi_tag = "XT:A:R"
|
13
|
110 elif 'TopHat' in line:
|
|
111 tag = "NH:i:1"
|
2
|
112 else :
|
13
|
113 stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")
|
2
|
114
|
|
115 # get footprint
|
|
116 elif re.search('^[^@].+', line) :
|
|
117 len_read = len(line.split('\t')[9])
|
|
118 ##full kmer dict
|
|
119 if KMER.has_key(len_read):
|
|
120 KMER[len_read] += 1
|
|
121 else :
|
|
122 KMER[len_read] = 1
|
|
123
|
|
124 #print line.rstrip()
|
|
125 read_pos = int(line.split('\t')[3])
|
|
126 read_sens = int(line.split('\t')[1])
|
|
127 #len_read = len(line.split('\t')[9])
|
13
|
128 if len_read == kmer and (tag in line or multi_tag not in line):
|
2
|
129 ###if line.split('\t')[5] == '28M' :
|
|
130 total_mapped_read +=1
|
|
131 #if it's a forward read
|
|
132 if read_sens == 0 :
|
|
133 #get 5' base
|
|
134 genomeF[read_pos] += 1
|
|
135 #if it's a reverse read
|
|
136 elif read_sens == 16 :
|
|
137 #get 5' base
|
|
138 read_pos += (len_read-1)
|
|
139 genomeR[read_pos] += 1
|
|
140
|
|
141 #try:
|
|
142 #write coverage in files
|
|
143 with open(tmpdir+'/assoCov_'+chrom+'.txt', 'w') as cov :
|
|
144 for i in range(0,size):
|
|
145 cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n')
|
|
146 #except Exception,e:
|
|
147 # stop_err( 'Error during coverage file writting : ' + str( e ) )
|
|
148
|
|
149 #sys.stdout.write("%d reads are in your bam file\n" % total_mapped_read)
|
|
150
|
|
151 return KMER
|
|
152
|
|
153 except Exception, e:
|
|
154 stop_err( 'Error during footprinting : ' + str( e ) )
|
|
155
|
|
156
|
|
157 def __percent__(prop):
|
|
158
|
|
159 if sum(prop) != 0 :
|
|
160 perc = [0,0,0]
|
|
161 if prop[0] != 0 :
|
|
162 perc[0] = int("{0:.0f}".format((prop[0]*100.0)/sum(prop)))
|
|
163 if prop[1] != 0 :
|
|
164 perc[1] = int("{0:.0f}".format((prop[1]*100.0)/sum(prop)))
|
|
165 if prop[2] != 0 :
|
|
166 perc[2] = int("{0:.0f}".format((prop[2]*100.0)/sum(prop)))
|
|
167 return perc
|
|
168 else :
|
|
169 return prop
|
|
170
|
|
171
|
|
172 def frame_analysis(tmpdir,GFF):
|
|
173 '''
|
|
174 This function take footprint and annotation (gff) for analyse reading frame in each gene
|
|
175 '''
|
|
176 global total_mapped_read
|
|
177 try:
|
|
178 chrom = "" # initializing chromosome
|
|
179 nb_gene = 0 # number of analysed genes
|
|
180 whole_phasing = [0,0,0]
|
|
181 for gene in GFF['order']:
|
|
182 ## maybe no start position in GTF file so we must to check and replace
|
10
|
183 exon_number = GFF[gene]['exon_number']
|
2
|
184 try : GFF[gene]['start']
|
|
185 except :
|
|
186 if GFF[gene]['strand'] == '+' :
|
|
187 GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
|
|
188 else :
|
10
|
189 GFF[gene]['start'] = GFF[gene]['exon'][exon_number]['stop']
|
2
|
190 ## also for stop coordinates
|
|
191 try : GFF[gene]['stop']
|
|
192 except :
|
|
193 if GFF[gene]['strand'] == '+' :
|
|
194 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
|
|
195
|
|
196 else :
|
10
|
197 GFF[gene]['stop'] = GFF[gene]['exon'][1]['start']
|
2
|
198 cov = []
|
|
199 ##first chromosome : we open corresponding file
|
10
|
200 try:
|
|
201 if chrom == "" :
|
|
202 chrom = GFF[gene]['chrom']
|
|
203 with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
|
|
204 data = f.readlines()
|
|
205 ##if we change chromosome
|
|
206 elif chrom != GFF[gene]['chrom'] :
|
|
207 chrom = GFF[gene]['chrom']
|
|
208 with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
|
|
209 data = f.readlines()
|
|
210 except IOError :
|
|
211 print tmpdir+"/assoCov_"+chrom+".txt doesn't exist"
|
|
212
|
19
|
213 try:
|
|
214 ## if a gene without intron :
|
|
215 if GFF[gene]['exon_number'] == 1:
|
|
216
|
|
217 ## get coverage for each gene
|
|
218 if GFF[gene]['strand'] == "+":
|
|
219 for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
|
|
220 cov.append(int((data[i].rstrip()).split("\t")[0]))
|
|
221 else :
|
|
222 for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
|
|
223 cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
|
|
224 cov.reverse()
|
2
|
225 else :
|
19
|
226 ## For each gene, get coverage and sum of exon size
|
|
227 if GFF[gene]['strand'] == "+":
|
2
|
228
|
19
|
229 for exon in range(1,GFF[gene]['exon_number']+1) :
|
|
230 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
|
|
231 #if i <= GFF[gene]['stop'] :
|
|
232 cov.append(int((data[i].rstrip()).split("\t")[0]))
|
|
233 else :
|
|
234
|
|
235 for exon in range(1,GFF[gene]['exon_number']+1) :
|
|
236 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
|
|
237 #if i <= GFF[gene]['start'] :
|
|
238 cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
|
|
239 cov.reverse()
|
|
240 except :
|
|
241 #print gene+" could not be analysed."
|
|
242 #del GFF[gene]
|
|
243 continue
|
2
|
244 len_cov = len(cov)
|
|
245 prop = [0,0,0]
|
|
246 for nuc in range(0,len_cov-2,3) :
|
|
247 ## Calculate proportion
|
|
248 prop[0] += cov[nuc]
|
|
249 prop[1] += cov[nuc+1]
|
|
250 prop[2] += cov[nuc+2]
|
|
251 whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop)))
|
|
252
|
|
253 whole_phasing = __percent__(whole_phasing)
|
|
254 #sys.stdout.write("Proportion of reads in each frame :\n%s\n" % whole_phasing)
|
|
255 return whole_phasing
|
|
256
|
|
257 except Exception, e:
|
|
258 stop_err( 'Error during frame analysis : ' + str( e ) )
|
|
259
|
|
260
|
|
261
|
|
262 def make_report(html_file, dirout, kmer, results) :
|
|
263
|
|
264 array = sorted(kmer.items(), key=lambda x: x[0])
|
|
265 values = []
|
|
266 label = []
|
|
267 for x,y in array :
|
|
268 values.append(y)
|
|
269 label.append(x)
|
|
270 index = arange(len(label))
|
|
271 bar_width = 0.35
|
|
272 axis_font = {'size':'10'}
|
|
273 fig, ax = pl.subplots()
|
|
274 pl.bar(index ,values, color='LightsteelBlue')
|
|
275 pl.xlabel('kmer value', **axis_font)
|
|
276 pl.ylabel('Number of reads', **axis_font)
|
|
277 pl.title('Number of reads for each k-mer')
|
|
278 pl.xticks(index + bar_width, label, **axis_font)
|
|
279 #pl.show()
|
10
|
280 fig.subplots_adjust()
|
2
|
281 pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640)
|
|
282 pl.clf()
|
|
283
|
|
284 for key, phase in results.iteritems() :
|
|
285 fig = pl.figure(num=1)
|
|
286 frame = phase
|
|
287 index = arange(3)
|
|
288 bar_width = 0.5
|
|
289 pl.bar(index,frame,color=['RoyalBlue','LightSkyBlue','LightBlue'])
|
|
290 pl.xlabel('Frame in gene', **axis_font)
|
|
291 pl.ylabel('Percent of read', **axis_font)
|
|
292 pl.title('Proportion of reads in each frame for '+str(key)+'-mer')
|
|
293 pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font)
|
10
|
294 #pl.tight_layout()
|
2
|
295 pl.ylim(0,100)
|
10
|
296 fig.subplots_adjust()
|
2
|
297 pl.draw()
|
10
|
298 pl.show()
|
2
|
299 pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300)
|
|
300 pl.clf()
|
|
301
|
|
302 kmer_summary = ''
|
|
303 kmer_sorted = OrderedDict(sorted(kmer.iteritems(), key=lambda x: x[0]))
|
|
304 for key, number in kmer_sorted.iteritems() :
|
|
305 if number > 100 :
|
|
306 kmer_summary += '<li>'
|
|
307 kmer_summary += 'Analysis of '+str(key)+'-mer : <br>'
|
|
308 kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>'
|
|
309 kmer_summary += 'Phasing of '+str(key)+'-mer is : '+str(results[key])+'<br>'
|
|
310 kmer_summary += '<img border="0" src="'+str(key)+'_phasing.png" width="50%%" height="50%%"/><br>'
|
|
311 kmer_summary += '</li>'
|
|
312 else :
|
|
313 kmer_summary += '<li>'
|
|
314 kmer_summary += 'Analysis of '+str(key)+'-mer : <br>'
|
|
315 kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>'
|
|
316 kmer_summary += '</li>'
|
|
317
|
|
318 html_str = """
|
|
319 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
|
320 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
321
|
|
322 <html xmlns="http://www.w3.org/1999/xhtml">
|
|
323 <head>
|
|
324 <link href="lightbox/css/lightbox.css" rel="stylesheet" />
|
|
325 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
|
326 <title>kmer analysis</title>
|
|
327 </head>
|
|
328 <body>
|
|
329 <h1>k-mer analysis results</h1>
|
|
330 <p>
|
|
331 <ul>
|
|
332 <li>
|
|
333 Number of reads for each k-mer in your sample : <br />
|
|
334 <img border="0" src="kmer_proportion.png" width="50%%" height="50%%"/>
|
|
335 </li>
|
|
336 %s
|
|
337 </ul>
|
|
338 </p>
|
|
339 </body>
|
|
340 </html>
|
|
341 """
|
|
342 all = html_str % kmer_summary
|
|
343
|
|
344 html = open(html_file,"w")
|
|
345 html.write(all)
|
|
346 html.close()
|
|
347
|
|
348 def __main__():
|
|
349
|
|
350 #Parse command line options
|
|
351 parser = optparse.OptionParser()
|
10
|
352 parser.add_option("-g", "--gff", dest="gff", type= "string",
|
2
|
353 help="GFF annotation file", metavar="FILE")
|
|
354
|
|
355 parser.add_option("-b", "--bam", dest="bamfile", type= "string",
|
|
356 help="Bam Ribo-Seq alignments ", metavar="FILE")
|
|
357
|
|
358 parser.add_option("-o", "--out", dest="html_file", type= "string",
|
|
359 help="write report to FILE", metavar="FILE")
|
|
360
|
|
361 parser.add_option("-d", "--dirout", dest="dirout", type= "string",
|
|
362 help="write report to PNG files", metavar="FILE")
|
|
363
|
|
364 parser.add_option("-q", "--quiet",
|
|
365 action="store_false", dest="verbose", default=True,
|
|
366 help="don't print status messages to stdout")
|
|
367
|
|
368 (options, args) = parser.parse_args()
|
|
369 sys.stdout.write("Begin kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) )
|
|
370
|
|
371 try:
|
|
372
|
18
|
373 if not os.path.exists(options.dirout):
|
|
374 try:
|
|
375 os.mkdir(options.dirout)
|
|
376 except Exception, e :
|
|
377 stop_err('Error running make directory : ' + str(e))
|
2
|
378
|
|
379 ##testing indexed bam file
|
|
380 if os.path.isfile(options.bamfile+".bai") :
|
|
381 pass
|
|
382 else :
|
|
383 cmd = "samtools index %s " % (options.bamfile)
|
|
384 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
|
|
385 returncode = proc.wait()
|
|
386
|
|
387 tmpdir = tempfile.mkdtemp()
|
10
|
388 ## identify GFF or GTF format from 9th column
|
|
389 with open (options.gff,"r") as gffile :
|
|
390 for line in gffile :
|
|
391 if '#' in line :
|
|
392 ## skip header
|
|
393 gffile.next()
|
|
394 elif 'gene_id' in line :
|
|
395 ## launch gtf reader :
|
|
396 GFF = ribo_functions.store_gtf(options.gff)
|
|
397 break
|
|
398 elif 'ID=' in line :
|
|
399 ## launch gff reader
|
|
400 GFF = ribo_functions.store_gff(options.gff)
|
|
401 break
|
|
402 else :
|
|
403 stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )
|
|
404 #GFF = store_gff(options.gff)
|
|
405 #GFF = ribo_functions.store_gtf(options.gff)
|
|
406 ## check gff reading
|
|
407 if not GFF['order'] :
|
13
|
408 stop_err( 'Incorrect GFF file' )
|
18
|
409 for gene in GFF['order']:
|
|
410 if not GFF[gene]['exon'] :
|
|
411 del GFF[gene]
|
|
412 GFF['order'].remove(gene)
|
2
|
413 ## split bam
|
10
|
414 split_bam(options.bamfile,tmpdir)
|
2
|
415 ###################################
|
|
416 ## First analysis with 28mer :
|
|
417 ###################################
|
|
418 ## compute coverage and distribution kmer
|
|
419 kmer = get_first_base(tmpdir, 28)
|
|
420 ## compute phasing
|
|
421 whole_phasing = frame_analysis(tmpdir,GFF)
|
|
422 ## save phasing
|
|
423 results = {}
|
|
424 results[28] = whole_phasing
|
|
425 ## compute analysis with other kmer
|
|
426 for keys in kmer.iterkeys() :
|
|
427 if keys != 28 :
|
13
|
428
|
2
|
429 ## If not enought reads in this kmer :
|
|
430 if kmer[keys] > 100 :
|
13
|
431 ## remove all txt files in tmp directory
|
|
432 if os.system("rm "+tmpdir+"/*.txt") != 0 :
|
|
433 stop_err( 'Error during tmp directory cleaning')
|
2
|
434 ## compute coverage and distribution kmer
|
|
435 tmp = get_first_base(tmpdir, keys)
|
|
436 ## compute phasing
|
|
437 whole_phasing = frame_analysis(tmpdir,GFF)
|
10
|
438
|
2
|
439 results[keys] = whole_phasing
|
|
440 ## get report
|
|
441 make_report(options.html_file, options.dirout, kmer, results)
|
|
442 #=======================================================================
|
|
443 # ############
|
|
444 # # tests
|
|
445 # ############
|
|
446 # with open("kmer_dict",'rb') as km:
|
|
447 # kmer = cPickle.load(km)
|
|
448 # with open("results_dict",'rb') as res:
|
|
449 # results = cPickle.load(res)
|
|
450 # with open("kmer_dict",'wb') as km:
|
|
451 # cPickle.dump(kmer,km)
|
|
452 # with open("results_dict",'wb') as res:
|
|
453 # cPickle.dump(results,res)
|
|
454 # make_report(options.html_file, options.dirout, kmer, results)
|
|
455 #=======================================================================
|
|
456
|
|
457 sys.stdout.write("Finish kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) )
|
|
458
|
|
459 except Exception, e:
|
|
460 stop_err( 'Error running kmer and phasing analysis : ' + str( e ) )
|
|
461
|
|
462
|
|
463 if __name__=="__main__":
|
|
464 __main__() |