2
|
1 #!/usr/bin/env python2.7.3
|
|
2 #-*- coding: utf-8 -*-
|
|
3 '''
|
|
4 Created on Jun. 2014
|
|
5 @author: rachel legendre
|
|
6 @copyright: rachel.legendre@igmors.u-psud.fr
|
|
7 @license: GPL v3
|
|
8 '''
|
|
9
|
|
10 import os, sys, time, optparse, re, urllib, subprocess, tempfile, commands
|
|
11 import pysam
|
|
12 #from matplotlib import pyplot as pl
|
|
13 import matplotlib
|
|
14 matplotlib.use('Agg')
|
|
15 import matplotlib.pyplot as pl
|
|
16 from numpy import arange
|
|
17 import ribo_functions
|
|
18 from collections import OrderedDict
|
|
19 #import cPickle
|
|
20
|
|
21 total_mapped_read = 0
|
|
22
|
|
23
|
|
24 def stop_err( msg ):
|
|
25 sys.stderr.write( "%s\n" % msg )
|
|
26 sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) )
|
|
27 sys.exit()
|
|
28
|
|
29
|
|
30
|
|
31 def get_first_base(tmpdir, kmer):
|
|
32 '''
|
|
33 write footprint coverage file for each sam file in tmpdir and get kmer distribution
|
|
34 '''
|
|
35 global total_mapped_read
|
|
36
|
|
37 ## init kmer dict
|
|
38 KMER = OrderedDict({})
|
|
39
|
|
40 try:
|
|
41 file_array = (commands.getoutput('ls '+tmpdir)).split('\n')
|
|
42 ##write coverage for each sam file in tmpdir
|
|
43 for samfile in file_array:
|
|
44 with open(tmpdir+'/'+samfile, 'r') as sam :
|
|
45 ##get chromosome name
|
|
46 chrom = samfile.split(".sam")[0]
|
|
47
|
|
48 for line in sam:
|
|
49 #initialize dictionnary
|
|
50 if '@SQ' in line :
|
|
51 size = int(line.split('LN:')[1])
|
|
52 genomeF = [0]*size
|
|
53 genomeR = [0]*size
|
|
54 # define multiple reads keys from mapper
|
|
55 elif '@PG' in line :
|
|
56 if 'bowtie' in line:
|
|
57 multi_tag = "XS:i:"
|
|
58 elif 'bwa' in line:
|
|
59 multi_tag = "XT:A:R"
|
|
60 else :
|
|
61 stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping")
|
|
62
|
|
63 # get footprint
|
|
64 elif re.search('^[^@].+', line) :
|
|
65 len_read = len(line.split('\t')[9])
|
|
66 ##full kmer dict
|
|
67 if KMER.has_key(len_read):
|
|
68 KMER[len_read] += 1
|
|
69 else :
|
|
70 KMER[len_read] = 1
|
|
71
|
|
72 #print line.rstrip()
|
|
73 read_pos = int(line.split('\t')[3])
|
|
74 read_sens = int(line.split('\t')[1])
|
|
75 #len_read = len(line.split('\t')[9])
|
|
76 if len_read == kmer and multi_tag not in line:
|
|
77 ###if line.split('\t')[5] == '28M' :
|
|
78 total_mapped_read +=1
|
|
79 #if it's a forward read
|
|
80 if read_sens == 0 :
|
|
81 #get 5' base
|
|
82 genomeF[read_pos] += 1
|
|
83 #if it's a reverse read
|
|
84 elif read_sens == 16 :
|
|
85 #get 5' base
|
|
86 read_pos += (len_read-1)
|
|
87 genomeR[read_pos] += 1
|
|
88
|
|
89 #try:
|
|
90 #write coverage in files
|
|
91 with open(tmpdir+'/assoCov_'+chrom+'.txt', 'w') as cov :
|
|
92 for i in range(0,size):
|
|
93 cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n')
|
|
94 #except Exception,e:
|
|
95 # stop_err( 'Error during coverage file writting : ' + str( e ) )
|
|
96
|
|
97 #sys.stdout.write("%d reads are in your bam file\n" % total_mapped_read)
|
|
98
|
|
99 return KMER
|
|
100
|
|
101 except Exception, e:
|
|
102 stop_err( 'Error during footprinting : ' + str( e ) )
|
|
103
|
|
104
|
|
105 def __percent__(prop):
|
|
106
|
|
107 if sum(prop) != 0 :
|
|
108 perc = [0,0,0]
|
|
109 if prop[0] != 0 :
|
|
110 perc[0] = int("{0:.0f}".format((prop[0]*100.0)/sum(prop)))
|
|
111 if prop[1] != 0 :
|
|
112 perc[1] = int("{0:.0f}".format((prop[1]*100.0)/sum(prop)))
|
|
113 if prop[2] != 0 :
|
|
114 perc[2] = int("{0:.0f}".format((prop[2]*100.0)/sum(prop)))
|
|
115 return perc
|
|
116 else :
|
|
117 return prop
|
|
118
|
|
119
|
|
120 def frame_analysis(tmpdir,GFF):
|
|
121 '''
|
|
122 This function take footprint and annotation (gff) for analyse reading frame in each gene
|
|
123 '''
|
|
124 global total_mapped_read
|
|
125 try:
|
|
126 chrom = "" # initializing chromosome
|
|
127 nb_gene = 0 # number of analysed genes
|
|
128 whole_phasing = [0,0,0]
|
|
129 for gene in GFF['order']:
|
|
130 ## maybe no start position in GTF file so we must to check and replace
|
|
131 try : GFF[gene]['start']
|
|
132 except :
|
|
133 if GFF[gene]['strand'] == '+' :
|
|
134 GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
|
|
135 else :
|
|
136 GFF[gene]['start'] = GFF[gene]['exon'][1]['stop']
|
|
137 ## also for stop coordinates
|
|
138 try : GFF[gene]['stop']
|
|
139 except :
|
|
140 exon_number = GFF[gene]['exon_number']
|
|
141 if GFF[gene]['strand'] == '+' :
|
|
142 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
|
|
143
|
|
144 else :
|
|
145 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['start']
|
|
146 cov = []
|
|
147 ##first chromosome : we open corresponding file
|
|
148 if chrom == "" :
|
|
149 chrom = GFF[gene]['chrom']
|
|
150 with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
|
|
151 data = f.readlines()
|
|
152 ##if we change chrosomosome
|
|
153 elif chrom != GFF[gene]['chrom'] :
|
|
154 chrom = GFF[gene]['chrom']
|
|
155 with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
|
|
156 data = f.readlines()
|
|
157
|
|
158 ## if a gene without intron :
|
|
159 if GFF[gene]['exon_number'] == 1:
|
|
160
|
|
161 ## get coverage for each gene
|
|
162 if GFF[gene]['strand'] == "+":
|
|
163 for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
|
|
164 cov.append(int((data[i].rstrip()).split("\t")[0]))
|
|
165 else :
|
|
166 for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
|
|
167 cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
|
|
168 cov.reverse()
|
|
169 else :
|
|
170 ## For each gene, get coverage and sum of exon size
|
|
171 if GFF[gene]['strand'] == "+":
|
|
172
|
|
173 for exon in range(1,GFF[gene]['exon_number']+1) :
|
|
174 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
|
|
175 if i <= GFF[gene]['stop'] :
|
|
176 cov.append(int((data[i].rstrip()).split("\t")[0]))
|
|
177 else :
|
|
178
|
|
179 for exon in range(1,GFF[gene]['exon_number']+1) :
|
|
180 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
|
|
181 if i <= GFF[gene]['start'] :
|
|
182 cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
|
|
183 cov.reverse()
|
|
184
|
|
185 len_cov = len(cov)
|
|
186 prop = [0,0,0]
|
|
187 for nuc in range(0,len_cov-2,3) :
|
|
188 ## Calculate proportion
|
|
189 prop[0] += cov[nuc]
|
|
190 prop[1] += cov[nuc+1]
|
|
191 prop[2] += cov[nuc+2]
|
|
192 whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop)))
|
|
193
|
|
194 whole_phasing = __percent__(whole_phasing)
|
|
195 #sys.stdout.write("Proportion of reads in each frame :\n%s\n" % whole_phasing)
|
|
196 return whole_phasing
|
|
197
|
|
198 except Exception, e:
|
|
199 stop_err( 'Error during frame analysis : ' + str( e ) )
|
|
200
|
|
201
|
|
202
|
|
203 def make_report(html_file, dirout, kmer, results) :
|
|
204
|
|
205 array = sorted(kmer.items(), key=lambda x: x[0])
|
|
206 values = []
|
|
207 label = []
|
|
208 for x,y in array :
|
|
209 values.append(y)
|
|
210 label.append(x)
|
|
211 index = arange(len(label))
|
|
212 bar_width = 0.35
|
|
213 axis_font = {'size':'10'}
|
|
214 fig, ax = pl.subplots()
|
|
215 pl.bar(index ,values, color='LightsteelBlue')
|
|
216 pl.xlabel('kmer value', **axis_font)
|
|
217 pl.ylabel('Number of reads', **axis_font)
|
|
218 pl.title('Number of reads for each k-mer')
|
|
219 pl.xticks(index + bar_width, label, **axis_font)
|
|
220 #pl.show()
|
|
221 pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640)
|
|
222 pl.clf()
|
|
223
|
|
224 for key, phase in results.iteritems() :
|
|
225 fig = pl.figure(num=1)
|
|
226 frame = phase
|
|
227 index = arange(3)
|
|
228 bar_width = 0.5
|
|
229 pl.bar(index,frame,color=['RoyalBlue','LightSkyBlue','LightBlue'])
|
|
230 pl.xlabel('Frame in gene', **axis_font)
|
|
231 pl.ylabel('Percent of read', **axis_font)
|
|
232 pl.title('Proportion of reads in each frame for '+str(key)+'-mer')
|
|
233 pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font)
|
|
234 pl.tight_layout()
|
|
235 pl.ylim(0,100)
|
|
236 pl.draw()
|
|
237 #pl.show()
|
|
238 pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300)
|
|
239 pl.clf()
|
|
240
|
|
241 kmer_summary = ''
|
|
242 kmer_sorted = OrderedDict(sorted(kmer.iteritems(), key=lambda x: x[0]))
|
|
243 for key, number in kmer_sorted.iteritems() :
|
|
244 if number > 100 :
|
|
245 kmer_summary += '<li>'
|
|
246 kmer_summary += 'Analysis of '+str(key)+'-mer : <br>'
|
|
247 kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>'
|
|
248 kmer_summary += 'Phasing of '+str(key)+'-mer is : '+str(results[key])+'<br>'
|
|
249 kmer_summary += '<img border="0" src="'+str(key)+'_phasing.png" width="50%%" height="50%%"/><br>'
|
|
250 kmer_summary += '</li>'
|
|
251 else :
|
|
252 kmer_summary += '<li>'
|
|
253 kmer_summary += 'Analysis of '+str(key)+'-mer : <br>'
|
|
254 kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>'
|
|
255 kmer_summary += '</li>'
|
|
256
|
|
257 html_str = """
|
|
258 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
|
259 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
260
|
|
261 <html xmlns="http://www.w3.org/1999/xhtml">
|
|
262 <head>
|
|
263 <link href="lightbox/css/lightbox.css" rel="stylesheet" />
|
|
264 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
|
265 <title>kmer analysis</title>
|
|
266 </head>
|
|
267 <body>
|
|
268 <h1>k-mer analysis results</h1>
|
|
269 <p>
|
|
270 <ul>
|
|
271 <li>
|
|
272 Number of reads for each k-mer in your sample : <br />
|
|
273 <img border="0" src="kmer_proportion.png" width="50%%" height="50%%"/>
|
|
274 </li>
|
|
275 %s
|
|
276 </ul>
|
|
277 </p>
|
|
278 </body>
|
|
279 </html>
|
|
280 """
|
|
281 all = html_str % kmer_summary
|
|
282
|
|
283 html = open(html_file,"w")
|
|
284 html.write(all)
|
|
285 html.close()
|
|
286
|
|
287 def __main__():
|
|
288
|
|
289 #Parse command line options
|
|
290 parser = optparse.OptionParser()
|
|
291 parser.add_option("-g", "--gff", dest="gfffile", type= "string",
|
|
292 help="GFF annotation file", metavar="FILE")
|
|
293
|
|
294 parser.add_option("-b", "--bam", dest="bamfile", type= "string",
|
|
295 help="Bam Ribo-Seq alignments ", metavar="FILE")
|
|
296
|
|
297 parser.add_option("-o", "--out", dest="html_file", type= "string",
|
|
298 help="write report to FILE", metavar="FILE")
|
|
299
|
|
300 parser.add_option("-d", "--dirout", dest="dirout", type= "string",
|
|
301 help="write report to PNG files", metavar="FILE")
|
|
302
|
|
303 parser.add_option("-q", "--quiet",
|
|
304 action="store_false", dest="verbose", default=True,
|
|
305 help="don't print status messages to stdout")
|
|
306
|
|
307 (options, args) = parser.parse_args()
|
|
308 sys.stdout.write("Begin kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) )
|
|
309
|
|
310 try:
|
|
311
|
|
312 if os.path.exists(options.dirout):
|
|
313 raise
|
|
314 try:
|
|
315 os.mkdir(options.dirout)
|
|
316 except:
|
|
317 raise
|
|
318
|
|
319 ##testing indexed bam file
|
|
320 if os.path.isfile(options.bamfile+".bai") :
|
|
321 pass
|
|
322 else :
|
|
323 cmd = "samtools index %s " % (options.bamfile)
|
|
324 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
|
|
325 returncode = proc.wait()
|
|
326
|
|
327 tmpdir = tempfile.mkdtemp()
|
|
328 GFF = ribo_functions.store_gff(options.gfffile)
|
|
329 ## split bam
|
|
330 ribo_functions.split_bam(options.bamfile,tmpdir)
|
|
331 ###################################
|
|
332 ## First analysis with 28mer :
|
|
333 ###################################
|
|
334 ## compute coverage and distribution kmer
|
|
335 kmer = get_first_base(tmpdir, 28)
|
|
336 ## compute phasing
|
|
337 whole_phasing = frame_analysis(tmpdir,GFF)
|
|
338 ## save phasing
|
|
339 results = {}
|
|
340 results[28] = whole_phasing
|
|
341 ## compute analysis with other kmer
|
|
342 for keys in kmer.iterkeys() :
|
|
343 if keys != 28 :
|
|
344 ## remove all txt files in tmp directory
|
|
345 if os.system("rm "+tmpdir+"/*.txt") != 0 :
|
|
346 stop_err( 'Error during tmp directory cleaning : ' + str( e ) )
|
|
347
|
|
348 ## If not enought reads in this kmer :
|
|
349 if kmer[keys] > 100 :
|
|
350 ## compute coverage and distribution kmer
|
|
351 tmp = get_first_base(tmpdir, keys)
|
|
352 ## compute phasing
|
|
353 whole_phasing = frame_analysis(tmpdir,GFF)
|
|
354 results[keys] = whole_phasing
|
|
355 ## get report
|
|
356 make_report(options.html_file, options.dirout, kmer, results)
|
|
357 #=======================================================================
|
|
358 # ############
|
|
359 # # tests
|
|
360 # ############
|
|
361 # with open("kmer_dict",'rb') as km:
|
|
362 # kmer = cPickle.load(km)
|
|
363 # with open("results_dict",'rb') as res:
|
|
364 # results = cPickle.load(res)
|
|
365 # with open("kmer_dict",'wb') as km:
|
|
366 # cPickle.dump(kmer,km)
|
|
367 # with open("results_dict",'wb') as res:
|
|
368 # cPickle.dump(results,res)
|
|
369 # make_report(options.html_file, options.dirout, kmer, results)
|
|
370 #=======================================================================
|
|
371
|
|
372 sys.stdout.write("Finish kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) )
|
|
373
|
|
374 except Exception, e:
|
|
375 stop_err( 'Error running kmer and phasing analysis : ' + str( e ) )
|
|
376
|
|
377
|
|
378 if __name__=="__main__":
|
|
379 __main__() |