Mercurial > repos > rlegendre > ribo_tools
comparison kmer_analysis.py @ 2:da126b91f9ea
Uploaded
author | rlegendre |
---|---|
date | Mon, 20 Oct 2014 11:06:53 -0400 |
parents | |
children | 707807fee542 |
comparison
equal
deleted
inserted
replaced
1:1fbddace2db6 | 2:da126b91f9ea |
---|---|
1 #!/usr/bin/env python2.7.3 | |
2 #-*- coding: utf-8 -*- | |
3 ''' | |
4 Created on Jun. 2014 | |
5 @author: rachel legendre | |
6 @copyright: rachel.legendre@igmors.u-psud.fr | |
7 @license: GPL v3 | |
8 ''' | |
9 | |
10 import os, sys, time, optparse, re, urllib, subprocess, tempfile, commands | |
11 import pysam | |
12 #from matplotlib import pyplot as pl | |
13 import matplotlib | |
14 matplotlib.use('Agg') | |
15 import matplotlib.pyplot as pl | |
16 from numpy import arange | |
17 import ribo_functions | |
18 from collections import OrderedDict | |
19 #import cPickle | |
20 | |
21 total_mapped_read = 0 | |
22 | |
23 | |
24 def stop_err( msg ): | |
25 sys.stderr.write( "%s\n" % msg ) | |
26 sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) ) | |
27 sys.exit() | |
28 | |
29 | |
30 | |
31 def get_first_base(tmpdir, kmer): | |
32 ''' | |
33 write footprint coverage file for each sam file in tmpdir and get kmer distribution | |
34 ''' | |
35 global total_mapped_read | |
36 | |
37 ## init kmer dict | |
38 KMER = OrderedDict({}) | |
39 | |
40 try: | |
41 file_array = (commands.getoutput('ls '+tmpdir)).split('\n') | |
42 ##write coverage for each sam file in tmpdir | |
43 for samfile in file_array: | |
44 with open(tmpdir+'/'+samfile, 'r') as sam : | |
45 ##get chromosome name | |
46 chrom = samfile.split(".sam")[0] | |
47 | |
48 for line in sam: | |
49 #initialize dictionnary | |
50 if '@SQ' in line : | |
51 size = int(line.split('LN:')[1]) | |
52 genomeF = [0]*size | |
53 genomeR = [0]*size | |
54 # define multiple reads keys from mapper | |
55 elif '@PG' in line : | |
56 if 'bowtie' in line: | |
57 multi_tag = "XS:i:" | |
58 elif 'bwa' in line: | |
59 multi_tag = "XT:A:R" | |
60 else : | |
61 stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") | |
62 | |
63 # get footprint | |
64 elif re.search('^[^@].+', line) : | |
65 len_read = len(line.split('\t')[9]) | |
66 ##full kmer dict | |
67 if KMER.has_key(len_read): | |
68 KMER[len_read] += 1 | |
69 else : | |
70 KMER[len_read] = 1 | |
71 | |
72 #print line.rstrip() | |
73 read_pos = int(line.split('\t')[3]) | |
74 read_sens = int(line.split('\t')[1]) | |
75 #len_read = len(line.split('\t')[9]) | |
76 if len_read == kmer and multi_tag not in line: | |
77 ###if line.split('\t')[5] == '28M' : | |
78 total_mapped_read +=1 | |
79 #if it's a forward read | |
80 if read_sens == 0 : | |
81 #get 5' base | |
82 genomeF[read_pos] += 1 | |
83 #if it's a reverse read | |
84 elif read_sens == 16 : | |
85 #get 5' base | |
86 read_pos += (len_read-1) | |
87 genomeR[read_pos] += 1 | |
88 | |
89 #try: | |
90 #write coverage in files | |
91 with open(tmpdir+'/assoCov_'+chrom+'.txt', 'w') as cov : | |
92 for i in range(0,size): | |
93 cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n') | |
94 #except Exception,e: | |
95 # stop_err( 'Error during coverage file writting : ' + str( e ) ) | |
96 | |
97 #sys.stdout.write("%d reads are in your bam file\n" % total_mapped_read) | |
98 | |
99 return KMER | |
100 | |
101 except Exception, e: | |
102 stop_err( 'Error during footprinting : ' + str( e ) ) | |
103 | |
104 | |
105 def __percent__(prop): | |
106 | |
107 if sum(prop) != 0 : | |
108 perc = [0,0,0] | |
109 if prop[0] != 0 : | |
110 perc[0] = int("{0:.0f}".format((prop[0]*100.0)/sum(prop))) | |
111 if prop[1] != 0 : | |
112 perc[1] = int("{0:.0f}".format((prop[1]*100.0)/sum(prop))) | |
113 if prop[2] != 0 : | |
114 perc[2] = int("{0:.0f}".format((prop[2]*100.0)/sum(prop))) | |
115 return perc | |
116 else : | |
117 return prop | |
118 | |
119 | |
120 def frame_analysis(tmpdir,GFF): | |
121 ''' | |
122 This function take footprint and annotation (gff) for analyse reading frame in each gene | |
123 ''' | |
124 global total_mapped_read | |
125 try: | |
126 chrom = "" # initializing chromosome | |
127 nb_gene = 0 # number of analysed genes | |
128 whole_phasing = [0,0,0] | |
129 for gene in GFF['order']: | |
130 ## maybe no start position in GTF file so we must to check and replace | |
131 try : GFF[gene]['start'] | |
132 except : | |
133 if GFF[gene]['strand'] == '+' : | |
134 GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] | |
135 else : | |
136 GFF[gene]['start'] = GFF[gene]['exon'][1]['stop'] | |
137 ## also for stop coordinates | |
138 try : GFF[gene]['stop'] | |
139 except : | |
140 exon_number = GFF[gene]['exon_number'] | |
141 if GFF[gene]['strand'] == '+' : | |
142 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] | |
143 | |
144 else : | |
145 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['start'] | |
146 cov = [] | |
147 ##first chromosome : we open corresponding file | |
148 if chrom == "" : | |
149 chrom = GFF[gene]['chrom'] | |
150 with open(tmpdir+"/assoCov_"+chrom+".txt") as f : | |
151 data = f.readlines() | |
152 ##if we change chrosomosome | |
153 elif chrom != GFF[gene]['chrom'] : | |
154 chrom = GFF[gene]['chrom'] | |
155 with open(tmpdir+"/assoCov_"+chrom+".txt") as f : | |
156 data = f.readlines() | |
157 | |
158 ## if a gene without intron : | |
159 if GFF[gene]['exon_number'] == 1: | |
160 | |
161 ## get coverage for each gene | |
162 if GFF[gene]['strand'] == "+": | |
163 for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1): | |
164 cov.append(int((data[i].rstrip()).split("\t")[0])) | |
165 else : | |
166 for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1): | |
167 cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) | |
168 cov.reverse() | |
169 else : | |
170 ## For each gene, get coverage and sum of exon size | |
171 if GFF[gene]['strand'] == "+": | |
172 | |
173 for exon in range(1,GFF[gene]['exon_number']+1) : | |
174 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): | |
175 if i <= GFF[gene]['stop'] : | |
176 cov.append(int((data[i].rstrip()).split("\t")[0])) | |
177 else : | |
178 | |
179 for exon in range(1,GFF[gene]['exon_number']+1) : | |
180 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): | |
181 if i <= GFF[gene]['start'] : | |
182 cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) | |
183 cov.reverse() | |
184 | |
185 len_cov = len(cov) | |
186 prop = [0,0,0] | |
187 for nuc in range(0,len_cov-2,3) : | |
188 ## Calculate proportion | |
189 prop[0] += cov[nuc] | |
190 prop[1] += cov[nuc+1] | |
191 prop[2] += cov[nuc+2] | |
192 whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop))) | |
193 | |
194 whole_phasing = __percent__(whole_phasing) | |
195 #sys.stdout.write("Proportion of reads in each frame :\n%s\n" % whole_phasing) | |
196 return whole_phasing | |
197 | |
198 except Exception, e: | |
199 stop_err( 'Error during frame analysis : ' + str( e ) ) | |
200 | |
201 | |
202 | |
203 def make_report(html_file, dirout, kmer, results) : | |
204 | |
205 array = sorted(kmer.items(), key=lambda x: x[0]) | |
206 values = [] | |
207 label = [] | |
208 for x,y in array : | |
209 values.append(y) | |
210 label.append(x) | |
211 index = arange(len(label)) | |
212 bar_width = 0.35 | |
213 axis_font = {'size':'10'} | |
214 fig, ax = pl.subplots() | |
215 pl.bar(index ,values, color='LightsteelBlue') | |
216 pl.xlabel('kmer value', **axis_font) | |
217 pl.ylabel('Number of reads', **axis_font) | |
218 pl.title('Number of reads for each k-mer') | |
219 pl.xticks(index + bar_width, label, **axis_font) | |
220 #pl.show() | |
221 pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640) | |
222 pl.clf() | |
223 | |
224 for key, phase in results.iteritems() : | |
225 fig = pl.figure(num=1) | |
226 frame = phase | |
227 index = arange(3) | |
228 bar_width = 0.5 | |
229 pl.bar(index,frame,color=['RoyalBlue','LightSkyBlue','LightBlue']) | |
230 pl.xlabel('Frame in gene', **axis_font) | |
231 pl.ylabel('Percent of read', **axis_font) | |
232 pl.title('Proportion of reads in each frame for '+str(key)+'-mer') | |
233 pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font) | |
234 pl.tight_layout() | |
235 pl.ylim(0,100) | |
236 pl.draw() | |
237 #pl.show() | |
238 pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300) | |
239 pl.clf() | |
240 | |
241 kmer_summary = '' | |
242 kmer_sorted = OrderedDict(sorted(kmer.iteritems(), key=lambda x: x[0])) | |
243 for key, number in kmer_sorted.iteritems() : | |
244 if number > 100 : | |
245 kmer_summary += '<li>' | |
246 kmer_summary += 'Analysis of '+str(key)+'-mer : <br>' | |
247 kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>' | |
248 kmer_summary += 'Phasing of '+str(key)+'-mer is : '+str(results[key])+'<br>' | |
249 kmer_summary += '<img border="0" src="'+str(key)+'_phasing.png" width="50%%" height="50%%"/><br>' | |
250 kmer_summary += '</li>' | |
251 else : | |
252 kmer_summary += '<li>' | |
253 kmer_summary += 'Analysis of '+str(key)+'-mer : <br>' | |
254 kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>' | |
255 kmer_summary += '</li>' | |
256 | |
257 html_str = """ | |
258 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" | |
259 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> | |
260 | |
261 <html xmlns="http://www.w3.org/1999/xhtml"> | |
262 <head> | |
263 <link href="lightbox/css/lightbox.css" rel="stylesheet" /> | |
264 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
265 <title>kmer analysis</title> | |
266 </head> | |
267 <body> | |
268 <h1>k-mer analysis results</h1> | |
269 <p> | |
270 <ul> | |
271 <li> | |
272 Number of reads for each k-mer in your sample : <br /> | |
273 <img border="0" src="kmer_proportion.png" width="50%%" height="50%%"/> | |
274 </li> | |
275 %s | |
276 </ul> | |
277 </p> | |
278 </body> | |
279 </html> | |
280 """ | |
281 all = html_str % kmer_summary | |
282 | |
283 html = open(html_file,"w") | |
284 html.write(all) | |
285 html.close() | |
286 | |
287 def __main__(): | |
288 | |
289 #Parse command line options | |
290 parser = optparse.OptionParser() | |
291 parser.add_option("-g", "--gff", dest="gfffile", type= "string", | |
292 help="GFF annotation file", metavar="FILE") | |
293 | |
294 parser.add_option("-b", "--bam", dest="bamfile", type= "string", | |
295 help="Bam Ribo-Seq alignments ", metavar="FILE") | |
296 | |
297 parser.add_option("-o", "--out", dest="html_file", type= "string", | |
298 help="write report to FILE", metavar="FILE") | |
299 | |
300 parser.add_option("-d", "--dirout", dest="dirout", type= "string", | |
301 help="write report to PNG files", metavar="FILE") | |
302 | |
303 parser.add_option("-q", "--quiet", | |
304 action="store_false", dest="verbose", default=True, | |
305 help="don't print status messages to stdout") | |
306 | |
307 (options, args) = parser.parse_args() | |
308 sys.stdout.write("Begin kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) ) | |
309 | |
310 try: | |
311 | |
312 if os.path.exists(options.dirout): | |
313 raise | |
314 try: | |
315 os.mkdir(options.dirout) | |
316 except: | |
317 raise | |
318 | |
319 ##testing indexed bam file | |
320 if os.path.isfile(options.bamfile+".bai") : | |
321 pass | |
322 else : | |
323 cmd = "samtools index %s " % (options.bamfile) | |
324 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) | |
325 returncode = proc.wait() | |
326 | |
327 tmpdir = tempfile.mkdtemp() | |
328 GFF = ribo_functions.store_gff(options.gfffile) | |
329 ## split bam | |
330 ribo_functions.split_bam(options.bamfile,tmpdir) | |
331 ################################### | |
332 ## First analysis with 28mer : | |
333 ################################### | |
334 ## compute coverage and distribution kmer | |
335 kmer = get_first_base(tmpdir, 28) | |
336 ## compute phasing | |
337 whole_phasing = frame_analysis(tmpdir,GFF) | |
338 ## save phasing | |
339 results = {} | |
340 results[28] = whole_phasing | |
341 ## compute analysis with other kmer | |
342 for keys in kmer.iterkeys() : | |
343 if keys != 28 : | |
344 ## remove all txt files in tmp directory | |
345 if os.system("rm "+tmpdir+"/*.txt") != 0 : | |
346 stop_err( 'Error during tmp directory cleaning : ' + str( e ) ) | |
347 | |
348 ## If not enought reads in this kmer : | |
349 if kmer[keys] > 100 : | |
350 ## compute coverage and distribution kmer | |
351 tmp = get_first_base(tmpdir, keys) | |
352 ## compute phasing | |
353 whole_phasing = frame_analysis(tmpdir,GFF) | |
354 results[keys] = whole_phasing | |
355 ## get report | |
356 make_report(options.html_file, options.dirout, kmer, results) | |
357 #======================================================================= | |
358 # ############ | |
359 # # tests | |
360 # ############ | |
361 # with open("kmer_dict",'rb') as km: | |
362 # kmer = cPickle.load(km) | |
363 # with open("results_dict",'rb') as res: | |
364 # results = cPickle.load(res) | |
365 # with open("kmer_dict",'wb') as km: | |
366 # cPickle.dump(kmer,km) | |
367 # with open("results_dict",'wb') as res: | |
368 # cPickle.dump(results,res) | |
369 # make_report(options.html_file, options.dirout, kmer, results) | |
370 #======================================================================= | |
371 | |
372 sys.stdout.write("Finish kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) ) | |
373 | |
374 except Exception, e: | |
375 stop_err( 'Error running kmer and phasing analysis : ' + str( e ) ) | |
376 | |
377 | |
378 if __name__=="__main__": | |
379 __main__() |