comparison metagene_readthrough.py @ 6:29c9c86e17e1

Uploaded
author rlegendre
date Mon, 20 Oct 2014 11:07:40 -0400
parents
children 707807fee542
comparison
equal deleted inserted replaced
5:0b8fa8b7f356 6:29c9c86e17e1
1 #!/usr/bin/env python2.7.3
2 #-*- coding: utf-8 -*-
3
4 '''
5 Created on Dec. 2013
6 @author: rachel legendre
7 @copyright: rachel.legendre@igmors.u-psud.fr
8 @license: GPL v3
9 '''
10
11 import os, sys, time, optparse, shutil, re, urllib, subprocess, tempfile
12 from urllib import unquote
13 from Bio import SeqIO
14 import csv
15 import pysam
16 import HTSeq
17 #from matplotlib import pyplot as pl
18 import matplotlib
19 matplotlib.use('Agg')
20 import matplotlib.pyplot as pl
21 from numpy import arange
22 from matplotlib import ticker as t
23 from PIL import Image
24
25 def stop_err( msg ):
26 sys.stderr.write( "%s\n" % msg )
27 sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
28 sys.exit()
29
30
31 def store_gff(gff):
32 '''
33 parse and store gff file in a dictionnary
34 '''
35 try:
36 GFF = {}
37 with open(gff, 'r') as f_gff :
38 GFF['order'] = []
39 for line in f_gff:
40 ## switch commented lines
41 line = line.split("#")[0]
42 if line != "" :
43 feature = (line.split('\t')[8]).split(';')
44 # first line is already gene line :
45 if line.split('\t')[2] == 'gene' :
46 gene = feature[0].replace("ID=","")
47 if re.search('gene',feature[2]) :
48 Name = feature[2].replace("gene=","")
49 else :
50 Name = "Unknown"
51 ##get annotation
52 note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line)
53 note = unquote(str(note)).replace("\n","")
54 ## store gene information
55 GFF['order'].append(gene)
56 GFF[gene] = {}
57 GFF[gene]['chrom'] = line.split('\t')[0]
58 GFF[gene]['start'] = int(line.split('\t')[3])
59 GFF[gene]['stop'] = int(line.split('\t')[4])
60 GFF[gene]['strand'] = line.split('\t')[6]
61 GFF[gene]['name'] = Name
62 GFF[gene]['note'] = note
63 GFF[gene]['exon'] = {}
64 GFF[gene]['exon_number'] = 0
65 #print Name
66 elif line.split('\t')[2] == 'CDS' :
67 gene = re.sub(r".?Parent\=(.+)(_mRNA)+", r"\1", feature[0])
68 if GFF.has_key(gene) :
69 GFF[gene]['exon_number'] += 1
70 exon_number = GFF[gene]['exon_number']
71 GFF[gene]['exon'][exon_number] = {}
72 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
73 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
74 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
75
76 ## if there is a five prim UTR intron, we change start of gene
77 elif line.split('\t')[2] == 'five_prime_UTR_intron' :
78 if GFF[gene]['strand'] == "+" :
79 GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
80 else :
81 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
82 return GFF
83 except Exception,e:
84 stop_err( 'Error during gff storage : ' + str( e ) )
85
86
87
88
89 def compute_rpkm(length,count_gene,count_tot) :
90
91 try :
92 rpkm = "{0:.4f}".format(count_gene*1000000.0/(count_tot*length))
93 except ArithmeticError :
94 stop_err( 'Illegal division by zero')
95 return float(rpkm)
96
97
98 def find_stop(seq) :
99 '''
100 Find stop codon in a sequence and return position of first nucleotide in stop
101 '''
102 stop_codon = ['TAA','TAG','TGA']
103 for nt in range(0,len(seq)-3,3) :
104 codon = seq[nt:nt+3]
105 if codon in stop_codon :
106 return nt
107
108 return -1
109
110
111 def check_met(seq):
112 '''
113 Boolean function for testing presence or absence of methionine in 5 codons following stop codon
114 '''
115 met = 'ATG'
116 for pos in range(0,15,3) :
117 codon = seq[pos:pos+3]
118 if codon in met :
119 return True
120
121 return False
122
123
124 '''
125 feature.iv is a GenomicInterval object :
126 A GenomicInterval object has the following slots, some of which
127 are calculated from the other:
128
129 chrom: The name of a sequence (i.e., chromosome, contig, or
130 the like).
131 start: The start of the interval. Even on the reverse strand,
132 this is always the smaller of the two values 'start' and 'end'.
133 Note that all positions should be given as 0-based value!
134 end: The end of the interval. Following Python convention for
135 ranges, this in one more than the coordinate of the last base
136 that is considered part of the sequence.
137 strand: The strand, as a single character, '+' or '-'. '.' indicates
138 that the strand is irrelavant. (Alternatively, pass a Strand object.)
139 length: The length of the interval, i.e., end - start
140 start_d: The "directional start" position. This is the position of the
141 first base of the interval, taking the strand into account. Hence,
142 this is the same as 'start' except when strand == '-', in which
143 case it is end-1.
144 end_d: The "directional end": Usually, the same as 'end', but for
145 strand=='-1', it is start-1.
146
147 '''
148 def check_overlapping(gff_reader,chrm,start,stop,strand):
149
150 #### probleme avec les genes completement inclu...
151
152 iv2 = HTSeq.GenomicInterval(chrm,start,stop,strand)
153 for feature in gff_reader:
154 if feature.type == "gene" :
155 if feature.iv.overlaps(iv2) :
156 ## if its a reverse gene, we replace start of extension by start of previous gene
157 if strand == '-' :
158 return (feature.iv.end+3,stop)
159 ## else we replace stop of extension by start of following gene
160 else :
161 return (start,feature.iv.start-3)
162 ## if no overlap are find, we return -1
163 return (start,stop)
164
165
166 def pass_length(start,stop) :
167
168 if (stop-start) > 25 :
169 return True
170 else :
171 return False
172
173
174 def check_homo_coverage(gene,GFF,start,stop, aln) :
175
176 chrom = GFF[gene]['chrom']
177 ## compute nunber of nucleotides in CDS with a coverage equal to zero
178 nt_cds_cov = 0
179 nt_cds_num = 0
180 for i in range(1,GFF[gene]['exon_number']+1) :
181 for z in range(GFF[gene]['exon'][i]['start'],GFF[gene]['exon'][i]['stop']):
182 nt_cds_num += 1
183 if aln.count(chrom,z,z+1) == 0 :
184 nt_cds_cov += 1
185
186 ## compute percent of CDS no covering
187 percent = nt_cds_cov*100/nt_cds_num
188
189 ## compute number of nucleotides with no coverage in extension
190 nt_no_cover = 0
191 for pos in range(start,stop,1) :
192 if aln.count(chrom,pos,pos+1) == 0 :
193 nt_no_cover += 1
194 #print gene, nt_cds_cov, percent, nt_no_cover
195 #percent10 = (stop-start)*50/100
196 if (nt_no_cover*100)/(stop-start) > percent :
197 return False
198 else :
199 return True
200
201 def plot_gene ( aln, gene, start_extension, stop_extension, dirout ) :
202
203 try:
204 strand = gene['strand']
205 len_gene = gene['stop']-gene['start']
206 if strand is "-" :
207 len_ext = gene['stop']-start_extension
208 ## coverage in all gene + extension
209 start = start_extension-100
210 stop = gene['stop']+100
211 vector1 = [0]*(stop-start)
212 #for pileupcolumn in aln.pileup( gene['chrom'], start, stop):
213 # vector.append(pileupcolumn.n)
214 for read in aln.fetch(gene['chrom'], start, stop):
215 if read.is_reverse :
216 ## for get footprint in P-site (estimate) we take 3 nt in middle of read
217 #pos = (read.pos+13)-start
218 pos = (read.pos-start) + (read.rlen/2)-1
219 for z in range(pos,(pos+3)) :
220 ## le fetch contient des reads qui chevauchent les 30 nt de la fin du gene mais dont le site P
221 ## se trouve avant notre vector, le z devient negatif et la couverture augmente en fin de vecteur (ce qui est faux)
222 if z > 0 :
223 try :
224 vector1[z] += 1
225 except IndexError :
226 pass
227 vector1.reverse()
228 mean_read = float(sum(vector1))/float(len(vector1))
229 cov = [(x/mean_read) for x in vector1]
230 idx_tot = arange(len(cov))
231 ## coverage in extension
232 start_ext = start_extension-40
233 stop_ext = stop_extension+30
234 vector2 = [0]*(stop_ext-start_ext)
235 #for pileupcolumn in aln.pileup( gene['chrom'], start, stop):
236 # vector.append(pileupcolumn.n)
237 for read in aln.fetch(gene['chrom'], start_extension, stop_ext):
238 if read.is_reverse :
239 ## get footprint in P-site
240 #pos = (read.pos+13)-start_ext
241 pos = (read.pos-start_ext) + (read.rlen/2)-1
242 for z in range(pos,(pos+3)) :
243 if z > 0 :
244 try :
245 vector2[z] += 1
246 except IndexError :
247 pass
248 vector2.reverse()
249 #mean_read = float(sum(vector))/float(len(vector))
250 cov_ext = [(x/mean_read) for x in vector2]
251 _max = max(cov_ext[30::])
252 idx_ext = arange(len(cov_ext))
253
254 else :
255 len_ext = stop_extension-gene['start']
256 start = gene['start']-100
257 stop = stop_extension+100
258 vector = [0]*(stop-start)
259 #for pileupcolumn in aln.pileup( gene['chrom'], start, stop):
260 #vector.append(pileupcolumn.n)
261 for read in aln.fetch(gene['chrom'], start, stop):
262 if not read.is_reverse :
263 ## get footprint in P-site
264 #pos = (read.pos+12)-start
265 pos = (read.pos-start) + (read.rlen/2)-1
266 for z in range(pos,(pos+3)) :
267 if z > 0 :
268 try :
269 vector[z] += 1
270 except IndexError :
271 pass
272 mean_read = float(sum(vector))/float(len(vector))
273 cov = [(x/mean_read) for x in vector]
274 idx_tot = arange(len(cov))
275
276 ## coverage in extension
277 start_ext = gene['stop']-30
278 stop_ext = stop_extension+40
279 vector = [0]*(stop-start_ext)
280 for read in aln.fetch(gene['chrom'], start_ext, stop_extension):
281 if not read.is_reverse :
282 ## get footprint in P-site
283 pos = (read.pos-start_ext) + (read.rlen/2)-1
284 for z in range(pos,(pos+3)) :
285 if z > 0 :
286 try :
287 vector[z] += 1
288 except IndexError :
289 pass
290
291 cov_ext = [(x/mean_read) for x in vector]
292 idx_ext = arange(len(cov_ext))
293 _max = max(cov_ext[30::])
294
295 #### PLOT FIGURE ####
296
297 font = {'family' : 'serif','color': 'grey','weight' : 'normal','size' : 16 }
298
299
300 fig = pl.figure(num=1)
301 ## create a big subplot for common y axis on two last subplot
302 ax = fig.add_subplot(2,1,2)
303 ## hide all spines
304 ax.spines['top'].set_visible(False)
305 ax.spines['bottom'].set_visible(False)
306 ax.spines['left'].set_visible(False)
307 ax.spines['right'].set_visible(False)
308 ax.tick_params(labelcolor='w', top='off', bottom='off', left='off', right='off')
309
310 ## plot gene structure
311 ax1 = fig.add_subplot(3,1,1)
312 ax1.set_title(gene['name'])
313 ## hide all spines
314 ax1.spines['right'].set_color('none')
315 ax1.spines['top'].set_color('none')
316 ax1.spines['left'].set_color('none')
317 ax1.spines['bottom'].set_color('none')
318 ax1.set_ylim(0,5)
319 #set xlim as second plot with 100nt switch
320 ax1.set_xlim(-100,len(idx_tot)-100)
321 ## hide ticks
322 pl.yticks([])
323 pl.xticks([])
324 ## if it's a "simple" gene
325 if gene['exon_number'] == 1 :
326 exon = arange(len_gene)
327 x = [3]*len_gene
328 ax1.plot(exon,x,color='#9B9E9C')
329 ax1.fill_between(exon,2,x,color='#9B9E9C')
330 ## if it's a gene with introns
331 else :
332 ## plot a line representing intron
333 intron = arange(len_gene)
334 y = [2.5]*len_gene
335 ax1.plot(intron,y,color='#9B9E9C')
336
337 ## plotting each intron
338 start = gene['start']
339 if strand == '+' :
340 for exon in range(1,gene['exon_number']+1,1) :
341 len_exon = gene['exon'][exon]['stop']-gene['exon'][exon]['start']
342 idx = gene['exon'][exon]['start'] - start
343 exo = arange(idx,idx+len_exon)
344 x = [3]*len_exon
345 ax1.plot(exo,x,color='#9B9E9C')
346 ax1.fill_between(exo,2,x,color='#9B9E9C')
347 else :
348 ## if it's a reverse gene we must reverse exon's position
349 start = gene['start']
350 tabF = [2.5]*len_gene
351 tabR = [2.5]*len_gene
352 for exon in range(1,gene['exon_number']+1,1) :
353 len_exon = gene['exon'][exon]['stop']-gene['exon'][exon]['start']
354 idx = gene['exon'][exon]['start'] - start
355 exo = arange(idx,idx+len_exon)
356 for z in exo :
357 tabF[z] = 3
358 tabR[z] = 2
359 tabF.reverse()
360 tabR.reverse()
361 #pl.ylim(0,5)
362 ax1.plot(tabF,color='#9B9E9C')
363 ax1.plot(tabR,color='#9B9E9C')
364 x = arange(len(tabR))
365 ax1.fill_between(x,tabF,tabR,color='#9B9E9C')
366
367 ## insert arrows (as genome browser representation)
368 yloc = 2.5
369 narrows = 20
370 exonwidth = .8
371 spread = .4 * len_gene / narrows
372 for i in range(narrows):
373 loc = (float(i) * len_gene / narrows)+ (len(idx_tot)/100)*2
374 if strand == '+' :
375 x = [loc - spread, loc, loc - spread]
376 else:
377 x = [loc - spread, loc, loc - spread]
378 y = [yloc - exonwidth / 5, yloc, yloc + exonwidth / 5]
379 ax1.plot(x, y, lw=1.4, color='#676B69')
380
381
382 # plot coverage in all gene + extension
383 ax2 = fig.add_subplot(3,1,2)
384 ## fixe limit to length of coverage for x axis
385 ax2.set_xlim(0,len(idx_tot))
386 ## fixe 4 ticks and associated labels for y axis
387 ax2.set_yticklabels(arange(0,int(max(cov)+20),int((max(cov)+20)/4)))
388 ax2.set_yticks(arange(0,int(max(cov)+20),int((max(cov)+20)/4)))
389 ### add start and stop of gene in axe in place of number
390 ax2.set_xticks([100,(100+len_gene),(100+len_ext)])
391 ax2.set_xticklabels(["{:,}".format(gene['start']),"{:,}".format(gene['stop']),""])
392 ## hide spines and any axis
393 ax2.spines['right'].set_visible(False)
394 ax2.spines['top'].set_visible(False)
395 ax2.yaxis.set_ticks_position('left')
396 ax2.xaxis.set_ticks_position('bottom')
397 ## plot and fill
398 ax2.plot(idx_tot, cov, color="#CC0011")
399 ax2.fill_between(idx_tot, 0,cov,color="#CC0011")
400 ax2.set_title("Genomic coordinates (%s,%s)"% (gene['strand'],gene['chrom']) ,fontdict={'fontsize':'small'})
401
402 # plot zoom coverage in extension
403 ax3 = fig.add_subplot(3,1,3)
404 ## fixe limit to length of coverage for x axis
405 ax3.set_ylim(0,int(_max))
406 # we hide spines
407 ax3.spines['right'].set_visible(False)
408 ax3.spines['top'].set_visible(False)
409 ax3.yaxis.set_ticks_position('left')
410 ax3.xaxis.set_ticks_position('bottom')
411 ## add stop and in-frame stop in x axis
412 pl.xticks([30,( stop_extension-start_extension)+30],["stop codon","next in-frame stop"])
413 ## fixe good position for labels
414 (ax3.get_xticklabels()[0]).set_horizontalalignment('center')
415 (ax3.get_xticklabels()[1]).set_horizontalalignment('left')
416 #if max coverage is lower than 2, we have a illegal division by zero
417 if _max > 2 :
418 ax3.set_yticklabels(arange(0,int(_max+1),int((_max+1)/3)))
419 ax3.set_yticks(arange(0,int(_max+1),int((_max+1)/3)))
420 else :
421 ##
422 ax3.set_ylim(0,_max)
423 ax3.ticklabel_format(style='sci', scilimits=(-2,2), axis='y',useOffset=False)
424
425 ax3.plot(idx_ext, cov_ext, color="#CC0011")
426 ax3.fill_between(idx_ext, 0,cov_ext,color="#CC0011")
427
428
429 ## get scale of subplot 3
430 #ax3.text(ax3.get_xlim()[1]-50,ax3.get_ylim()[1]-1, r'50 nt', fontdict=font)
431 #pl.arrow( ax3.get_xlim()[1]-50, ax3.get_ylim()[1]-2, ax3.get_xlim()[1]-50, 0, fc="grey", ec="grey",lw=2)
432
433 ## set common y label
434 ax.set_ylabel('Normalized footprint counts',labelpad=20)
435
436 ## draw and save plot
437 pl.draw()
438 #pl.show()
439 pl.savefig(dirout+".png",format='png', dpi=300)
440 pl.clf()
441
442
443 ## Make thumbnail for html page
444 infile = dirout+'.png'
445 size = 128, 128
446 im = Image.open(infile)
447 im.thumbnail(size, Image.ANTIALIAS)
448 im.save(dirout+"_thumbnail.png", "PNG")
449
450
451 except Exception, e:
452 stop_err( 'Error during gene plotting : ' + str( e ) )
453
454
455 def compute_analysis(bam, GFF, fasta, gff, dirout) :
456
457 try:
458 background_file = dirout+"/background_sequence.fasta"
459 file_back = open(background_file, 'w')
460 file_context = open(dirout+"/stop_context.fasta", 'w')
461 file_extension = open(dirout+"/extensions.fasta", 'w')
462 ## Opening Bam file with pysam librarie
463 pysam.index(bam)
464 aln = pysam.Samfile(bam, "rb",header=True, check_header = True)
465 ## Opening fasta file in a dict with BioPython
466 SeqDict = SeqIO.to_dict(SeqIO.parse(open(fasta,"r"),"fasta"))
467
468 ## count total read in bam file
469 cmd = "samtools view -c %s " % (bam)
470 proc = subprocess.Popen( args=cmd, shell=True, stdout = subprocess.PIPE)
471 count_tot = int(proc.stdout.read().rstrip())
472 returncode = proc.wait()
473
474 ## opening a GFF reader for check overlapping
475 gff_reader = HTSeq.GFF_Reader(gff)
476
477 with open(dirout+"/readthrough_result.csv","w") as out :
478 myheader = ['Gene','Name', 'FAIL', 'Stop context','chrom','start extension','stop extension','length extension','RPKM CDS', 'RPKM extension','ratio','Annotation','sequence']
479 writer = csv.writer(out,delimiter='\t')
480 writer.writerow(myheader)
481 for gene in GFF['order'] :
482 indic = ""
483 # compute rpkm of CDS :
484 len_cds = GFF[gene]['stop']-GFF[gene]['start']
485 count_cds = 0
486 ### count method of pysam doesn't strand information
487 if GFF[gene]['strand'] == '+' :
488 for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+12,GFF[gene]['stop']-15) :
489 if not track.is_reverse :
490 count_cds += 1
491 else :
492 for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+15,GFF[gene]['stop']-12) :
493 if track.is_reverse :
494 count_cds += 1
495 rpkm_cds = compute_rpkm(len_cds,count_cds,count_tot)
496 ## Only if gene is translate :
497 if rpkm_cds > 0 and count_cds > 128:
498 ## search footprint in UTR3
499 count = 0
500 try :
501 if GFF[gene]['strand'] == '+' :
502 contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['stop']-6:GFF[gene]['stop']+6])
503 #print gene, contexte_stop
504 start_extension = GFF[gene]['stop']
505 stop_extension = GFF[gene]['stop']+90
506 for track in aln.fetch(GFF[gene]['chrom'],start_extension+15,stop_extension) :
507 if not track.is_reverse :
508 count += 1
509
510 ## get sequence of extension
511 seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension])
512
513 else :
514 contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['start']-7:GFF[gene]['start']+5].reverse_complement())
515 #print gene, contexte_stop
516 start_extension = GFF[gene]['start']-90
517 stop_extension = GFF[gene]['start']
518 for track in aln.fetch(GFF[gene]['chrom'],start_extension,stop_extension-15) :
519 if track.is_reverse :
520 count += 1
521 ## get sequence of extension
522 seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement())
523
524 ## if we have coverage after cds stop codon
525 if count > 10 :
526 res = find_stop(seq)
527 if res == -1 :
528 '''
529 Write results with no stop but RPF in extension
530 '''
531 ## check if next in-frame codon is far than 90 nt extension :
532 if GFF[gene]['strand'] == '+' :
533 pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['stop']+1,GFF[gene]['stop']+300,'+')
534 start_extension = pos[0]-1
535 stop_extension = pos[1]
536 seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension])
537
538 #print gene,count,pos,'\n',seq
539
540 if (seq):
541 res = find_stop(seq)
542 if res == -1 :
543 mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq]
544 writer.writerow(mylist)
545 else :
546 indic = 'ok'
547 #print res
548 #stop_extension = start_extension + res +3
549 else :
550 pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['start']-300,GFF[gene]['start']-1,'-')
551 start_extension = pos[0]
552 stop_extension = pos[1]+1
553 seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement())
554
555 #print gene,count,pos,'\n',seq
556
557 if (seq):
558 res = find_stop(seq)
559 if res == -1 :
560 mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq]
561 writer.writerow(mylist)
562 else :
563 indic = 'ok'
564 #print res
565 #start_extension = stop_extension - res -3
566 else :
567 indic = 'ok'
568
569
570 if indic == 'ok' :
571 ## We save new coordinates
572 if GFF[gene]['strand'] == '+' :
573 stop_extension = start_extension + res +3
574 #print gene, count
575 #print gene,start_extension,stop_extension
576 seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension])
577 #print seq
578 count_stop = aln.count(GFF[gene]['chrom'],stop_extension-2,stop_extension+2)
579 if pass_length(start_extension,stop_extension) :
580 count_ext = aln.count(GFF[gene]['chrom'],start_extension+9,stop_extension-15)
581 if stop_extension > GFF[gene]['stop']+9 :
582 stop_ok = 1
583 else :
584 stop_ok = 0
585 else :
586 start_extension = stop_extension - res - 3
587 #print gene, count
588 #print gene,start_extension,stop_extension
589 seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension-1:stop_extension-1].reverse_complement())
590 #print seq
591 count_stop = aln.count(GFF[gene]['chrom'],start_extension-2,start_extension+2)
592 if pass_length(start_extension,stop_extension) :
593 count_ext = aln.count(GFF[gene]['chrom'],start_extension+15,stop_extension-9)
594 if start_extension < GFF[gene]['start']-9 :
595 stop_ok = 1
596 else :
597 stop_ok = 0
598 ## if we are no methionine in 5 codons following stop of CDS
599 if (not check_met(seq) ):
600 ## if we have footprint in stop codon extension and stop extension is further than cds_stop+9
601 if count_stop > 2 and stop_ok == 1 :
602 homo_cov = check_homo_coverage(gene,GFF,start_extension,stop_extension,aln)
603 if (homo_cov) :
604 '''
605 write result witch corresponding to readthrough
606 '''
607 ##if length of extension upper than 25 we can compute rpkm
608 if (not pass_length(start_extension,stop_extension)) :
609 len_ext = stop_extension-start_extension
610 rpkm_ext = 'nan'
611 ratio = 'nan'
612 else :
613 len_ext = stop_extension-start_extension
614 rpkm_ext = compute_rpkm(len_ext,count_ext,count_tot)
615 ## compute ratio between extension coverage and cds coverage (rpkm)
616 ratio = rpkm_ext/rpkm_cds
617 #print gene,ratio
618 #print start_extension,stop_extension
619 mylist = [gene,GFF[gene]['name'],'-',contexte_stop,GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,GFF[gene]['note'],seq]
620 writer.writerow(mylist)
621 file_context.write('>'+gene+'\n'+contexte_stop+'\n')
622 file_extension.write('>'+gene+'\n'+seq+'\n')
623 else :
624 '''
625 write result witch corresponding to readthrough but with no homogeneous coverage
626 '''
627 if (not pass_length(start_extension,stop_extension)) :
628 len_ext = stop_extension-start_extension
629 rpkm_ext = 'nan'
630 ratio = 'nan'
631 else :
632 len_ext = stop_extension-start_extension
633 rpkm_ext = compute_rpkm(len_ext,count_ext,count_tot)
634 ## compute ratio between extension coverage and cds coverage (rpkm)
635 ratio = rpkm_ext/rpkm_cds
636 mylist = [gene,GFF[gene]['name'],'hetero cov',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,GFF[gene]['note'],seq]
637 writer.writerow(mylist)
638 file_context.write('>'+gene+'\n'+contexte_stop+'\n')
639 file_extension.write('>'+gene+'\n'+seq+'\n')
640 #print ">"+gene+"\n"+contexte_stop
641
642 ## plot gene :
643 plot_gene(aln, GFF[gene], start_extension, stop_extension, dirout+"/"+gene)
644
645
646
647 else :
648 '''
649 write result with no footprint in stop codon of extension
650 '''
651 mylist = [gene,GFF[gene]['name'],'no RPF in stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq]
652 writer.writerow(mylist)
653 file_context.write('>'+gene+'\n'+contexte_stop+'\n')
654 file_extension.write('>'+gene+'\n'+seq+'\n')
655 #print ">"+gene+"\n"+contexte_stop
656 else :
657 '''
658 write result with RPF maybe result of reinitiation on a start codon
659 '''
660 if pass_length(start_extension,stop_extension) :
661 mylist = [gene,GFF[gene]['name'],'Met after stop', contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq]
662 writer.writerow(mylist)
663 file_context.write('>'+gene+'\n'+contexte_stop+'\n')
664 file_extension.write('>'+gene+'\n'+seq+'\n')
665 #print ">"+gene+"\n"+contexte_stop
666 else :
667 ## if its not a interesting case, we get stop context of genes without readthrough
668 if GFF[gene]['strand'] == '+' :
669 contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['stop']-6:GFF[gene]['stop']+6])
670 file_back.write(contexte_stop+'\n')
671 else :
672 contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['start']-7:GFF[gene]['start']+5].reverse_complement())
673 file_back.write(contexte_stop+'\n')
674
675 ## excluded UT with incorrect positions
676 except ValueError:
677 pass
678
679
680 file_context.close()
681 file_back.close()
682 file_extension.close()
683 except Exception,e:
684 stop_err( 'Error during computing analysis : ' + str( e ) )
685
686
687 def cleaning_bam(bam):
688 '''
689 Remove reads unmapped, non uniquely mapped and reads with length lower than 25 and upper than 32, and mapping quality upper than 12
690 '''
691 try :
692 header = subprocess.check_output(['samtools', 'view', '-H', bam], stderr= subprocess.PIPE)
693 #header = results.split('\n')
694
695 # check mapper for define multiple tag
696 if re.search('bowtie', header):
697 multi_tag = "XS:i:"
698 elif re.search('bwa', header):
699 multi_tag = "XT:A:M"
700 else :
701 stop_err("No PG tag find in"+bam+". Please use bowtie or bwa for mapping")
702
703 tmp_sam = tempfile.mktemp()
704 cmd = "samtools view %s > %s" % (bam, tmp_sam)
705 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
706 returncode = proc.wait()
707
708
709 with open(tempfile.mktemp(),'w') as out :
710 out.write(header)
711 with open(tmp_sam,'r') as sam :
712 for line in sam :
713 if multi_tag not in line and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 :
714 if len(line.split('\t')[9]) < 32 and len(line.split('\t')[9]) > 25 :
715 out.write(line)
716 bamfile = tempfile.mktemp()+'.bam'
717 cmd = "samtools view -hSb %s > %s" % (out.name,bamfile)
718 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
719 returncode = proc.wait()
720
721 return bamfile
722
723 except Exception,e:
724 stop_err( 'Error during cleaning bam : ' + str( e ) )
725
726 def write_html_page(html,subfolder) :
727
728
729 try :
730
731 gene_table = ''
732 gene_table += '<table>'
733 gene_table += '<thead><tr><th data-sort="string">Gene</th><th>Plot</th><th data-sort="string">Name</th><th>Stop context</th><th>Coordinates</th><th>RPKM CDS</th><th>RPKM extension</th><th data-sort="float">ratio</th><th>Extension</th></tr></thead><tbody>'
734
735 with open(os.path.join(subfolder,'readthrough_result.csv'), 'rb') as csvfile:
736 spamreader = csv.reader(csvfile, delimiter='\t')
737 ## skip the header
738 next(spamreader, None)
739 for row in spamreader:
740 if row[2] == '-' :
741 gene_table += '<tr><td>%s</td><td><a href="%s.png" data-lightbox="%s"><img src="%s_thumbnail.png" /></a></td><td><a title="%s">%s</a></td><td>%s</td><td>%s:%s-%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' %(row[0], row[0], row[0], row[0], row[11], row[1], row[3], row[4], row[5], row[6], row[8], row[9], row[10], row[12])
742
743 gene_table += '</tbody></table>'
744
745
746
747 html_str = """
748 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
749 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
750
751 <html xmlns="http://www.w3.org/1999/xhtml">
752 <head>
753 <link rel="stylesheet" href="http://code.jquery.com/ui/1.10.3/themes/smoothness/jquery-ui.css" />
754 <script src="http://code.jquery.com/jquery-1.10.2.min.js"></script>
755 <script src="http://ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js"></script>
756 <script src="http://code.jquery.com/ui/1.10.3/jquery-ui.js"></script>
757 <script src="lightbox/js/jquery-1.10.2.min.js"></script>
758 <script src="lightbox/js/lightbox-2.6.min.js"></script>
759 <link href="lightbox/css/lightbox.css" rel="stylesheet" />
760 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
761 <title>Dual coding result file</title>
762 <link rel="stylesheet" href="http://code.jquery.com/ui/1.10.3/themes/smoothness/jquery-ui.css" />
763 <script>
764 (function(d){d.fn.stupidtable=function(b){return this.each(function(){var a=d(this);b=b||{};b=d.extend({},d.fn.stupidtable.default_sort_fns,b);var n=function(a,b){for(var f=[],c=0,e=a.slice(0).sort(b),h=0;h<a.length;h++){for(c=d.inArray(a[h],e);-1!=d.inArray(c,f);)c++;f.push(c)}return f},q=function(a,b){for(var d=a.slice(0),c=0,e=0;e<b.length;e++)c=b[e],d[c]=a[e];return d};a.on("click","th",function(){var m=a.children("tbody").children("tr"),g=d(this),f=0,c=d.fn.stupidtable.dir;a.find("th").slice(0, g.index()).each(function(){var a=d(this).attr("colspan")||1;f+=parseInt(a,10)});var e=g.data("sort-default")||c.ASC;g.data("sort-dir")&&(e=g.data("sort-dir")===c.ASC?c.DESC:c.ASC);var h=g.data("sort")||null;null!==h&&(a.trigger("beforetablesort",{column:f,direction:e}),a.css("display"),setTimeout(function(){var l=[],p=b[h];m.each(function(a,b){var c=d(b).children().eq(f),e=c.data("sort-value"),c="undefined"!==typeof e?e:c.text();l.push(c)});var k;k=e==c.ASC?n(l,p):n(l,function(a,b){return-p(a,b)}); a.find("th").data("sort-dir",null).removeClass("sorting-desc sorting-asc");g.data("sort-dir",e).addClass("sorting-"+e);k=d(q(m,k));a.children("tbody").remove();a.append("<tbody />").append(k);a.trigger("aftertablesort",{column:f,direction:e});a.css("display")},10))})})};d.fn.stupidtable.dir={ASC:"asc",DESC:"desc"};d.fn.stupidtable.default_sort_fns={"int":function(b,a){return parseInt(b,10)-parseInt(a,10)},"float":function(b,a){return parseFloat(b)-parseFloat(a)},string:function(b,a){return b<a?-1: b>a?1:0},"string-ins":function(b,a){b=b.toLowerCase();a=a.toLowerCase();return b<a?-1:b>a?1:0}}})(jQuery);
765 (function($) {
766
767 $.fn.stupidtable = function(sortFns) {
768 return this.each(function() {
769 var $table = $(this);
770 sortFns = sortFns || {};
771
772 // ==================================================== //
773 // Utility functions //
774 // ==================================================== //
775
776 // Merge sort functions with some default sort functions.
777 sortFns = $.extend({}, $.fn.stupidtable.default_sort_fns, sortFns);
778
779 // Return the resulting indexes of a sort so we can apply
780 // this result elsewhere. This returns an array of index numbers.
781 // return[0] = x means "arr's 0th element is now at x"
782 var sort_map = function(arr, sort_function) {
783 var map = [];
784 var index = 0;
785 var sorted = arr.slice(0).sort(sort_function);
786 for (var i=0; i<arr.length; i++) {
787 index = $.inArray(arr[i], sorted);
788
789 // If this index is already in the map, look for the next index.
790 // This handles the case of duplicate entries.
791 while ($.inArray(index, map) != -1) {
792 index++;
793 }
794 map.push(index);
795 }
796
797 return map;
798 };
799
800 // Apply a sort map to the array.
801 var apply_sort_map = function(arr, map) {
802 var clone = arr.slice(0),
803 newIndex = 0;
804 for (var i=0; i<map.length; i++) {
805 newIndex = map[i];
806 clone[newIndex] = arr[i];
807 }
808 return clone;
809 };
810
811 // ==================================================== //
812 // Begin execution! //
813 // ==================================================== //
814
815 // Do sorting when THs are clicked
816 $table.on("click", "th", function() {
817 var trs = $table.children("tbody").children("tr");
818 var $this = $(this);
819 var th_index = 0;
820 var dir = $.fn.stupidtable.dir;
821
822 $table.find("th").slice(0, $this.index()).each(function() {
823 var cols = $(this).attr("colspan") || 1;
824 th_index += parseInt(cols,10);
825 });
826
827 // Determine (and/or reverse) sorting direction, default `asc`
828 var sort_dir = $this.data("sort-default") || dir.ASC;
829 if ($this.data("sort-dir"))
830 sort_dir = $this.data("sort-dir") === dir.ASC ? dir.DESC : dir.ASC;
831
832 // Choose appropriate sorting function.
833 var type = $this.data("sort") || null;
834
835 // Prevent sorting if no type defined
836 if (type === null) {
837 return;
838 }
839
840 // Trigger `beforetablesort` event that calling scripts can hook into;
841 // pass parameters for sorted column index and sorting direction
842 $table.trigger("beforetablesort", {column: th_index, direction: sort_dir});
843 // More reliable method of forcing a redraw
844 $table.css("display");
845
846 // Run sorting asynchronously on a timout to force browser redraw after
847 // `beforetablesort` callback. Also avoids locking up the browser too much.
848 setTimeout(function() {
849 // Gather the elements for this column
850 var column = [];
851 var sortMethod = sortFns[type];
852
853 // Push either the value of the `data-order-by` attribute if specified
854 // or just the text() value in this column to column[] for comparison.
855 trs.each(function(index,tr) {
856 var $e = $(tr).children().eq(th_index);
857 var sort_val = $e.data("sort-value");
858 var order_by = typeof(sort_val) !== "undefined" ? sort_val : $e.text();
859 column.push(order_by);
860 });
861
862 // Create the sort map. This column having a sort-dir implies it was
863 // the last column sorted. As long as no data-sort-desc is specified,
864 // we're free to just reverse the column.
865 var theMap;
866 if (sort_dir == dir.ASC)
867 theMap = sort_map(column, sortMethod);
868 else
869 theMap = sort_map(column, function(a, b) { return -sortMethod(a, b); });
870
871 // Reset siblings
872 $table.find("th").data("sort-dir", null).removeClass("sorting-desc sorting-asc");
873 $this.data("sort-dir", sort_dir).addClass("sorting-"+sort_dir);
874
875 var sortedTRs = $(apply_sort_map(trs, theMap));
876 $table.children("tbody").remove();
877 $table.append("<tbody />").append(sortedTRs);
878
879 // Trigger `aftertablesort` event. Similar to `beforetablesort`
880 $table.trigger("aftertablesort", {column: th_index, direction: sort_dir});
881 // More reliable method of forcing a redraw
882 $table.css("display");
883 }, 10);
884 });
885 });
886 };
887
888 // Enum containing sorting directions
889 $.fn.stupidtable.dir = {ASC: "asc", DESC: "desc"};
890
891 $.fn.stupidtable.default_sort_fns = {
892 "int": function(a, b) {
893 return parseInt(a, 10) - parseInt(b, 10);
894 },
895 "float": function(a, b) {
896 return parseFloat(a) - parseFloat(b);
897 },
898 "string": function(a, b) {
899 if (a < b) return -1;
900 if (a > b) return +1;
901 return 0;
902 },
903 "string-ins": function(a, b) {
904 a = a.toLowerCase();
905 b = b.toLowerCase();
906 if (a < b) return -1;
907 if (a > b) return +1;
908 return 0;
909 }
910 };
911
912 })(jQuery);
913 $(function(){
914 var table = $("table").stupidtable();
915
916 table.on("beforetablesort", function (event, data) {
917 // data.column - the index of the column sorted after a click
918 // data.direction - the sorting direction (either asc or desc)
919 $("#msg").text("Sorting index " + data.column)
920 });
921 table.on("aftertablesort", function (event, data) {
922 var th = $(this).find("th");
923 th.find(".arrow").remove();
924 var dir = $.fn.stupidtable.dir;
925 var arrow = data.direction === dir.ASC ? "&uarr;" : "&darr;";
926 th.eq(data.column).append('<span class="arrow">' + arrow +'</span>');
927 });
928 });
929 </script>
930 <style type="text/css">
931 label {
932 display: inline-block;
933 width: 5em;
934 }
935 table {
936 border-collapse: collapse;
937 }
938 th, td {
939 padding: 5px 10px;
940 border: 1px solid #999;
941 }
942 th {
943 background-color: #a7d3ff;
944 }
945 th[data-sort]{
946 cursor:pointer;
947 }
948 a[data-lightbox]{
949 cursor:zoom-in;
950 }
951 #msg {
952 color: green;
953 }
954 </style>
955 </head>
956 <body>
957 <h1>Readthrough analyse results</h1>
958 %s
959 </body>
960 </html> """ % (gene_table)
961
962 html_file = open(html,"w")
963 html_file.write(html_str)
964 html_file.close()
965
966
967 except Exception, e :
968 stop_err('Error during html page creation : ' + str( e ) )
969
970
971 def __main__():
972
973 ## python metagene_readthrough.py -g Saccer3.gff -f Saccer3.fa -b B1_sorted.bam -o Bstrain_readthrough.csv
974 #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_readthrough.py -g /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /data/Mapping/read_mapped_unique_psiP_sorted.bam -o /home/rlegendre/Documents/Translecture/plop_py.csv -d /home/rlegendre/Documents/Translecture/out_trans/
975 #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_readthrough.py -g /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /home/rlegendre/Documents/PDE2S/PDE2S_trim_no_rRNA_sorted.bam -o /home/rlegendre/Documents/PDE2S/Readthrough_pde2s.csv -d /home/rlegendre/Documents/PDE2S/out_trans
976
977
978 #Parse command line options
979 parser = optparse.OptionParser()
980 parser.add_option("-g", "--gff", dest="gff", type= "string",
981 help="GFF annotation file", metavar="FILE")
982
983 parser.add_option("-f", "--fasta", dest="fasta", type= "string",
984 help="Fasta file ", metavar="FILE")
985
986 parser.add_option("-b", "--bam", dest="bamfile", type= "string",
987 help="Bam Ribo-Seq alignments ", metavar="FILE")
988
989 parser.add_option("-d", "--dirout", dest="dirout", type= "string",
990 help="write report in this html file and in associated directory", metavar="STR,STR")
991
992 parser.add_option("-q", "--quiet",
993 action="store_false", dest="verbose", default=True,
994 help="don't print status messages to stdout")
995
996 (options, args) = parser.parse_args()
997 sys.stdout.write("Begin readthrough analysis at %s\n" % time.asctime( time.localtime(time.time())))
998
999 try:
1000 (html_file, subfolder ) = options.dirout.split(",")
1001 if os.path.exists(subfolder):
1002 raise
1003 try:
1004 os.mkdir(subfolder)
1005 except:
1006 raise
1007
1008 GFF = store_gff(options.gff)
1009 clean_file = cleaning_bam(options.bamfile)
1010 compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder)
1011 if os.path.exists( clean_file ):
1012 os.remove( clean_file )
1013
1014 write_html_page(html_file,subfolder)
1015 ##paste jquery script in result directory :
1016 jq_src = os.path.join(os.path.dirname(__file__),'lightbox')
1017 shutil.copytree(jq_src,os.path.join(subfolder,'lightbox'))
1018
1019
1020 sys.stdout.write("Finish readthrough analysis at %s\n" % time.asctime( time.localtime(time.time())))
1021 except Exception, e:
1022 stop_err( 'Error running metagene readthrough analysis : ' + str( e ) )
1023
1024
1025 if __name__=="__main__":
1026 __main__()