comparison metagene_readthrough.py @ 10:707807fee542

(none)
author rlegendre
date Thu, 22 Jan 2015 14:34:38 +0100
parents 29c9c86e17e1
children 7c944fd9907e
comparison
equal deleted inserted replaced
9:d7739f797a26 10:707807fee542
19 matplotlib.use('Agg') 19 matplotlib.use('Agg')
20 import matplotlib.pyplot as pl 20 import matplotlib.pyplot as pl
21 from numpy import arange 21 from numpy import arange
22 from matplotlib import ticker as t 22 from matplotlib import ticker as t
23 from PIL import Image 23 from PIL import Image
24 import ribo_functions
25 ## suppress matplotlib warnings
26 import warnings
27
28
24 29
25 def stop_err( msg ): 30 def stop_err( msg ):
26 sys.stderr.write( "%s\n" % msg ) 31 sys.stderr.write( "%s\n" % msg )
27 sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) 32 sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
28 sys.exit() 33 sys.exit()
29 34
30
31 def store_gff(gff):
32 '''
33 parse and store gff file in a dictionnary
34 '''
35 try:
36 GFF = {}
37 with open(gff, 'r') as f_gff :
38 GFF['order'] = []
39 for line in f_gff:
40 ## switch commented lines
41 line = line.split("#")[0]
42 if line != "" :
43 feature = (line.split('\t')[8]).split(';')
44 # first line is already gene line :
45 if line.split('\t')[2] == 'gene' :
46 gene = feature[0].replace("ID=","")
47 if re.search('gene',feature[2]) :
48 Name = feature[2].replace("gene=","")
49 else :
50 Name = "Unknown"
51 ##get annotation
52 note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line)
53 note = unquote(str(note)).replace("\n","")
54 ## store gene information
55 GFF['order'].append(gene)
56 GFF[gene] = {}
57 GFF[gene]['chrom'] = line.split('\t')[0]
58 GFF[gene]['start'] = int(line.split('\t')[3])
59 GFF[gene]['stop'] = int(line.split('\t')[4])
60 GFF[gene]['strand'] = line.split('\t')[6]
61 GFF[gene]['name'] = Name
62 GFF[gene]['note'] = note
63 GFF[gene]['exon'] = {}
64 GFF[gene]['exon_number'] = 0
65 #print Name
66 elif line.split('\t')[2] == 'CDS' :
67 gene = re.sub(r".?Parent\=(.+)(_mRNA)+", r"\1", feature[0])
68 if GFF.has_key(gene) :
69 GFF[gene]['exon_number'] += 1
70 exon_number = GFF[gene]['exon_number']
71 GFF[gene]['exon'][exon_number] = {}
72 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
73 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
74 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
75
76 ## if there is a five prim UTR intron, we change start of gene
77 elif line.split('\t')[2] == 'five_prime_UTR_intron' :
78 if GFF[gene]['strand'] == "+" :
79 GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
80 else :
81 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
82 return GFF
83 except Exception,e:
84 stop_err( 'Error during gff storage : ' + str( e ) )
85
86
87
88 35
89 def compute_rpkm(length,count_gene,count_tot) : 36 def compute_rpkm(length,count_gene,count_tot) :
90 37
91 try : 38 try :
92 rpkm = "{0:.4f}".format(count_gene*1000000.0/(count_tot*length)) 39 rpkm = "{0:.4f}".format(count_gene*1000000.0/(count_tot*length))
133 Note that all positions should be given as 0-based value! 80 Note that all positions should be given as 0-based value!
134 end: The end of the interval. Following Python convention for 81 end: The end of the interval. Following Python convention for
135 ranges, this in one more than the coordinate of the last base 82 ranges, this in one more than the coordinate of the last base
136 that is considered part of the sequence. 83 that is considered part of the sequence.
137 strand: The strand, as a single character, '+' or '-'. '.' indicates 84 strand: The strand, as a single character, '+' or '-'. '.' indicates
138 that the strand is irrelavant. (Alternatively, pass a Strand object.) 85 that the strand is irrelevant. (Alternatively, pass a Strand object.)
139 length: The length of the interval, i.e., end - start 86 length: The length of the interval, i.e., end - start
140 start_d: The "directional start" position. This is the position of the 87 start_d: The "directional start" position. This is the position of the
141 first base of the interval, taking the strand into account. Hence, 88 first base of the interval, taking the strand into account. Hence,
142 this is the same as 'start' except when strand == '-', in which 89 this is the same as 'start' except when strand == '-', in which
143 case it is end-1. 90 case it is end-1.
144 end_d: The "directional end": Usually, the same as 'end', but for 91 end_d: The "directional end": Usually, the same as 'end', but for
145 strand=='-1', it is start-1. 92 strand=='-1', it is start-1.
146 93
147 ''' 94 '''
148 def check_overlapping(gff_reader,chrm,start,stop,strand): 95 def check_overlapping(gff_reader,chrm,start,stop,strand, name):
149 96
150 #### probleme avec les genes completement inclu... 97 #### probleme avec les genes completement inclu...
151 98
152 iv2 = HTSeq.GenomicInterval(chrm,start,stop,strand) 99 iv2 = HTSeq.GenomicInterval(chrm,start,stop,strand)
153 for feature in gff_reader: 100 for feature in gff_reader:
154 if feature.type == "gene" : 101 if feature.type == "gene" :
155 if feature.iv.overlaps(iv2) : 102 if feature.iv.overlaps(iv2) and feature.attr.get('gene_name') != name :
156 ## if its a reverse gene, we replace start of extension by start of previous gene 103 ## if its a reverse gene, we replace start of extension by start of previous gene
157 if strand == '-' : 104 if strand == '-' :
158 return (feature.iv.end+3,stop) 105 return (feature.iv.end+3,stop)
159 ## else we replace stop of extension by start of following gene 106 ## else we replace stop of extension by start of following gene
160 else : 107 else :
198 else : 145 else :
199 return True 146 return True
200 147
201 def plot_gene ( aln, gene, start_extension, stop_extension, dirout ) : 148 def plot_gene ( aln, gene, start_extension, stop_extension, dirout ) :
202 149
150
151 ### ignore matplotlib warnings for galaxy
152 warnings.filterwarnings('ignore')
203 try: 153 try:
204 strand = gene['strand'] 154 strand = gene['strand']
205 len_gene = gene['stop']-gene['start'] 155 len_gene = gene['stop']-gene['start']
206 if strand is "-" : 156 if strand is "-" :
207 len_ext = gene['stop']-start_extension 157 len_ext = gene['stop']-start_extension
444 infile = dirout+'.png' 394 infile = dirout+'.png'
445 size = 128, 128 395 size = 128, 128
446 im = Image.open(infile) 396 im = Image.open(infile)
447 im.thumbnail(size, Image.ANTIALIAS) 397 im.thumbnail(size, Image.ANTIALIAS)
448 im.save(dirout+"_thumbnail.png", "PNG") 398 im.save(dirout+"_thumbnail.png", "PNG")
449 399 warnings.resetwarnings()
450 400
451 except Exception, e: 401 except Exception, e:
452 stop_err( 'Error during gene plotting : ' + str( e ) ) 402 stop_err( 'Error during gene plotting : ' + str( e ) )
453 403
454 404
455 def compute_analysis(bam, GFF, fasta, gff, dirout) : 405 def compute_analysis(bam, GFF, fasta, gff, dirout, extend) :
456 406
457 try: 407 try:
458 background_file = dirout+"/background_sequence.fasta" 408 background_file = dirout+"/background_sequence.fasta"
459 file_back = open(background_file, 'w') 409 file_back = open(background_file, 'w')
460 file_context = open(dirout+"/stop_context.fasta", 'w') 410 file_context = open(dirout+"/stop_context.fasta", 'w')
477 with open(dirout+"/readthrough_result.csv","w") as out : 427 with open(dirout+"/readthrough_result.csv","w") as out :
478 myheader = ['Gene','Name', 'FAIL', 'Stop context','chrom','start extension','stop extension','length extension','RPKM CDS', 'RPKM extension','ratio','Annotation','sequence'] 428 myheader = ['Gene','Name', 'FAIL', 'Stop context','chrom','start extension','stop extension','length extension','RPKM CDS', 'RPKM extension','ratio','Annotation','sequence']
479 writer = csv.writer(out,delimiter='\t') 429 writer = csv.writer(out,delimiter='\t')
480 writer.writerow(myheader) 430 writer.writerow(myheader)
481 for gene in GFF['order'] : 431 for gene in GFF['order'] :
432 #print GFF[gene]
433 ## maybe no start position in GTF file so we must to check and replace
434 exon_number = GFF[gene]['exon_number']
435 try : GFF[gene]['start']
436 except :
437 if GFF[gene]['strand'] == '+' :
438 GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
439 else :
440 GFF[gene]['start'] = GFF[gene]['exon'][exon_number]['stop']
441 ## also for stop coordinates
442 try : GFF[gene]['stop']
443 except :
444 if GFF[gene]['strand'] == '+' :
445 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
446
447 else :
448 GFF[gene]['stop'] = GFF[gene]['exon'][1]['start']
449
450
482 indic = "" 451 indic = ""
483 # compute rpkm of CDS : 452 # compute rpkm of CDS :
484 len_cds = GFF[gene]['stop']-GFF[gene]['start'] 453 len_cds = GFF[gene]['stop']-GFF[gene]['start']
485 count_cds = 0 454 count_cds = 0
486 ### count method of pysam doesn't strand information 455 rpkm_cds = 0
487 if GFF[gene]['strand'] == '+' : 456 count_cds = 0
488 for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+12,GFF[gene]['stop']-15) : 457 try :
489 if not track.is_reverse : 458 ### count method of pysam doesn't strand information
490 count_cds += 1 459 if GFF[gene]['strand'] == '+' :
491 else : 460 for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+12,GFF[gene]['stop']-15) :
492 for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+15,GFF[gene]['stop']-12) : 461 if not track.is_reverse :
493 if track.is_reverse : 462 count_cds += 1
494 count_cds += 1 463 else :
495 rpkm_cds = compute_rpkm(len_cds,count_cds,count_tot) 464 for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+15,GFF[gene]['stop']-12) :
465 if track.is_reverse :
466 count_cds += 1
467 rpkm_cds = compute_rpkm(len_cds,count_cds,count_tot)
468 except ValueError:
469 ## genere warning about gtf coordinates
470 #warnings.warn("Please check coordinates in GFF : maybe stop or start codon are missing" )
471 pass
496 ## Only if gene is translate : 472 ## Only if gene is translate :
497 if rpkm_cds > 0 and count_cds > 128: 473 if rpkm_cds > 0 and count_cds > 128:
498 ## search footprint in UTR3 474 ## search footprint in UTR3
499 count = 0 475 count = 0
500 try : 476 try :
518 for track in aln.fetch(GFF[gene]['chrom'],start_extension,stop_extension-15) : 494 for track in aln.fetch(GFF[gene]['chrom'],start_extension,stop_extension-15) :
519 if track.is_reverse : 495 if track.is_reverse :
520 count += 1 496 count += 1
521 ## get sequence of extension 497 ## get sequence of extension
522 seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement()) 498 seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement())
523 499
500
524 ## if we have coverage after cds stop codon 501 ## if we have coverage after cds stop codon
525 if count > 10 : 502 if count > 10 :
526 res = find_stop(seq) 503 res = find_stop(seq)
527 if res == -1 : 504 if res == -1 :
528 ''' 505 '''
529 Write results with no stop but RPF in extension 506 Write results with no stop but RPF in extension
530 ''' 507 '''
531 ## check if next in-frame codon is far than 90 nt extension : 508 ## check if next in-frame codon is far than 90 nt extension :
532 if GFF[gene]['strand'] == '+' : 509 if GFF[gene]['strand'] == '+' :
533 pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['stop']+1,GFF[gene]['stop']+300,'+') 510 pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['stop']+1,GFF[gene]['stop']+extend,'+',GFF[gene]['name'])
534 start_extension = pos[0]-1 511 start_extension = pos[0]-1
535 stop_extension = pos[1] 512 stop_extension = pos[1]
513 #start_extension = GFF[gene]['stop']
514 #stop_extension = GFF[gene]['stop']+extend
536 seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension]) 515 seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension])
537 516
538 #print gene,count,pos,'\n',seq 517 #print gene,count,pos,'\n',seq
539 518
540 if (seq): 519 if (seq):
545 else : 524 else :
546 indic = 'ok' 525 indic = 'ok'
547 #print res 526 #print res
548 #stop_extension = start_extension + res +3 527 #stop_extension = start_extension + res +3
549 else : 528 else :
550 pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['start']-300,GFF[gene]['start']-1,'-') 529 pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['start']-extend,GFF[gene]['start']-1,'-',GFF[gene]['name'])
551 start_extension = pos[0] 530 start_extension = pos[0]
552 stop_extension = pos[1]+1 531 stop_extension = pos[1]+1
532 #start_extension = GFF[gene]['start']-extend
533 #stop_extension = GFF[gene]['start']
553 seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement()) 534 seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement())
554 535
555 #print gene,count,pos,'\n',seq 536 #print gene,count,pos,'\n',seq
556 537
557 if (seq): 538 if (seq):
968 stop_err('Error during html page creation : ' + str( e ) ) 949 stop_err('Error during html page creation : ' + str( e ) )
969 950
970 951
971 def __main__(): 952 def __main__():
972 953
973 ## python metagene_readthrough.py -g Saccer3.gff -f Saccer3.fa -b B1_sorted.bam -o Bstrain_readthrough.csv
974 #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_readthrough.py -g /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /data/Mapping/read_mapped_unique_psiP_sorted.bam -o /home/rlegendre/Documents/Translecture/plop_py.csv -d /home/rlegendre/Documents/Translecture/out_trans/
975 #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_readthrough.py -g /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /home/rlegendre/Documents/PDE2S/PDE2S_trim_no_rRNA_sorted.bam -o /home/rlegendre/Documents/PDE2S/Readthrough_pde2s.csv -d /home/rlegendre/Documents/PDE2S/out_trans
976
977
978 #Parse command line options 954 #Parse command line options
979 parser = optparse.OptionParser() 955 parser = optparse.OptionParser()
980 parser.add_option("-g", "--gff", dest="gff", type= "string", 956 parser.add_option("-g", "--gff", dest="gff", type= "string",
981 help="GFF annotation file", metavar="FILE") 957 help="GFF annotation file", metavar="FILE")
982 958
985 961
986 parser.add_option("-b", "--bam", dest="bamfile", type= "string", 962 parser.add_option("-b", "--bam", dest="bamfile", type= "string",
987 help="Bam Ribo-Seq alignments ", metavar="FILE") 963 help="Bam Ribo-Seq alignments ", metavar="FILE")
988 964
989 parser.add_option("-d", "--dirout", dest="dirout", type= "string", 965 parser.add_option("-d", "--dirout", dest="dirout", type= "string",
990 help="write report in this html file and in associated directory", metavar="STR,STR") 966 help="write report in this html file and in associated directory", metavar="STR,STR")
967
968 parser.add_option("-e", "--extend", dest="extend", type= "int",
969 help="Lenght of extension after stop in number of base pairs (depends on your organisme)", metavar="INT")
991 970
992 parser.add_option("-q", "--quiet", 971 parser.add_option("-q", "--quiet",
993 action="store_false", dest="verbose", default=True, 972 action="store_false", dest="verbose", default=True,
994 help="don't print status messages to stdout") 973 help="don't print status messages to stdout")
995 974
1002 raise 981 raise
1003 try: 982 try:
1004 os.mkdir(subfolder) 983 os.mkdir(subfolder)
1005 except: 984 except:
1006 raise 985 raise
1007 986 ## identify GFF or GTF format from 9th column
1008 GFF = store_gff(options.gff) 987 with open (options.gff,"r") as gffile :
988 for line in gffile :
989 if '#' in line :
990 ## skip header
991 gffile.next()
992 elif 'gene_id' in line :
993 ## launch gtf reader :
994 GFF = ribo_functions.store_gtf(options.gff)
995 break
996 elif 'ID=' in line :
997 ## launch gff reader
998 GFF = ribo_functions.store_gff(options.gff)
999 break
1000 else :
1001 stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )
1002
1003 #GFF = store_gff(options.gff)
1004 #GFF = ribo_functions.store_gtf(options.gff)
1005 ## check gff reading
1006 if not GFF['order'] :
1007 stop_err( 'Incorrect GFF file' + str( e ) )
1009 clean_file = cleaning_bam(options.bamfile) 1008 clean_file = cleaning_bam(options.bamfile)
1010 compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder) 1009 compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder, options.extend)
1011 if os.path.exists( clean_file ): 1010 if os.path.exists( clean_file ):
1012 os.remove( clean_file ) 1011 os.remove( clean_file )
1013 1012
1014 write_html_page(html_file,subfolder) 1013 write_html_page(html_file,subfolder)
1015 ##paste jquery script in result directory : 1014 ##paste jquery script in result directory :