Mercurial > repos > rlegendre > ribo_tools
comparison metagene_readthrough.py @ 10:707807fee542
(none)
author | rlegendre |
---|---|
date | Thu, 22 Jan 2015 14:34:38 +0100 |
parents | 29c9c86e17e1 |
children | 7c944fd9907e |
comparison
equal
deleted
inserted
replaced
9:d7739f797a26 | 10:707807fee542 |
---|---|
19 matplotlib.use('Agg') | 19 matplotlib.use('Agg') |
20 import matplotlib.pyplot as pl | 20 import matplotlib.pyplot as pl |
21 from numpy import arange | 21 from numpy import arange |
22 from matplotlib import ticker as t | 22 from matplotlib import ticker as t |
23 from PIL import Image | 23 from PIL import Image |
24 import ribo_functions | |
25 ## suppress matplotlib warnings | |
26 import warnings | |
27 | |
28 | |
24 | 29 |
25 def stop_err( msg ): | 30 def stop_err( msg ): |
26 sys.stderr.write( "%s\n" % msg ) | 31 sys.stderr.write( "%s\n" % msg ) |
27 sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) | 32 sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) |
28 sys.exit() | 33 sys.exit() |
29 | 34 |
30 | |
31 def store_gff(gff): | |
32 ''' | |
33 parse and store gff file in a dictionnary | |
34 ''' | |
35 try: | |
36 GFF = {} | |
37 with open(gff, 'r') as f_gff : | |
38 GFF['order'] = [] | |
39 for line in f_gff: | |
40 ## switch commented lines | |
41 line = line.split("#")[0] | |
42 if line != "" : | |
43 feature = (line.split('\t')[8]).split(';') | |
44 # first line is already gene line : | |
45 if line.split('\t')[2] == 'gene' : | |
46 gene = feature[0].replace("ID=","") | |
47 if re.search('gene',feature[2]) : | |
48 Name = feature[2].replace("gene=","") | |
49 else : | |
50 Name = "Unknown" | |
51 ##get annotation | |
52 note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line) | |
53 note = unquote(str(note)).replace("\n","") | |
54 ## store gene information | |
55 GFF['order'].append(gene) | |
56 GFF[gene] = {} | |
57 GFF[gene]['chrom'] = line.split('\t')[0] | |
58 GFF[gene]['start'] = int(line.split('\t')[3]) | |
59 GFF[gene]['stop'] = int(line.split('\t')[4]) | |
60 GFF[gene]['strand'] = line.split('\t')[6] | |
61 GFF[gene]['name'] = Name | |
62 GFF[gene]['note'] = note | |
63 GFF[gene]['exon'] = {} | |
64 GFF[gene]['exon_number'] = 0 | |
65 #print Name | |
66 elif line.split('\t')[2] == 'CDS' : | |
67 gene = re.sub(r".?Parent\=(.+)(_mRNA)+", r"\1", feature[0]) | |
68 if GFF.has_key(gene) : | |
69 GFF[gene]['exon_number'] += 1 | |
70 exon_number = GFF[gene]['exon_number'] | |
71 GFF[gene]['exon'][exon_number] = {} | |
72 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] | |
73 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) | |
74 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) | |
75 | |
76 ## if there is a five prim UTR intron, we change start of gene | |
77 elif line.split('\t')[2] == 'five_prime_UTR_intron' : | |
78 if GFF[gene]['strand'] == "+" : | |
79 GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] | |
80 else : | |
81 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] | |
82 return GFF | |
83 except Exception,e: | |
84 stop_err( 'Error during gff storage : ' + str( e ) ) | |
85 | |
86 | |
87 | |
88 | 35 |
89 def compute_rpkm(length,count_gene,count_tot) : | 36 def compute_rpkm(length,count_gene,count_tot) : |
90 | 37 |
91 try : | 38 try : |
92 rpkm = "{0:.4f}".format(count_gene*1000000.0/(count_tot*length)) | 39 rpkm = "{0:.4f}".format(count_gene*1000000.0/(count_tot*length)) |
133 Note that all positions should be given as 0-based value! | 80 Note that all positions should be given as 0-based value! |
134 end: The end of the interval. Following Python convention for | 81 end: The end of the interval. Following Python convention for |
135 ranges, this in one more than the coordinate of the last base | 82 ranges, this in one more than the coordinate of the last base |
136 that is considered part of the sequence. | 83 that is considered part of the sequence. |
137 strand: The strand, as a single character, '+' or '-'. '.' indicates | 84 strand: The strand, as a single character, '+' or '-'. '.' indicates |
138 that the strand is irrelavant. (Alternatively, pass a Strand object.) | 85 that the strand is irrelevant. (Alternatively, pass a Strand object.) |
139 length: The length of the interval, i.e., end - start | 86 length: The length of the interval, i.e., end - start |
140 start_d: The "directional start" position. This is the position of the | 87 start_d: The "directional start" position. This is the position of the |
141 first base of the interval, taking the strand into account. Hence, | 88 first base of the interval, taking the strand into account. Hence, |
142 this is the same as 'start' except when strand == '-', in which | 89 this is the same as 'start' except when strand == '-', in which |
143 case it is end-1. | 90 case it is end-1. |
144 end_d: The "directional end": Usually, the same as 'end', but for | 91 end_d: The "directional end": Usually, the same as 'end', but for |
145 strand=='-1', it is start-1. | 92 strand=='-1', it is start-1. |
146 | 93 |
147 ''' | 94 ''' |
148 def check_overlapping(gff_reader,chrm,start,stop,strand): | 95 def check_overlapping(gff_reader,chrm,start,stop,strand, name): |
149 | 96 |
150 #### probleme avec les genes completement inclu... | 97 #### probleme avec les genes completement inclu... |
151 | 98 |
152 iv2 = HTSeq.GenomicInterval(chrm,start,stop,strand) | 99 iv2 = HTSeq.GenomicInterval(chrm,start,stop,strand) |
153 for feature in gff_reader: | 100 for feature in gff_reader: |
154 if feature.type == "gene" : | 101 if feature.type == "gene" : |
155 if feature.iv.overlaps(iv2) : | 102 if feature.iv.overlaps(iv2) and feature.attr.get('gene_name') != name : |
156 ## if its a reverse gene, we replace start of extension by start of previous gene | 103 ## if its a reverse gene, we replace start of extension by start of previous gene |
157 if strand == '-' : | 104 if strand == '-' : |
158 return (feature.iv.end+3,stop) | 105 return (feature.iv.end+3,stop) |
159 ## else we replace stop of extension by start of following gene | 106 ## else we replace stop of extension by start of following gene |
160 else : | 107 else : |
198 else : | 145 else : |
199 return True | 146 return True |
200 | 147 |
201 def plot_gene ( aln, gene, start_extension, stop_extension, dirout ) : | 148 def plot_gene ( aln, gene, start_extension, stop_extension, dirout ) : |
202 | 149 |
150 | |
151 ### ignore matplotlib warnings for galaxy | |
152 warnings.filterwarnings('ignore') | |
203 try: | 153 try: |
204 strand = gene['strand'] | 154 strand = gene['strand'] |
205 len_gene = gene['stop']-gene['start'] | 155 len_gene = gene['stop']-gene['start'] |
206 if strand is "-" : | 156 if strand is "-" : |
207 len_ext = gene['stop']-start_extension | 157 len_ext = gene['stop']-start_extension |
444 infile = dirout+'.png' | 394 infile = dirout+'.png' |
445 size = 128, 128 | 395 size = 128, 128 |
446 im = Image.open(infile) | 396 im = Image.open(infile) |
447 im.thumbnail(size, Image.ANTIALIAS) | 397 im.thumbnail(size, Image.ANTIALIAS) |
448 im.save(dirout+"_thumbnail.png", "PNG") | 398 im.save(dirout+"_thumbnail.png", "PNG") |
449 | 399 warnings.resetwarnings() |
450 | 400 |
451 except Exception, e: | 401 except Exception, e: |
452 stop_err( 'Error during gene plotting : ' + str( e ) ) | 402 stop_err( 'Error during gene plotting : ' + str( e ) ) |
453 | 403 |
454 | 404 |
455 def compute_analysis(bam, GFF, fasta, gff, dirout) : | 405 def compute_analysis(bam, GFF, fasta, gff, dirout, extend) : |
456 | 406 |
457 try: | 407 try: |
458 background_file = dirout+"/background_sequence.fasta" | 408 background_file = dirout+"/background_sequence.fasta" |
459 file_back = open(background_file, 'w') | 409 file_back = open(background_file, 'w') |
460 file_context = open(dirout+"/stop_context.fasta", 'w') | 410 file_context = open(dirout+"/stop_context.fasta", 'w') |
477 with open(dirout+"/readthrough_result.csv","w") as out : | 427 with open(dirout+"/readthrough_result.csv","w") as out : |
478 myheader = ['Gene','Name', 'FAIL', 'Stop context','chrom','start extension','stop extension','length extension','RPKM CDS', 'RPKM extension','ratio','Annotation','sequence'] | 428 myheader = ['Gene','Name', 'FAIL', 'Stop context','chrom','start extension','stop extension','length extension','RPKM CDS', 'RPKM extension','ratio','Annotation','sequence'] |
479 writer = csv.writer(out,delimiter='\t') | 429 writer = csv.writer(out,delimiter='\t') |
480 writer.writerow(myheader) | 430 writer.writerow(myheader) |
481 for gene in GFF['order'] : | 431 for gene in GFF['order'] : |
432 #print GFF[gene] | |
433 ## maybe no start position in GTF file so we must to check and replace | |
434 exon_number = GFF[gene]['exon_number'] | |
435 try : GFF[gene]['start'] | |
436 except : | |
437 if GFF[gene]['strand'] == '+' : | |
438 GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] | |
439 else : | |
440 GFF[gene]['start'] = GFF[gene]['exon'][exon_number]['stop'] | |
441 ## also for stop coordinates | |
442 try : GFF[gene]['stop'] | |
443 except : | |
444 if GFF[gene]['strand'] == '+' : | |
445 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] | |
446 | |
447 else : | |
448 GFF[gene]['stop'] = GFF[gene]['exon'][1]['start'] | |
449 | |
450 | |
482 indic = "" | 451 indic = "" |
483 # compute rpkm of CDS : | 452 # compute rpkm of CDS : |
484 len_cds = GFF[gene]['stop']-GFF[gene]['start'] | 453 len_cds = GFF[gene]['stop']-GFF[gene]['start'] |
485 count_cds = 0 | 454 count_cds = 0 |
486 ### count method of pysam doesn't strand information | 455 rpkm_cds = 0 |
487 if GFF[gene]['strand'] == '+' : | 456 count_cds = 0 |
488 for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+12,GFF[gene]['stop']-15) : | 457 try : |
489 if not track.is_reverse : | 458 ### count method of pysam doesn't strand information |
490 count_cds += 1 | 459 if GFF[gene]['strand'] == '+' : |
491 else : | 460 for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+12,GFF[gene]['stop']-15) : |
492 for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+15,GFF[gene]['stop']-12) : | 461 if not track.is_reverse : |
493 if track.is_reverse : | 462 count_cds += 1 |
494 count_cds += 1 | 463 else : |
495 rpkm_cds = compute_rpkm(len_cds,count_cds,count_tot) | 464 for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+15,GFF[gene]['stop']-12) : |
465 if track.is_reverse : | |
466 count_cds += 1 | |
467 rpkm_cds = compute_rpkm(len_cds,count_cds,count_tot) | |
468 except ValueError: | |
469 ## genere warning about gtf coordinates | |
470 #warnings.warn("Please check coordinates in GFF : maybe stop or start codon are missing" ) | |
471 pass | |
496 ## Only if gene is translate : | 472 ## Only if gene is translate : |
497 if rpkm_cds > 0 and count_cds > 128: | 473 if rpkm_cds > 0 and count_cds > 128: |
498 ## search footprint in UTR3 | 474 ## search footprint in UTR3 |
499 count = 0 | 475 count = 0 |
500 try : | 476 try : |
518 for track in aln.fetch(GFF[gene]['chrom'],start_extension,stop_extension-15) : | 494 for track in aln.fetch(GFF[gene]['chrom'],start_extension,stop_extension-15) : |
519 if track.is_reverse : | 495 if track.is_reverse : |
520 count += 1 | 496 count += 1 |
521 ## get sequence of extension | 497 ## get sequence of extension |
522 seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement()) | 498 seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement()) |
523 | 499 |
500 | |
524 ## if we have coverage after cds stop codon | 501 ## if we have coverage after cds stop codon |
525 if count > 10 : | 502 if count > 10 : |
526 res = find_stop(seq) | 503 res = find_stop(seq) |
527 if res == -1 : | 504 if res == -1 : |
528 ''' | 505 ''' |
529 Write results with no stop but RPF in extension | 506 Write results with no stop but RPF in extension |
530 ''' | 507 ''' |
531 ## check if next in-frame codon is far than 90 nt extension : | 508 ## check if next in-frame codon is far than 90 nt extension : |
532 if GFF[gene]['strand'] == '+' : | 509 if GFF[gene]['strand'] == '+' : |
533 pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['stop']+1,GFF[gene]['stop']+300,'+') | 510 pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['stop']+1,GFF[gene]['stop']+extend,'+',GFF[gene]['name']) |
534 start_extension = pos[0]-1 | 511 start_extension = pos[0]-1 |
535 stop_extension = pos[1] | 512 stop_extension = pos[1] |
513 #start_extension = GFF[gene]['stop'] | |
514 #stop_extension = GFF[gene]['stop']+extend | |
536 seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension]) | 515 seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension]) |
537 | 516 |
538 #print gene,count,pos,'\n',seq | 517 #print gene,count,pos,'\n',seq |
539 | 518 |
540 if (seq): | 519 if (seq): |
545 else : | 524 else : |
546 indic = 'ok' | 525 indic = 'ok' |
547 #print res | 526 #print res |
548 #stop_extension = start_extension + res +3 | 527 #stop_extension = start_extension + res +3 |
549 else : | 528 else : |
550 pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['start']-300,GFF[gene]['start']-1,'-') | 529 pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['start']-extend,GFF[gene]['start']-1,'-',GFF[gene]['name']) |
551 start_extension = pos[0] | 530 start_extension = pos[0] |
552 stop_extension = pos[1]+1 | 531 stop_extension = pos[1]+1 |
532 #start_extension = GFF[gene]['start']-extend | |
533 #stop_extension = GFF[gene]['start'] | |
553 seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement()) | 534 seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement()) |
554 | 535 |
555 #print gene,count,pos,'\n',seq | 536 #print gene,count,pos,'\n',seq |
556 | 537 |
557 if (seq): | 538 if (seq): |
968 stop_err('Error during html page creation : ' + str( e ) ) | 949 stop_err('Error during html page creation : ' + str( e ) ) |
969 | 950 |
970 | 951 |
971 def __main__(): | 952 def __main__(): |
972 | 953 |
973 ## python metagene_readthrough.py -g Saccer3.gff -f Saccer3.fa -b B1_sorted.bam -o Bstrain_readthrough.csv | |
974 #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_readthrough.py -g /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /data/Mapping/read_mapped_unique_psiP_sorted.bam -o /home/rlegendre/Documents/Translecture/plop_py.csv -d /home/rlegendre/Documents/Translecture/out_trans/ | |
975 #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_readthrough.py -g /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /home/rlegendre/Documents/PDE2S/PDE2S_trim_no_rRNA_sorted.bam -o /home/rlegendre/Documents/PDE2S/Readthrough_pde2s.csv -d /home/rlegendre/Documents/PDE2S/out_trans | |
976 | |
977 | |
978 #Parse command line options | 954 #Parse command line options |
979 parser = optparse.OptionParser() | 955 parser = optparse.OptionParser() |
980 parser.add_option("-g", "--gff", dest="gff", type= "string", | 956 parser.add_option("-g", "--gff", dest="gff", type= "string", |
981 help="GFF annotation file", metavar="FILE") | 957 help="GFF annotation file", metavar="FILE") |
982 | 958 |
985 | 961 |
986 parser.add_option("-b", "--bam", dest="bamfile", type= "string", | 962 parser.add_option("-b", "--bam", dest="bamfile", type= "string", |
987 help="Bam Ribo-Seq alignments ", metavar="FILE") | 963 help="Bam Ribo-Seq alignments ", metavar="FILE") |
988 | 964 |
989 parser.add_option("-d", "--dirout", dest="dirout", type= "string", | 965 parser.add_option("-d", "--dirout", dest="dirout", type= "string", |
990 help="write report in this html file and in associated directory", metavar="STR,STR") | 966 help="write report in this html file and in associated directory", metavar="STR,STR") |
967 | |
968 parser.add_option("-e", "--extend", dest="extend", type= "int", | |
969 help="Lenght of extension after stop in number of base pairs (depends on your organisme)", metavar="INT") | |
991 | 970 |
992 parser.add_option("-q", "--quiet", | 971 parser.add_option("-q", "--quiet", |
993 action="store_false", dest="verbose", default=True, | 972 action="store_false", dest="verbose", default=True, |
994 help="don't print status messages to stdout") | 973 help="don't print status messages to stdout") |
995 | 974 |
1002 raise | 981 raise |
1003 try: | 982 try: |
1004 os.mkdir(subfolder) | 983 os.mkdir(subfolder) |
1005 except: | 984 except: |
1006 raise | 985 raise |
1007 | 986 ## identify GFF or GTF format from 9th column |
1008 GFF = store_gff(options.gff) | 987 with open (options.gff,"r") as gffile : |
988 for line in gffile : | |
989 if '#' in line : | |
990 ## skip header | |
991 gffile.next() | |
992 elif 'gene_id' in line : | |
993 ## launch gtf reader : | |
994 GFF = ribo_functions.store_gtf(options.gff) | |
995 break | |
996 elif 'ID=' in line : | |
997 ## launch gff reader | |
998 GFF = ribo_functions.store_gff(options.gff) | |
999 break | |
1000 else : | |
1001 stop_err( 'Please check your annotation file is in correct format, GFF or GTF' ) | |
1002 | |
1003 #GFF = store_gff(options.gff) | |
1004 #GFF = ribo_functions.store_gtf(options.gff) | |
1005 ## check gff reading | |
1006 if not GFF['order'] : | |
1007 stop_err( 'Incorrect GFF file' + str( e ) ) | |
1009 clean_file = cleaning_bam(options.bamfile) | 1008 clean_file = cleaning_bam(options.bamfile) |
1010 compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder) | 1009 compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder, options.extend) |
1011 if os.path.exists( clean_file ): | 1010 if os.path.exists( clean_file ): |
1012 os.remove( clean_file ) | 1011 os.remove( clean_file ) |
1013 | 1012 |
1014 write_html_page(html_file,subfolder) | 1013 write_html_page(html_file,subfolder) |
1015 ##paste jquery script in result directory : | 1014 ##paste jquery script in result directory : |