Mercurial > repos > rlegendre > ribo_tools
comparison metagene_readthrough.py @ 19:385fc64fa988 draft default tip
Uploaded
author | rlegendre |
---|---|
date | Fri, 12 Jun 2015 11:32:59 -0400 |
parents | c87c40e642af |
children |
comparison
equal
deleted
inserted
replaced
18:a121cce43f90 | 19:385fc64fa988 |
---|---|
14 import csv | 14 import csv |
15 import pysam | 15 import pysam |
16 import HTSeq | 16 import HTSeq |
17 #from matplotlib import pyplot as pl | 17 #from matplotlib import pyplot as pl |
18 import matplotlib | 18 import matplotlib |
19 from jinja2.nodes import Pos | |
19 matplotlib.use('Agg') | 20 matplotlib.use('Agg') |
20 import matplotlib.pyplot as pl | 21 import matplotlib.pyplot as pl |
21 from numpy import arange | 22 from numpy import arange |
22 from matplotlib import ticker as t | 23 from matplotlib import ticker as t |
23 from PIL import Image | 24 from PIL import Image |
402 warnings.resetwarnings() | 403 warnings.resetwarnings() |
403 | 404 |
404 except Exception, e: | 405 except Exception, e: |
405 stop_err( 'Error during gene plotting : ' + str( e ) ) | 406 stop_err( 'Error during gene plotting : ' + str( e ) ) |
406 | 407 |
407 | 408 |
408 def compute_analysis(bam, GFF, fasta, gff, dirout, extend) : | 409 def __percent__(prop): |
410 | |
411 if sum(prop) != 0 : | |
412 perc = [0,0,0] | |
413 if prop[0] != 0 : | |
414 perc[0] = int("{0:.0f}".format((prop[0]*100.0)/sum(prop))) | |
415 if prop[1] != 0 : | |
416 perc[1] = int("{0:.0f}".format((prop[1]*100.0)/sum(prop))) | |
417 if prop[2] != 0 : | |
418 perc[2] = int("{0:.0f}".format((prop[2]*100.0)/sum(prop))) | |
419 return perc | |
420 else : | |
421 return prop | |
422 | |
423 def compute_phasing(chrom, start, stop, aln, kmer, strand): | |
424 | |
425 phasing = [0,0,0] | |
426 for track in aln.fetch(chrom, start, stop): | |
427 if strand == '-': | |
428 if track.is_reverse : | |
429 if len(track.seq) == kmer : | |
430 pos = stop - (track.pos + track.rlen - 12) | |
431 if pos > 0: | |
432 phasing[pos%3] += 1 | |
433 else : | |
434 if not track.is_reverse : | |
435 if len(track.seq) == kmer : | |
436 pos = (track.pos + 12) - start | |
437 if pos > 0: | |
438 phasing[pos%3] += 1 | |
439 | |
440 #return __percent__(phasing) | |
441 return phasing | |
442 | |
443 | |
444 def compute_analysis(bam, GFF, fasta, gff, dirout, extend, kmer) : | |
409 | 445 |
410 try: | 446 try: |
411 background_file = dirout+"/background_sequence.fasta" | 447 #background_file = dirout+"/background_sequence.fasta" |
412 file_back = open(background_file, 'w') | 448 #file_back = open(background_file, 'w') |
413 file_context = open(dirout+"/stop_context.fasta", 'w') | 449 #file_context = open(dirout+"/stop_context.fasta", 'w') |
414 file_extension = open(dirout+"/extensions.fasta", 'w') | 450 #file_extension = open(dirout+"/extensions.fasta", 'w') |
415 ## Opening Bam file with pysam librarie | 451 ## Opening Bam file with pysam librarie |
416 pysam.index(bam) | 452 pysam.index(bam) |
417 aln = pysam.Samfile(bam, "rb",header=True, check_header = True) | 453 aln = pysam.Samfile(bam, "rb",header=True, check_header = True) |
418 ## Opening fasta file in a dict with BioPython | 454 ## Opening fasta file in a dict with BioPython |
419 SeqDict = SeqIO.to_dict(SeqIO.parse(open(fasta,"r"),"fasta")) | 455 SeqDict = SeqIO.to_dict(SeqIO.parse(open(fasta,"r"),"fasta")) |
427 ## opening a GFF reader for check overlapping | 463 ## opening a GFF reader for check overlapping |
428 gff_reader = HTSeq.GFF_Reader(gff) | 464 gff_reader = HTSeq.GFF_Reader(gff) |
429 | 465 |
430 with open(dirout+"/readthrough_result.csv","w") as out : | 466 with open(dirout+"/readthrough_result.csv","w") as out : |
431 myheader = ['Gene','Name', 'FAIL', 'Stop context','chrom','start extension','stop extension','length extension','RPKM CDS', 'RPKM extension','ratio','Annotation','sequence'] | 467 myheader = ['Gene','Name', 'FAIL', 'Stop context','chrom','start extension','stop extension','length extension','RPKM CDS', 'RPKM extension','ratio','Annotation','sequence'] |
432 writer = csv.writer(out,delimiter='\t') | 468 writer = csv.writer(out,delimiter=',') |
433 writer.writerow(myheader) | 469 writer.writerow(myheader) |
434 for gene in GFF['order'] : | 470 for gene in GFF['order'] : |
435 #print GFF[gene] | 471 #print GFF[gene] |
436 ## maybe no start position in GTF file so we must to check and replace | 472 ## maybe no start position in GTF file so we must to check and replace |
437 exon_number = GFF[gene]['exon_number'] | 473 exon_number = GFF[gene]['exon_number'] |
520 #print gene,count,pos,'\n',seq | 556 #print gene,count,pos,'\n',seq |
521 | 557 |
522 if (seq): | 558 if (seq): |
523 res = find_stop(seq) | 559 res = find_stop(seq) |
524 if res == -1 : | 560 if res == -1 : |
525 mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] | 561 mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']] |
526 writer.writerow(mylist) | 562 writer.writerow(mylist) |
527 else : | 563 else : |
528 indic = 'ok' | 564 indic = 'ok' |
529 #print res | 565 #print res |
530 #stop_extension = start_extension + res +3 | 566 #stop_extension = start_extension + res +3 |
539 #print gene,count,pos,'\n',seq | 575 #print gene,count,pos,'\n',seq |
540 | 576 |
541 if (seq): | 577 if (seq): |
542 res = find_stop(seq) | 578 res = find_stop(seq) |
543 if res == -1 : | 579 if res == -1 : |
544 mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] | 580 mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']] |
545 writer.writerow(mylist) | 581 writer.writerow(mylist) |
546 else : | 582 else : |
547 indic = 'ok' | 583 indic = 'ok' |
548 #print res | 584 #print res |
549 #start_extension = stop_extension - res -3 | 585 #start_extension = stop_extension - res -3 |
582 ## if we are no methionine in 5 codons following stop of CDS | 618 ## if we are no methionine in 5 codons following stop of CDS |
583 if (not check_met(seq) ): | 619 if (not check_met(seq) ): |
584 ## if we have footprint in stop codon extension and stop extension is further than cds_stop+9 | 620 ## if we have footprint in stop codon extension and stop extension is further than cds_stop+9 |
585 if count_stop > 2 and stop_ok == 1 : | 621 if count_stop > 2 and stop_ok == 1 : |
586 homo_cov = check_homo_coverage(gene,GFF,start_extension,stop_extension,aln) | 622 homo_cov = check_homo_coverage(gene,GFF,start_extension,stop_extension,aln) |
623 # phasing = compute_phasing(GFF[gene]['chrom'],start_extension,stop_extension, aln, kmer,GFF[gene]['strand']) | |
624 # print gene, phasing | |
625 # count_after_stop = 0 | |
626 # try : | |
627 # ### count method of pysam doesn't strand information | |
628 # if GFF[gene]['strand'] == '+' : | |
629 # for track in aln.fetch(GFF[gene]['chrom'],stop_extension+15,stop_extension+40) : | |
630 # if not track.is_reverse : | |
631 # count_after_stop += 1 | |
632 # else : | |
633 # for track in aln.fetch(GFF[gene]['chrom'],start_extension-40,start_extension-15) : | |
634 # if track.is_reverse : | |
635 # count_after_stop += 1 | |
636 # except ValueError: | |
637 # ## genere warning about gtf coordinates | |
638 # warnings.warn("Please check coordinates in GFF : maybe stop or start codon are missing" ) | |
639 # pass | |
640 # print gene+"\t"+str(count_stop)+"\t"+str(count_after_stop) | |
641 # if (count_stop*5)/100 > count_after_stop : | |
642 # print "moins de 5%...\n" | |
587 if (homo_cov) : | 643 if (homo_cov) : |
588 ''' | 644 ''' |
589 write result witch corresponding to readthrough | 645 write result witch corresponding to readthrough |
590 ''' | 646 ''' |
591 ##if length of extension upper than 25 we can compute rpkm | 647 ##if length of extension upper than 25 we can compute rpkm |
598 rpkm_ext = compute_rpkm(len_ext,count_ext,count_tot) | 654 rpkm_ext = compute_rpkm(len_ext,count_ext,count_tot) |
599 ## compute ratio between extension coverage and cds coverage (rpkm) | 655 ## compute ratio between extension coverage and cds coverage (rpkm) |
600 ratio = rpkm_ext/rpkm_cds | 656 ratio = rpkm_ext/rpkm_cds |
601 #print gene,ratio | 657 #print gene,ratio |
602 #print start_extension,stop_extension | 658 #print start_extension,stop_extension |
603 mylist = [gene,GFF[gene]['name'],'-',contexte_stop,GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,GFF[gene]['note'],seq] | 659 mylist = [gene,GFF[gene]['name'],'-',contexte_stop,GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,seq,GFF[gene]['note']] |
604 writer.writerow(mylist) | 660 writer.writerow(mylist) |
605 file_context.write('>'+gene+'\n'+contexte_stop+'\n') | 661 #file_context.write('>'+gene+'\n'+contexte_stop+'\n') |
606 file_extension.write('>'+gene+'\n'+seq+'\n') | 662 #file_extension.write('>'+gene+'\n'+seq+'\n') |
607 else : | 663 else : |
608 ''' | 664 ''' |
609 write result witch corresponding to readthrough but with no homogeneous coverage | 665 write result witch corresponding to readthrough but with no homogeneous coverage |
610 ''' | 666 ''' |
611 if (not pass_length(start_extension,stop_extension)) : | 667 if (not pass_length(start_extension,stop_extension)) : |
615 else : | 671 else : |
616 len_ext = stop_extension-start_extension | 672 len_ext = stop_extension-start_extension |
617 rpkm_ext = compute_rpkm(len_ext,count_ext,count_tot) | 673 rpkm_ext = compute_rpkm(len_ext,count_ext,count_tot) |
618 ## compute ratio between extension coverage and cds coverage (rpkm) | 674 ## compute ratio between extension coverage and cds coverage (rpkm) |
619 ratio = rpkm_ext/rpkm_cds | 675 ratio = rpkm_ext/rpkm_cds |
620 mylist = [gene,GFF[gene]['name'],'hetero cov',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,GFF[gene]['note'],seq] | 676 mylist = [gene,GFF[gene]['name'],'hetero cov',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,seq,GFF[gene]['note']] |
621 writer.writerow(mylist) | 677 writer.writerow(mylist) |
622 file_context.write('>'+gene+'\n'+contexte_stop+'\n') | 678 #file_context.write('>'+gene+'\n'+contexte_stop+'\n') |
623 file_extension.write('>'+gene+'\n'+seq+'\n') | 679 #file_extension.write('>'+gene+'\n'+seq+'\n') |
624 #print ">"+gene+"\n"+contexte_stop | 680 #print ">"+gene+"\n"+contexte_stop |
625 | 681 |
626 ## plot gene : | 682 ## plot gene : |
627 plot_gene(aln, GFF[gene], start_extension, stop_extension, dirout+"/"+gene) | 683 plot_gene(aln, GFF[gene], start_extension, stop_extension, dirout+"/"+gene) |
628 | 684 |
630 | 686 |
631 else : | 687 else : |
632 ''' | 688 ''' |
633 write result with no footprint in stop codon of extension | 689 write result with no footprint in stop codon of extension |
634 ''' | 690 ''' |
635 mylist = [gene,GFF[gene]['name'],'no RPF in stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] | 691 mylist = [gene,GFF[gene]['name'],'no RPF in stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']] |
636 writer.writerow(mylist) | 692 writer.writerow(mylist) |
637 file_context.write('>'+gene+'\n'+contexte_stop+'\n') | 693 #file_context.write('>'+gene+'\n'+contexte_stop+'\n') |
638 file_extension.write('>'+gene+'\n'+seq+'\n') | 694 #file_extension.write('>'+gene+'\n'+seq+'\n') |
639 #print ">"+gene+"\n"+contexte_stop | 695 #print ">"+gene+"\n"+contexte_stop |
640 else : | 696 else : |
641 ''' | 697 ''' |
642 write result with RPF maybe result of reinitiation on a start codon | 698 write result with RPF maybe result of reinitiation on a start codon |
643 ''' | 699 ''' |
644 if pass_length(start_extension,stop_extension) : | 700 if pass_length(start_extension,stop_extension) : |
645 mylist = [gene,GFF[gene]['name'],'Met after stop', contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] | 701 mylist = [gene,GFF[gene]['name'],'Met after stop', contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']] |
646 writer.writerow(mylist) | 702 writer.writerow(mylist) |
647 file_context.write('>'+gene+'\n'+contexte_stop+'\n') | 703 #file_context.write('>'+gene+'\n'+contexte_stop+'\n') |
648 file_extension.write('>'+gene+'\n'+seq+'\n') | 704 #file_extension.write('>'+gene+'\n'+seq+'\n') |
649 #print ">"+gene+"\n"+contexte_stop | 705 #print ">"+gene+"\n"+contexte_stop |
650 else : | 706 # else : |
651 ## if its not a interesting case, we get stop context of genes without readthrough | 707 # ## if its not a interesting case, we get stop context of genes without readthrough |
652 if GFF[gene]['strand'] == '+' : | 708 # if GFF[gene]['strand'] == '+' : |
653 contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['stop']-6:GFF[gene]['stop']+6]) | 709 # contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['stop']-6:GFF[gene]['stop']+6]) |
654 file_back.write(contexte_stop+'\n') | 710 # file_back.write(contexte_stop+'\n') |
655 else : | 711 # else : |
656 contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['start']-7:GFF[gene]['start']+5].reverse_complement()) | 712 # contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['start']-7:GFF[gene]['start']+5].reverse_complement()) |
657 file_back.write(contexte_stop+'\n') | 713 # file_back.write(contexte_stop+'\n') |
658 | 714 # |
659 ## excluded UT with incorrect positions | 715 ## excluded UT with incorrect positions |
660 except ValueError: | 716 except ValueError: |
661 pass | 717 pass |
662 | 718 |
663 | 719 |
664 file_context.close() | 720 #file_context.close() |
665 file_back.close() | 721 #file_back.close() |
666 file_extension.close() | 722 #file_extension.close() |
667 except Exception,e: | 723 except Exception,e: |
668 stop_err( 'Error during computing analysis : ' + str( e ) ) | 724 stop_err( 'Error during computing analysis : ' + str( e ) ) |
669 | 725 |
670 | 726 |
671 | 727 |
677 gene_table = '' | 733 gene_table = '' |
678 gene_table += '<table>' | 734 gene_table += '<table>' |
679 gene_table += '<thead><tr><th data-sort="string">Gene</th><th>Plot</th><th data-sort="string">Name</th><th>Stop context</th><th>Coordinates</th><th>RPKM CDS</th><th>RPKM extension</th><th data-sort="float">ratio</th><th>Extension</th></tr></thead><tbody>' | 735 gene_table += '<thead><tr><th data-sort="string">Gene</th><th>Plot</th><th data-sort="string">Name</th><th>Stop context</th><th>Coordinates</th><th>RPKM CDS</th><th>RPKM extension</th><th data-sort="float">ratio</th><th>Extension</th></tr></thead><tbody>' |
680 | 736 |
681 with open(os.path.join(subfolder,'readthrough_result.csv'), 'rb') as csvfile: | 737 with open(os.path.join(subfolder,'readthrough_result.csv'), 'rb') as csvfile: |
682 spamreader = csv.reader(csvfile, delimiter='\t') | 738 spamreader = csv.reader(csvfile, delimiter=',') |
683 ## skip the header | 739 ## skip the header |
684 next(spamreader, None) | 740 next(spamreader, None) |
685 ##test if file is empty or not | 741 ##test if file is empty or not |
686 if next(spamreader, None): | 742 if next(spamreader, None): |
687 for row in spamreader: | 743 for row in spamreader: |
688 if row[2] == '-' : | 744 if row[2] == '-' : |
689 gene_table += '<tr><td>%s</td><td><a href="%s.png" data-lightbox="%s"><img src="%s_thumbnail.png" /></a></td><td><a title="%s">%s</a></td><td>%s</td><td>%s:%s-%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' %(row[0], row[0], row[0], row[0], row[11], row[1], row[3], row[4], row[5], row[6], row[8], row[9], row[10], row[12]) | 745 gene_table += '<tr><td>%s</td><td><a href="%s.png" data-lightbox="%s"><img src="%s_thumbnail.png" /></a></td><td><a title="%s">%s</a></td><td>%s</td><td>%s:%s-%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' %(row[0], row[0], row[0], row[0], row[11], row[1], row[3], row[4], row[5], row[6], row[8], row[9], row[10], row[11]) |
690 | 746 |
691 gene_table += '</tbody></table>' | 747 gene_table += '</tbody></table>' |
692 else : | 748 else : |
693 gene_table = 'Sorry, there are no stop codon readthrough genes in your data\n' | 749 gene_table = 'Sorry, there are no stop codon readthrough genes in your data\n' |
694 | 750 |
929 | 985 |
930 parser.add_option("-b", "--bam", dest="bamfile", type= "string", | 986 parser.add_option("-b", "--bam", dest="bamfile", type= "string", |
931 help="Bam Ribo-Seq alignments ", metavar="FILE") | 987 help="Bam Ribo-Seq alignments ", metavar="FILE") |
932 | 988 |
933 parser.add_option("-d", "--dirout", dest="dirout", type= "string", | 989 parser.add_option("-d", "--dirout", dest="dirout", type= "string", |
934 help="write report in this html file and in associated directory", metavar="STR,STR") | 990 help="write report in this html file and in associated directory", metavar="STR,STR") |
991 | |
992 parser.add_option("-k", "--kmer", dest="kmer", type= "int",default = 28 , | |
993 help="Longer of your phasing reads (Default is 28)", metavar="INT") | |
935 | 994 |
936 parser.add_option("-e", "--extend", dest="extend", type= "int",default = 300 , | 995 parser.add_option("-e", "--extend", dest="extend", type= "int",default = 300 , |
937 help="Lenght of extension after stop in number of base pairs (depends on your organisme)", metavar="INT") | 996 help="Lenght of extension after stop in number of base pairs (depends on your organisme)", metavar="INT") |
938 | 997 |
939 parser.add_option("-q", "--quiet", | 998 parser.add_option("-q", "--quiet", |
970 #GFF = store_gff(options.gff) | 1029 #GFF = store_gff(options.gff) |
971 #GFF = ribo_functions.store_gtf(options.gff) | 1030 #GFF = ribo_functions.store_gtf(options.gff) |
972 ## check gff reading | 1031 ## check gff reading |
973 if not GFF['order'] : | 1032 if not GFF['order'] : |
974 stop_err( 'Incorrect GFF file' ) | 1033 stop_err( 'Incorrect GFF file' ) |
1034 for gene in GFF['order']: | |
1035 if not GFF[gene]['exon'] : | |
1036 del GFF[gene] | |
1037 GFF['order'].remove(gene) | |
1038 | |
975 clean_file = ribo_functions.cleaning_bam(options.bamfile) | 1039 clean_file = ribo_functions.cleaning_bam(options.bamfile) |
976 compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder, options.extend) | 1040 compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder, options.extend, options.kmer) |
977 if os.path.exists( clean_file ): | 1041 if os.path.exists( clean_file ): |
978 os.remove( clean_file ) | 1042 os.remove( clean_file ) |
979 | 1043 |
980 write_html_page(html_file,subfolder) | 1044 write_html_page(html_file,subfolder) |
981 ##paste jquery script in result directory : | 1045 ##paste jquery script in result directory : |