comparison metagene_readthrough.py @ 19:385fc64fa988 draft default tip

Uploaded
author rlegendre
date Fri, 12 Jun 2015 11:32:59 -0400
parents c87c40e642af
children
comparison
equal deleted inserted replaced
18:a121cce43f90 19:385fc64fa988
14 import csv 14 import csv
15 import pysam 15 import pysam
16 import HTSeq 16 import HTSeq
17 #from matplotlib import pyplot as pl 17 #from matplotlib import pyplot as pl
18 import matplotlib 18 import matplotlib
19 from jinja2.nodes import Pos
19 matplotlib.use('Agg') 20 matplotlib.use('Agg')
20 import matplotlib.pyplot as pl 21 import matplotlib.pyplot as pl
21 from numpy import arange 22 from numpy import arange
22 from matplotlib import ticker as t 23 from matplotlib import ticker as t
23 from PIL import Image 24 from PIL import Image
402 warnings.resetwarnings() 403 warnings.resetwarnings()
403 404
404 except Exception, e: 405 except Exception, e:
405 stop_err( 'Error during gene plotting : ' + str( e ) ) 406 stop_err( 'Error during gene plotting : ' + str( e ) )
406 407
407 408
408 def compute_analysis(bam, GFF, fasta, gff, dirout, extend) : 409 def __percent__(prop):
410
411 if sum(prop) != 0 :
412 perc = [0,0,0]
413 if prop[0] != 0 :
414 perc[0] = int("{0:.0f}".format((prop[0]*100.0)/sum(prop)))
415 if prop[1] != 0 :
416 perc[1] = int("{0:.0f}".format((prop[1]*100.0)/sum(prop)))
417 if prop[2] != 0 :
418 perc[2] = int("{0:.0f}".format((prop[2]*100.0)/sum(prop)))
419 return perc
420 else :
421 return prop
422
423 def compute_phasing(chrom, start, stop, aln, kmer, strand):
424
425 phasing = [0,0,0]
426 for track in aln.fetch(chrom, start, stop):
427 if strand == '-':
428 if track.is_reverse :
429 if len(track.seq) == kmer :
430 pos = stop - (track.pos + track.rlen - 12)
431 if pos > 0:
432 phasing[pos%3] += 1
433 else :
434 if not track.is_reverse :
435 if len(track.seq) == kmer :
436 pos = (track.pos + 12) - start
437 if pos > 0:
438 phasing[pos%3] += 1
439
440 #return __percent__(phasing)
441 return phasing
442
443
444 def compute_analysis(bam, GFF, fasta, gff, dirout, extend, kmer) :
409 445
410 try: 446 try:
411 background_file = dirout+"/background_sequence.fasta" 447 #background_file = dirout+"/background_sequence.fasta"
412 file_back = open(background_file, 'w') 448 #file_back = open(background_file, 'w')
413 file_context = open(dirout+"/stop_context.fasta", 'w') 449 #file_context = open(dirout+"/stop_context.fasta", 'w')
414 file_extension = open(dirout+"/extensions.fasta", 'w') 450 #file_extension = open(dirout+"/extensions.fasta", 'w')
415 ## Opening Bam file with pysam librarie 451 ## Opening Bam file with pysam librarie
416 pysam.index(bam) 452 pysam.index(bam)
417 aln = pysam.Samfile(bam, "rb",header=True, check_header = True) 453 aln = pysam.Samfile(bam, "rb",header=True, check_header = True)
418 ## Opening fasta file in a dict with BioPython 454 ## Opening fasta file in a dict with BioPython
419 SeqDict = SeqIO.to_dict(SeqIO.parse(open(fasta,"r"),"fasta")) 455 SeqDict = SeqIO.to_dict(SeqIO.parse(open(fasta,"r"),"fasta"))
427 ## opening a GFF reader for check overlapping 463 ## opening a GFF reader for check overlapping
428 gff_reader = HTSeq.GFF_Reader(gff) 464 gff_reader = HTSeq.GFF_Reader(gff)
429 465
430 with open(dirout+"/readthrough_result.csv","w") as out : 466 with open(dirout+"/readthrough_result.csv","w") as out :
431 myheader = ['Gene','Name', 'FAIL', 'Stop context','chrom','start extension','stop extension','length extension','RPKM CDS', 'RPKM extension','ratio','Annotation','sequence'] 467 myheader = ['Gene','Name', 'FAIL', 'Stop context','chrom','start extension','stop extension','length extension','RPKM CDS', 'RPKM extension','ratio','Annotation','sequence']
432 writer = csv.writer(out,delimiter='\t') 468 writer = csv.writer(out,delimiter=',')
433 writer.writerow(myheader) 469 writer.writerow(myheader)
434 for gene in GFF['order'] : 470 for gene in GFF['order'] :
435 #print GFF[gene] 471 #print GFF[gene]
436 ## maybe no start position in GTF file so we must to check and replace 472 ## maybe no start position in GTF file so we must to check and replace
437 exon_number = GFF[gene]['exon_number'] 473 exon_number = GFF[gene]['exon_number']
520 #print gene,count,pos,'\n',seq 556 #print gene,count,pos,'\n',seq
521 557
522 if (seq): 558 if (seq):
523 res = find_stop(seq) 559 res = find_stop(seq)
524 if res == -1 : 560 if res == -1 :
525 mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] 561 mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']]
526 writer.writerow(mylist) 562 writer.writerow(mylist)
527 else : 563 else :
528 indic = 'ok' 564 indic = 'ok'
529 #print res 565 #print res
530 #stop_extension = start_extension + res +3 566 #stop_extension = start_extension + res +3
539 #print gene,count,pos,'\n',seq 575 #print gene,count,pos,'\n',seq
540 576
541 if (seq): 577 if (seq):
542 res = find_stop(seq) 578 res = find_stop(seq)
543 if res == -1 : 579 if res == -1 :
544 mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] 580 mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']]
545 writer.writerow(mylist) 581 writer.writerow(mylist)
546 else : 582 else :
547 indic = 'ok' 583 indic = 'ok'
548 #print res 584 #print res
549 #start_extension = stop_extension - res -3 585 #start_extension = stop_extension - res -3
582 ## if we are no methionine in 5 codons following stop of CDS 618 ## if we are no methionine in 5 codons following stop of CDS
583 if (not check_met(seq) ): 619 if (not check_met(seq) ):
584 ## if we have footprint in stop codon extension and stop extension is further than cds_stop+9 620 ## if we have footprint in stop codon extension and stop extension is further than cds_stop+9
585 if count_stop > 2 and stop_ok == 1 : 621 if count_stop > 2 and stop_ok == 1 :
586 homo_cov = check_homo_coverage(gene,GFF,start_extension,stop_extension,aln) 622 homo_cov = check_homo_coverage(gene,GFF,start_extension,stop_extension,aln)
623 # phasing = compute_phasing(GFF[gene]['chrom'],start_extension,stop_extension, aln, kmer,GFF[gene]['strand'])
624 # print gene, phasing
625 # count_after_stop = 0
626 # try :
627 # ### count method of pysam doesn't strand information
628 # if GFF[gene]['strand'] == '+' :
629 # for track in aln.fetch(GFF[gene]['chrom'],stop_extension+15,stop_extension+40) :
630 # if not track.is_reverse :
631 # count_after_stop += 1
632 # else :
633 # for track in aln.fetch(GFF[gene]['chrom'],start_extension-40,start_extension-15) :
634 # if track.is_reverse :
635 # count_after_stop += 1
636 # except ValueError:
637 # ## genere warning about gtf coordinates
638 # warnings.warn("Please check coordinates in GFF : maybe stop or start codon are missing" )
639 # pass
640 # print gene+"\t"+str(count_stop)+"\t"+str(count_after_stop)
641 # if (count_stop*5)/100 > count_after_stop :
642 # print "moins de 5%...\n"
587 if (homo_cov) : 643 if (homo_cov) :
588 ''' 644 '''
589 write result witch corresponding to readthrough 645 write result witch corresponding to readthrough
590 ''' 646 '''
591 ##if length of extension upper than 25 we can compute rpkm 647 ##if length of extension upper than 25 we can compute rpkm
598 rpkm_ext = compute_rpkm(len_ext,count_ext,count_tot) 654 rpkm_ext = compute_rpkm(len_ext,count_ext,count_tot)
599 ## compute ratio between extension coverage and cds coverage (rpkm) 655 ## compute ratio between extension coverage and cds coverage (rpkm)
600 ratio = rpkm_ext/rpkm_cds 656 ratio = rpkm_ext/rpkm_cds
601 #print gene,ratio 657 #print gene,ratio
602 #print start_extension,stop_extension 658 #print start_extension,stop_extension
603 mylist = [gene,GFF[gene]['name'],'-',contexte_stop,GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,GFF[gene]['note'],seq] 659 mylist = [gene,GFF[gene]['name'],'-',contexte_stop,GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,seq,GFF[gene]['note']]
604 writer.writerow(mylist) 660 writer.writerow(mylist)
605 file_context.write('>'+gene+'\n'+contexte_stop+'\n') 661 #file_context.write('>'+gene+'\n'+contexte_stop+'\n')
606 file_extension.write('>'+gene+'\n'+seq+'\n') 662 #file_extension.write('>'+gene+'\n'+seq+'\n')
607 else : 663 else :
608 ''' 664 '''
609 write result witch corresponding to readthrough but with no homogeneous coverage 665 write result witch corresponding to readthrough but with no homogeneous coverage
610 ''' 666 '''
611 if (not pass_length(start_extension,stop_extension)) : 667 if (not pass_length(start_extension,stop_extension)) :
615 else : 671 else :
616 len_ext = stop_extension-start_extension 672 len_ext = stop_extension-start_extension
617 rpkm_ext = compute_rpkm(len_ext,count_ext,count_tot) 673 rpkm_ext = compute_rpkm(len_ext,count_ext,count_tot)
618 ## compute ratio between extension coverage and cds coverage (rpkm) 674 ## compute ratio between extension coverage and cds coverage (rpkm)
619 ratio = rpkm_ext/rpkm_cds 675 ratio = rpkm_ext/rpkm_cds
620 mylist = [gene,GFF[gene]['name'],'hetero cov',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,GFF[gene]['note'],seq] 676 mylist = [gene,GFF[gene]['name'],'hetero cov',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,seq,GFF[gene]['note']]
621 writer.writerow(mylist) 677 writer.writerow(mylist)
622 file_context.write('>'+gene+'\n'+contexte_stop+'\n') 678 #file_context.write('>'+gene+'\n'+contexte_stop+'\n')
623 file_extension.write('>'+gene+'\n'+seq+'\n') 679 #file_extension.write('>'+gene+'\n'+seq+'\n')
624 #print ">"+gene+"\n"+contexte_stop 680 #print ">"+gene+"\n"+contexte_stop
625 681
626 ## plot gene : 682 ## plot gene :
627 plot_gene(aln, GFF[gene], start_extension, stop_extension, dirout+"/"+gene) 683 plot_gene(aln, GFF[gene], start_extension, stop_extension, dirout+"/"+gene)
628 684
630 686
631 else : 687 else :
632 ''' 688 '''
633 write result with no footprint in stop codon of extension 689 write result with no footprint in stop codon of extension
634 ''' 690 '''
635 mylist = [gene,GFF[gene]['name'],'no RPF in stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] 691 mylist = [gene,GFF[gene]['name'],'no RPF in stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']]
636 writer.writerow(mylist) 692 writer.writerow(mylist)
637 file_context.write('>'+gene+'\n'+contexte_stop+'\n') 693 #file_context.write('>'+gene+'\n'+contexte_stop+'\n')
638 file_extension.write('>'+gene+'\n'+seq+'\n') 694 #file_extension.write('>'+gene+'\n'+seq+'\n')
639 #print ">"+gene+"\n"+contexte_stop 695 #print ">"+gene+"\n"+contexte_stop
640 else : 696 else :
641 ''' 697 '''
642 write result with RPF maybe result of reinitiation on a start codon 698 write result with RPF maybe result of reinitiation on a start codon
643 ''' 699 '''
644 if pass_length(start_extension,stop_extension) : 700 if pass_length(start_extension,stop_extension) :
645 mylist = [gene,GFF[gene]['name'],'Met after stop', contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] 701 mylist = [gene,GFF[gene]['name'],'Met after stop', contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']]
646 writer.writerow(mylist) 702 writer.writerow(mylist)
647 file_context.write('>'+gene+'\n'+contexte_stop+'\n') 703 #file_context.write('>'+gene+'\n'+contexte_stop+'\n')
648 file_extension.write('>'+gene+'\n'+seq+'\n') 704 #file_extension.write('>'+gene+'\n'+seq+'\n')
649 #print ">"+gene+"\n"+contexte_stop 705 #print ">"+gene+"\n"+contexte_stop
650 else : 706 # else :
651 ## if its not a interesting case, we get stop context of genes without readthrough 707 # ## if its not a interesting case, we get stop context of genes without readthrough
652 if GFF[gene]['strand'] == '+' : 708 # if GFF[gene]['strand'] == '+' :
653 contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['stop']-6:GFF[gene]['stop']+6]) 709 # contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['stop']-6:GFF[gene]['stop']+6])
654 file_back.write(contexte_stop+'\n') 710 # file_back.write(contexte_stop+'\n')
655 else : 711 # else :
656 contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['start']-7:GFF[gene]['start']+5].reverse_complement()) 712 # contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['start']-7:GFF[gene]['start']+5].reverse_complement())
657 file_back.write(contexte_stop+'\n') 713 # file_back.write(contexte_stop+'\n')
658 714 #
659 ## excluded UT with incorrect positions 715 ## excluded UT with incorrect positions
660 except ValueError: 716 except ValueError:
661 pass 717 pass
662 718
663 719
664 file_context.close() 720 #file_context.close()
665 file_back.close() 721 #file_back.close()
666 file_extension.close() 722 #file_extension.close()
667 except Exception,e: 723 except Exception,e:
668 stop_err( 'Error during computing analysis : ' + str( e ) ) 724 stop_err( 'Error during computing analysis : ' + str( e ) )
669 725
670 726
671 727
677 gene_table = '' 733 gene_table = ''
678 gene_table += '<table>' 734 gene_table += '<table>'
679 gene_table += '<thead><tr><th data-sort="string">Gene</th><th>Plot</th><th data-sort="string">Name</th><th>Stop context</th><th>Coordinates</th><th>RPKM CDS</th><th>RPKM extension</th><th data-sort="float">ratio</th><th>Extension</th></tr></thead><tbody>' 735 gene_table += '<thead><tr><th data-sort="string">Gene</th><th>Plot</th><th data-sort="string">Name</th><th>Stop context</th><th>Coordinates</th><th>RPKM CDS</th><th>RPKM extension</th><th data-sort="float">ratio</th><th>Extension</th></tr></thead><tbody>'
680 736
681 with open(os.path.join(subfolder,'readthrough_result.csv'), 'rb') as csvfile: 737 with open(os.path.join(subfolder,'readthrough_result.csv'), 'rb') as csvfile:
682 spamreader = csv.reader(csvfile, delimiter='\t') 738 spamreader = csv.reader(csvfile, delimiter=',')
683 ## skip the header 739 ## skip the header
684 next(spamreader, None) 740 next(spamreader, None)
685 ##test if file is empty or not 741 ##test if file is empty or not
686 if next(spamreader, None): 742 if next(spamreader, None):
687 for row in spamreader: 743 for row in spamreader:
688 if row[2] == '-' : 744 if row[2] == '-' :
689 gene_table += '<tr><td>%s</td><td><a href="%s.png" data-lightbox="%s"><img src="%s_thumbnail.png" /></a></td><td><a title="%s">%s</a></td><td>%s</td><td>%s:%s-%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' %(row[0], row[0], row[0], row[0], row[11], row[1], row[3], row[4], row[5], row[6], row[8], row[9], row[10], row[12]) 745 gene_table += '<tr><td>%s</td><td><a href="%s.png" data-lightbox="%s"><img src="%s_thumbnail.png" /></a></td><td><a title="%s">%s</a></td><td>%s</td><td>%s:%s-%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' %(row[0], row[0], row[0], row[0], row[11], row[1], row[3], row[4], row[5], row[6], row[8], row[9], row[10], row[11])
690 746
691 gene_table += '</tbody></table>' 747 gene_table += '</tbody></table>'
692 else : 748 else :
693 gene_table = 'Sorry, there are no stop codon readthrough genes in your data\n' 749 gene_table = 'Sorry, there are no stop codon readthrough genes in your data\n'
694 750
929 985
930 parser.add_option("-b", "--bam", dest="bamfile", type= "string", 986 parser.add_option("-b", "--bam", dest="bamfile", type= "string",
931 help="Bam Ribo-Seq alignments ", metavar="FILE") 987 help="Bam Ribo-Seq alignments ", metavar="FILE")
932 988
933 parser.add_option("-d", "--dirout", dest="dirout", type= "string", 989 parser.add_option("-d", "--dirout", dest="dirout", type= "string",
934 help="write report in this html file and in associated directory", metavar="STR,STR") 990 help="write report in this html file and in associated directory", metavar="STR,STR")
991
992 parser.add_option("-k", "--kmer", dest="kmer", type= "int",default = 28 ,
993 help="Longer of your phasing reads (Default is 28)", metavar="INT")
935 994
936 parser.add_option("-e", "--extend", dest="extend", type= "int",default = 300 , 995 parser.add_option("-e", "--extend", dest="extend", type= "int",default = 300 ,
937 help="Lenght of extension after stop in number of base pairs (depends on your organisme)", metavar="INT") 996 help="Lenght of extension after stop in number of base pairs (depends on your organisme)", metavar="INT")
938 997
939 parser.add_option("-q", "--quiet", 998 parser.add_option("-q", "--quiet",
970 #GFF = store_gff(options.gff) 1029 #GFF = store_gff(options.gff)
971 #GFF = ribo_functions.store_gtf(options.gff) 1030 #GFF = ribo_functions.store_gtf(options.gff)
972 ## check gff reading 1031 ## check gff reading
973 if not GFF['order'] : 1032 if not GFF['order'] :
974 stop_err( 'Incorrect GFF file' ) 1033 stop_err( 'Incorrect GFF file' )
1034 for gene in GFF['order']:
1035 if not GFF[gene]['exon'] :
1036 del GFF[gene]
1037 GFF['order'].remove(gene)
1038
975 clean_file = ribo_functions.cleaning_bam(options.bamfile) 1039 clean_file = ribo_functions.cleaning_bam(options.bamfile)
976 compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder, options.extend) 1040 compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder, options.extend, options.kmer)
977 if os.path.exists( clean_file ): 1041 if os.path.exists( clean_file ):
978 os.remove( clean_file ) 1042 os.remove( clean_file )
979 1043
980 write_html_page(html_file,subfolder) 1044 write_html_page(html_file,subfolder)
981 ##paste jquery script in result directory : 1045 ##paste jquery script in result directory :