Mercurial > repos > rlegendre > ribo_tools
changeset 6:29c9c86e17e1
Uploaded
author | rlegendre |
---|---|
date | Mon, 20 Oct 2014 11:07:40 -0400 |
parents | 0b8fa8b7f356 |
children | 015db5db052c |
files | metagene_readthrough.py |
diffstat | 1 files changed, 1026 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metagene_readthrough.py Mon Oct 20 11:07:40 2014 -0400 @@ -0,0 +1,1026 @@ +#!/usr/bin/env python2.7.3 +#-*- coding: utf-8 -*- + +''' + Created on Dec. 2013 + @author: rachel legendre + @copyright: rachel.legendre@igmors.u-psud.fr + @license: GPL v3 +''' + +import os, sys, time, optparse, shutil, re, urllib, subprocess, tempfile +from urllib import unquote +from Bio import SeqIO +import csv +import pysam +import HTSeq +#from matplotlib import pyplot as pl +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as pl +from numpy import arange +from matplotlib import ticker as t +from PIL import Image + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) + sys.exit() + + +def store_gff(gff): + ''' + parse and store gff file in a dictionnary + ''' + try: + GFF = {} + with open(gff, 'r') as f_gff : + GFF['order'] = [] + for line in f_gff: + ## switch commented lines + line = line.split("#")[0] + if line != "" : + feature = (line.split('\t')[8]).split(';') + # first line is already gene line : + if line.split('\t')[2] == 'gene' : + gene = feature[0].replace("ID=","") + if re.search('gene',feature[2]) : + Name = feature[2].replace("gene=","") + else : + Name = "Unknown" + ##get annotation + note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line) + note = unquote(str(note)).replace("\n","") + ## store gene information + GFF['order'].append(gene) + GFF[gene] = {} + GFF[gene]['chrom'] = line.split('\t')[0] + GFF[gene]['start'] = int(line.split('\t')[3]) + GFF[gene]['stop'] = int(line.split('\t')[4]) + GFF[gene]['strand'] = line.split('\t')[6] + GFF[gene]['name'] = Name + GFF[gene]['note'] = note + GFF[gene]['exon'] = {} + GFF[gene]['exon_number'] = 0 + #print Name + elif line.split('\t')[2] == 'CDS' : + gene = re.sub(r".?Parent\=(.+)(_mRNA)+", r"\1", feature[0]) + if GFF.has_key(gene) : + GFF[gene]['exon_number'] += 1 + exon_number = GFF[gene]['exon_number'] + GFF[gene]['exon'][exon_number] = {} + GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] + GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) + GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) + + ## if there is a five prim UTR intron, we change start of gene + elif line.split('\t')[2] == 'five_prime_UTR_intron' : + if GFF[gene]['strand'] == "+" : + GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] + else : + GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] + return GFF + except Exception,e: + stop_err( 'Error during gff storage : ' + str( e ) ) + + + + +def compute_rpkm(length,count_gene,count_tot) : + + try : + rpkm = "{0:.4f}".format(count_gene*1000000.0/(count_tot*length)) + except ArithmeticError : + stop_err( 'Illegal division by zero') + return float(rpkm) + + +def find_stop(seq) : + ''' + Find stop codon in a sequence and return position of first nucleotide in stop + ''' + stop_codon = ['TAA','TAG','TGA'] + for nt in range(0,len(seq)-3,3) : + codon = seq[nt:nt+3] + if codon in stop_codon : + return nt + + return -1 + + +def check_met(seq): + ''' + Boolean function for testing presence or absence of methionine in 5 codons following stop codon + ''' + met = 'ATG' + for pos in range(0,15,3) : + codon = seq[pos:pos+3] + if codon in met : + return True + + return False + + +''' + feature.iv is a GenomicInterval object : + A GenomicInterval object has the following slots, some of which + are calculated from the other: + + chrom: The name of a sequence (i.e., chromosome, contig, or + the like). + start: The start of the interval. Even on the reverse strand, + this is always the smaller of the two values 'start' and 'end'. + Note that all positions should be given as 0-based value! + end: The end of the interval. Following Python convention for + ranges, this in one more than the coordinate of the last base + that is considered part of the sequence. + strand: The strand, as a single character, '+' or '-'. '.' indicates + that the strand is irrelavant. (Alternatively, pass a Strand object.) + length: The length of the interval, i.e., end - start + start_d: The "directional start" position. This is the position of the + first base of the interval, taking the strand into account. Hence, + this is the same as 'start' except when strand == '-', in which + case it is end-1. + end_d: The "directional end": Usually, the same as 'end', but for + strand=='-1', it is start-1. + +''' +def check_overlapping(gff_reader,chrm,start,stop,strand): + + #### probleme avec les genes completement inclu... + + iv2 = HTSeq.GenomicInterval(chrm,start,stop,strand) + for feature in gff_reader: + if feature.type == "gene" : + if feature.iv.overlaps(iv2) : + ## if its a reverse gene, we replace start of extension by start of previous gene + if strand == '-' : + return (feature.iv.end+3,stop) + ## else we replace stop of extension by start of following gene + else : + return (start,feature.iv.start-3) + ## if no overlap are find, we return -1 + return (start,stop) + + +def pass_length(start,stop) : + + if (stop-start) > 25 : + return True + else : + return False + + +def check_homo_coverage(gene,GFF,start,stop, aln) : + + chrom = GFF[gene]['chrom'] + ## compute nunber of nucleotides in CDS with a coverage equal to zero + nt_cds_cov = 0 + nt_cds_num = 0 + for i in range(1,GFF[gene]['exon_number']+1) : + for z in range(GFF[gene]['exon'][i]['start'],GFF[gene]['exon'][i]['stop']): + nt_cds_num += 1 + if aln.count(chrom,z,z+1) == 0 : + nt_cds_cov += 1 + + ## compute percent of CDS no covering + percent = nt_cds_cov*100/nt_cds_num + + ## compute number of nucleotides with no coverage in extension + nt_no_cover = 0 + for pos in range(start,stop,1) : + if aln.count(chrom,pos,pos+1) == 0 : + nt_no_cover += 1 + #print gene, nt_cds_cov, percent, nt_no_cover + #percent10 = (stop-start)*50/100 + if (nt_no_cover*100)/(stop-start) > percent : + return False + else : + return True + +def plot_gene ( aln, gene, start_extension, stop_extension, dirout ) : + + try: + strand = gene['strand'] + len_gene = gene['stop']-gene['start'] + if strand is "-" : + len_ext = gene['stop']-start_extension + ## coverage in all gene + extension + start = start_extension-100 + stop = gene['stop']+100 + vector1 = [0]*(stop-start) + #for pileupcolumn in aln.pileup( gene['chrom'], start, stop): + # vector.append(pileupcolumn.n) + for read in aln.fetch(gene['chrom'], start, stop): + if read.is_reverse : + ## for get footprint in P-site (estimate) we take 3 nt in middle of read + #pos = (read.pos+13)-start + pos = (read.pos-start) + (read.rlen/2)-1 + for z in range(pos,(pos+3)) : + ## le fetch contient des reads qui chevauchent les 30 nt de la fin du gene mais dont le site P + ## se trouve avant notre vector, le z devient negatif et la couverture augmente en fin de vecteur (ce qui est faux) + if z > 0 : + try : + vector1[z] += 1 + except IndexError : + pass + vector1.reverse() + mean_read = float(sum(vector1))/float(len(vector1)) + cov = [(x/mean_read) for x in vector1] + idx_tot = arange(len(cov)) + ## coverage in extension + start_ext = start_extension-40 + stop_ext = stop_extension+30 + vector2 = [0]*(stop_ext-start_ext) + #for pileupcolumn in aln.pileup( gene['chrom'], start, stop): + # vector.append(pileupcolumn.n) + for read in aln.fetch(gene['chrom'], start_extension, stop_ext): + if read.is_reverse : + ## get footprint in P-site + #pos = (read.pos+13)-start_ext + pos = (read.pos-start_ext) + (read.rlen/2)-1 + for z in range(pos,(pos+3)) : + if z > 0 : + try : + vector2[z] += 1 + except IndexError : + pass + vector2.reverse() + #mean_read = float(sum(vector))/float(len(vector)) + cov_ext = [(x/mean_read) for x in vector2] + _max = max(cov_ext[30::]) + idx_ext = arange(len(cov_ext)) + + else : + len_ext = stop_extension-gene['start'] + start = gene['start']-100 + stop = stop_extension+100 + vector = [0]*(stop-start) + #for pileupcolumn in aln.pileup( gene['chrom'], start, stop): + #vector.append(pileupcolumn.n) + for read in aln.fetch(gene['chrom'], start, stop): + if not read.is_reverse : + ## get footprint in P-site + #pos = (read.pos+12)-start + pos = (read.pos-start) + (read.rlen/2)-1 + for z in range(pos,(pos+3)) : + if z > 0 : + try : + vector[z] += 1 + except IndexError : + pass + mean_read = float(sum(vector))/float(len(vector)) + cov = [(x/mean_read) for x in vector] + idx_tot = arange(len(cov)) + + ## coverage in extension + start_ext = gene['stop']-30 + stop_ext = stop_extension+40 + vector = [0]*(stop-start_ext) + for read in aln.fetch(gene['chrom'], start_ext, stop_extension): + if not read.is_reverse : + ## get footprint in P-site + pos = (read.pos-start_ext) + (read.rlen/2)-1 + for z in range(pos,(pos+3)) : + if z > 0 : + try : + vector[z] += 1 + except IndexError : + pass + + cov_ext = [(x/mean_read) for x in vector] + idx_ext = arange(len(cov_ext)) + _max = max(cov_ext[30::]) + + #### PLOT FIGURE #### + + font = {'family' : 'serif','color': 'grey','weight' : 'normal','size' : 16 } + + + fig = pl.figure(num=1) + ## create a big subplot for common y axis on two last subplot + ax = fig.add_subplot(2,1,2) + ## hide all spines + ax.spines['top'].set_visible(False) + ax.spines['bottom'].set_visible(False) + ax.spines['left'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.tick_params(labelcolor='w', top='off', bottom='off', left='off', right='off') + + ## plot gene structure + ax1 = fig.add_subplot(3,1,1) + ax1.set_title(gene['name']) + ## hide all spines + ax1.spines['right'].set_color('none') + ax1.spines['top'].set_color('none') + ax1.spines['left'].set_color('none') + ax1.spines['bottom'].set_color('none') + ax1.set_ylim(0,5) + #set xlim as second plot with 100nt switch + ax1.set_xlim(-100,len(idx_tot)-100) + ## hide ticks + pl.yticks([]) + pl.xticks([]) + ## if it's a "simple" gene + if gene['exon_number'] == 1 : + exon = arange(len_gene) + x = [3]*len_gene + ax1.plot(exon,x,color='#9B9E9C') + ax1.fill_between(exon,2,x,color='#9B9E9C') + ## if it's a gene with introns + else : + ## plot a line representing intron + intron = arange(len_gene) + y = [2.5]*len_gene + ax1.plot(intron,y,color='#9B9E9C') + + ## plotting each intron + start = gene['start'] + if strand == '+' : + for exon in range(1,gene['exon_number']+1,1) : + len_exon = gene['exon'][exon]['stop']-gene['exon'][exon]['start'] + idx = gene['exon'][exon]['start'] - start + exo = arange(idx,idx+len_exon) + x = [3]*len_exon + ax1.plot(exo,x,color='#9B9E9C') + ax1.fill_between(exo,2,x,color='#9B9E9C') + else : + ## if it's a reverse gene we must reverse exon's position + start = gene['start'] + tabF = [2.5]*len_gene + tabR = [2.5]*len_gene + for exon in range(1,gene['exon_number']+1,1) : + len_exon = gene['exon'][exon]['stop']-gene['exon'][exon]['start'] + idx = gene['exon'][exon]['start'] - start + exo = arange(idx,idx+len_exon) + for z in exo : + tabF[z] = 3 + tabR[z] = 2 + tabF.reverse() + tabR.reverse() + #pl.ylim(0,5) + ax1.plot(tabF,color='#9B9E9C') + ax1.plot(tabR,color='#9B9E9C') + x = arange(len(tabR)) + ax1.fill_between(x,tabF,tabR,color='#9B9E9C') + + ## insert arrows (as genome browser representation) + yloc = 2.5 + narrows = 20 + exonwidth = .8 + spread = .4 * len_gene / narrows + for i in range(narrows): + loc = (float(i) * len_gene / narrows)+ (len(idx_tot)/100)*2 + if strand == '+' : + x = [loc - spread, loc, loc - spread] + else: + x = [loc - spread, loc, loc - spread] + y = [yloc - exonwidth / 5, yloc, yloc + exonwidth / 5] + ax1.plot(x, y, lw=1.4, color='#676B69') + + + # plot coverage in all gene + extension + ax2 = fig.add_subplot(3,1,2) + ## fixe limit to length of coverage for x axis + ax2.set_xlim(0,len(idx_tot)) + ## fixe 4 ticks and associated labels for y axis + ax2.set_yticklabels(arange(0,int(max(cov)+20),int((max(cov)+20)/4))) + ax2.set_yticks(arange(0,int(max(cov)+20),int((max(cov)+20)/4))) + ### add start and stop of gene in axe in place of number + ax2.set_xticks([100,(100+len_gene),(100+len_ext)]) + ax2.set_xticklabels(["{:,}".format(gene['start']),"{:,}".format(gene['stop']),""]) + ## hide spines and any axis + ax2.spines['right'].set_visible(False) + ax2.spines['top'].set_visible(False) + ax2.yaxis.set_ticks_position('left') + ax2.xaxis.set_ticks_position('bottom') + ## plot and fill + ax2.plot(idx_tot, cov, color="#CC0011") + ax2.fill_between(idx_tot, 0,cov,color="#CC0011") + ax2.set_title("Genomic coordinates (%s,%s)"% (gene['strand'],gene['chrom']) ,fontdict={'fontsize':'small'}) + + # plot zoom coverage in extension + ax3 = fig.add_subplot(3,1,3) + ## fixe limit to length of coverage for x axis + ax3.set_ylim(0,int(_max)) + # we hide spines + ax3.spines['right'].set_visible(False) + ax3.spines['top'].set_visible(False) + ax3.yaxis.set_ticks_position('left') + ax3.xaxis.set_ticks_position('bottom') + ## add stop and in-frame stop in x axis + pl.xticks([30,( stop_extension-start_extension)+30],["stop codon","next in-frame stop"]) + ## fixe good position for labels + (ax3.get_xticklabels()[0]).set_horizontalalignment('center') + (ax3.get_xticklabels()[1]).set_horizontalalignment('left') + #if max coverage is lower than 2, we have a illegal division by zero + if _max > 2 : + ax3.set_yticklabels(arange(0,int(_max+1),int((_max+1)/3))) + ax3.set_yticks(arange(0,int(_max+1),int((_max+1)/3))) + else : + ## + ax3.set_ylim(0,_max) + ax3.ticklabel_format(style='sci', scilimits=(-2,2), axis='y',useOffset=False) + + ax3.plot(idx_ext, cov_ext, color="#CC0011") + ax3.fill_between(idx_ext, 0,cov_ext,color="#CC0011") + + + ## get scale of subplot 3 + #ax3.text(ax3.get_xlim()[1]-50,ax3.get_ylim()[1]-1, r'50 nt', fontdict=font) + #pl.arrow( ax3.get_xlim()[1]-50, ax3.get_ylim()[1]-2, ax3.get_xlim()[1]-50, 0, fc="grey", ec="grey",lw=2) + + ## set common y label + ax.set_ylabel('Normalized footprint counts',labelpad=20) + + ## draw and save plot + pl.draw() + #pl.show() + pl.savefig(dirout+".png",format='png', dpi=300) + pl.clf() + + + ## Make thumbnail for html page + infile = dirout+'.png' + size = 128, 128 + im = Image.open(infile) + im.thumbnail(size, Image.ANTIALIAS) + im.save(dirout+"_thumbnail.png", "PNG") + + + except Exception, e: + stop_err( 'Error during gene plotting : ' + str( e ) ) + + +def compute_analysis(bam, GFF, fasta, gff, dirout) : + + try: + background_file = dirout+"/background_sequence.fasta" + file_back = open(background_file, 'w') + file_context = open(dirout+"/stop_context.fasta", 'w') + file_extension = open(dirout+"/extensions.fasta", 'w') + ## Opening Bam file with pysam librarie + pysam.index(bam) + aln = pysam.Samfile(bam, "rb",header=True, check_header = True) + ## Opening fasta file in a dict with BioPython + SeqDict = SeqIO.to_dict(SeqIO.parse(open(fasta,"r"),"fasta")) + + ## count total read in bam file + cmd = "samtools view -c %s " % (bam) + proc = subprocess.Popen( args=cmd, shell=True, stdout = subprocess.PIPE) + count_tot = int(proc.stdout.read().rstrip()) + returncode = proc.wait() + + ## opening a GFF reader for check overlapping + gff_reader = HTSeq.GFF_Reader(gff) + + with open(dirout+"/readthrough_result.csv","w") as out : + myheader = ['Gene','Name', 'FAIL', 'Stop context','chrom','start extension','stop extension','length extension','RPKM CDS', 'RPKM extension','ratio','Annotation','sequence'] + writer = csv.writer(out,delimiter='\t') + writer.writerow(myheader) + for gene in GFF['order'] : + indic = "" + # compute rpkm of CDS : + len_cds = GFF[gene]['stop']-GFF[gene]['start'] + count_cds = 0 + ### count method of pysam doesn't strand information + if GFF[gene]['strand'] == '+' : + for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+12,GFF[gene]['stop']-15) : + if not track.is_reverse : + count_cds += 1 + else : + for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+15,GFF[gene]['stop']-12) : + if track.is_reverse : + count_cds += 1 + rpkm_cds = compute_rpkm(len_cds,count_cds,count_tot) + ## Only if gene is translate : + if rpkm_cds > 0 and count_cds > 128: + ## search footprint in UTR3 + count = 0 + try : + if GFF[gene]['strand'] == '+' : + contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['stop']-6:GFF[gene]['stop']+6]) + #print gene, contexte_stop + start_extension = GFF[gene]['stop'] + stop_extension = GFF[gene]['stop']+90 + for track in aln.fetch(GFF[gene]['chrom'],start_extension+15,stop_extension) : + if not track.is_reverse : + count += 1 + + ## get sequence of extension + seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension]) + + else : + contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['start']-7:GFF[gene]['start']+5].reverse_complement()) + #print gene, contexte_stop + start_extension = GFF[gene]['start']-90 + stop_extension = GFF[gene]['start'] + for track in aln.fetch(GFF[gene]['chrom'],start_extension,stop_extension-15) : + if track.is_reverse : + count += 1 + ## get sequence of extension + seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement()) + + ## if we have coverage after cds stop codon + if count > 10 : + res = find_stop(seq) + if res == -1 : + ''' + Write results with no stop but RPF in extension + ''' + ## check if next in-frame codon is far than 90 nt extension : + if GFF[gene]['strand'] == '+' : + pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['stop']+1,GFF[gene]['stop']+300,'+') + start_extension = pos[0]-1 + stop_extension = pos[1] + seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension]) + + #print gene,count,pos,'\n',seq + + if (seq): + res = find_stop(seq) + if res == -1 : + mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] + writer.writerow(mylist) + else : + indic = 'ok' + #print res + #stop_extension = start_extension + res +3 + else : + pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['start']-300,GFF[gene]['start']-1,'-') + start_extension = pos[0] + stop_extension = pos[1]+1 + seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement()) + + #print gene,count,pos,'\n',seq + + if (seq): + res = find_stop(seq) + if res == -1 : + mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] + writer.writerow(mylist) + else : + indic = 'ok' + #print res + #start_extension = stop_extension - res -3 + else : + indic = 'ok' + + + if indic == 'ok' : + ## We save new coordinates + if GFF[gene]['strand'] == '+' : + stop_extension = start_extension + res +3 + #print gene, count + #print gene,start_extension,stop_extension + seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension]) + #print seq + count_stop = aln.count(GFF[gene]['chrom'],stop_extension-2,stop_extension+2) + if pass_length(start_extension,stop_extension) : + count_ext = aln.count(GFF[gene]['chrom'],start_extension+9,stop_extension-15) + if stop_extension > GFF[gene]['stop']+9 : + stop_ok = 1 + else : + stop_ok = 0 + else : + start_extension = stop_extension - res - 3 + #print gene, count + #print gene,start_extension,stop_extension + seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension-1:stop_extension-1].reverse_complement()) + #print seq + count_stop = aln.count(GFF[gene]['chrom'],start_extension-2,start_extension+2) + if pass_length(start_extension,stop_extension) : + count_ext = aln.count(GFF[gene]['chrom'],start_extension+15,stop_extension-9) + if start_extension < GFF[gene]['start']-9 : + stop_ok = 1 + else : + stop_ok = 0 + ## if we are no methionine in 5 codons following stop of CDS + if (not check_met(seq) ): + ## if we have footprint in stop codon extension and stop extension is further than cds_stop+9 + if count_stop > 2 and stop_ok == 1 : + homo_cov = check_homo_coverage(gene,GFF,start_extension,stop_extension,aln) + if (homo_cov) : + ''' + write result witch corresponding to readthrough + ''' + ##if length of extension upper than 25 we can compute rpkm + if (not pass_length(start_extension,stop_extension)) : + len_ext = stop_extension-start_extension + rpkm_ext = 'nan' + ratio = 'nan' + else : + len_ext = stop_extension-start_extension + rpkm_ext = compute_rpkm(len_ext,count_ext,count_tot) + ## compute ratio between extension coverage and cds coverage (rpkm) + ratio = rpkm_ext/rpkm_cds + #print gene,ratio + #print start_extension,stop_extension + mylist = [gene,GFF[gene]['name'],'-',contexte_stop,GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,GFF[gene]['note'],seq] + writer.writerow(mylist) + file_context.write('>'+gene+'\n'+contexte_stop+'\n') + file_extension.write('>'+gene+'\n'+seq+'\n') + else : + ''' + write result witch corresponding to readthrough but with no homogeneous coverage + ''' + if (not pass_length(start_extension,stop_extension)) : + len_ext = stop_extension-start_extension + rpkm_ext = 'nan' + ratio = 'nan' + else : + len_ext = stop_extension-start_extension + rpkm_ext = compute_rpkm(len_ext,count_ext,count_tot) + ## compute ratio between extension coverage and cds coverage (rpkm) + ratio = rpkm_ext/rpkm_cds + mylist = [gene,GFF[gene]['name'],'hetero cov',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,GFF[gene]['note'],seq] + writer.writerow(mylist) + file_context.write('>'+gene+'\n'+contexte_stop+'\n') + file_extension.write('>'+gene+'\n'+seq+'\n') + #print ">"+gene+"\n"+contexte_stop + + ## plot gene : + plot_gene(aln, GFF[gene], start_extension, stop_extension, dirout+"/"+gene) + + + + else : + ''' + write result with no footprint in stop codon of extension + ''' + mylist = [gene,GFF[gene]['name'],'no RPF in stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] + writer.writerow(mylist) + file_context.write('>'+gene+'\n'+contexte_stop+'\n') + file_extension.write('>'+gene+'\n'+seq+'\n') + #print ">"+gene+"\n"+contexte_stop + else : + ''' + write result with RPF maybe result of reinitiation on a start codon + ''' + if pass_length(start_extension,stop_extension) : + mylist = [gene,GFF[gene]['name'],'Met after stop', contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] + writer.writerow(mylist) + file_context.write('>'+gene+'\n'+contexte_stop+'\n') + file_extension.write('>'+gene+'\n'+seq+'\n') + #print ">"+gene+"\n"+contexte_stop + else : + ## if its not a interesting case, we get stop context of genes without readthrough + if GFF[gene]['strand'] == '+' : + contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['stop']-6:GFF[gene]['stop']+6]) + file_back.write(contexte_stop+'\n') + else : + contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['start']-7:GFF[gene]['start']+5].reverse_complement()) + file_back.write(contexte_stop+'\n') + + ## excluded UT with incorrect positions + except ValueError: + pass + + + file_context.close() + file_back.close() + file_extension.close() + except Exception,e: + stop_err( 'Error during computing analysis : ' + str( e ) ) + + +def cleaning_bam(bam): + ''' + Remove reads unmapped, non uniquely mapped and reads with length lower than 25 and upper than 32, and mapping quality upper than 12 + ''' + try : + header = subprocess.check_output(['samtools', 'view', '-H', bam], stderr= subprocess.PIPE) + #header = results.split('\n') + + # check mapper for define multiple tag + if re.search('bowtie', header): + multi_tag = "XS:i:" + elif re.search('bwa', header): + multi_tag = "XT:A:M" + else : + stop_err("No PG tag find in"+bam+". Please use bowtie or bwa for mapping") + + tmp_sam = tempfile.mktemp() + cmd = "samtools view %s > %s" % (bam, tmp_sam) + proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) + returncode = proc.wait() + + + with open(tempfile.mktemp(),'w') as out : + out.write(header) + with open(tmp_sam,'r') as sam : + for line in sam : + if multi_tag not in line and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 : + if len(line.split('\t')[9]) < 32 and len(line.split('\t')[9]) > 25 : + out.write(line) + bamfile = tempfile.mktemp()+'.bam' + cmd = "samtools view -hSb %s > %s" % (out.name,bamfile) + proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) + returncode = proc.wait() + + return bamfile + + except Exception,e: + stop_err( 'Error during cleaning bam : ' + str( e ) ) + +def write_html_page(html,subfolder) : + + + try : + + gene_table = '' + gene_table += '<table>' + gene_table += '<thead><tr><th data-sort="string">Gene</th><th>Plot</th><th data-sort="string">Name</th><th>Stop context</th><th>Coordinates</th><th>RPKM CDS</th><th>RPKM extension</th><th data-sort="float">ratio</th><th>Extension</th></tr></thead><tbody>' + + with open(os.path.join(subfolder,'readthrough_result.csv'), 'rb') as csvfile: + spamreader = csv.reader(csvfile, delimiter='\t') + ## skip the header + next(spamreader, None) + for row in spamreader: + if row[2] == '-' : + gene_table += '<tr><td>%s</td><td><a href="%s.png" data-lightbox="%s"><img src="%s_thumbnail.png" /></a></td><td><a title="%s">%s</a></td><td>%s</td><td>%s:%s-%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' %(row[0], row[0], row[0], row[0], row[11], row[1], row[3], row[4], row[5], row[6], row[8], row[9], row[10], row[12]) + + gene_table += '</tbody></table>' + + + + html_str = """ +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> + +<html xmlns="http://www.w3.org/1999/xhtml"> + <head> + <link rel="stylesheet" href="http://code.jquery.com/ui/1.10.3/themes/smoothness/jquery-ui.css" /> + <script src="http://code.jquery.com/jquery-1.10.2.min.js"></script> + <script src="http://ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js"></script> + <script src="http://code.jquery.com/ui/1.10.3/jquery-ui.js"></script> + <script src="lightbox/js/jquery-1.10.2.min.js"></script> + <script src="lightbox/js/lightbox-2.6.min.js"></script> + <link href="lightbox/css/lightbox.css" rel="stylesheet" /> + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> + <title>Dual coding result file</title> + <link rel="stylesheet" href="http://code.jquery.com/ui/1.10.3/themes/smoothness/jquery-ui.css" /> + <script> + (function(d){d.fn.stupidtable=function(b){return this.each(function(){var a=d(this);b=b||{};b=d.extend({},d.fn.stupidtable.default_sort_fns,b);var n=function(a,b){for(var f=[],c=0,e=a.slice(0).sort(b),h=0;h<a.length;h++){for(c=d.inArray(a[h],e);-1!=d.inArray(c,f);)c++;f.push(c)}return f},q=function(a,b){for(var d=a.slice(0),c=0,e=0;e<b.length;e++)c=b[e],d[c]=a[e];return d};a.on("click","th",function(){var m=a.children("tbody").children("tr"),g=d(this),f=0,c=d.fn.stupidtable.dir;a.find("th").slice(0, g.index()).each(function(){var a=d(this).attr("colspan")||1;f+=parseInt(a,10)});var e=g.data("sort-default")||c.ASC;g.data("sort-dir")&&(e=g.data("sort-dir")===c.ASC?c.DESC:c.ASC);var h=g.data("sort")||null;null!==h&&(a.trigger("beforetablesort",{column:f,direction:e}),a.css("display"),setTimeout(function(){var l=[],p=b[h];m.each(function(a,b){var c=d(b).children().eq(f),e=c.data("sort-value"),c="undefined"!==typeof e?e:c.text();l.push(c)});var k;k=e==c.ASC?n(l,p):n(l,function(a,b){return-p(a,b)}); a.find("th").data("sort-dir",null).removeClass("sorting-desc sorting-asc");g.data("sort-dir",e).addClass("sorting-"+e);k=d(q(m,k));a.children("tbody").remove();a.append("<tbody />").append(k);a.trigger("aftertablesort",{column:f,direction:e});a.css("display")},10))})})};d.fn.stupidtable.dir={ASC:"asc",DESC:"desc"};d.fn.stupidtable.default_sort_fns={"int":function(b,a){return parseInt(b,10)-parseInt(a,10)},"float":function(b,a){return parseFloat(b)-parseFloat(a)},string:function(b,a){return b<a?-1: b>a?1:0},"string-ins":function(b,a){b=b.toLowerCase();a=a.toLowerCase();return b<a?-1:b>a?1:0}}})(jQuery); + (function($) { + + $.fn.stupidtable = function(sortFns) { + return this.each(function() { + var $table = $(this); + sortFns = sortFns || {}; + + // ==================================================== // + // Utility functions // + // ==================================================== // + + // Merge sort functions with some default sort functions. + sortFns = $.extend({}, $.fn.stupidtable.default_sort_fns, sortFns); + + // Return the resulting indexes of a sort so we can apply + // this result elsewhere. This returns an array of index numbers. + // return[0] = x means "arr's 0th element is now at x" + var sort_map = function(arr, sort_function) { + var map = []; + var index = 0; + var sorted = arr.slice(0).sort(sort_function); + for (var i=0; i<arr.length; i++) { + index = $.inArray(arr[i], sorted); + + // If this index is already in the map, look for the next index. + // This handles the case of duplicate entries. + while ($.inArray(index, map) != -1) { + index++; + } + map.push(index); + } + + return map; + }; + + // Apply a sort map to the array. + var apply_sort_map = function(arr, map) { + var clone = arr.slice(0), + newIndex = 0; + for (var i=0; i<map.length; i++) { + newIndex = map[i]; + clone[newIndex] = arr[i]; + } + return clone; + }; + + // ==================================================== // + // Begin execution! // + // ==================================================== // + + // Do sorting when THs are clicked + $table.on("click", "th", function() { + var trs = $table.children("tbody").children("tr"); + var $this = $(this); + var th_index = 0; + var dir = $.fn.stupidtable.dir; + + $table.find("th").slice(0, $this.index()).each(function() { + var cols = $(this).attr("colspan") || 1; + th_index += parseInt(cols,10); + }); + + // Determine (and/or reverse) sorting direction, default `asc` + var sort_dir = $this.data("sort-default") || dir.ASC; + if ($this.data("sort-dir")) + sort_dir = $this.data("sort-dir") === dir.ASC ? dir.DESC : dir.ASC; + + // Choose appropriate sorting function. + var type = $this.data("sort") || null; + + // Prevent sorting if no type defined + if (type === null) { + return; + } + + // Trigger `beforetablesort` event that calling scripts can hook into; + // pass parameters for sorted column index and sorting direction + $table.trigger("beforetablesort", {column: th_index, direction: sort_dir}); + // More reliable method of forcing a redraw + $table.css("display"); + + // Run sorting asynchronously on a timout to force browser redraw after + // `beforetablesort` callback. Also avoids locking up the browser too much. + setTimeout(function() { + // Gather the elements for this column + var column = []; + var sortMethod = sortFns[type]; + + // Push either the value of the `data-order-by` attribute if specified + // or just the text() value in this column to column[] for comparison. + trs.each(function(index,tr) { + var $e = $(tr).children().eq(th_index); + var sort_val = $e.data("sort-value"); + var order_by = typeof(sort_val) !== "undefined" ? sort_val : $e.text(); + column.push(order_by); + }); + + // Create the sort map. This column having a sort-dir implies it was + // the last column sorted. As long as no data-sort-desc is specified, + // we're free to just reverse the column. + var theMap; + if (sort_dir == dir.ASC) + theMap = sort_map(column, sortMethod); + else + theMap = sort_map(column, function(a, b) { return -sortMethod(a, b); }); + + // Reset siblings + $table.find("th").data("sort-dir", null).removeClass("sorting-desc sorting-asc"); + $this.data("sort-dir", sort_dir).addClass("sorting-"+sort_dir); + + var sortedTRs = $(apply_sort_map(trs, theMap)); + $table.children("tbody").remove(); + $table.append("<tbody />").append(sortedTRs); + + // Trigger `aftertablesort` event. Similar to `beforetablesort` + $table.trigger("aftertablesort", {column: th_index, direction: sort_dir}); + // More reliable method of forcing a redraw + $table.css("display"); + }, 10); + }); + }); + }; + + // Enum containing sorting directions + $.fn.stupidtable.dir = {ASC: "asc", DESC: "desc"}; + + $.fn.stupidtable.default_sort_fns = { + "int": function(a, b) { + return parseInt(a, 10) - parseInt(b, 10); + }, + "float": function(a, b) { + return parseFloat(a) - parseFloat(b); + }, + "string": function(a, b) { + if (a < b) return -1; + if (a > b) return +1; + return 0; + }, + "string-ins": function(a, b) { + a = a.toLowerCase(); + b = b.toLowerCase(); + if (a < b) return -1; + if (a > b) return +1; + return 0; + } + }; + + })(jQuery); + $(function(){ + var table = $("table").stupidtable(); + + table.on("beforetablesort", function (event, data) { + // data.column - the index of the column sorted after a click + // data.direction - the sorting direction (either asc or desc) + $("#msg").text("Sorting index " + data.column) + }); + table.on("aftertablesort", function (event, data) { + var th = $(this).find("th"); + th.find(".arrow").remove(); + var dir = $.fn.stupidtable.dir; + var arrow = data.direction === dir.ASC ? "↑" : "↓"; + th.eq(data.column).append('<span class="arrow">' + arrow +'</span>'); + }); + }); + </script> + <style type="text/css"> + label { + display: inline-block; + width: 5em; + } + table { + border-collapse: collapse; + } + th, td { + padding: 5px 10px; + border: 1px solid #999; + } + th { + background-color: #a7d3ff; + } + th[data-sort]{ + cursor:pointer; + } + a[data-lightbox]{ + cursor:zoom-in; + } + #msg { + color: green; + } + </style> + </head> + <body> + <h1>Readthrough analyse results</h1> + %s + </body> +</html> """ % (gene_table) + + html_file = open(html,"w") + html_file.write(html_str) + html_file.close() + + + except Exception, e : + stop_err('Error during html page creation : ' + str( e ) ) + + +def __main__(): + + ## python metagene_readthrough.py -g Saccer3.gff -f Saccer3.fa -b B1_sorted.bam -o Bstrain_readthrough.csv + #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_readthrough.py -g /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /data/Mapping/read_mapped_unique_psiP_sorted.bam -o /home/rlegendre/Documents/Translecture/plop_py.csv -d /home/rlegendre/Documents/Translecture/out_trans/ + #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_readthrough.py -g /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /home/rlegendre/Documents/PDE2S/PDE2S_trim_no_rRNA_sorted.bam -o /home/rlegendre/Documents/PDE2S/Readthrough_pde2s.csv -d /home/rlegendre/Documents/PDE2S/out_trans + + + #Parse command line options + parser = optparse.OptionParser() + parser.add_option("-g", "--gff", dest="gff", type= "string", + help="GFF annotation file", metavar="FILE") + + parser.add_option("-f", "--fasta", dest="fasta", type= "string", + help="Fasta file ", metavar="FILE") + + parser.add_option("-b", "--bam", dest="bamfile", type= "string", + help="Bam Ribo-Seq alignments ", metavar="FILE") + + parser.add_option("-d", "--dirout", dest="dirout", type= "string", + help="write report in this html file and in associated directory", metavar="STR,STR") + + parser.add_option("-q", "--quiet", + action="store_false", dest="verbose", default=True, + help="don't print status messages to stdout") + + (options, args) = parser.parse_args() + sys.stdout.write("Begin readthrough analysis at %s\n" % time.asctime( time.localtime(time.time()))) + + try: + (html_file, subfolder ) = options.dirout.split(",") + if os.path.exists(subfolder): + raise + try: + os.mkdir(subfolder) + except: + raise + + GFF = store_gff(options.gff) + clean_file = cleaning_bam(options.bamfile) + compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder) + if os.path.exists( clean_file ): + os.remove( clean_file ) + + write_html_page(html_file,subfolder) + ##paste jquery script in result directory : + jq_src = os.path.join(os.path.dirname(__file__),'lightbox') + shutil.copytree(jq_src,os.path.join(subfolder,'lightbox')) + + + sys.stdout.write("Finish readthrough analysis at %s\n" % time.asctime( time.localtime(time.time()))) + except Exception, e: + stop_err( 'Error running metagene readthrough analysis : ' + str( e ) ) + + +if __name__=="__main__": + __main__()