Mercurial > repos > rlegendre > ribo_tools
view metagene_readthrough.py @ 17:c87c40e642af draft
Uploaded
author | rlegendre |
---|---|
date | Fri, 29 May 2015 09:17:29 -0400 |
parents | 702e60e819c2 |
children | 385fc64fa988 |
line wrap: on
line source
#!/usr/bin/env python2.7.3 #-*- coding: utf-8 -*- ''' Created on Dec. 2013 @author: rachel legendre @copyright: rachel.legendre@igmors.u-psud.fr @license: GPL v3 ''' import os, sys, time, optparse, shutil, re, urllib, subprocess, tempfile from urllib import unquote from Bio import SeqIO import csv import pysam import HTSeq #from matplotlib import pyplot as pl import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as pl from numpy import arange from matplotlib import ticker as t from PIL import Image import ribo_functions ## suppress matplotlib warnings import warnings def stop_err( msg ): sys.stderr.write( "%s\n" % msg ) sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) sys.exit() def compute_rpkm(length,count_gene,count_tot) : try : rpkm = "{0:.4f}".format(count_gene*1000000.0/(count_tot*length)) except ArithmeticError : stop_err( 'Illegal division by zero during computing RPKM') return float(rpkm) def find_stop(seq) : ''' Find stop codon in a sequence and return position of first nucleotide in stop ''' stop_codon = ['TAA','TAG','TGA'] for nt in range(0,len(seq)-3,3) : codon = seq[nt:nt+3] if codon in stop_codon : return nt return -1 def check_met(seq): ''' Boolean function for testing presence or absence of methionine in 5 codons following stop codon ''' met = 'ATG' for pos in range(0,15,3) : codon = seq[pos:pos+3] if codon in met : return True return False ''' feature.iv is a GenomicInterval object : A GenomicInterval object has the following slots, some of which are calculated from the other: chrom: The name of a sequence (i.e., chromosome, contig, or the like). start: The start of the interval. Even on the reverse strand, this is always the smaller of the two values 'start' and 'end'. Note that all positions should be given as 0-based value! end: The end of the interval. Following Python convention for ranges, this in one more than the coordinate of the last base that is considered part of the sequence. strand: The strand, as a single character, '+' or '-'. '.' indicates that the strand is irrelevant. (Alternatively, pass a Strand object.) length: The length of the interval, i.e., end - start start_d: The "directional start" position. This is the position of the first base of the interval, taking the strand into account. Hence, this is the same as 'start' except when strand == '-', in which case it is end-1. end_d: The "directional end": Usually, the same as 'end', but for strand=='-1', it is start-1. ''' def check_overlapping(gff_reader,chrm,start,stop,strand, name): #### probleme avec les genes completement inclu... iv2 = HTSeq.GenomicInterval(chrm,start,stop,strand) for feature in gff_reader: if feature.type == "gene" : if feature.iv.overlaps(iv2) and feature.attr.get('gene_name') != name : ## if its a reverse gene, we replace start of extension by start of previous gene if strand == '-' : return (feature.iv.end+3,stop) ## else we replace stop of extension by start of following gene else : return (start,feature.iv.start-3) ## if no overlap are find, we return -1 return (start,stop) def pass_length(start,stop) : if (stop-start) > 25 : return True else : return False def check_homo_coverage(gene,GFF,start,stop, aln) : chrom = GFF[gene]['chrom'] ## compute number of nucleotides in CDS with a coverage equal to zero nt_cds_cov = 0 nt_cds_num = 0 for i in range(1,GFF[gene]['exon_number']+1) : for z in range(GFF[gene]['exon'][i]['start'],GFF[gene]['exon'][i]['stop']): nt_cds_num += 1 if aln.count(chrom,z,z+1) == 0 : nt_cds_cov += 1 ## compute percent of CDS no covering if nt_cds_cov == 0 or nt_cds_num == 0: percent = 0 else: percent = nt_cds_cov*100/nt_cds_num ## compute number of nucleotides with no coverage in extension nt_no_cover = 0 for pos in range(start,stop,1) : if aln.count(chrom,pos,pos+1) == 0 : nt_no_cover += 1 #print gene, nt_cds_cov, percent, nt_no_cover #percent10 = (stop-start)*50/100 if (nt_no_cover*100)/(stop-start) > percent : return False else : return True def plot_gene ( aln, gene, start_extension, stop_extension, dirout ) : ### ignore matplotlib warnings for galaxy warnings.filterwarnings('ignore') try: strand = gene['strand'] len_gene = gene['stop']-gene['start'] if strand is "-" : len_ext = gene['stop']-start_extension ## coverage in all gene + extension start = start_extension-100 stop = gene['stop']+100 vector1 = [0]*(stop-start) #for pileupcolumn in aln.pileup( gene['chrom'], start, stop): # vector.append(pileupcolumn.n) for read in aln.fetch(gene['chrom'], start, stop): if read.is_reverse : ## for get footprint in P-site (estimate) we take 3 nt in middle of read #pos = (read.pos+13)-start pos = (read.pos-start) + (read.rlen/2)-1 for z in range(pos,(pos+3)) : ## le fetch contient des reads qui chevauchent les 30 nt de la fin du gene mais dont le site P ## se trouve avant notre vector, le z devient negatif et la couverture augmente en fin de vecteur (ce qui est faux) if z > 0 : try : vector1[z] += 1 except IndexError : pass vector1.reverse() mean_read = float(sum(vector1))/float(len(vector1)) cov = [(x/mean_read) for x in vector1] idx_tot = arange(len(cov)) ## coverage in extension start_ext = start_extension-40 stop_ext = stop_extension+30 vector2 = [0]*(stop_ext-start_ext) #for pileupcolumn in aln.pileup( gene['chrom'], start, stop): # vector.append(pileupcolumn.n) for read in aln.fetch(gene['chrom'], start_extension, stop_ext): if read.is_reverse : ## get footprint in P-site #pos = (read.pos+13)-start_ext pos = (read.pos-start_ext) + (read.rlen/2)-1 for z in range(pos,(pos+3)) : if z > 0 : try : vector2[z] += 1 except IndexError : pass vector2.reverse() #mean_read = float(sum(vector))/float(len(vector)) cov_ext = [(x/mean_read) for x in vector2] _max = max(cov_ext[30::]) idx_ext = arange(len(cov_ext)) else : len_ext = stop_extension-gene['start'] start = gene['start']-100 stop = stop_extension+100 vector = [0]*(stop-start) #for pileupcolumn in aln.pileup( gene['chrom'], start, stop): #vector.append(pileupcolumn.n) for read in aln.fetch(gene['chrom'], start, stop): if not read.is_reverse : ## get footprint in P-site #pos = (read.pos+12)-start pos = (read.pos-start) + (read.rlen/2)-1 for z in range(pos,(pos+3)) : if z > 0 : try : vector[z] += 1 except IndexError : pass mean_read = float(sum(vector))/float(len(vector)) cov = [(x/mean_read) for x in vector] idx_tot = arange(len(cov)) ## coverage in extension start_ext = gene['stop']-30 stop_ext = stop_extension+40 vector = [0]*(stop-start_ext) for read in aln.fetch(gene['chrom'], start_ext, stop_extension): if not read.is_reverse : ## get footprint in P-site pos = (read.pos-start_ext) + (read.rlen/2)-1 for z in range(pos,(pos+3)) : if z > 0 : try : vector[z] += 1 except IndexError : pass cov_ext = [(x/mean_read) for x in vector] idx_ext = arange(len(cov_ext)) _max = max(cov_ext[30::]) #### PLOT FIGURE #### font = {'family' : 'serif','color': 'grey','weight' : 'normal','size' : 16 } fig = pl.figure(num=1) ## create a big subplot for common y axis on two last subplot ax = fig.add_subplot(2,1,2) ## hide all spines ax.spines['top'].set_visible(False) ax.spines['bottom'].set_visible(False) ax.spines['left'].set_visible(False) ax.spines['right'].set_visible(False) ax.tick_params(labelcolor='w', top='off', bottom='off', left='off', right='off') ## plot gene structure ax1 = fig.add_subplot(3,1,1) ax1.set_title(gene['name']) ## hide all spines ax1.spines['right'].set_color('none') ax1.spines['top'].set_color('none') ax1.spines['left'].set_color('none') ax1.spines['bottom'].set_color('none') ax1.set_ylim(0,5) #set xlim as second plot with 100nt switch ax1.set_xlim(-100,len(idx_tot)-100) ## hide ticks pl.yticks([]) pl.xticks([]) ## if it's a "simple" gene if gene['exon_number'] == 1 : exon = arange(len_gene) x = [3]*len_gene ax1.plot(exon,x,color='#9B9E9C') ax1.fill_between(exon,2,x,color='#9B9E9C') ## if it's a gene with introns else : ## plot a line representing intron intron = arange(len_gene) y = [2.5]*len_gene ax1.plot(intron,y,color='#9B9E9C') ## plotting each intron start = gene['start'] if strand == '+' : for exon in range(1,gene['exon_number']+1,1) : len_exon = gene['exon'][exon]['stop']-gene['exon'][exon]['start'] idx = gene['exon'][exon]['start'] - start exo = arange(idx,idx+len_exon) x = [3]*len_exon ax1.plot(exo,x,color='#9B9E9C') ax1.fill_between(exo,2,x,color='#9B9E9C') else : ## if it's a reverse gene we must reverse exon's position start = gene['start'] tabF = [2.5]*len_gene tabR = [2.5]*len_gene for exon in range(1,gene['exon_number']+1,1) : len_exon = gene['exon'][exon]['stop']-gene['exon'][exon]['start'] idx = gene['exon'][exon]['start'] - start exo = arange(idx,idx+len_exon) for z in exo : tabF[z] = 3 tabR[z] = 2 tabF.reverse() tabR.reverse() #pl.ylim(0,5) ax1.plot(tabF,color='#9B9E9C') ax1.plot(tabR,color='#9B9E9C') x = arange(len(tabR)) ax1.fill_between(x,tabF,tabR,color='#9B9E9C') ## insert arrows (as genome browser representation) yloc = 2.5 narrows = 20 exonwidth = .8 spread = .4 * len_gene / narrows for i in range(narrows): loc = (float(i) * len_gene / narrows)+ (len(idx_tot)/100)*2 if strand == '+' : x = [loc - spread, loc, loc - spread] else: x = [loc - spread, loc, loc - spread] y = [yloc - exonwidth / 5, yloc, yloc + exonwidth / 5] ax1.plot(x, y, lw=1.4, color='#676B69') # plot coverage in all gene + extension ax2 = fig.add_subplot(3,1,2) ## fixe limit to length of coverage for x axis ax2.set_xlim(0,len(idx_tot)) ## fixe 4 ticks and associated labels for y axis ax2.set_yticklabels(arange(0,int(max(cov)+20),int((max(cov)+20)/4))) ax2.set_yticks(arange(0,int(max(cov)+20),int((max(cov)+20)/4))) ### add start and stop of gene in axe in place of number ax2.set_xticks([100,(100+len_gene),(100+len_ext)]) ax2.set_xticklabels(["{:,}".format(gene['start']),"{:,}".format(gene['stop']),""]) ## hide spines and any axis ax2.spines['right'].set_visible(False) ax2.spines['top'].set_visible(False) ax2.yaxis.set_ticks_position('left') ax2.xaxis.set_ticks_position('bottom') ## plot and fill ax2.plot(idx_tot, cov, color="#CC0011") ax2.fill_between(idx_tot, 0,cov,color="#CC0011") ax2.set_title("Genomic coordinates (%s,%s)"% (gene['strand'],gene['chrom']) ,fontdict={'fontsize':'small'}) # plot zoom coverage in extension ax3 = fig.add_subplot(3,1,3) ## fixe limit to length of coverage for x axis ax3.set_ylim(0,int(_max)) # we hide spines ax3.spines['right'].set_visible(False) ax3.spines['top'].set_visible(False) ax3.yaxis.set_ticks_position('left') ax3.xaxis.set_ticks_position('bottom') ## add stop and in-frame stop in x axis pl.xticks([30,( stop_extension-start_extension)+30],["stop codon","next in-frame stop"]) ## fixe good position for labels (ax3.get_xticklabels()[0]).set_horizontalalignment('center') (ax3.get_xticklabels()[1]).set_horizontalalignment('left') #if max coverage is lower than 2, we have a illegal division by zero if _max > 2 : ax3.set_yticklabels(arange(0,int(_max+1),int((_max+1)/3))) ax3.set_yticks(arange(0,int(_max+1),int((_max+1)/3))) else : ## ax3.set_ylim(0,_max) ax3.ticklabel_format(style='sci', scilimits=(-2,2), axis='y',useOffset=False) ax3.plot(idx_ext, cov_ext, color="#CC0011") ax3.fill_between(idx_ext, 0,cov_ext,color="#CC0011") ## get scale of subplot 3 #ax3.text(ax3.get_xlim()[1]-50,ax3.get_ylim()[1]-1, r'50 nt', fontdict=font) #pl.arrow( ax3.get_xlim()[1]-50, ax3.get_ylim()[1]-2, ax3.get_xlim()[1]-50, 0, fc="grey", ec="grey",lw=2) ## set common y label ax.set_ylabel('Normalized footprint counts',labelpad=20) ## draw and save plot pl.draw() #pl.show() pl.savefig(dirout+".png",format='png', dpi=300) pl.clf() ## Make thumbnail for html page infile = dirout+'.png' size = 128, 128 im = Image.open(infile) im.thumbnail(size, Image.ANTIALIAS) im.save(dirout+"_thumbnail.png", "PNG") warnings.resetwarnings() except Exception, e: stop_err( 'Error during gene plotting : ' + str( e ) ) def compute_analysis(bam, GFF, fasta, gff, dirout, extend) : try: background_file = dirout+"/background_sequence.fasta" file_back = open(background_file, 'w') file_context = open(dirout+"/stop_context.fasta", 'w') file_extension = open(dirout+"/extensions.fasta", 'w') ## Opening Bam file with pysam librarie pysam.index(bam) aln = pysam.Samfile(bam, "rb",header=True, check_header = True) ## Opening fasta file in a dict with BioPython SeqDict = SeqIO.to_dict(SeqIO.parse(open(fasta,"r"),"fasta")) ## count total read in bam file cmd = "samtools view -c %s " % (bam) proc = subprocess.Popen( args=cmd, shell=True, stdout = subprocess.PIPE) count_tot = int(proc.stdout.read().rstrip()) returncode = proc.wait() ## opening a GFF reader for check overlapping gff_reader = HTSeq.GFF_Reader(gff) with open(dirout+"/readthrough_result.csv","w") as out : myheader = ['Gene','Name', 'FAIL', 'Stop context','chrom','start extension','stop extension','length extension','RPKM CDS', 'RPKM extension','ratio','Annotation','sequence'] writer = csv.writer(out,delimiter='\t') writer.writerow(myheader) for gene in GFF['order'] : #print GFF[gene] ## maybe no start position in GTF file so we must to check and replace exon_number = GFF[gene]['exon_number'] try : GFF[gene]['start'] except : if GFF[gene]['strand'] == '+' : GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] else : GFF[gene]['start'] = GFF[gene]['exon'][exon_number]['stop'] ## also for stop coordinates try : GFF[gene]['stop'] except : if GFF[gene]['strand'] == '+' : GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] else : GFF[gene]['stop'] = GFF[gene]['exon'][1]['start'] indic = "" # compute rpkm of CDS : len_cds = GFF[gene]['stop']-GFF[gene]['start'] count_cds = 0 rpkm_cds = 0 count_cds = 0 try : ### count method of pysam doesn't strand information if GFF[gene]['strand'] == '+' : for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+12,GFF[gene]['stop']-15) : if not track.is_reverse : count_cds += 1 else : for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+15,GFF[gene]['stop']-12) : if track.is_reverse : count_cds += 1 rpkm_cds = compute_rpkm(len_cds,count_cds,count_tot) except ValueError: ## genere warning about gtf coordinates #warnings.warn("Please check coordinates in GFF : maybe stop or start codon are missing" ) pass ## Only if gene is translate : if rpkm_cds > 0 and count_cds > 128: ## search footprint in UTR3 count = 0 try : if GFF[gene]['strand'] == '+' : contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['stop']-6:GFF[gene]['stop']+6]) #print gene, contexte_stop start_extension = GFF[gene]['stop'] stop_extension = GFF[gene]['stop']+90 for track in aln.fetch(GFF[gene]['chrom'],start_extension+15,stop_extension) : if not track.is_reverse : count += 1 ## get sequence of extension seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension]) else : contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['start']-7:GFF[gene]['start']+5].reverse_complement()) #print gene, contexte_stop start_extension = GFF[gene]['start']-90 stop_extension = GFF[gene]['start'] for track in aln.fetch(GFF[gene]['chrom'],start_extension,stop_extension-15) : if track.is_reverse : count += 1 ## get sequence of extension seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement()) ## if we have coverage after cds stop codon if count > 10 : res = find_stop(seq) if res == -1 : ''' Write results with no stop but RPF in extension ''' ## check if next in-frame codon is far than 90 nt extension : if GFF[gene]['strand'] == '+' : pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['stop']+1,GFF[gene]['stop']+extend,'+',GFF[gene]['name']) start_extension = pos[0]-1 stop_extension = pos[1] #start_extension = GFF[gene]['stop'] #stop_extension = GFF[gene]['stop']+extend seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension]) #print gene,count,pos,'\n',seq if (seq): res = find_stop(seq) if res == -1 : mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] writer.writerow(mylist) else : indic = 'ok' #print res #stop_extension = start_extension + res +3 else : pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['start']-extend,GFF[gene]['start']-1,'-',GFF[gene]['name']) start_extension = pos[0] stop_extension = pos[1]+1 #start_extension = GFF[gene]['start']-extend #stop_extension = GFF[gene]['start'] seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement()) #print gene,count,pos,'\n',seq if (seq): res = find_stop(seq) if res == -1 : mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] writer.writerow(mylist) else : indic = 'ok' #print res #start_extension = stop_extension - res -3 else : indic = 'ok' if indic == 'ok' : ## We save new coordinates if GFF[gene]['strand'] == '+' : stop_extension = start_extension + res +3 #print gene, count #print gene,start_extension,stop_extension seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension]) #print seq count_stop = aln.count(GFF[gene]['chrom'],stop_extension-2,stop_extension+2) if pass_length(start_extension,stop_extension) : count_ext = aln.count(GFF[gene]['chrom'],start_extension+9,stop_extension-15) if stop_extension > GFF[gene]['stop']+9 : stop_ok = 1 else : stop_ok = 0 else : start_extension = stop_extension - res - 3 #print gene, count #print gene,start_extension,stop_extension seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension-1:stop_extension-1].reverse_complement()) #print seq count_stop = aln.count(GFF[gene]['chrom'],start_extension-2,start_extension+2) if pass_length(start_extension,stop_extension) : count_ext = aln.count(GFF[gene]['chrom'],start_extension+15,stop_extension-9) if start_extension < GFF[gene]['start']-9 : stop_ok = 1 else : stop_ok = 0 ## if we are no methionine in 5 codons following stop of CDS if (not check_met(seq) ): ## if we have footprint in stop codon extension and stop extension is further than cds_stop+9 if count_stop > 2 and stop_ok == 1 : homo_cov = check_homo_coverage(gene,GFF,start_extension,stop_extension,aln) if (homo_cov) : ''' write result witch corresponding to readthrough ''' ##if length of extension upper than 25 we can compute rpkm if (not pass_length(start_extension,stop_extension)) : len_ext = stop_extension-start_extension rpkm_ext = 'nan' ratio = 'nan' else : len_ext = stop_extension-start_extension rpkm_ext = compute_rpkm(len_ext,count_ext,count_tot) ## compute ratio between extension coverage and cds coverage (rpkm) ratio = rpkm_ext/rpkm_cds #print gene,ratio #print start_extension,stop_extension mylist = [gene,GFF[gene]['name'],'-',contexte_stop,GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,GFF[gene]['note'],seq] writer.writerow(mylist) file_context.write('>'+gene+'\n'+contexte_stop+'\n') file_extension.write('>'+gene+'\n'+seq+'\n') else : ''' write result witch corresponding to readthrough but with no homogeneous coverage ''' if (not pass_length(start_extension,stop_extension)) : len_ext = stop_extension-start_extension rpkm_ext = 'nan' ratio = 'nan' else : len_ext = stop_extension-start_extension rpkm_ext = compute_rpkm(len_ext,count_ext,count_tot) ## compute ratio between extension coverage and cds coverage (rpkm) ratio = rpkm_ext/rpkm_cds mylist = [gene,GFF[gene]['name'],'hetero cov',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,GFF[gene]['note'],seq] writer.writerow(mylist) file_context.write('>'+gene+'\n'+contexte_stop+'\n') file_extension.write('>'+gene+'\n'+seq+'\n') #print ">"+gene+"\n"+contexte_stop ## plot gene : plot_gene(aln, GFF[gene], start_extension, stop_extension, dirout+"/"+gene) else : ''' write result with no footprint in stop codon of extension ''' mylist = [gene,GFF[gene]['name'],'no RPF in stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] writer.writerow(mylist) file_context.write('>'+gene+'\n'+contexte_stop+'\n') file_extension.write('>'+gene+'\n'+seq+'\n') #print ">"+gene+"\n"+contexte_stop else : ''' write result with RPF maybe result of reinitiation on a start codon ''' if pass_length(start_extension,stop_extension) : mylist = [gene,GFF[gene]['name'],'Met after stop', contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] writer.writerow(mylist) file_context.write('>'+gene+'\n'+contexte_stop+'\n') file_extension.write('>'+gene+'\n'+seq+'\n') #print ">"+gene+"\n"+contexte_stop else : ## if its not a interesting case, we get stop context of genes without readthrough if GFF[gene]['strand'] == '+' : contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['stop']-6:GFF[gene]['stop']+6]) file_back.write(contexte_stop+'\n') else : contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['start']-7:GFF[gene]['start']+5].reverse_complement()) file_back.write(contexte_stop+'\n') ## excluded UT with incorrect positions except ValueError: pass file_context.close() file_back.close() file_extension.close() except Exception,e: stop_err( 'Error during computing analysis : ' + str( e ) ) def write_html_page(html,subfolder) : try : gene_table = '' gene_table += '<table>' gene_table += '<thead><tr><th data-sort="string">Gene</th><th>Plot</th><th data-sort="string">Name</th><th>Stop context</th><th>Coordinates</th><th>RPKM CDS</th><th>RPKM extension</th><th data-sort="float">ratio</th><th>Extension</th></tr></thead><tbody>' with open(os.path.join(subfolder,'readthrough_result.csv'), 'rb') as csvfile: spamreader = csv.reader(csvfile, delimiter='\t') ## skip the header next(spamreader, None) ##test if file is empty or not if next(spamreader, None): for row in spamreader: if row[2] == '-' : gene_table += '<tr><td>%s</td><td><a href="%s.png" data-lightbox="%s"><img src="%s_thumbnail.png" /></a></td><td><a title="%s">%s</a></td><td>%s</td><td>%s:%s-%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' %(row[0], row[0], row[0], row[0], row[11], row[1], row[3], row[4], row[5], row[6], row[8], row[9], row[10], row[12]) gene_table += '</tbody></table>' else : gene_table = 'Sorry, there are no stop codon readthrough genes in your data\n' html_str = """ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <link rel="stylesheet" href="http://code.jquery.com/ui/1.10.3/themes/smoothness/jquery-ui.css" /> <script src="http://code.jquery.com/jquery-1.10.2.min.js"></script> <script src="http://ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js"></script> <script src="http://code.jquery.com/ui/1.10.3/jquery-ui.js"></script> <script src="lightbox/js/jquery-1.10.2.min.js"></script> <script src="lightbox/js/lightbox-2.6.min.js"></script> <link href="lightbox/css/lightbox.css" rel="stylesheet" /> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <title>Dual coding result file</title> <link rel="stylesheet" href="http://code.jquery.com/ui/1.10.3/themes/smoothness/jquery-ui.css" /> <script> (function(d){d.fn.stupidtable=function(b){return this.each(function(){var a=d(this);b=b||{};b=d.extend({},d.fn.stupidtable.default_sort_fns,b);var n=function(a,b){for(var f=[],c=0,e=a.slice(0).sort(b),h=0;h<a.length;h++){for(c=d.inArray(a[h],e);-1!=d.inArray(c,f);)c++;f.push(c)}return f},q=function(a,b){for(var d=a.slice(0),c=0,e=0;e<b.length;e++)c=b[e],d[c]=a[e];return d};a.on("click","th",function(){var m=a.children("tbody").children("tr"),g=d(this),f=0,c=d.fn.stupidtable.dir;a.find("th").slice(0, g.index()).each(function(){var a=d(this).attr("colspan")||1;f+=parseInt(a,10)});var e=g.data("sort-default")||c.ASC;g.data("sort-dir")&&(e=g.data("sort-dir")===c.ASC?c.DESC:c.ASC);var h=g.data("sort")||null;null!==h&&(a.trigger("beforetablesort",{column:f,direction:e}),a.css("display"),setTimeout(function(){var l=[],p=b[h];m.each(function(a,b){var c=d(b).children().eq(f),e=c.data("sort-value"),c="undefined"!==typeof e?e:c.text();l.push(c)});var k;k=e==c.ASC?n(l,p):n(l,function(a,b){return-p(a,b)}); a.find("th").data("sort-dir",null).removeClass("sorting-desc sorting-asc");g.data("sort-dir",e).addClass("sorting-"+e);k=d(q(m,k));a.children("tbody").remove();a.append("<tbody />").append(k);a.trigger("aftertablesort",{column:f,direction:e});a.css("display")},10))})})};d.fn.stupidtable.dir={ASC:"asc",DESC:"desc"};d.fn.stupidtable.default_sort_fns={"int":function(b,a){return parseInt(b,10)-parseInt(a,10)},"float":function(b,a){return parseFloat(b)-parseFloat(a)},string:function(b,a){return b<a?-1: b>a?1:0},"string-ins":function(b,a){b=b.toLowerCase();a=a.toLowerCase();return b<a?-1:b>a?1:0}}})(jQuery); (function($) { $.fn.stupidtable = function(sortFns) { return this.each(function() { var $table = $(this); sortFns = sortFns || {}; // ==================================================== // // Utility functions // // ==================================================== // // Merge sort functions with some default sort functions. sortFns = $.extend({}, $.fn.stupidtable.default_sort_fns, sortFns); // Return the resulting indexes of a sort so we can apply // this result elsewhere. This returns an array of index numbers. // return[0] = x means "arr's 0th element is now at x" var sort_map = function(arr, sort_function) { var map = []; var index = 0; var sorted = arr.slice(0).sort(sort_function); for (var i=0; i<arr.length; i++) { index = $.inArray(arr[i], sorted); // If this index is already in the map, look for the next index. // This handles the case of duplicate entries. while ($.inArray(index, map) != -1) { index++; } map.push(index); } return map; }; // Apply a sort map to the array. var apply_sort_map = function(arr, map) { var clone = arr.slice(0), newIndex = 0; for (var i=0; i<map.length; i++) { newIndex = map[i]; clone[newIndex] = arr[i]; } return clone; }; // ==================================================== // // Begin execution! // // ==================================================== // // Do sorting when THs are clicked $table.on("click", "th", function() { var trs = $table.children("tbody").children("tr"); var $this = $(this); var th_index = 0; var dir = $.fn.stupidtable.dir; $table.find("th").slice(0, $this.index()).each(function() { var cols = $(this).attr("colspan") || 1; th_index += parseInt(cols,10); }); // Determine (and/or reverse) sorting direction, default `asc` var sort_dir = $this.data("sort-default") || dir.ASC; if ($this.data("sort-dir")) sort_dir = $this.data("sort-dir") === dir.ASC ? dir.DESC : dir.ASC; // Choose appropriate sorting function. var type = $this.data("sort") || null; // Prevent sorting if no type defined if (type === null) { return; } // Trigger `beforetablesort` event that calling scripts can hook into; // pass parameters for sorted column index and sorting direction $table.trigger("beforetablesort", {column: th_index, direction: sort_dir}); // More reliable method of forcing a redraw $table.css("display"); // Run sorting asynchronously on a timout to force browser redraw after // `beforetablesort` callback. Also avoids locking up the browser too much. setTimeout(function() { // Gather the elements for this column var column = []; var sortMethod = sortFns[type]; // Push either the value of the `data-order-by` attribute if specified // or just the text() value in this column to column[] for comparison. trs.each(function(index,tr) { var $e = $(tr).children().eq(th_index); var sort_val = $e.data("sort-value"); var order_by = typeof(sort_val) !== "undefined" ? sort_val : $e.text(); column.push(order_by); }); // Create the sort map. This column having a sort-dir implies it was // the last column sorted. As long as no data-sort-desc is specified, // we're free to just reverse the column. var theMap; if (sort_dir == dir.ASC) theMap = sort_map(column, sortMethod); else theMap = sort_map(column, function(a, b) { return -sortMethod(a, b); }); // Reset siblings $table.find("th").data("sort-dir", null).removeClass("sorting-desc sorting-asc"); $this.data("sort-dir", sort_dir).addClass("sorting-"+sort_dir); var sortedTRs = $(apply_sort_map(trs, theMap)); $table.children("tbody").remove(); $table.append("<tbody />").append(sortedTRs); // Trigger `aftertablesort` event. Similar to `beforetablesort` $table.trigger("aftertablesort", {column: th_index, direction: sort_dir}); // More reliable method of forcing a redraw $table.css("display"); }, 10); }); }); }; // Enum containing sorting directions $.fn.stupidtable.dir = {ASC: "asc", DESC: "desc"}; $.fn.stupidtable.default_sort_fns = { "int": function(a, b) { return parseInt(a, 10) - parseInt(b, 10); }, "float": function(a, b) { return parseFloat(a) - parseFloat(b); }, "string": function(a, b) { if (a < b) return -1; if (a > b) return +1; return 0; }, "string-ins": function(a, b) { a = a.toLowerCase(); b = b.toLowerCase(); if (a < b) return -1; if (a > b) return +1; return 0; } }; })(jQuery); $(function(){ var table = $("table").stupidtable(); table.on("beforetablesort", function (event, data) { // data.column - the index of the column sorted after a click // data.direction - the sorting direction (either asc or desc) $("#msg").text("Sorting index " + data.column) }); table.on("aftertablesort", function (event, data) { var th = $(this).find("th"); th.find(".arrow").remove(); var dir = $.fn.stupidtable.dir; var arrow = data.direction === dir.ASC ? "↑" : "↓"; th.eq(data.column).append('<span class="arrow">' + arrow +'</span>'); }); }); </script> <style type="text/css"> label { display: inline-block; width: 5em; } table { border-collapse: collapse; } th, td { padding: 5px 10px; border: 1px solid #999; } th { background-color: #a7d3ff; } th[data-sort]{ cursor:pointer; } a[data-lightbox]{ cursor:zoom-in; } #msg { color: green; } </style> </head> <body> <h1>Readthrough analyse results</h1> %s </body> </html> """ % (gene_table) html_file = open(html,"w") html_file.write(html_str) html_file.close() except Exception, e : stop_err('Error during html page creation : ' + str( e ) ) def __main__(): #Parse command line options parser = optparse.OptionParser() parser.add_option("-g", "--gff", dest="gff", type= "string", help="GFF annotation file", metavar="FILE") parser.add_option("-f", "--fasta", dest="fasta", type= "string", help="Fasta file ", metavar="FILE") parser.add_option("-b", "--bam", dest="bamfile", type= "string", help="Bam Ribo-Seq alignments ", metavar="FILE") parser.add_option("-d", "--dirout", dest="dirout", type= "string", help="write report in this html file and in associated directory", metavar="STR,STR") parser.add_option("-e", "--extend", dest="extend", type= "int",default = 300 , help="Lenght of extension after stop in number of base pairs (depends on your organisme)", metavar="INT") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") (options, args) = parser.parse_args() sys.stdout.write("Begin readthrough analysis at %s\n" % time.asctime( time.localtime(time.time()))) try: (html_file, subfolder ) = options.dirout.split(",") if not os.path.exists(subfolder): try: os.mkdir(subfolder) except Exception, e : stop_err('Error running make directory : ' + str(e)) ## identify GFF or GTF format from 9th column with open (options.gff,"r") as gffile : for line in gffile : if '#' in line : ## skip header gffile.next() elif 'gene_id' in line : ## launch gtf reader : GFF = ribo_functions.store_gtf(options.gff) break elif 'ID=' in line : ## launch gff reader GFF = ribo_functions.store_gff(options.gff) break else : stop_err( 'Please check your annotation file is in correct format, GFF or GTF' ) #GFF = store_gff(options.gff) #GFF = ribo_functions.store_gtf(options.gff) ## check gff reading if not GFF['order'] : stop_err( 'Incorrect GFF file' ) clean_file = ribo_functions.cleaning_bam(options.bamfile) compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder, options.extend) if os.path.exists( clean_file ): os.remove( clean_file ) write_html_page(html_file,subfolder) ##paste jquery script in result directory : jq_src = os.path.join(os.path.dirname(__file__),'lightbox') shutil.copytree(jq_src,os.path.join(subfolder,'lightbox')) sys.stdout.write("Finish readthrough analysis at %s\n" % time.asctime( time.localtime(time.time()))) except Exception, e: stop_err( 'Error running metagene readthrough analysis : ' + str( e ) ) if __name__=="__main__": __main__()