# HG changeset patch # User rlegendre # Date 1413817660 14400 # Node ID 29c9c86e17e1431c26f4548ab17f59eb6e210643 # Parent 0b8fa8b7f35613fa7911200115e2cc2bc32f9e8e Uploaded diff -r 0b8fa8b7f356 -r 29c9c86e17e1 metagene_readthrough.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metagene_readthrough.py Mon Oct 20 11:07:40 2014 -0400 @@ -0,0 +1,1026 @@ +#!/usr/bin/env python2.7.3 +#-*- coding: utf-8 -*- + +''' + Created on Dec. 2013 + @author: rachel legendre + @copyright: rachel.legendre@igmors.u-psud.fr + @license: GPL v3 +''' + +import os, sys, time, optparse, shutil, re, urllib, subprocess, tempfile +from urllib import unquote +from Bio import SeqIO +import csv +import pysam +import HTSeq +#from matplotlib import pyplot as pl +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as pl +from numpy import arange +from matplotlib import ticker as t +from PIL import Image + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) + sys.exit() + + +def store_gff(gff): + ''' + parse and store gff file in a dictionnary + ''' + try: + GFF = {} + with open(gff, 'r') as f_gff : + GFF['order'] = [] + for line in f_gff: + ## switch commented lines + line = line.split("#")[0] + if line != "" : + feature = (line.split('\t')[8]).split(';') + # first line is already gene line : + if line.split('\t')[2] == 'gene' : + gene = feature[0].replace("ID=","") + if re.search('gene',feature[2]) : + Name = feature[2].replace("gene=","") + else : + Name = "Unknown" + ##get annotation + note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line) + note = unquote(str(note)).replace("\n","") + ## store gene information + GFF['order'].append(gene) + GFF[gene] = {} + GFF[gene]['chrom'] = line.split('\t')[0] + GFF[gene]['start'] = int(line.split('\t')[3]) + GFF[gene]['stop'] = int(line.split('\t')[4]) + GFF[gene]['strand'] = line.split('\t')[6] + GFF[gene]['name'] = Name + GFF[gene]['note'] = note + GFF[gene]['exon'] = {} + GFF[gene]['exon_number'] = 0 + #print Name + elif line.split('\t')[2] == 'CDS' : + gene = re.sub(r".?Parent\=(.+)(_mRNA)+", r"\1", feature[0]) + if GFF.has_key(gene) : + GFF[gene]['exon_number'] += 1 + exon_number = GFF[gene]['exon_number'] + GFF[gene]['exon'][exon_number] = {} + GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] + GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) + GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) + + ## if there is a five prim UTR intron, we change start of gene + elif line.split('\t')[2] == 'five_prime_UTR_intron' : + if GFF[gene]['strand'] == "+" : + GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] + else : + GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] + return GFF + except Exception,e: + stop_err( 'Error during gff storage : ' + str( e ) ) + + + + +def compute_rpkm(length,count_gene,count_tot) : + + try : + rpkm = "{0:.4f}".format(count_gene*1000000.0/(count_tot*length)) + except ArithmeticError : + stop_err( 'Illegal division by zero') + return float(rpkm) + + +def find_stop(seq) : + ''' + Find stop codon in a sequence and return position of first nucleotide in stop + ''' + stop_codon = ['TAA','TAG','TGA'] + for nt in range(0,len(seq)-3,3) : + codon = seq[nt:nt+3] + if codon in stop_codon : + return nt + + return -1 + + +def check_met(seq): + ''' + Boolean function for testing presence or absence of methionine in 5 codons following stop codon + ''' + met = 'ATG' + for pos in range(0,15,3) : + codon = seq[pos:pos+3] + if codon in met : + return True + + return False + + +''' + feature.iv is a GenomicInterval object : + A GenomicInterval object has the following slots, some of which + are calculated from the other: + + chrom: The name of a sequence (i.e., chromosome, contig, or + the like). + start: The start of the interval. Even on the reverse strand, + this is always the smaller of the two values 'start' and 'end'. + Note that all positions should be given as 0-based value! + end: The end of the interval. Following Python convention for + ranges, this in one more than the coordinate of the last base + that is considered part of the sequence. + strand: The strand, as a single character, '+' or '-'. '.' indicates + that the strand is irrelavant. (Alternatively, pass a Strand object.) + length: The length of the interval, i.e., end - start + start_d: The "directional start" position. This is the position of the + first base of the interval, taking the strand into account. Hence, + this is the same as 'start' except when strand == '-', in which + case it is end-1. + end_d: The "directional end": Usually, the same as 'end', but for + strand=='-1', it is start-1. + +''' +def check_overlapping(gff_reader,chrm,start,stop,strand): + + #### probleme avec les genes completement inclu... + + iv2 = HTSeq.GenomicInterval(chrm,start,stop,strand) + for feature in gff_reader: + if feature.type == "gene" : + if feature.iv.overlaps(iv2) : + ## if its a reverse gene, we replace start of extension by start of previous gene + if strand == '-' : + return (feature.iv.end+3,stop) + ## else we replace stop of extension by start of following gene + else : + return (start,feature.iv.start-3) + ## if no overlap are find, we return -1 + return (start,stop) + + +def pass_length(start,stop) : + + if (stop-start) > 25 : + return True + else : + return False + + +def check_homo_coverage(gene,GFF,start,stop, aln) : + + chrom = GFF[gene]['chrom'] + ## compute nunber of nucleotides in CDS with a coverage equal to zero + nt_cds_cov = 0 + nt_cds_num = 0 + for i in range(1,GFF[gene]['exon_number']+1) : + for z in range(GFF[gene]['exon'][i]['start'],GFF[gene]['exon'][i]['stop']): + nt_cds_num += 1 + if aln.count(chrom,z,z+1) == 0 : + nt_cds_cov += 1 + + ## compute percent of CDS no covering + percent = nt_cds_cov*100/nt_cds_num + + ## compute number of nucleotides with no coverage in extension + nt_no_cover = 0 + for pos in range(start,stop,1) : + if aln.count(chrom,pos,pos+1) == 0 : + nt_no_cover += 1 + #print gene, nt_cds_cov, percent, nt_no_cover + #percent10 = (stop-start)*50/100 + if (nt_no_cover*100)/(stop-start) > percent : + return False + else : + return True + +def plot_gene ( aln, gene, start_extension, stop_extension, dirout ) : + + try: + strand = gene['strand'] + len_gene = gene['stop']-gene['start'] + if strand is "-" : + len_ext = gene['stop']-start_extension + ## coverage in all gene + extension + start = start_extension-100 + stop = gene['stop']+100 + vector1 = [0]*(stop-start) + #for pileupcolumn in aln.pileup( gene['chrom'], start, stop): + # vector.append(pileupcolumn.n) + for read in aln.fetch(gene['chrom'], start, stop): + if read.is_reverse : + ## for get footprint in P-site (estimate) we take 3 nt in middle of read + #pos = (read.pos+13)-start + pos = (read.pos-start) + (read.rlen/2)-1 + for z in range(pos,(pos+3)) : + ## le fetch contient des reads qui chevauchent les 30 nt de la fin du gene mais dont le site P + ## se trouve avant notre vector, le z devient negatif et la couverture augmente en fin de vecteur (ce qui est faux) + if z > 0 : + try : + vector1[z] += 1 + except IndexError : + pass + vector1.reverse() + mean_read = float(sum(vector1))/float(len(vector1)) + cov = [(x/mean_read) for x in vector1] + idx_tot = arange(len(cov)) + ## coverage in extension + start_ext = start_extension-40 + stop_ext = stop_extension+30 + vector2 = [0]*(stop_ext-start_ext) + #for pileupcolumn in aln.pileup( gene['chrom'], start, stop): + # vector.append(pileupcolumn.n) + for read in aln.fetch(gene['chrom'], start_extension, stop_ext): + if read.is_reverse : + ## get footprint in P-site + #pos = (read.pos+13)-start_ext + pos = (read.pos-start_ext) + (read.rlen/2)-1 + for z in range(pos,(pos+3)) : + if z > 0 : + try : + vector2[z] += 1 + except IndexError : + pass + vector2.reverse() + #mean_read = float(sum(vector))/float(len(vector)) + cov_ext = [(x/mean_read) for x in vector2] + _max = max(cov_ext[30::]) + idx_ext = arange(len(cov_ext)) + + else : + len_ext = stop_extension-gene['start'] + start = gene['start']-100 + stop = stop_extension+100 + vector = [0]*(stop-start) + #for pileupcolumn in aln.pileup( gene['chrom'], start, stop): + #vector.append(pileupcolumn.n) + for read in aln.fetch(gene['chrom'], start, stop): + if not read.is_reverse : + ## get footprint in P-site + #pos = (read.pos+12)-start + pos = (read.pos-start) + (read.rlen/2)-1 + for z in range(pos,(pos+3)) : + if z > 0 : + try : + vector[z] += 1 + except IndexError : + pass + mean_read = float(sum(vector))/float(len(vector)) + cov = [(x/mean_read) for x in vector] + idx_tot = arange(len(cov)) + + ## coverage in extension + start_ext = gene['stop']-30 + stop_ext = stop_extension+40 + vector = [0]*(stop-start_ext) + for read in aln.fetch(gene['chrom'], start_ext, stop_extension): + if not read.is_reverse : + ## get footprint in P-site + pos = (read.pos-start_ext) + (read.rlen/2)-1 + for z in range(pos,(pos+3)) : + if z > 0 : + try : + vector[z] += 1 + except IndexError : + pass + + cov_ext = [(x/mean_read) for x in vector] + idx_ext = arange(len(cov_ext)) + _max = max(cov_ext[30::]) + + #### PLOT FIGURE #### + + font = {'family' : 'serif','color': 'grey','weight' : 'normal','size' : 16 } + + + fig = pl.figure(num=1) + ## create a big subplot for common y axis on two last subplot + ax = fig.add_subplot(2,1,2) + ## hide all spines + ax.spines['top'].set_visible(False) + ax.spines['bottom'].set_visible(False) + ax.spines['left'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.tick_params(labelcolor='w', top='off', bottom='off', left='off', right='off') + + ## plot gene structure + ax1 = fig.add_subplot(3,1,1) + ax1.set_title(gene['name']) + ## hide all spines + ax1.spines['right'].set_color('none') + ax1.spines['top'].set_color('none') + ax1.spines['left'].set_color('none') + ax1.spines['bottom'].set_color('none') + ax1.set_ylim(0,5) + #set xlim as second plot with 100nt switch + ax1.set_xlim(-100,len(idx_tot)-100) + ## hide ticks + pl.yticks([]) + pl.xticks([]) + ## if it's a "simple" gene + if gene['exon_number'] == 1 : + exon = arange(len_gene) + x = [3]*len_gene + ax1.plot(exon,x,color='#9B9E9C') + ax1.fill_between(exon,2,x,color='#9B9E9C') + ## if it's a gene with introns + else : + ## plot a line representing intron + intron = arange(len_gene) + y = [2.5]*len_gene + ax1.plot(intron,y,color='#9B9E9C') + + ## plotting each intron + start = gene['start'] + if strand == '+' : + for exon in range(1,gene['exon_number']+1,1) : + len_exon = gene['exon'][exon]['stop']-gene['exon'][exon]['start'] + idx = gene['exon'][exon]['start'] - start + exo = arange(idx,idx+len_exon) + x = [3]*len_exon + ax1.plot(exo,x,color='#9B9E9C') + ax1.fill_between(exo,2,x,color='#9B9E9C') + else : + ## if it's a reverse gene we must reverse exon's position + start = gene['start'] + tabF = [2.5]*len_gene + tabR = [2.5]*len_gene + for exon in range(1,gene['exon_number']+1,1) : + len_exon = gene['exon'][exon]['stop']-gene['exon'][exon]['start'] + idx = gene['exon'][exon]['start'] - start + exo = arange(idx,idx+len_exon) + for z in exo : + tabF[z] = 3 + tabR[z] = 2 + tabF.reverse() + tabR.reverse() + #pl.ylim(0,5) + ax1.plot(tabF,color='#9B9E9C') + ax1.plot(tabR,color='#9B9E9C') + x = arange(len(tabR)) + ax1.fill_between(x,tabF,tabR,color='#9B9E9C') + + ## insert arrows (as genome browser representation) + yloc = 2.5 + narrows = 20 + exonwidth = .8 + spread = .4 * len_gene / narrows + for i in range(narrows): + loc = (float(i) * len_gene / narrows)+ (len(idx_tot)/100)*2 + if strand == '+' : + x = [loc - spread, loc, loc - spread] + else: + x = [loc - spread, loc, loc - spread] + y = [yloc - exonwidth / 5, yloc, yloc + exonwidth / 5] + ax1.plot(x, y, lw=1.4, color='#676B69') + + + # plot coverage in all gene + extension + ax2 = fig.add_subplot(3,1,2) + ## fixe limit to length of coverage for x axis + ax2.set_xlim(0,len(idx_tot)) + ## fixe 4 ticks and associated labels for y axis + ax2.set_yticklabels(arange(0,int(max(cov)+20),int((max(cov)+20)/4))) + ax2.set_yticks(arange(0,int(max(cov)+20),int((max(cov)+20)/4))) + ### add start and stop of gene in axe in place of number + ax2.set_xticks([100,(100+len_gene),(100+len_ext)]) + ax2.set_xticklabels(["{:,}".format(gene['start']),"{:,}".format(gene['stop']),""]) + ## hide spines and any axis + ax2.spines['right'].set_visible(False) + ax2.spines['top'].set_visible(False) + ax2.yaxis.set_ticks_position('left') + ax2.xaxis.set_ticks_position('bottom') + ## plot and fill + ax2.plot(idx_tot, cov, color="#CC0011") + ax2.fill_between(idx_tot, 0,cov,color="#CC0011") + ax2.set_title("Genomic coordinates (%s,%s)"% (gene['strand'],gene['chrom']) ,fontdict={'fontsize':'small'}) + + # plot zoom coverage in extension + ax3 = fig.add_subplot(3,1,3) + ## fixe limit to length of coverage for x axis + ax3.set_ylim(0,int(_max)) + # we hide spines + ax3.spines['right'].set_visible(False) + ax3.spines['top'].set_visible(False) + ax3.yaxis.set_ticks_position('left') + ax3.xaxis.set_ticks_position('bottom') + ## add stop and in-frame stop in x axis + pl.xticks([30,( stop_extension-start_extension)+30],["stop codon","next in-frame stop"]) + ## fixe good position for labels + (ax3.get_xticklabels()[0]).set_horizontalalignment('center') + (ax3.get_xticklabels()[1]).set_horizontalalignment('left') + #if max coverage is lower than 2, we have a illegal division by zero + if _max > 2 : + ax3.set_yticklabels(arange(0,int(_max+1),int((_max+1)/3))) + ax3.set_yticks(arange(0,int(_max+1),int((_max+1)/3))) + else : + ## + ax3.set_ylim(0,_max) + ax3.ticklabel_format(style='sci', scilimits=(-2,2), axis='y',useOffset=False) + + ax3.plot(idx_ext, cov_ext, color="#CC0011") + ax3.fill_between(idx_ext, 0,cov_ext,color="#CC0011") + + + ## get scale of subplot 3 + #ax3.text(ax3.get_xlim()[1]-50,ax3.get_ylim()[1]-1, r'50 nt', fontdict=font) + #pl.arrow( ax3.get_xlim()[1]-50, ax3.get_ylim()[1]-2, ax3.get_xlim()[1]-50, 0, fc="grey", ec="grey",lw=2) + + ## set common y label + ax.set_ylabel('Normalized footprint counts',labelpad=20) + + ## draw and save plot + pl.draw() + #pl.show() + pl.savefig(dirout+".png",format='png', dpi=300) + pl.clf() + + + ## Make thumbnail for html page + infile = dirout+'.png' + size = 128, 128 + im = Image.open(infile) + im.thumbnail(size, Image.ANTIALIAS) + im.save(dirout+"_thumbnail.png", "PNG") + + + except Exception, e: + stop_err( 'Error during gene plotting : ' + str( e ) ) + + +def compute_analysis(bam, GFF, fasta, gff, dirout) : + + try: + background_file = dirout+"/background_sequence.fasta" + file_back = open(background_file, 'w') + file_context = open(dirout+"/stop_context.fasta", 'w') + file_extension = open(dirout+"/extensions.fasta", 'w') + ## Opening Bam file with pysam librarie + pysam.index(bam) + aln = pysam.Samfile(bam, "rb",header=True, check_header = True) + ## Opening fasta file in a dict with BioPython + SeqDict = SeqIO.to_dict(SeqIO.parse(open(fasta,"r"),"fasta")) + + ## count total read in bam file + cmd = "samtools view -c %s " % (bam) + proc = subprocess.Popen( args=cmd, shell=True, stdout = subprocess.PIPE) + count_tot = int(proc.stdout.read().rstrip()) + returncode = proc.wait() + + ## opening a GFF reader for check overlapping + gff_reader = HTSeq.GFF_Reader(gff) + + with open(dirout+"/readthrough_result.csv","w") as out : + myheader = ['Gene','Name', 'FAIL', 'Stop context','chrom','start extension','stop extension','length extension','RPKM CDS', 'RPKM extension','ratio','Annotation','sequence'] + writer = csv.writer(out,delimiter='\t') + writer.writerow(myheader) + for gene in GFF['order'] : + indic = "" + # compute rpkm of CDS : + len_cds = GFF[gene]['stop']-GFF[gene]['start'] + count_cds = 0 + ### count method of pysam doesn't strand information + if GFF[gene]['strand'] == '+' : + for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+12,GFF[gene]['stop']-15) : + if not track.is_reverse : + count_cds += 1 + else : + for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+15,GFF[gene]['stop']-12) : + if track.is_reverse : + count_cds += 1 + rpkm_cds = compute_rpkm(len_cds,count_cds,count_tot) + ## Only if gene is translate : + if rpkm_cds > 0 and count_cds > 128: + ## search footprint in UTR3 + count = 0 + try : + if GFF[gene]['strand'] == '+' : + contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['stop']-6:GFF[gene]['stop']+6]) + #print gene, contexte_stop + start_extension = GFF[gene]['stop'] + stop_extension = GFF[gene]['stop']+90 + for track in aln.fetch(GFF[gene]['chrom'],start_extension+15,stop_extension) : + if not track.is_reverse : + count += 1 + + ## get sequence of extension + seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension]) + + else : + contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['start']-7:GFF[gene]['start']+5].reverse_complement()) + #print gene, contexte_stop + start_extension = GFF[gene]['start']-90 + stop_extension = GFF[gene]['start'] + for track in aln.fetch(GFF[gene]['chrom'],start_extension,stop_extension-15) : + if track.is_reverse : + count += 1 + ## get sequence of extension + seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement()) + + ## if we have coverage after cds stop codon + if count > 10 : + res = find_stop(seq) + if res == -1 : + ''' + Write results with no stop but RPF in extension + ''' + ## check if next in-frame codon is far than 90 nt extension : + if GFF[gene]['strand'] == '+' : + pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['stop']+1,GFF[gene]['stop']+300,'+') + start_extension = pos[0]-1 + stop_extension = pos[1] + seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension]) + + #print gene,count,pos,'\n',seq + + if (seq): + res = find_stop(seq) + if res == -1 : + mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] + writer.writerow(mylist) + else : + indic = 'ok' + #print res + #stop_extension = start_extension + res +3 + else : + pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['start']-300,GFF[gene]['start']-1,'-') + start_extension = pos[0] + stop_extension = pos[1]+1 + seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement()) + + #print gene,count,pos,'\n',seq + + if (seq): + res = find_stop(seq) + if res == -1 : + mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] + writer.writerow(mylist) + else : + indic = 'ok' + #print res + #start_extension = stop_extension - res -3 + else : + indic = 'ok' + + + if indic == 'ok' : + ## We save new coordinates + if GFF[gene]['strand'] == '+' : + stop_extension = start_extension + res +3 + #print gene, count + #print gene,start_extension,stop_extension + seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension]) + #print seq + count_stop = aln.count(GFF[gene]['chrom'],stop_extension-2,stop_extension+2) + if pass_length(start_extension,stop_extension) : + count_ext = aln.count(GFF[gene]['chrom'],start_extension+9,stop_extension-15) + if stop_extension > GFF[gene]['stop']+9 : + stop_ok = 1 + else : + stop_ok = 0 + else : + start_extension = stop_extension - res - 3 + #print gene, count + #print gene,start_extension,stop_extension + seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension-1:stop_extension-1].reverse_complement()) + #print seq + count_stop = aln.count(GFF[gene]['chrom'],start_extension-2,start_extension+2) + if pass_length(start_extension,stop_extension) : + count_ext = aln.count(GFF[gene]['chrom'],start_extension+15,stop_extension-9) + if start_extension < GFF[gene]['start']-9 : + stop_ok = 1 + else : + stop_ok = 0 + ## if we are no methionine in 5 codons following stop of CDS + if (not check_met(seq) ): + ## if we have footprint in stop codon extension and stop extension is further than cds_stop+9 + if count_stop > 2 and stop_ok == 1 : + homo_cov = check_homo_coverage(gene,GFF,start_extension,stop_extension,aln) + if (homo_cov) : + ''' + write result witch corresponding to readthrough + ''' + ##if length of extension upper than 25 we can compute rpkm + if (not pass_length(start_extension,stop_extension)) : + len_ext = stop_extension-start_extension + rpkm_ext = 'nan' + ratio = 'nan' + else : + len_ext = stop_extension-start_extension + rpkm_ext = compute_rpkm(len_ext,count_ext,count_tot) + ## compute ratio between extension coverage and cds coverage (rpkm) + ratio = rpkm_ext/rpkm_cds + #print gene,ratio + #print start_extension,stop_extension + mylist = [gene,GFF[gene]['name'],'-',contexte_stop,GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,GFF[gene]['note'],seq] + writer.writerow(mylist) + file_context.write('>'+gene+'\n'+contexte_stop+'\n') + file_extension.write('>'+gene+'\n'+seq+'\n') + else : + ''' + write result witch corresponding to readthrough but with no homogeneous coverage + ''' + if (not pass_length(start_extension,stop_extension)) : + len_ext = stop_extension-start_extension + rpkm_ext = 'nan' + ratio = 'nan' + else : + len_ext = stop_extension-start_extension + rpkm_ext = compute_rpkm(len_ext,count_ext,count_tot) + ## compute ratio between extension coverage and cds coverage (rpkm) + ratio = rpkm_ext/rpkm_cds + mylist = [gene,GFF[gene]['name'],'hetero cov',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,GFF[gene]['note'],seq] + writer.writerow(mylist) + file_context.write('>'+gene+'\n'+contexte_stop+'\n') + file_extension.write('>'+gene+'\n'+seq+'\n') + #print ">"+gene+"\n"+contexte_stop + + ## plot gene : + plot_gene(aln, GFF[gene], start_extension, stop_extension, dirout+"/"+gene) + + + + else : + ''' + write result with no footprint in stop codon of extension + ''' + mylist = [gene,GFF[gene]['name'],'no RPF in stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] + writer.writerow(mylist) + file_context.write('>'+gene+'\n'+contexte_stop+'\n') + file_extension.write('>'+gene+'\n'+seq+'\n') + #print ">"+gene+"\n"+contexte_stop + else : + ''' + write result with RPF maybe result of reinitiation on a start codon + ''' + if pass_length(start_extension,stop_extension) : + mylist = [gene,GFF[gene]['name'],'Met after stop', contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] + writer.writerow(mylist) + file_context.write('>'+gene+'\n'+contexte_stop+'\n') + file_extension.write('>'+gene+'\n'+seq+'\n') + #print ">"+gene+"\n"+contexte_stop + else : + ## if its not a interesting case, we get stop context of genes without readthrough + if GFF[gene]['strand'] == '+' : + contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['stop']-6:GFF[gene]['stop']+6]) + file_back.write(contexte_stop+'\n') + else : + contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['start']-7:GFF[gene]['start']+5].reverse_complement()) + file_back.write(contexte_stop+'\n') + + ## excluded UT with incorrect positions + except ValueError: + pass + + + file_context.close() + file_back.close() + file_extension.close() + except Exception,e: + stop_err( 'Error during computing analysis : ' + str( e ) ) + + +def cleaning_bam(bam): + ''' + Remove reads unmapped, non uniquely mapped and reads with length lower than 25 and upper than 32, and mapping quality upper than 12 + ''' + try : + header = subprocess.check_output(['samtools', 'view', '-H', bam], stderr= subprocess.PIPE) + #header = results.split('\n') + + # check mapper for define multiple tag + if re.search('bowtie', header): + multi_tag = "XS:i:" + elif re.search('bwa', header): + multi_tag = "XT:A:M" + else : + stop_err("No PG tag find in"+bam+". Please use bowtie or bwa for mapping") + + tmp_sam = tempfile.mktemp() + cmd = "samtools view %s > %s" % (bam, tmp_sam) + proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) + returncode = proc.wait() + + + with open(tempfile.mktemp(),'w') as out : + out.write(header) + with open(tmp_sam,'r') as sam : + for line in sam : + if multi_tag not in line and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 : + if len(line.split('\t')[9]) < 32 and len(line.split('\t')[9]) > 25 : + out.write(line) + bamfile = tempfile.mktemp()+'.bam' + cmd = "samtools view -hSb %s > %s" % (out.name,bamfile) + proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) + returncode = proc.wait() + + return bamfile + + except Exception,e: + stop_err( 'Error during cleaning bam : ' + str( e ) ) + +def write_html_page(html,subfolder) : + + + try : + + gene_table = '' + gene_table += '
Gene | Plot | Name | Stop context | Coordinates | RPKM CDS | RPKM extension | ratio | Extension |
---|---|---|---|---|---|---|---|---|
%s | ![]() | %s | %s | %s:%s-%s | %s | %s | %s | %s |