# HG changeset patch # User rlegendre # Date 1413817639 14400 # Node ID eea5fec46e5c97544aec049890c3115df8de28ea # Parent 8d8552899d20a9c4ba368cdb347afa7d8e5069ee Uploaded diff -r 8d8552899d20 -r eea5fec46e5c metagene_frameshift_analysis.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metagene_frameshift_analysis.py Mon Oct 20 11:07:19 2014 -0400 @@ -0,0 +1,884 @@ +#!/usr/bin/env python2.7 +#-*- coding: utf-8 -*- +''' + Created on sep. 2013 + @author: rachel legendre + @copyright: rachel.legendre@igmors.u-psud.fr + @license: GPL v3 +''' + + +import os, sys, optparse, tempfile, subprocess, re, shutil, commands, urllib, time, csv +import math +from numpy import arange +#from matplotlib import pyplot as pl +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as pl +import itertools +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.Alphabet import IUPAC +from PIL import Image +##libraries for debugg +#import cPickle +#import pdb + + +##Setting +MIN_COV_REQUIREMENT = 500 +total_mapped_read = 0 +_MAX = 30 +whole_phasing_table = [[],[],[]] + + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) + sys.exit() + + +def split_bam(bamfile,tmpdir): + ''' + split bam by chromosome and write sam file in tmpdir + ''' + try: + #get header + results = subprocess.check_output(['samtools', 'view', '-H',bamfile]) + header = results.split('\n') + + #define genome size + genome = [] + for line in header: + result = re.search('SN', line) + if result : + #print line + feat = line.split('\t') + chrom = re.split(":", feat[1]) + #print feat[1] + genome.append(chrom[1]) + + #split sam by chrom + n = 0 + for chrm in genome: + with open(tmpdir+'/'+chrm+'.sam', 'w') as f : + #write header correctly for each chromosome + f.write(header[0]+'\n') + expr = re.compile(chrm+'\t') + el =[elem for elem in header if expr.search(elem)][0] + f.write(el+'\n') + f.write(header[-2]+'\n') + #write all reads for each chromosome + reads = subprocess.check_output(["samtools", "view", bamfile, chrm]) + f.write(reads) + # calculate number of reads + n += reads.count(chrm) + + sys.stdout.write("%d reads are presents in your bam file\n" % n) + + except Exception, e: + stop_err( 'Error during bam file splitting : ' + str( e ) ) + + + +def get_first_base(tmpdir, kmer): + ''' + write footprint coverage file for each sam file in tmpdir + ''' + global total_mapped_read + try: + file_array = (commands.getoutput('ls '+tmpdir)).split('\n') + ##write coverage for each sam file in tmpdir + for samfile in file_array: + with open(tmpdir+'/'+samfile, 'r') as sam : + ##get chromosome name + chrom = samfile.split(".sam")[0] + + for line in sam: + #initialize dictionnary + if '@SQ' in line : + size = int(line.split('LN:')[1]) + genomeF = [0]*size + genomeR = [0]*size + # define multiple reads keys from mapper + elif '@PG' in line : + if 'bowtie' in line: + multi_tag = "XS:i:" + elif 'bwa' in line: + multi_tag = "XT:A:R" + else : + stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") + + # get footprint + elif re.search('^[^@].+', line) : + #print line.rstrip() + read_pos = int(line.split('\t')[3]) + read_sens = int(line.split('\t')[1]) + len_read = len(line.split('\t')[9]) + read_qual = int(line.split('\t')[4]) + if len_read == kmer and multi_tag not in line and read_qual > 20 : + ###if line.split('\t')[5] == '28M' : + # print line.rstrip() + total_mapped_read +=1 + #if it's a forward read + if read_sens == 0 : + #get P-site : start read + 12 nt + read_pos += 15 + genomeF[read_pos] += 1 + #if it's a reverse read + elif read_sens == 16 : + #get P-site + read_pos += 12 + genomeR[read_pos] += 1 + + #try: + #write coverage in files + with open(tmpdir+'/assoCov_'+chrom+'.txt', 'w') as cov : + for i in range(0,size): + cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n') + #except Exception,e: + # stop_err( 'Error during coverage file writting : ' + str( e ) ) + del genomeF + del genomeR + sys.stdout.write("%d reads are used for frame analysis\n" % total_mapped_read) + except Exception, e: + stop_err( 'Error during footprinting : ' + str( e ) ) + + + + + +def store_gff(gff): + ''' + parse and store gff file in a dictionnary + ''' + try: + GFF = {} + with open(gff, 'r') as f_gff : + GFF['order'] = [] + for line in f_gff: + ## switch commented lines + line = line.split("#")[0] + if line != "" : + feature = (line.split('\t')[8]).split(';') + # first line is already gene line : + if line.split('\t')[2] == 'gene' : + gene = feature[0].replace("ID=","") + if re.search('gene',feature[2]) : + Name = feature[2].replace("gene=","") + else : + Name = "Unknown" + ##get annotation + note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line) + note = urllib.unquote(str(note)).replace("\n","") + ## store gene information + GFF['order'].append(gene) + GFF[gene] = {} + GFF[gene]['chrom'] = line.split('\t')[0] + GFF[gene]['start'] = int(line.split('\t')[3]) + GFF[gene]['stop'] = int(line.split('\t')[4]) + GFF[gene]['strand'] = line.split('\t')[6] + GFF[gene]['name'] = Name + GFF[gene]['note'] = note + GFF[gene]['exon'] = {} + GFF[gene]['exon_number'] = 0 + #print Name + elif line.split('\t')[2] == 'CDS' : + gene = re.sub(r".?Parent\=(.+)(_mRNA)+", r"\1", feature[0]) + if GFF.has_key(gene) : + GFF[gene]['exon_number'] += 1 + exon_number = GFF[gene]['exon_number'] + GFF[gene]['exon'][exon_number] = {} + GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] + GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) + GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) + + ## if there is a five prim UTR intron, we change start of gene + elif line.split('\t')[2] == 'five_prime_UTR_intron' : + if GFF[gene]['strand'] == "+" : + GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] + else : + GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] + return GFF + except Exception,e: + stop_err( 'Error during gff storage : ' + str( e ) ) + + +#chrI SGD gene 87286 87752 . + . ID=YAL030W;Name=YAL030W;gene=SNC1;Alias=SNC1;Ontology_term=GO:0005484,GO:0005768,GO:0005802,GO:0005886,GO:0005935,GO:0006887,GO:0006893,GO:000689 +#7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra +#ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr +#om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified +#chrI SGD CDS 87286 87387 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified +#chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified +#chrI SGD intron 87388 87500 . + . Parent=YAL030W_mRNA;Name=YAL030W_intron;orf_classification=Verified + + +def __percent__(prop): + + if sum(prop) != 0 : + perc = [0,0,0] + if prop[0] != 0 : + perc[0] = int("{0:.0f}".format((prop[0]*100.0)/sum(prop))) + if prop[1] != 0 : + perc[1] = int("{0:.0f}".format((prop[1]*100.0)/sum(prop))) + if prop[2] != 0 : + perc[2] = int("{0:.0f}".format((prop[2]*100.0)/sum(prop))) + return perc + else : + return prop + + +def frame_analysis(tmpdir,subfolder,cutoff,GFF,box_file): + ''' + This function take footprint and annotation (gff) for analyse reading frame in each gene + ''' + global total_mapped_read + global whole_phasing_table + try: + chrom = "" # initializing chromosome + nb_gene = 0 # number of analysed genes + myheader = ['Gene','Name','Total_proportion','Dual_coding','RPKM','Segment_RPKM','Note'] + + with open(subfolder+'/dual_coding.tab',"w") as out : + #out.write('Gene\tName\tTotal_proportion\tDual_coding\tRPKM\tSegment_RPKM\tNote\n') + writer = csv.writer(out,delimiter='\t') + writer.writerow(myheader) + whole_phasing = [0,0,0] + for gene in GFF['order']: + cov = [] + ##first chromosome : we open corresponding file + if chrom == "" : + chrom = GFF[gene]['chrom'] + with open(tmpdir+"/assoCov_"+chrom+".txt") as f : + data = f.readlines() + ##if we change chrosomosome + elif chrom != GFF[gene]['chrom'] : + chrom = GFF[gene]['chrom'] + with open(tmpdir+"/assoCov_"+chrom+".txt") as f : + data = f.readlines() + + ## if a gene without intron : + if GFF[gene]['exon_number'] == 1: + + ## get coverage for each gene + if GFF[gene]['strand'] == "+": + for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1): + cov.append(int((data[i].rstrip()).split("\t")[0])) + else : + for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1): + cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) + cov.reverse() + else : + ## For each gene, get coverage and sum of exon size + if GFF[gene]['strand'] == "+": + + for exon in range(1,GFF[gene]['exon_number']+1) : + for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): + cov.append(int((data[i].rstrip()).split("\t")[0])) + else : + + for exon in range(1,GFF[gene]['exon_number']+1) : + for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): + cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) + cov.reverse() + + len_cov = len(cov) + #segmentation by gene size + if len_cov < 1000 : + nb_segment = 3 + elif len_cov > 1000 and len_cov < 2000: + nb_segment = 6 + else : + nb_segment = 9 + ## number of aa in gene : + nb_aa = len_cov/3 + len_frag = len_cov/nb_segment + prop = [0,0,0] + seg_prop = [] + tot_prop = prop + tot_rpkm = 0 + seg_rpkm = [] + window = 0 + GFF[gene]['dual_coding'] = '-' + ''' segmentation et total proportions ''' + if sum(cov) > MIN_COV_REQUIREMENT: + nb_gene += 1 + for nuc in range(0,len_cov-2,3) : + window += 1 + tot_rpkm += sum(cov[nuc:nuc+2]) + ## Calculate proportion + prop[0] += cov[nuc] + prop[1] += cov[nuc+1] + prop[2] += cov[nuc+2] + #segmented proportion + if window == nb_aa/nb_segment: + seg_prop.append(__percent__(prop)) + whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop))) + tot_prop = (map(lambda (x, y): x + y, zip(tot_prop, prop))) + seg_rpkm.append("{0:.2f}".format((tot_rpkm*1000000.0)/(total_mapped_read*len_frag))) + window = 0 + tot_rpkm = 0 + prop = [0,0,0] + + GFF[gene]['total_prop'] = __percent__(tot_prop) + GFF[gene]['seg_prop'] = seg_prop + GFF[gene]['seg_rpkm'] = seg_rpkm + tot_rpkm = "{0:.2f}".format(sum(cov)*1000000.0/(total_mapped_read*len_cov)) + GFF[gene]['total_rpkm'] = tot_rpkm + ## If gene has non standard proportion + idx = 0 + for seg in GFF[gene]['seg_prop'] : + if (seg[0] < cutoff) and (sum(seg) != 0 and GFF[gene]['seg_rpkm'][idx] != 0.0) : + GFF[gene]['dual_coding'] = 'yes' + idx += 1 + for z in range(0,3) : + whole_phasing_table[z].append(__percent__(tot_prop)[z]) + else : + for nuc in range(0,len_cov-2,3) : + ## Calculate proportion + prop[0] += cov[nuc] + prop[1] += cov[nuc+1] + prop[2] += cov[nuc+2] + whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop))) + #for z in range(0,3) : + # whole_phasing_table[z].append(__percent__(tot_prop)[z]) + + if GFF[gene]['dual_coding'] == 'yes' : + mylist = [gene,GFF[gene]['name'],GFF[gene]['total_prop'],GFF[gene]['seg_prop'],GFF[gene]['total_rpkm'],GFF[gene]['seg_rpkm'],GFF[gene]['note']] + #out.write(gene+'\t'+GFF[gene]['name']+'\t'+str(GFF[gene]['total_prop'])+'\t'+str(GFF[gene]['seg_prop'])+'\t'+str(GFF[gene]['total_rpkm'])+'\t'+GFF[gene]['note']+'\n') + writer.writerow(mylist) + GFF[gene]['coverage'] = cov + else : + del GFF[gene] + + del GFF['order'] + + + whole_phasing = __percent__(whole_phasing) + sys.stdout.write("Proportion of reads in each frame :\n%s\n" % whole_phasing) + sys.stdout.write("%d genes are used for frame analysis\n" % nb_gene) + + ##produce boxplot + pl.boxplot(whole_phasing_table,notch=True,sym='ro') + pl.ylabel('Total number of footprints (percent)') + pl.draw() + pl.savefig(subfolder+'/boxplot.png',format='png', dpi=640) + pl.savefig(box_file, format='png', dpi=640) + pl.clf() + return GFF + + except Exception, e: + stop_err( 'Error during frame analysis : ' + str( e ) ) + +def get_orf(fasta,gene): + ''' + This function compute ORF in each gene and return multiple lists of methionines and stop in each frame + ''' + try : + SeqDict = SeqIO.to_dict(SeqIO.parse(open(fasta,"r"),"fasta")) + if gene['exon_number'] == 1 : + if gene['strand'] == '-' : + gene_seq = str(SeqDict[gene['chrom']].seq[gene['start']-2:gene['stop']+1].reverse_complement()) + else : + gene_seq = str(SeqDict[gene['chrom']].seq[gene['start']-2:gene['stop']+1]) + else : + ## get sequence of gene with intron with -1 and +1 nt for detect all phases + gene_seq = str(SeqDict[gene['chrom']].seq[gene['start']-2:gene['start']-1]) + for exon in range(1,gene['exon_number']+1) : + gene_seq = gene_seq+str(SeqDict[gene['chrom']].seq[gene['exon'][exon]['start']-1:gene['exon'][exon]['stop']]) + gene_seq = gene_seq+str(SeqDict[gene['chrom']].seq[gene['stop']:gene['stop']+1]) + if gene['strand'] == '-' : + my_seq = Seq(gene_seq,IUPAC.unambiguous_dna).reverse_complement() + #gene_seq = my_seq.tostring() DEPRECATED + gene_seq = str(my_seq) + # initialize array of start/stop array_1 is fame 0, array 2 is frame +1, array_3 is frame -1 + stop_1 = [] + stop_2 = [] + stop_3 = [] + met_1 = [] + met_2 = [] + met_3 = [] + stops = ['TAA','TGA','TAG'] + for z in range(0,len(gene_seq),3): + # Frame -1 + if gene_seq[z:z+3] == 'ATG' : + met_3.append(z) + elif gene_seq[z:z+3] in stops : + stop_3.append(z) + # Frame 0 + if gene_seq[z+1:z+4] == 'ATG' : + met_1.append(z+1) + elif gene_seq[z+1:z+4] in stops : + stop_1.append(z+1) + # Frame +1 + if gene_seq[z+2:z+5] == 'ATG' : + met_2.append(z+2) + elif gene_seq[z+2:z+5] in stops : + stop_2.append(z+2) + + + return gene_seq,met_1,stop_1,met_2,stop_2,met_3,stop_3 + + except Exception, e: + stop_err( 'Error during ORF computing : ' + str( e ) ) + +def plot_subprofile(GFF, directory,fasta) : + ''' + This function plot subprofiles of footprint and ORFs for each gene in GFF dict + ''' + global _MAX + try : + + for gene in GFF.iterkeys(): + nb_aa = len(GFF[gene]['coverage'])/3 + subprofile_1 = [] + subprofile_2 = [] + subprofile_3 = [] + cov = GFF[gene]['coverage'] + + for z in range(0,len(cov)-2,3): + for i in range(3): + subprofile_1.append(cov[z]) + subprofile_2.append(cov[z+1]) + subprofile_3.append(cov[z+2]) + + ## Normalisation of density plot + tot = max(cov) + + subprofile_1 = [(x * _MAX/tot) for x in subprofile_1] + subprofile_2 = [(x * _MAX/tot) for x in subprofile_2] + subprofile_3 = [(x * _MAX/tot) for x in subprofile_3] + + + if len(subprofile_1) == len(subprofile_2) and len(subprofile_1) == len(subprofile_3) : + + ind = arange(len(subprofile_1)) + fig = pl.figure(num=1) + #sub profile frame 0 + pl.subplot(4,1,1) + ax1 = fig.add_subplot(4,1,1) + pl.title( "%s (%s)\n$(%s RPKM)$"% (gene, GFF[gene]['name'], GFF[gene]['total_rpkm'])) + pl.ylim(0,_MAX) + pl.xlim(0,len(subprofile_1)) + pl.yticks([0, _MAX/2, _MAX]) + ax1.plot(ind, subprofile_1, 'b-', lw = 3,label = 'Frame 0') + ax1.fill_between(ind, 0,subprofile_1,color='b') + ax1.legend(loc=1,prop={'size':8}) + pl.ylabel('#RPF') + + ##sub profile frame +1 + pl.subplot(4,1,2) + ax2 = fig.add_subplot(4,1,2) + pl.ylim(0,_MAX) + pl.xlim(0,len(subprofile_2)) + pl.yticks([0, _MAX/2, _MAX]) + ax2.plot(ind,subprofile_2,'r-',lw = 3,label = 'Frame +1') + ax2.fill_between(ind, 0,subprofile_2,color='r') + ax2.legend(loc=1,prop={'size':8}) + pl.ylabel('#RPF') + + ##sub profile frame -1 + pl.subplot(4,1,3) + ax3 = fig.add_subplot(4,1,3) + pl.ylim(0,_MAX) + pl.yticks([0, _MAX/2, _MAX]) + pl.xlim(0,len(subprofile_3)) + ax3.plot(ind,subprofile_3,'g-',lw =3,label = 'Frame -1') + ax3.fill_between(ind, 0,subprofile_3,color='g') + ax3.legend(loc=1,prop={'size':8}) + pl.ylabel('#RPF') + + ## get orf for each frame + gene_seq,met_1,stop_1,met_2,stop_2,met_3,stop_3 = get_orf(fasta,GFF[gene]) + ## ORF plot + idc = arange(len(gene_seq)) + ##define axe for frame : + x = [0]*len(gene_seq) + y = [1]*len(gene_seq) + z = [2]*len(gene_seq) + ## define last subplot + #pdb.set_trace() + pl.subplot(4,1,4) + ## modify axes setting + ax4 = fig.add_subplot(4,1,4) + ax4.spines['right'].set_color('none') + ax4.spines['top'].set_color('none') + ax4.spines['left'].set_color('none') + ax4.spines['bottom'].set_color('none') + ax4.tick_params(bottom = 'off',top='off',left='off', right='off') + pl.ylim(0,3) + pl.xlim(0,len(gene_seq)) + pl.yticks([0.5,1.5,2.5],[r'-1',r'+1',r'0']) + pl.xticks([]) + ## plot line for delimiting frame and arrow corresponding to start and stop + pl.plot(idc,x,'k',lw=2) + for val in met_1: + pl.arrow( val, 2, 0.0, 0.5, fc="g", ec="g",lw=2) + for val in stop_1: + pl.arrow( val, 2, 0.0, 1, fc="r", ec="r",lw=2) + pl.plot(idc,y,'k') + for val in met_2: + pl.arrow( val, 1, 0.0, 0.5, fc="g", ec="g",lw=2) + for val in stop_2: + pl.arrow( val, 1, 0.0, 1, fc="r", ec="r",lw=2) + pl.plot(idc,z,'k') + for val in met_3: + pl.arrow( val, 0, 0.0, 0.5, fc="g", ec="g",lw=2) + for val in stop_3: + pl.arrow( val, 0, 0.0, 1, fc="r", ec="r",lw=2) + + + pl.ylabel('ORF',fontsize='small') + pl.draw() + pl.savefig(directory+'/'+gene+'.png',format='png', dpi=600) + pl.clf() + ##pl.show() + + ## Make thumbnail for html page + infile = directory+'/'+gene+'.png' + size = 128, 128 + im = Image.open(infile) + im.thumbnail(size, Image.ANTIALIAS) + im.save(directory+'/'+gene + "_thumbnail.png", "PNG") + + + else : + stop_err( 'Error during plot : lenght of subprofile are different ') + + except Exception, e: + stop_err( 'Error during subprofiles plotting : ' + str( e ) ) + +def write_html_page(html,subfolder,GFF) : + + + try : + gene_table = '' + gene_table += '
Gene | Plot | Name | Total Proportion | Segment proportion | RPKM | Segment RPKM |
---|---|---|---|---|---|---|
%s | ![]() | %s | %s | %s | %s | %s |