Mercurial > repos > rlegendre > ribo_tools
changeset 18:a121cce43f90 draft
Uploaded
author | rlegendre |
---|---|
date | Tue, 09 Jun 2015 09:06:17 -0400 |
parents | c87c40e642af |
children | 385fc64fa988 |
files | kmer_analysis.py metagene_frameshift_analysis.py ribo_functions.py |
diffstat | 3 files changed, 209 insertions(+), 179 deletions(-) [+] |
line wrap: on
line diff
--- a/kmer_analysis.py Fri May 29 09:17:29 2015 -0400 +++ b/kmer_analysis.py Tue Jun 09 09:06:17 2015 -0400 @@ -367,12 +367,11 @@ try: - if os.path.exists(options.dirout): - raise - try: - os.mkdir(options.dirout) - except: - raise + if not os.path.exists(options.dirout): + try: + os.mkdir(options.dirout) + except Exception, e : + stop_err('Error running make directory : ' + str(e)) ##testing indexed bam file if os.path.isfile(options.bamfile+".bai") : @@ -404,7 +403,10 @@ ## check gff reading if not GFF['order'] : stop_err( 'Incorrect GFF file' ) - + for gene in GFF['order']: + if not GFF[gene]['exon'] : + del GFF[gene] + GFF['order'].remove(gene) ## split bam split_bam(options.bamfile,tmpdir) ###################################
--- a/metagene_frameshift_analysis.py Fri May 29 09:17:29 2015 -0400 +++ b/metagene_frameshift_analysis.py Tue Jun 09 09:06:17 2015 -0400 @@ -204,32 +204,37 @@ data = f.readlines() except IOError : print tmpdir+"/assoCov_"+chrom+".txt doesn't exist" - - ## if a gene without intron : - if GFF[gene]['exon_number'] == 1: - - ## get coverage for each gene - if GFF[gene]['strand'] == "+": - for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1): - cov.append(int((data[i].rstrip()).split("\t")[0])) + + try : + ## if a gene without intron : + if GFF[gene]['exon_number'] == 1: + + ## get coverage for each gene + if GFF[gene]['strand'] == "+": + for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1): + cov.append(int((data[i].rstrip()).split("\t")[0])) + else : + for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1): + cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) + cov.reverse() else : - for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1): - cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) - cov.reverse() - else : - ## For each gene, get coverage and sum of exon size - if GFF[gene]['strand'] == "+": - - for exon in range(1,GFF[gene]['exon_number']+1) : - for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): - cov.append(int((data[i].rstrip()).split("\t")[0])) - else : - - for exon in range(1,GFF[gene]['exon_number']+1) : - for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): - cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) - cov.reverse() + ## For each gene, get coverage and sum of exon size + if GFF[gene]['strand'] == "+": + for exon in range(1,GFF[gene]['exon_number']+1) : + for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): + cov.append(int((data[i].rstrip()).split("\t")[0])) + else : + + for exon in range(1,GFF[gene]['exon_number']+1) : + for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): + cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) + cov.reverse() + except : + print gene+" could not be analysed.\n" + del GFF[gene] + continue + len_cov = len(cov) ## segmentation by orf_length nb_segment = int(len_cov/orf_length) @@ -237,14 +242,6 @@ if len_cov < orf_length : nb_segment = 1 -# #segmentation by gene size -# if len_cov < 1000 : -# nb_segment = 3 -# elif len_cov > 1000 and len_cov < 2000: -# nb_segment = 6 -# else : -# nb_segment = 9 - ## number of aa in gene : nb_aa = len_cov/3 len_frag = len_cov/nb_segment @@ -255,7 +252,7 @@ seg_rpkm = [] window = 0 GFF[gene]['dual_coding'] = '-' - ''' segmentation et total proportions ''' + ''' segmentation and total proportions ''' if sum(cov) > MIN_COV_REQUIREMENT: ## compute mean of ORF length mean_orf += len_cov @@ -290,16 +287,16 @@ idx += 1 for z in range(0,3) : whole_phasing_table[z].append(__percent__(tot_prop)[z]) - else : - for nuc in range(0,len_cov-2,3) : - ## Calculate proportion - prop[0] += cov[nuc] - prop[1] += cov[nuc+1] - prop[2] += cov[nuc+2] - whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop))) - #for z in range(0,3) : - # whole_phasing_table[z].append(__percent__(tot_prop)[z]) - +# else : +# for nuc in range(0,len_cov-2,3) : +# ## Calculate proportion +# prop[0] += cov[nuc] +# prop[1] += cov[nuc+1] +# prop[2] += cov[nuc+2] +# whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop))) +# for z in range(0,3) : +# whole_phasing_table[z].append(__percent__(tot_prop)[z]) + if GFF[gene]['dual_coding'] == 'yes' : mylist = [gene,GFF[gene]['name'],GFF[gene]['total_prop'],GFF[gene]['seg_prop'],GFF[gene]['total_rpkm'],GFF[gene]['seg_rpkm'],GFF[gene]['note']] #out.write(gene+'\t'+GFF[gene]['name']+'\t'+str(GFF[gene]['total_prop'])+'\t'+str(GFF[gene]['seg_prop'])+'\t'+str(GFF[gene]['total_rpkm'])+'\t'+GFF[gene]['note']+'\n') @@ -309,20 +306,23 @@ del GFF[gene] del GFF['order'] - - + whole_phasing = __percent__(whole_phasing) sys.stdout.write("Proportion of reads in each frame :\n%s\n" % whole_phasing) - sys.stdout.write("Mean length of ORF in your organism : %s\n" % int(mean_orf/nb_gene)) - sys.stdout.write("%d genes are used for frame analysis\n" % nb_gene) + if nb_gene == 0: + sys.stdout.write("%d gene are used for frame analysis: each gene require %d reads at least to be analysed.\n" % (nb_gene, MIN_COV_REQUIREMENT)) + else : + sys.stdout.write("Mean length of ORF in your organism : %s\n" % int(mean_orf/nb_gene)) + sys.stdout.write("%d genes are used for frame analysis\n" % nb_gene) ##produce boxplot pl.boxplot(whole_phasing_table,notch=True,sym='ro') pl.ylabel('Total number of footprints (percent)') + pl.ylim(0,100) pl.draw() pl.savefig(subfolder+'/boxplot.png',format='png', dpi=640) pl.savefig(box_file, format='png', dpi=640) - + pl.clf() return GFF @@ -381,6 +381,8 @@ except Exception, e: stop_err( 'Error during ORF computing : ' + str( e ) ) + + def plot_subprofile(GFF, directory,fasta) : ''' This function plot subprofiles of footprint and ORFs for each gene in GFF dict @@ -389,116 +391,119 @@ try : for gene in GFF.iterkeys(): - nb_aa = len(GFF[gene]['coverage'])/3 - subprofile_1 = [] - subprofile_2 = [] - subprofile_3 = [] - cov = GFF[gene]['coverage'] + if GFF[gene]['coverage'] : + nb_aa = len(GFF[gene]['coverage'])/3 + subprofile_1 = [] + subprofile_2 = [] + subprofile_3 = [] + cov = GFF[gene]['coverage'] + - for z in range(0,len(cov)-2,3): - for i in range(3): - subprofile_1.append(cov[z]) - subprofile_2.append(cov[z+1]) - subprofile_3.append(cov[z+2]) - - ## Normalisation of density plot - tot = max(cov) - - subprofile_1 = [(x * _MAX/tot) for x in subprofile_1] - subprofile_2 = [(x * _MAX/tot) for x in subprofile_2] - subprofile_3 = [(x * _MAX/tot) for x in subprofile_3] - - - if len(subprofile_1) == len(subprofile_2) and len(subprofile_1) == len(subprofile_3) : - ind = arange(len(subprofile_1)) - fig = pl.figure(num=1) - #sub profile frame 0 - pl.subplot(4,1,1) - ax1 = fig.add_subplot(4,1,1) - pl.title( "%s (%s)\n$(%s RPKM)$"% (gene, GFF[gene]['name'], GFF[gene]['total_rpkm'])) - pl.ylim(0,_MAX) - pl.xlim(0,len(subprofile_1)) - pl.yticks([0, _MAX/2, _MAX]) - ax1.plot(ind, subprofile_1, 'b-', lw = 3,label = 'Frame 0') - ax1.fill_between(ind, 0,subprofile_1,color='b') - ax1.legend(loc=1,prop={'size':8}) - pl.ylabel('#RPF') + for z in range(0,len(cov)-2,3): + for i in range(3): + subprofile_1.append(cov[z]) + subprofile_2.append(cov[z+1]) + subprofile_3.append(cov[z+2]) - ##sub profile frame +1 - pl.subplot(4,1,2) - ax2 = fig.add_subplot(4,1,2) - pl.ylim(0,_MAX) - pl.xlim(0,len(subprofile_2)) - pl.yticks([0, _MAX/2, _MAX]) - ax2.plot(ind,subprofile_2,'r-',lw = 3,label = 'Frame +1') - ax2.fill_between(ind, 0,subprofile_2,color='r') - ax2.legend(loc=1,prop={'size':8}) - pl.ylabel('#RPF') + ## Normalisation of density plot + tot = max(cov) + + subprofile_1 = [(x * _MAX/tot) for x in subprofile_1] + subprofile_2 = [(x * _MAX/tot) for x in subprofile_2] + subprofile_3 = [(x * _MAX/tot) for x in subprofile_3] + - ##sub profile frame -1 - pl.subplot(4,1,3) - ax3 = fig.add_subplot(4,1,3) - pl.ylim(0,_MAX) - pl.yticks([0, _MAX/2, _MAX]) - pl.xlim(0,len(subprofile_3)) - ax3.plot(ind,subprofile_3,'g-',lw =3,label = 'Frame -1') - ax3.fill_between(ind, 0,subprofile_3,color='g') - ax3.legend(loc=1,prop={'size':8}) - pl.ylabel('#RPF') - - ## get orf for each frame - gene_seq,met_1,stop_1,met_2,stop_2,met_3,stop_3 = get_orf(fasta,GFF[gene]) - ## ORF plot - idc = arange(len(gene_seq)) - ##define axe for frame : - x = [0]*len(gene_seq) - y = [1]*len(gene_seq) - z = [2]*len(gene_seq) - ## define last subplot - #pdb.set_trace() - pl.subplot(4,1,4) - ## modify axes setting - ax4 = fig.add_subplot(4,1,4) - ax4.spines['right'].set_color('none') - ax4.spines['top'].set_color('none') - ax4.spines['left'].set_color('none') - ax4.spines['bottom'].set_color('none') - ax4.tick_params(bottom = 'off',top='off',left='off', right='off') - pl.ylim(0,3) - pl.xlim(0,len(gene_seq)) - pl.yticks([0.5,1.5,2.5],[r'-1',r'+1',r'0']) - pl.xticks([]) - ## plot line for delimiting frame and arrow corresponding to start and stop - pl.plot(idc,x,'k',lw=2) - for val in met_1: - pl.arrow( val, 2, 0.0, 0.5, fc="g", ec="g",lw=2) - for val in stop_1: - pl.arrow( val, 2, 0.0, 1, fc="r", ec="r",lw=2) - pl.plot(idc,y,'k') - for val in met_2: - pl.arrow( val, 1, 0.0, 0.5, fc="g", ec="g",lw=2) - for val in stop_2: - pl.arrow( val, 1, 0.0, 1, fc="r", ec="r",lw=2) - pl.plot(idc,z,'k') - for val in met_3: - pl.arrow( val, 0, 0.0, 0.5, fc="g", ec="g",lw=2) - for val in stop_3: - pl.arrow( val, 0, 0.0, 1, fc="r", ec="r",lw=2) - - - pl.ylabel('ORF',fontsize='small') - pl.draw() - pl.savefig(directory+'/'+gene+'.png',format='png', dpi=600) - pl.clf() - ##pl.show() - - ## Make thumbnail for html page - infile = directory+'/'+gene+'.png' - size = 128, 128 - im = Image.open(infile) - im.thumbnail(size, Image.ANTIALIAS) - im.save(directory+'/'+gene + "_thumbnail.png", "PNG") + if len(subprofile_1) == len(subprofile_2) and len(subprofile_1) == len(subprofile_3) : + + ind = arange(len(subprofile_1)) + fig = pl.figure(num=1) + #sub profile frame 0 + pl.subplot(4,1,1) + ax1 = fig.add_subplot(4,1,1) + pl.title( "%s (%s)\n$(%s RPKM)$"% (gene, GFF[gene]['name'], GFF[gene]['total_rpkm'])) + pl.ylim(0,_MAX) + pl.xlim(0,len(subprofile_1)) + pl.yticks([0, _MAX/2, _MAX]) + ax1.plot(ind, subprofile_1, 'b-', lw = 3,label = 'Frame 0') + ax1.fill_between(ind, 0,subprofile_1,color='b') + ax1.legend(loc=1,prop={'size':8}) + pl.ylabel('#RPF') + + ##sub profile frame +1 + pl.subplot(4,1,2) + ax2 = fig.add_subplot(4,1,2) + pl.ylim(0,_MAX) + pl.xlim(0,len(subprofile_2)) + pl.yticks([0, _MAX/2, _MAX]) + ax2.plot(ind,subprofile_2,'r-',lw = 3,label = 'Frame +1') + ax2.fill_between(ind, 0,subprofile_2,color='r') + ax2.legend(loc=1,prop={'size':8}) + pl.ylabel('#RPF') + + ##sub profile frame -1 + pl.subplot(4,1,3) + ax3 = fig.add_subplot(4,1,3) + pl.ylim(0,_MAX) + pl.yticks([0, _MAX/2, _MAX]) + pl.xlim(0,len(subprofile_3)) + ax3.plot(ind,subprofile_3,'g-',lw =3,label = 'Frame -1') + ax3.fill_between(ind, 0,subprofile_3,color='g') + ax3.legend(loc=1,prop={'size':8}) + pl.ylabel('#RPF') + + ## get orf for each frame + gene_seq,met_1,stop_1,met_2,stop_2,met_3,stop_3 = get_orf(fasta,GFF[gene]) + ## ORF plot + idc = arange(len(gene_seq)) + ##define axe for frame : + x = [0]*len(gene_seq) + y = [1]*len(gene_seq) + z = [2]*len(gene_seq) + ## define last subplot + #pdb.set_trace() + pl.subplot(4,1,4) + ## modify axes setting + ax4 = fig.add_subplot(4,1,4) + ax4.spines['right'].set_color('none') + ax4.spines['top'].set_color('none') + ax4.spines['left'].set_color('none') + ax4.spines['bottom'].set_color('none') + ax4.tick_params(bottom = 'off',top='off',left='off', right='off') + pl.ylim(0,3) + pl.xlim(0,len(gene_seq)) + pl.yticks([0.5,1.5,2.5],[r'-1',r'+1',r'0']) + pl.xticks([]) + ## plot line for delimiting frame and arrow corresponding to start and stop + pl.plot(idc,x,'k',lw=2) + for val in met_1: + pl.arrow( val, 2, 0.0, 0.5, fc="g", ec="g",lw=2) + for val in stop_1: + pl.arrow( val, 2, 0.0, 1, fc="r", ec="r",lw=2) + pl.plot(idc,y,'k') + for val in met_2: + pl.arrow( val, 1, 0.0, 0.5, fc="g", ec="g",lw=2) + for val in stop_2: + pl.arrow( val, 1, 0.0, 1, fc="r", ec="r",lw=2) + pl.plot(idc,z,'k') + for val in met_3: + pl.arrow( val, 0, 0.0, 0.5, fc="g", ec="g",lw=2) + for val in stop_3: + pl.arrow( val, 0, 0.0, 1, fc="r", ec="r",lw=2) + + + pl.ylabel('ORF',fontsize='small') + pl.draw() + pl.savefig(directory+'/'+gene+'.png',format='png', dpi=600) + pl.clf() + ##pl.show() + + ## Make thumbnail for html page + infile = directory+'/'+gene+'.png' + size = 128, 128 + im = Image.open(infile) + im.thumbnail(size, Image.ANTIALIAS) + im.save(directory+'/'+gene + "_thumbnail.png", "PNG") else : @@ -511,12 +516,15 @@ try : - gene_table = '' - gene_table += '<table>' - gene_table += '<thead><tr><th data-sort="string">Gene</th><th>Plot</th><th data-sort="string">Name</th><th>Total Proportion</th><th>Segment proportion</th><th data-sort="float">RPKM</th><th>Segment RPKM</th></tr></thead><tbody>' - for gene in GFF.iterkeys(): - gene_table += '<tr><td>%s</td><td><a href="%s.png" data-lightbox="%s"><img src="%s_thumbnail.png" /></a></td><td><a title="%s">%s</a></td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' %(gene,gene,gene,gene,GFF[gene]['note'],GFF[gene]['name'],GFF[gene]['total_prop'],GFF[gene]['seg_prop'],GFF[gene]['total_rpkm'],GFF[gene]['seg_rpkm']) - gene_table += '</tbody></table>' + if not GFF: + gene_table = 'Sorry, there are no dual coding events in your data\n' + else : + gene_table = '' + gene_table += '<table>' + gene_table += '<thead><tr><th data-sort="string">Gene</th><th>Plot</th><th data-sort="string">Name</th><th>Total Proportion</th><th>Segment proportion</th><th data-sort="float">RPKM</th><th>Segment RPKM</th></tr></thead><tbody>' + for gene in GFF.iterkeys(): + gene_table += '<tr><td>%s</td><td><a href="%s.png" data-lightbox="%s"><img src="%s_thumbnail.png" /></a></td><td><a title="%s">%s</a></td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' %(gene,gene,gene,gene,GFF[gene]['note'],GFF[gene]['name'],GFF[gene]['total_prop'],GFF[gene]['seg_prop'],GFF[gene]['total_rpkm'],GFF[gene]['seg_rpkm']) + gene_table += '</tbody></table>' html_str = """ @@ -797,12 +805,11 @@ proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) returncode = proc.wait() - if os.path.exists(subfolder): - raise - try: - os.mkdir(subfolder) - except: - raise + if not os.path.exists(subfolder): + try: + os.mkdir(subfolder) + except Exception, e : + stop_err('Error running make directory : ' + str(e)) ## check frame arg if options.frame not in [1,2,3]: @@ -831,7 +838,11 @@ ## check gff reading if not GFF['order'] : stop_err( 'Incorrect GFF file') - # + for gene in GFF['order']: + if not GFF[gene]['exon'] : + del GFF[gene] + GFF['order'].remove(gene) + #### cPickles for Test #### #if os.path.isfile("/home/rlegendre/Documents/FrameShift/OutOfFrameWithIntron/pyt_dict") : # with open("/home/rlegendre/Documents/FrameShift/OutOfFrameWithIntron/pyt_dict",'rb') as fp: @@ -861,7 +872,7 @@ stop_err( 'Error running metagene analysis : ' + str( e ) ) if os.path.exists( tmp_dir ): - shutil.rmtree( tmp_dir ) + shutil.rmtree( tmp_dir )
--- a/ribo_functions.py Fri May 29 09:17:29 2015 -0400 +++ b/ribo_functions.py Tue Jun 09 09:06:17 2015 -0400 @@ -141,6 +141,7 @@ ''' try: GFF = {} + mRNA = {} with open(gff, 'r') as f_gff : GFF['order'] = [] @@ -153,8 +154,9 @@ # first line is already gene line : if line.split('\t')[2] == 'gene' : gene = feature[0].replace("ID=","") + curent_gene = gene if 'Name' in line : - regex = re.compile('(Name=)([^;]*);') + regex = re.compile('(Name=)([^;]*);?') res = regex.search(line.split('\t')[8]) Name = res.group(2) Name = Name.rstrip() @@ -162,7 +164,7 @@ Name = "Unknown" ##get annotation if 'Note' in line : - regex = re.compile('(Note=)([^;]*);') + regex = re.compile('(Note=)([^;]*);?') res = regex.search(line.split('\t')[8]) note = res.group(2) note = urllib.unquote(str(note)).replace("\n","") @@ -180,12 +182,27 @@ GFF[gene]['exon'] = {} GFF[gene]['exon_number'] = 0 #print Name + elif line.split('\t')[2] == 'mRNA' : + regex = re.compile('(Parent=)([^;]*);?') + res = regex.search(line.split('\t')[8]) + gene_name = res.group(2) + regex = re.compile('(ID=)([^;]*);?') + res = regex.search(line.split('\t')[8]) + mRNA_name = res.group(2) + if gene not in mRNA.viewvalues() and gene_name == curent_gene : + mRNA[mRNA_name] = gene_name + elif line.split('\t')[2] == 'CDS' : - regex = re.compile('(Parent=)([^;]*);') + regex = re.compile('(Parent=)([^;]*);?') res = regex.search(line.split('\t')[8]) gene = res.group(2) + if 'mRNA' in gene: gene = re.sub(r"(.*)(\_mRNA)", r"\1", gene) + if mRNA.has_key(gene) and GFF.has_key(mRNA[gene]): + + gene = gene_name + if GFF.has_key(gene) : GFF[gene]['exon_number'] += 1 exon_number = GFF[gene]['exon_number'] @@ -193,7 +210,7 @@ GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) - + ## if there is a five prim UTR intron, we change start of gene elif line.split('\t')[2] == 'five_prime_UTR_intron' : if GFF[gene]['strand'] == "+" :