Mercurial > repos > rlegendre > ribo_tools
changeset 19:385fc64fa988 draft default tip
Uploaded
author | rlegendre |
---|---|
date | Fri, 12 Jun 2015 11:32:59 -0400 |
parents | a121cce43f90 |
children | |
files | get_codon_frequency.py kmer_analysis.py metagene_frameshift_analysis.py metagene_readthrough.py ribo_functions.pyc |
diffstat | 5 files changed, 140 insertions(+), 73 deletions(-) [+] |
line wrap: on
line diff
--- a/get_codon_frequency.py Tue Jun 09 09:06:17 2015 -0400 +++ b/get_codon_frequency.py Fri Jun 12 11:32:59 2015 -0400 @@ -368,15 +368,15 @@ # write result with open(outfile, 'w') as out : - out.write('Codon\tRaw_' + c1 + '\tRaw_' + c2 + '\tNorm_' + c1 + '\tNorm_' + c2 + '\tFC(Mut/WT)\n') + out.write('Codon,Raw_' + c1 + ',Raw_' + c2 + ',Norm_' + c1 + ',Norm_' + c2 + ',FC(Mut/WT)\n') for i in codon_sorted: ## if global foldchange is equal to zero if cond1_norm[i] == 0 and cond2_norm[i] == 0: - out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t1.0\n') + out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',1.0\n') elif cond1_norm[i] == 0 : - out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t0.0\n') + out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',0.0\n') else: - out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n') + out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',' + str(cond2_norm[i] / cond1_norm[i]) + '\n') with errstate(all='ignore'): chi = stats.chisquare(observed, expected) out.write('Khi2 test\n') @@ -478,14 +478,14 @@ # write result with open(outfile, 'w') as out : - out.write('Codon\tRaw_' + c1 + '\tRaw_' + c2 + '\tNorm_' + c1 + '\tNorm_' + c2 + '\tFC(Mut/WT)\n') + out.write('Codon,Raw_' + c1 + ',Raw_' + c2 + ',Norm_' + c1 + ',Norm_' + c2 + ',FC(Mut/WT)\n') for i in codon_sorted: if cond1_norm[i] == 0 and cond2_norm[i] == 0: - out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t1.0\n') + out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',1.0\n') elif cond1_norm[i] == 0 : - out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t0.0\n') + out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',0.0\n') else: - out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n') + out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',' + str(cond2_norm[i] / cond1_norm[i]) + '\n') out.write('Khi2 test\n') with errstate(all='ignore'): chi = stats.chisquare(observed, expected)
--- a/kmer_analysis.py Tue Jun 09 09:06:17 2015 -0400 +++ b/kmer_analysis.py Fri Jun 12 11:32:59 2015 -0400 @@ -210,34 +210,37 @@ except IOError : print tmpdir+"/assoCov_"+chrom+".txt doesn't exist" - - ## if a gene without intron : - if GFF[gene]['exon_number'] == 1: - - ## get coverage for each gene - if GFF[gene]['strand'] == "+": - for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1): - cov.append(int((data[i].rstrip()).split("\t")[0])) + try: + ## if a gene without intron : + if GFF[gene]['exon_number'] == 1: + + ## get coverage for each gene + if GFF[gene]['strand'] == "+": + for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1): + cov.append(int((data[i].rstrip()).split("\t")[0])) + else : + for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1): + cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) + cov.reverse() else : - for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1): - cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) - cov.reverse() - else : - ## For each gene, get coverage and sum of exon size - if GFF[gene]['strand'] == "+": - - for exon in range(1,GFF[gene]['exon_number']+1) : - for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): - #if i <= GFF[gene]['stop'] : - cov.append(int((data[i].rstrip()).split("\t")[0])) - else : - - for exon in range(1,GFF[gene]['exon_number']+1) : - for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): - #if i <= GFF[gene]['start'] : - cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) - cov.reverse() + ## For each gene, get coverage and sum of exon size + if GFF[gene]['strand'] == "+": + for exon in range(1,GFF[gene]['exon_number']+1) : + for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): + #if i <= GFF[gene]['stop'] : + cov.append(int((data[i].rstrip()).split("\t")[0])) + else : + + for exon in range(1,GFF[gene]['exon_number']+1) : + for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): + #if i <= GFF[gene]['start'] : + cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) + cov.reverse() + except : + #print gene+" could not be analysed." + #del GFF[gene] + continue len_cov = len(cov) prop = [0,0,0] for nuc in range(0,len_cov-2,3) :
--- a/metagene_frameshift_analysis.py Tue Jun 09 09:06:17 2015 -0400 +++ b/metagene_frameshift_analysis.py Fri Jun 12 11:32:59 2015 -0400 @@ -184,9 +184,9 @@ mean_orf = 0 myheader = ['Gene','Name','Total_proportion','Dual_coding','RPKM','Segment_RPKM','Note'] - with open(subfolder+'/dual_coding.tab',"w") as out : + with open(subfolder+'/dual_coding.csv',"w") as out : #out.write('Gene\tName\tTotal_proportion\tDual_coding\tRPKM\tSegment_RPKM\tNote\n') - writer = csv.writer(out,delimiter='\t') + writer = csv.writer(out,delimiter=',') writer.writerow(myheader) whole_phasing = [0,0,0] for gene in GFF['order']: @@ -231,7 +231,7 @@ cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) cov.reverse() except : - print gene+" could not be analysed.\n" + print gene+" could not be analysed." del GFF[gene] continue
--- a/metagene_readthrough.py Tue Jun 09 09:06:17 2015 -0400 +++ b/metagene_readthrough.py Fri Jun 12 11:32:59 2015 -0400 @@ -16,6 +16,7 @@ import HTSeq #from matplotlib import pyplot as pl import matplotlib +from jinja2.nodes import Pos matplotlib.use('Agg') import matplotlib.pyplot as pl from numpy import arange @@ -404,14 +405,49 @@ except Exception, e: stop_err( 'Error during gene plotting : ' + str( e ) ) + +def __percent__(prop): + + if sum(prop) != 0 : + perc = [0,0,0] + if prop[0] != 0 : + perc[0] = int("{0:.0f}".format((prop[0]*100.0)/sum(prop))) + if prop[1] != 0 : + perc[1] = int("{0:.0f}".format((prop[1]*100.0)/sum(prop))) + if prop[2] != 0 : + perc[2] = int("{0:.0f}".format((prop[2]*100.0)/sum(prop))) + return perc + else : + return prop + +def compute_phasing(chrom, start, stop, aln, kmer, strand): -def compute_analysis(bam, GFF, fasta, gff, dirout, extend) : + phasing = [0,0,0] + for track in aln.fetch(chrom, start, stop): + if strand == '-': + if track.is_reverse : + if len(track.seq) == kmer : + pos = stop - (track.pos + track.rlen - 12) + if pos > 0: + phasing[pos%3] += 1 + else : + if not track.is_reverse : + if len(track.seq) == kmer : + pos = (track.pos + 12) - start + if pos > 0: + phasing[pos%3] += 1 + + #return __percent__(phasing) + return phasing + + +def compute_analysis(bam, GFF, fasta, gff, dirout, extend, kmer) : try: - background_file = dirout+"/background_sequence.fasta" - file_back = open(background_file, 'w') - file_context = open(dirout+"/stop_context.fasta", 'w') - file_extension = open(dirout+"/extensions.fasta", 'w') + #background_file = dirout+"/background_sequence.fasta" + #file_back = open(background_file, 'w') + #file_context = open(dirout+"/stop_context.fasta", 'w') + #file_extension = open(dirout+"/extensions.fasta", 'w') ## Opening Bam file with pysam librarie pysam.index(bam) aln = pysam.Samfile(bam, "rb",header=True, check_header = True) @@ -429,7 +465,7 @@ with open(dirout+"/readthrough_result.csv","w") as out : myheader = ['Gene','Name', 'FAIL', 'Stop context','chrom','start extension','stop extension','length extension','RPKM CDS', 'RPKM extension','ratio','Annotation','sequence'] - writer = csv.writer(out,delimiter='\t') + writer = csv.writer(out,delimiter=',') writer.writerow(myheader) for gene in GFF['order'] : #print GFF[gene] @@ -522,7 +558,7 @@ if (seq): res = find_stop(seq) if res == -1 : - mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] + mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']] writer.writerow(mylist) else : indic = 'ok' @@ -541,7 +577,7 @@ if (seq): res = find_stop(seq) if res == -1 : - mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] + mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']] writer.writerow(mylist) else : indic = 'ok' @@ -584,6 +620,26 @@ ## if we have footprint in stop codon extension and stop extension is further than cds_stop+9 if count_stop > 2 and stop_ok == 1 : homo_cov = check_homo_coverage(gene,GFF,start_extension,stop_extension,aln) + # phasing = compute_phasing(GFF[gene]['chrom'],start_extension,stop_extension, aln, kmer,GFF[gene]['strand']) + # print gene, phasing +# count_after_stop = 0 +# try : +# ### count method of pysam doesn't strand information +# if GFF[gene]['strand'] == '+' : +# for track in aln.fetch(GFF[gene]['chrom'],stop_extension+15,stop_extension+40) : +# if not track.is_reverse : +# count_after_stop += 1 +# else : +# for track in aln.fetch(GFF[gene]['chrom'],start_extension-40,start_extension-15) : +# if track.is_reverse : +# count_after_stop += 1 +# except ValueError: +# ## genere warning about gtf coordinates +# warnings.warn("Please check coordinates in GFF : maybe stop or start codon are missing" ) +# pass +# print gene+"\t"+str(count_stop)+"\t"+str(count_after_stop) +# if (count_stop*5)/100 > count_after_stop : +# print "moins de 5%...\n" if (homo_cov) : ''' write result witch corresponding to readthrough @@ -600,10 +656,10 @@ ratio = rpkm_ext/rpkm_cds #print gene,ratio #print start_extension,stop_extension - mylist = [gene,GFF[gene]['name'],'-',contexte_stop,GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,GFF[gene]['note'],seq] + mylist = [gene,GFF[gene]['name'],'-',contexte_stop,GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,seq,GFF[gene]['note']] writer.writerow(mylist) - file_context.write('>'+gene+'\n'+contexte_stop+'\n') - file_extension.write('>'+gene+'\n'+seq+'\n') + #file_context.write('>'+gene+'\n'+contexte_stop+'\n') + #file_extension.write('>'+gene+'\n'+seq+'\n') else : ''' write result witch corresponding to readthrough but with no homogeneous coverage @@ -617,10 +673,10 @@ rpkm_ext = compute_rpkm(len_ext,count_ext,count_tot) ## compute ratio between extension coverage and cds coverage (rpkm) ratio = rpkm_ext/rpkm_cds - mylist = [gene,GFF[gene]['name'],'hetero cov',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,GFF[gene]['note'],seq] + mylist = [gene,GFF[gene]['name'],'hetero cov',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,seq,GFF[gene]['note']] writer.writerow(mylist) - file_context.write('>'+gene+'\n'+contexte_stop+'\n') - file_extension.write('>'+gene+'\n'+seq+'\n') + #file_context.write('>'+gene+'\n'+contexte_stop+'\n') + #file_extension.write('>'+gene+'\n'+seq+'\n') #print ">"+gene+"\n"+contexte_stop ## plot gene : @@ -632,38 +688,38 @@ ''' write result with no footprint in stop codon of extension ''' - mylist = [gene,GFF[gene]['name'],'no RPF in stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] + mylist = [gene,GFF[gene]['name'],'no RPF in stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']] writer.writerow(mylist) - file_context.write('>'+gene+'\n'+contexte_stop+'\n') - file_extension.write('>'+gene+'\n'+seq+'\n') + #file_context.write('>'+gene+'\n'+contexte_stop+'\n') + #file_extension.write('>'+gene+'\n'+seq+'\n') #print ">"+gene+"\n"+contexte_stop else : ''' write result with RPF maybe result of reinitiation on a start codon ''' if pass_length(start_extension,stop_extension) : - mylist = [gene,GFF[gene]['name'],'Met after stop', contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq] + mylist = [gene,GFF[gene]['name'],'Met after stop', contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']] writer.writerow(mylist) - file_context.write('>'+gene+'\n'+contexte_stop+'\n') - file_extension.write('>'+gene+'\n'+seq+'\n') + #file_context.write('>'+gene+'\n'+contexte_stop+'\n') + #file_extension.write('>'+gene+'\n'+seq+'\n') #print ">"+gene+"\n"+contexte_stop - else : - ## if its not a interesting case, we get stop context of genes without readthrough - if GFF[gene]['strand'] == '+' : - contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['stop']-6:GFF[gene]['stop']+6]) - file_back.write(contexte_stop+'\n') - else : - contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['start']-7:GFF[gene]['start']+5].reverse_complement()) - file_back.write(contexte_stop+'\n') - +# else : +# ## if its not a interesting case, we get stop context of genes without readthrough +# if GFF[gene]['strand'] == '+' : +# contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['stop']-6:GFF[gene]['stop']+6]) +# file_back.write(contexte_stop+'\n') +# else : +# contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['start']-7:GFF[gene]['start']+5].reverse_complement()) +# file_back.write(contexte_stop+'\n') +# ## excluded UT with incorrect positions except ValueError: pass - file_context.close() - file_back.close() - file_extension.close() + #file_context.close() + #file_back.close() + #file_extension.close() except Exception,e: stop_err( 'Error during computing analysis : ' + str( e ) ) @@ -679,14 +735,14 @@ gene_table += '<thead><tr><th data-sort="string">Gene</th><th>Plot</th><th data-sort="string">Name</th><th>Stop context</th><th>Coordinates</th><th>RPKM CDS</th><th>RPKM extension</th><th data-sort="float">ratio</th><th>Extension</th></tr></thead><tbody>' with open(os.path.join(subfolder,'readthrough_result.csv'), 'rb') as csvfile: - spamreader = csv.reader(csvfile, delimiter='\t') + spamreader = csv.reader(csvfile, delimiter=',') ## skip the header next(spamreader, None) ##test if file is empty or not if next(spamreader, None): for row in spamreader: if row[2] == '-' : - gene_table += '<tr><td>%s</td><td><a href="%s.png" data-lightbox="%s"><img src="%s_thumbnail.png" /></a></td><td><a title="%s">%s</a></td><td>%s</td><td>%s:%s-%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' %(row[0], row[0], row[0], row[0], row[11], row[1], row[3], row[4], row[5], row[6], row[8], row[9], row[10], row[12]) + gene_table += '<tr><td>%s</td><td><a href="%s.png" data-lightbox="%s"><img src="%s_thumbnail.png" /></a></td><td><a title="%s">%s</a></td><td>%s</td><td>%s:%s-%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' %(row[0], row[0], row[0], row[0], row[11], row[1], row[3], row[4], row[5], row[6], row[8], row[9], row[10], row[11]) gene_table += '</tbody></table>' else : @@ -931,7 +987,10 @@ help="Bam Ribo-Seq alignments ", metavar="FILE") parser.add_option("-d", "--dirout", dest="dirout", type= "string", - help="write report in this html file and in associated directory", metavar="STR,STR") + help="write report in this html file and in associated directory", metavar="STR,STR") + + parser.add_option("-k", "--kmer", dest="kmer", type= "int",default = 28 , + help="Longer of your phasing reads (Default is 28)", metavar="INT") parser.add_option("-e", "--extend", dest="extend", type= "int",default = 300 , help="Lenght of extension after stop in number of base pairs (depends on your organisme)", metavar="INT") @@ -972,8 +1031,13 @@ ## check gff reading if not GFF['order'] : stop_err( 'Incorrect GFF file' ) + for gene in GFF['order']: + if not GFF[gene]['exon'] : + del GFF[gene] + GFF['order'].remove(gene) + clean_file = ribo_functions.cleaning_bam(options.bamfile) - compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder, options.extend) + compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder, options.extend, options.kmer) if os.path.exists( clean_file ): os.remove( clean_file )