# HG changeset patch
# User rlegendre
# Date 1434123179 14400
# Node ID 385fc64fa988d6da767d254e2043603191255502
# Parent a121cce43f909ef88a0b761667a6fdf722450d7c
Uploaded
diff -r a121cce43f90 -r 385fc64fa988 get_codon_frequency.py
--- a/get_codon_frequency.py Tue Jun 09 09:06:17 2015 -0400
+++ b/get_codon_frequency.py Fri Jun 12 11:32:59 2015 -0400
@@ -368,15 +368,15 @@
# write result
with open(outfile, 'w') as out :
- out.write('Codon\tRaw_' + c1 + '\tRaw_' + c2 + '\tNorm_' + c1 + '\tNorm_' + c2 + '\tFC(Mut/WT)\n')
+ out.write('Codon,Raw_' + c1 + ',Raw_' + c2 + ',Norm_' + c1 + ',Norm_' + c2 + ',FC(Mut/WT)\n')
for i in codon_sorted:
## if global foldchange is equal to zero
if cond1_norm[i] == 0 and cond2_norm[i] == 0:
- out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t1.0\n')
+ out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',1.0\n')
elif cond1_norm[i] == 0 :
- out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t0.0\n')
+ out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',0.0\n')
else:
- out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n')
+ out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',' + str(cond2_norm[i] / cond1_norm[i]) + '\n')
with errstate(all='ignore'):
chi = stats.chisquare(observed, expected)
out.write('Khi2 test\n')
@@ -478,14 +478,14 @@
# write result
with open(outfile, 'w') as out :
- out.write('Codon\tRaw_' + c1 + '\tRaw_' + c2 + '\tNorm_' + c1 + '\tNorm_' + c2 + '\tFC(Mut/WT)\n')
+ out.write('Codon,Raw_' + c1 + ',Raw_' + c2 + ',Norm_' + c1 + ',Norm_' + c2 + ',FC(Mut/WT)\n')
for i in codon_sorted:
if cond1_norm[i] == 0 and cond2_norm[i] == 0:
- out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t1.0\n')
+ out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',1.0\n')
elif cond1_norm[i] == 0 :
- out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t0.0\n')
+ out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',0.0\n')
else:
- out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n')
+ out.write(i + ',' + str(cond1[i]) + ',' + str(cond2[i]) + ',' + str(cond1_norm[i]) + ',' + str(cond2_norm[i]) + ',' + str(cond2_norm[i] / cond1_norm[i]) + '\n')
out.write('Khi2 test\n')
with errstate(all='ignore'):
chi = stats.chisquare(observed, expected)
diff -r a121cce43f90 -r 385fc64fa988 kmer_analysis.py
--- a/kmer_analysis.py Tue Jun 09 09:06:17 2015 -0400
+++ b/kmer_analysis.py Fri Jun 12 11:32:59 2015 -0400
@@ -210,34 +210,37 @@
except IOError :
print tmpdir+"/assoCov_"+chrom+".txt doesn't exist"
-
- ## if a gene without intron :
- if GFF[gene]['exon_number'] == 1:
-
- ## get coverage for each gene
- if GFF[gene]['strand'] == "+":
- for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
- cov.append(int((data[i].rstrip()).split("\t")[0]))
+ try:
+ ## if a gene without intron :
+ if GFF[gene]['exon_number'] == 1:
+
+ ## get coverage for each gene
+ if GFF[gene]['strand'] == "+":
+ for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
+ cov.append(int((data[i].rstrip()).split("\t")[0]))
+ else :
+ for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
+ cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
+ cov.reverse()
else :
- for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
- cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
- cov.reverse()
- else :
- ## For each gene, get coverage and sum of exon size
- if GFF[gene]['strand'] == "+":
-
- for exon in range(1,GFF[gene]['exon_number']+1) :
- for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
- #if i <= GFF[gene]['stop'] :
- cov.append(int((data[i].rstrip()).split("\t")[0]))
- else :
-
- for exon in range(1,GFF[gene]['exon_number']+1) :
- for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
- #if i <= GFF[gene]['start'] :
- cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
- cov.reverse()
+ ## For each gene, get coverage and sum of exon size
+ if GFF[gene]['strand'] == "+":
+ for exon in range(1,GFF[gene]['exon_number']+1) :
+ for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
+ #if i <= GFF[gene]['stop'] :
+ cov.append(int((data[i].rstrip()).split("\t")[0]))
+ else :
+
+ for exon in range(1,GFF[gene]['exon_number']+1) :
+ for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
+ #if i <= GFF[gene]['start'] :
+ cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
+ cov.reverse()
+ except :
+ #print gene+" could not be analysed."
+ #del GFF[gene]
+ continue
len_cov = len(cov)
prop = [0,0,0]
for nuc in range(0,len_cov-2,3) :
diff -r a121cce43f90 -r 385fc64fa988 metagene_frameshift_analysis.py
--- a/metagene_frameshift_analysis.py Tue Jun 09 09:06:17 2015 -0400
+++ b/metagene_frameshift_analysis.py Fri Jun 12 11:32:59 2015 -0400
@@ -184,9 +184,9 @@
mean_orf = 0
myheader = ['Gene','Name','Total_proportion','Dual_coding','RPKM','Segment_RPKM','Note']
- with open(subfolder+'/dual_coding.tab',"w") as out :
+ with open(subfolder+'/dual_coding.csv',"w") as out :
#out.write('Gene\tName\tTotal_proportion\tDual_coding\tRPKM\tSegment_RPKM\tNote\n')
- writer = csv.writer(out,delimiter='\t')
+ writer = csv.writer(out,delimiter=',')
writer.writerow(myheader)
whole_phasing = [0,0,0]
for gene in GFF['order']:
@@ -231,7 +231,7 @@
cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
cov.reverse()
except :
- print gene+" could not be analysed.\n"
+ print gene+" could not be analysed."
del GFF[gene]
continue
diff -r a121cce43f90 -r 385fc64fa988 metagene_readthrough.py
--- a/metagene_readthrough.py Tue Jun 09 09:06:17 2015 -0400
+++ b/metagene_readthrough.py Fri Jun 12 11:32:59 2015 -0400
@@ -16,6 +16,7 @@
import HTSeq
#from matplotlib import pyplot as pl
import matplotlib
+from jinja2.nodes import Pos
matplotlib.use('Agg')
import matplotlib.pyplot as pl
from numpy import arange
@@ -404,14 +405,49 @@
except Exception, e:
stop_err( 'Error during gene plotting : ' + str( e ) )
+
+def __percent__(prop):
+
+ if sum(prop) != 0 :
+ perc = [0,0,0]
+ if prop[0] != 0 :
+ perc[0] = int("{0:.0f}".format((prop[0]*100.0)/sum(prop)))
+ if prop[1] != 0 :
+ perc[1] = int("{0:.0f}".format((prop[1]*100.0)/sum(prop)))
+ if prop[2] != 0 :
+ perc[2] = int("{0:.0f}".format((prop[2]*100.0)/sum(prop)))
+ return perc
+ else :
+ return prop
+
+def compute_phasing(chrom, start, stop, aln, kmer, strand):
-def compute_analysis(bam, GFF, fasta, gff, dirout, extend) :
+ phasing = [0,0,0]
+ for track in aln.fetch(chrom, start, stop):
+ if strand == '-':
+ if track.is_reverse :
+ if len(track.seq) == kmer :
+ pos = stop - (track.pos + track.rlen - 12)
+ if pos > 0:
+ phasing[pos%3] += 1
+ else :
+ if not track.is_reverse :
+ if len(track.seq) == kmer :
+ pos = (track.pos + 12) - start
+ if pos > 0:
+ phasing[pos%3] += 1
+
+ #return __percent__(phasing)
+ return phasing
+
+
+def compute_analysis(bam, GFF, fasta, gff, dirout, extend, kmer) :
try:
- background_file = dirout+"/background_sequence.fasta"
- file_back = open(background_file, 'w')
- file_context = open(dirout+"/stop_context.fasta", 'w')
- file_extension = open(dirout+"/extensions.fasta", 'w')
+ #background_file = dirout+"/background_sequence.fasta"
+ #file_back = open(background_file, 'w')
+ #file_context = open(dirout+"/stop_context.fasta", 'w')
+ #file_extension = open(dirout+"/extensions.fasta", 'w')
## Opening Bam file with pysam librarie
pysam.index(bam)
aln = pysam.Samfile(bam, "rb",header=True, check_header = True)
@@ -429,7 +465,7 @@
with open(dirout+"/readthrough_result.csv","w") as out :
myheader = ['Gene','Name', 'FAIL', 'Stop context','chrom','start extension','stop extension','length extension','RPKM CDS', 'RPKM extension','ratio','Annotation','sequence']
- writer = csv.writer(out,delimiter='\t')
+ writer = csv.writer(out,delimiter=',')
writer.writerow(myheader)
for gene in GFF['order'] :
#print GFF[gene]
@@ -522,7 +558,7 @@
if (seq):
res = find_stop(seq)
if res == -1 :
- mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq]
+ mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']]
writer.writerow(mylist)
else :
indic = 'ok'
@@ -541,7 +577,7 @@
if (seq):
res = find_stop(seq)
if res == -1 :
- mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq]
+ mylist = [gene,GFF[gene]['name'],'no stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']]
writer.writerow(mylist)
else :
indic = 'ok'
@@ -584,6 +620,26 @@
## if we have footprint in stop codon extension and stop extension is further than cds_stop+9
if count_stop > 2 and stop_ok == 1 :
homo_cov = check_homo_coverage(gene,GFF,start_extension,stop_extension,aln)
+ # phasing = compute_phasing(GFF[gene]['chrom'],start_extension,stop_extension, aln, kmer,GFF[gene]['strand'])
+ # print gene, phasing
+# count_after_stop = 0
+# try :
+# ### count method of pysam doesn't strand information
+# if GFF[gene]['strand'] == '+' :
+# for track in aln.fetch(GFF[gene]['chrom'],stop_extension+15,stop_extension+40) :
+# if not track.is_reverse :
+# count_after_stop += 1
+# else :
+# for track in aln.fetch(GFF[gene]['chrom'],start_extension-40,start_extension-15) :
+# if track.is_reverse :
+# count_after_stop += 1
+# except ValueError:
+# ## genere warning about gtf coordinates
+# warnings.warn("Please check coordinates in GFF : maybe stop or start codon are missing" )
+# pass
+# print gene+"\t"+str(count_stop)+"\t"+str(count_after_stop)
+# if (count_stop*5)/100 > count_after_stop :
+# print "moins de 5%...\n"
if (homo_cov) :
'''
write result witch corresponding to readthrough
@@ -600,10 +656,10 @@
ratio = rpkm_ext/rpkm_cds
#print gene,ratio
#print start_extension,stop_extension
- mylist = [gene,GFF[gene]['name'],'-',contexte_stop,GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,GFF[gene]['note'],seq]
+ mylist = [gene,GFF[gene]['name'],'-',contexte_stop,GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,seq,GFF[gene]['note']]
writer.writerow(mylist)
- file_context.write('>'+gene+'\n'+contexte_stop+'\n')
- file_extension.write('>'+gene+'\n'+seq+'\n')
+ #file_context.write('>'+gene+'\n'+contexte_stop+'\n')
+ #file_extension.write('>'+gene+'\n'+seq+'\n')
else :
'''
write result witch corresponding to readthrough but with no homogeneous coverage
@@ -617,10 +673,10 @@
rpkm_ext = compute_rpkm(len_ext,count_ext,count_tot)
## compute ratio between extension coverage and cds coverage (rpkm)
ratio = rpkm_ext/rpkm_cds
- mylist = [gene,GFF[gene]['name'],'hetero cov',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,GFF[gene]['note'],seq]
+ mylist = [gene,GFF[gene]['name'],'hetero cov',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,rpkm_ext,ratio,seq,GFF[gene]['note']]
writer.writerow(mylist)
- file_context.write('>'+gene+'\n'+contexte_stop+'\n')
- file_extension.write('>'+gene+'\n'+seq+'\n')
+ #file_context.write('>'+gene+'\n'+contexte_stop+'\n')
+ #file_extension.write('>'+gene+'\n'+seq+'\n')
#print ">"+gene+"\n"+contexte_stop
## plot gene :
@@ -632,38 +688,38 @@
'''
write result with no footprint in stop codon of extension
'''
- mylist = [gene,GFF[gene]['name'],'no RPF in stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq]
+ mylist = [gene,GFF[gene]['name'],'no RPF in stop',contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']]
writer.writerow(mylist)
- file_context.write('>'+gene+'\n'+contexte_stop+'\n')
- file_extension.write('>'+gene+'\n'+seq+'\n')
+ #file_context.write('>'+gene+'\n'+contexte_stop+'\n')
+ #file_extension.write('>'+gene+'\n'+seq+'\n')
#print ">"+gene+"\n"+contexte_stop
else :
'''
write result with RPF maybe result of reinitiation on a start codon
'''
if pass_length(start_extension,stop_extension) :
- mylist = [gene,GFF[gene]['name'],'Met after stop', contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',GFF[gene]['note'],seq]
+ mylist = [gene,GFF[gene]['name'],'Met after stop', contexte_stop, GFF[gene]['chrom'], start_extension, stop_extension,stop_extension-start_extension,rpkm_cds,'-','-',seq,GFF[gene]['note']]
writer.writerow(mylist)
- file_context.write('>'+gene+'\n'+contexte_stop+'\n')
- file_extension.write('>'+gene+'\n'+seq+'\n')
+ #file_context.write('>'+gene+'\n'+contexte_stop+'\n')
+ #file_extension.write('>'+gene+'\n'+seq+'\n')
#print ">"+gene+"\n"+contexte_stop
- else :
- ## if its not a interesting case, we get stop context of genes without readthrough
- if GFF[gene]['strand'] == '+' :
- contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['stop']-6:GFF[gene]['stop']+6])
- file_back.write(contexte_stop+'\n')
- else :
- contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['start']-7:GFF[gene]['start']+5].reverse_complement())
- file_back.write(contexte_stop+'\n')
-
+# else :
+# ## if its not a interesting case, we get stop context of genes without readthrough
+# if GFF[gene]['strand'] == '+' :
+# contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['stop']-6:GFF[gene]['stop']+6])
+# file_back.write(contexte_stop+'\n')
+# else :
+# contexte_stop = str(SeqDict[GFF[gene]['chrom']].seq[GFF[gene]['start']-7:GFF[gene]['start']+5].reverse_complement())
+# file_back.write(contexte_stop+'\n')
+#
## excluded UT with incorrect positions
except ValueError:
pass
- file_context.close()
- file_back.close()
- file_extension.close()
+ #file_context.close()
+ #file_back.close()
+ #file_extension.close()
except Exception,e:
stop_err( 'Error during computing analysis : ' + str( e ) )
@@ -679,14 +735,14 @@
gene_table += 'Gene Plot Name Stop context Coordinates RPKM CDS RPKM extension ratio Extension