# HG changeset patch
# User rlegendre
# Date 1433855177 14400
# Node ID a121cce43f909ef88a0b761667a6fdf722450d7c
# Parent c87c40e642af0db2a7ea0649aac193d45eeea27d
Uploaded
diff -r c87c40e642af -r a121cce43f90 kmer_analysis.py
--- a/kmer_analysis.py Fri May 29 09:17:29 2015 -0400
+++ b/kmer_analysis.py Tue Jun 09 09:06:17 2015 -0400
@@ -367,12 +367,11 @@
try:
- if os.path.exists(options.dirout):
- raise
- try:
- os.mkdir(options.dirout)
- except:
- raise
+ if not os.path.exists(options.dirout):
+ try:
+ os.mkdir(options.dirout)
+ except Exception, e :
+ stop_err('Error running make directory : ' + str(e))
##testing indexed bam file
if os.path.isfile(options.bamfile+".bai") :
@@ -404,7 +403,10 @@
## check gff reading
if not GFF['order'] :
stop_err( 'Incorrect GFF file' )
-
+ for gene in GFF['order']:
+ if not GFF[gene]['exon'] :
+ del GFF[gene]
+ GFF['order'].remove(gene)
## split bam
split_bam(options.bamfile,tmpdir)
###################################
diff -r c87c40e642af -r a121cce43f90 metagene_frameshift_analysis.py
--- a/metagene_frameshift_analysis.py Fri May 29 09:17:29 2015 -0400
+++ b/metagene_frameshift_analysis.py Tue Jun 09 09:06:17 2015 -0400
@@ -204,32 +204,37 @@
data = f.readlines()
except IOError :
print tmpdir+"/assoCov_"+chrom+".txt doesn't exist"
-
- ## if a gene without intron :
- if GFF[gene]['exon_number'] == 1:
-
- ## get coverage for each gene
- if GFF[gene]['strand'] == "+":
- for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
- cov.append(int((data[i].rstrip()).split("\t")[0]))
+
+ try :
+ ## if a gene without intron :
+ if GFF[gene]['exon_number'] == 1:
+
+ ## get coverage for each gene
+ if GFF[gene]['strand'] == "+":
+ for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
+ cov.append(int((data[i].rstrip()).split("\t")[0]))
+ else :
+ for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
+ cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
+ cov.reverse()
else :
- for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
- cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
- cov.reverse()
- else :
- ## For each gene, get coverage and sum of exon size
- if GFF[gene]['strand'] == "+":
-
- for exon in range(1,GFF[gene]['exon_number']+1) :
- for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
- cov.append(int((data[i].rstrip()).split("\t")[0]))
- else :
-
- for exon in range(1,GFF[gene]['exon_number']+1) :
- for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
- cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
- cov.reverse()
+ ## For each gene, get coverage and sum of exon size
+ if GFF[gene]['strand'] == "+":
+ for exon in range(1,GFF[gene]['exon_number']+1) :
+ for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
+ cov.append(int((data[i].rstrip()).split("\t")[0]))
+ else :
+
+ for exon in range(1,GFF[gene]['exon_number']+1) :
+ for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
+ cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
+ cov.reverse()
+ except :
+ print gene+" could not be analysed.\n"
+ del GFF[gene]
+ continue
+
len_cov = len(cov)
## segmentation by orf_length
nb_segment = int(len_cov/orf_length)
@@ -237,14 +242,6 @@
if len_cov < orf_length :
nb_segment = 1
-# #segmentation by gene size
-# if len_cov < 1000 :
-# nb_segment = 3
-# elif len_cov > 1000 and len_cov < 2000:
-# nb_segment = 6
-# else :
-# nb_segment = 9
-
## number of aa in gene :
nb_aa = len_cov/3
len_frag = len_cov/nb_segment
@@ -255,7 +252,7 @@
seg_rpkm = []
window = 0
GFF[gene]['dual_coding'] = '-'
- ''' segmentation et total proportions '''
+ ''' segmentation and total proportions '''
if sum(cov) > MIN_COV_REQUIREMENT:
## compute mean of ORF length
mean_orf += len_cov
@@ -290,16 +287,16 @@
idx += 1
for z in range(0,3) :
whole_phasing_table[z].append(__percent__(tot_prop)[z])
- else :
- for nuc in range(0,len_cov-2,3) :
- ## Calculate proportion
- prop[0] += cov[nuc]
- prop[1] += cov[nuc+1]
- prop[2] += cov[nuc+2]
- whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop)))
- #for z in range(0,3) :
- # whole_phasing_table[z].append(__percent__(tot_prop)[z])
-
+# else :
+# for nuc in range(0,len_cov-2,3) :
+# ## Calculate proportion
+# prop[0] += cov[nuc]
+# prop[1] += cov[nuc+1]
+# prop[2] += cov[nuc+2]
+# whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop)))
+# for z in range(0,3) :
+# whole_phasing_table[z].append(__percent__(tot_prop)[z])
+
if GFF[gene]['dual_coding'] == 'yes' :
mylist = [gene,GFF[gene]['name'],GFF[gene]['total_prop'],GFF[gene]['seg_prop'],GFF[gene]['total_rpkm'],GFF[gene]['seg_rpkm'],GFF[gene]['note']]
#out.write(gene+'\t'+GFF[gene]['name']+'\t'+str(GFF[gene]['total_prop'])+'\t'+str(GFF[gene]['seg_prop'])+'\t'+str(GFF[gene]['total_rpkm'])+'\t'+GFF[gene]['note']+'\n')
@@ -309,20 +306,23 @@
del GFF[gene]
del GFF['order']
-
-
+
whole_phasing = __percent__(whole_phasing)
sys.stdout.write("Proportion of reads in each frame :\n%s\n" % whole_phasing)
- sys.stdout.write("Mean length of ORF in your organism : %s\n" % int(mean_orf/nb_gene))
- sys.stdout.write("%d genes are used for frame analysis\n" % nb_gene)
+ if nb_gene == 0:
+ sys.stdout.write("%d gene are used for frame analysis: each gene require %d reads at least to be analysed.\n" % (nb_gene, MIN_COV_REQUIREMENT))
+ else :
+ sys.stdout.write("Mean length of ORF in your organism : %s\n" % int(mean_orf/nb_gene))
+ sys.stdout.write("%d genes are used for frame analysis\n" % nb_gene)
##produce boxplot
pl.boxplot(whole_phasing_table,notch=True,sym='ro')
pl.ylabel('Total number of footprints (percent)')
+ pl.ylim(0,100)
pl.draw()
pl.savefig(subfolder+'/boxplot.png',format='png', dpi=640)
pl.savefig(box_file, format='png', dpi=640)
-
+ pl.clf()
return GFF
@@ -381,6 +381,8 @@
except Exception, e:
stop_err( 'Error during ORF computing : ' + str( e ) )
+
+
def plot_subprofile(GFF, directory,fasta) :
'''
This function plot subprofiles of footprint and ORFs for each gene in GFF dict
@@ -389,116 +391,119 @@
try :
for gene in GFF.iterkeys():
- nb_aa = len(GFF[gene]['coverage'])/3
- subprofile_1 = []
- subprofile_2 = []
- subprofile_3 = []
- cov = GFF[gene]['coverage']
+ if GFF[gene]['coverage'] :
+ nb_aa = len(GFF[gene]['coverage'])/3
+ subprofile_1 = []
+ subprofile_2 = []
+ subprofile_3 = []
+ cov = GFF[gene]['coverage']
+
- for z in range(0,len(cov)-2,3):
- for i in range(3):
- subprofile_1.append(cov[z])
- subprofile_2.append(cov[z+1])
- subprofile_3.append(cov[z+2])
-
- ## Normalisation of density plot
- tot = max(cov)
-
- subprofile_1 = [(x * _MAX/tot) for x in subprofile_1]
- subprofile_2 = [(x * _MAX/tot) for x in subprofile_2]
- subprofile_3 = [(x * _MAX/tot) for x in subprofile_3]
-
-
- if len(subprofile_1) == len(subprofile_2) and len(subprofile_1) == len(subprofile_3) :
- ind = arange(len(subprofile_1))
- fig = pl.figure(num=1)
- #sub profile frame 0
- pl.subplot(4,1,1)
- ax1 = fig.add_subplot(4,1,1)
- pl.title( "%s (%s)\n$(%s RPKM)$"% (gene, GFF[gene]['name'], GFF[gene]['total_rpkm']))
- pl.ylim(0,_MAX)
- pl.xlim(0,len(subprofile_1))
- pl.yticks([0, _MAX/2, _MAX])
- ax1.plot(ind, subprofile_1, 'b-', lw = 3,label = 'Frame 0')
- ax1.fill_between(ind, 0,subprofile_1,color='b')
- ax1.legend(loc=1,prop={'size':8})
- pl.ylabel('#RPF')
+ for z in range(0,len(cov)-2,3):
+ for i in range(3):
+ subprofile_1.append(cov[z])
+ subprofile_2.append(cov[z+1])
+ subprofile_3.append(cov[z+2])
- ##sub profile frame +1
- pl.subplot(4,1,2)
- ax2 = fig.add_subplot(4,1,2)
- pl.ylim(0,_MAX)
- pl.xlim(0,len(subprofile_2))
- pl.yticks([0, _MAX/2, _MAX])
- ax2.plot(ind,subprofile_2,'r-',lw = 3,label = 'Frame +1')
- ax2.fill_between(ind, 0,subprofile_2,color='r')
- ax2.legend(loc=1,prop={'size':8})
- pl.ylabel('#RPF')
+ ## Normalisation of density plot
+ tot = max(cov)
+
+ subprofile_1 = [(x * _MAX/tot) for x in subprofile_1]
+ subprofile_2 = [(x * _MAX/tot) for x in subprofile_2]
+ subprofile_3 = [(x * _MAX/tot) for x in subprofile_3]
+
- ##sub profile frame -1
- pl.subplot(4,1,3)
- ax3 = fig.add_subplot(4,1,3)
- pl.ylim(0,_MAX)
- pl.yticks([0, _MAX/2, _MAX])
- pl.xlim(0,len(subprofile_3))
- ax3.plot(ind,subprofile_3,'g-',lw =3,label = 'Frame -1')
- ax3.fill_between(ind, 0,subprofile_3,color='g')
- ax3.legend(loc=1,prop={'size':8})
- pl.ylabel('#RPF')
-
- ## get orf for each frame
- gene_seq,met_1,stop_1,met_2,stop_2,met_3,stop_3 = get_orf(fasta,GFF[gene])
- ## ORF plot
- idc = arange(len(gene_seq))
- ##define axe for frame :
- x = [0]*len(gene_seq)
- y = [1]*len(gene_seq)
- z = [2]*len(gene_seq)
- ## define last subplot
- #pdb.set_trace()
- pl.subplot(4,1,4)
- ## modify axes setting
- ax4 = fig.add_subplot(4,1,4)
- ax4.spines['right'].set_color('none')
- ax4.spines['top'].set_color('none')
- ax4.spines['left'].set_color('none')
- ax4.spines['bottom'].set_color('none')
- ax4.tick_params(bottom = 'off',top='off',left='off', right='off')
- pl.ylim(0,3)
- pl.xlim(0,len(gene_seq))
- pl.yticks([0.5,1.5,2.5],[r'-1',r'+1',r'0'])
- pl.xticks([])
- ## plot line for delimiting frame and arrow corresponding to start and stop
- pl.plot(idc,x,'k',lw=2)
- for val in met_1:
- pl.arrow( val, 2, 0.0, 0.5, fc="g", ec="g",lw=2)
- for val in stop_1:
- pl.arrow( val, 2, 0.0, 1, fc="r", ec="r",lw=2)
- pl.plot(idc,y,'k')
- for val in met_2:
- pl.arrow( val, 1, 0.0, 0.5, fc="g", ec="g",lw=2)
- for val in stop_2:
- pl.arrow( val, 1, 0.0, 1, fc="r", ec="r",lw=2)
- pl.plot(idc,z,'k')
- for val in met_3:
- pl.arrow( val, 0, 0.0, 0.5, fc="g", ec="g",lw=2)
- for val in stop_3:
- pl.arrow( val, 0, 0.0, 1, fc="r", ec="r",lw=2)
-
-
- pl.ylabel('ORF',fontsize='small')
- pl.draw()
- pl.savefig(directory+'/'+gene+'.png',format='png', dpi=600)
- pl.clf()
- ##pl.show()
-
- ## Make thumbnail for html page
- infile = directory+'/'+gene+'.png'
- size = 128, 128
- im = Image.open(infile)
- im.thumbnail(size, Image.ANTIALIAS)
- im.save(directory+'/'+gene + "_thumbnail.png", "PNG")
+ if len(subprofile_1) == len(subprofile_2) and len(subprofile_1) == len(subprofile_3) :
+
+ ind = arange(len(subprofile_1))
+ fig = pl.figure(num=1)
+ #sub profile frame 0
+ pl.subplot(4,1,1)
+ ax1 = fig.add_subplot(4,1,1)
+ pl.title( "%s (%s)\n$(%s RPKM)$"% (gene, GFF[gene]['name'], GFF[gene]['total_rpkm']))
+ pl.ylim(0,_MAX)
+ pl.xlim(0,len(subprofile_1))
+ pl.yticks([0, _MAX/2, _MAX])
+ ax1.plot(ind, subprofile_1, 'b-', lw = 3,label = 'Frame 0')
+ ax1.fill_between(ind, 0,subprofile_1,color='b')
+ ax1.legend(loc=1,prop={'size':8})
+ pl.ylabel('#RPF')
+
+ ##sub profile frame +1
+ pl.subplot(4,1,2)
+ ax2 = fig.add_subplot(4,1,2)
+ pl.ylim(0,_MAX)
+ pl.xlim(0,len(subprofile_2))
+ pl.yticks([0, _MAX/2, _MAX])
+ ax2.plot(ind,subprofile_2,'r-',lw = 3,label = 'Frame +1')
+ ax2.fill_between(ind, 0,subprofile_2,color='r')
+ ax2.legend(loc=1,prop={'size':8})
+ pl.ylabel('#RPF')
+
+ ##sub profile frame -1
+ pl.subplot(4,1,3)
+ ax3 = fig.add_subplot(4,1,3)
+ pl.ylim(0,_MAX)
+ pl.yticks([0, _MAX/2, _MAX])
+ pl.xlim(0,len(subprofile_3))
+ ax3.plot(ind,subprofile_3,'g-',lw =3,label = 'Frame -1')
+ ax3.fill_between(ind, 0,subprofile_3,color='g')
+ ax3.legend(loc=1,prop={'size':8})
+ pl.ylabel('#RPF')
+
+ ## get orf for each frame
+ gene_seq,met_1,stop_1,met_2,stop_2,met_3,stop_3 = get_orf(fasta,GFF[gene])
+ ## ORF plot
+ idc = arange(len(gene_seq))
+ ##define axe for frame :
+ x = [0]*len(gene_seq)
+ y = [1]*len(gene_seq)
+ z = [2]*len(gene_seq)
+ ## define last subplot
+ #pdb.set_trace()
+ pl.subplot(4,1,4)
+ ## modify axes setting
+ ax4 = fig.add_subplot(4,1,4)
+ ax4.spines['right'].set_color('none')
+ ax4.spines['top'].set_color('none')
+ ax4.spines['left'].set_color('none')
+ ax4.spines['bottom'].set_color('none')
+ ax4.tick_params(bottom = 'off',top='off',left='off', right='off')
+ pl.ylim(0,3)
+ pl.xlim(0,len(gene_seq))
+ pl.yticks([0.5,1.5,2.5],[r'-1',r'+1',r'0'])
+ pl.xticks([])
+ ## plot line for delimiting frame and arrow corresponding to start and stop
+ pl.plot(idc,x,'k',lw=2)
+ for val in met_1:
+ pl.arrow( val, 2, 0.0, 0.5, fc="g", ec="g",lw=2)
+ for val in stop_1:
+ pl.arrow( val, 2, 0.0, 1, fc="r", ec="r",lw=2)
+ pl.plot(idc,y,'k')
+ for val in met_2:
+ pl.arrow( val, 1, 0.0, 0.5, fc="g", ec="g",lw=2)
+ for val in stop_2:
+ pl.arrow( val, 1, 0.0, 1, fc="r", ec="r",lw=2)
+ pl.plot(idc,z,'k')
+ for val in met_3:
+ pl.arrow( val, 0, 0.0, 0.5, fc="g", ec="g",lw=2)
+ for val in stop_3:
+ pl.arrow( val, 0, 0.0, 1, fc="r", ec="r",lw=2)
+
+
+ pl.ylabel('ORF',fontsize='small')
+ pl.draw()
+ pl.savefig(directory+'/'+gene+'.png',format='png', dpi=600)
+ pl.clf()
+ ##pl.show()
+
+ ## Make thumbnail for html page
+ infile = directory+'/'+gene+'.png'
+ size = 128, 128
+ im = Image.open(infile)
+ im.thumbnail(size, Image.ANTIALIAS)
+ im.save(directory+'/'+gene + "_thumbnail.png", "PNG")
else :
@@ -511,12 +516,15 @@
try :
- gene_table = ''
- gene_table += '
'
- gene_table += 'Gene | Plot | Name | Total Proportion | Segment proportion | RPKM | Segment RPKM |
'
- for gene in GFF.iterkeys():
- gene_table += '%s |  | %s | %s | %s | %s | %s |
' %(gene,gene,gene,gene,GFF[gene]['note'],GFF[gene]['name'],GFF[gene]['total_prop'],GFF[gene]['seg_prop'],GFF[gene]['total_rpkm'],GFF[gene]['seg_rpkm'])
- gene_table += '
'
+ if not GFF:
+ gene_table = 'Sorry, there are no dual coding events in your data\n'
+ else :
+ gene_table = ''
+ gene_table += ''
+ gene_table += 'Gene | Plot | Name | Total Proportion | Segment proportion | RPKM | Segment RPKM |
'
+ for gene in GFF.iterkeys():
+ gene_table += '%s |  | %s | %s | %s | %s | %s |
' %(gene,gene,gene,gene,GFF[gene]['note'],GFF[gene]['name'],GFF[gene]['total_prop'],GFF[gene]['seg_prop'],GFF[gene]['total_rpkm'],GFF[gene]['seg_rpkm'])
+ gene_table += '
'
html_str = """
@@ -797,12 +805,11 @@
proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
returncode = proc.wait()
- if os.path.exists(subfolder):
- raise
- try:
- os.mkdir(subfolder)
- except:
- raise
+ if not os.path.exists(subfolder):
+ try:
+ os.mkdir(subfolder)
+ except Exception, e :
+ stop_err('Error running make directory : ' + str(e))
## check frame arg
if options.frame not in [1,2,3]:
@@ -831,7 +838,11 @@
## check gff reading
if not GFF['order'] :
stop_err( 'Incorrect GFF file')
- #
+ for gene in GFF['order']:
+ if not GFF[gene]['exon'] :
+ del GFF[gene]
+ GFF['order'].remove(gene)
+
#### cPickles for Test ####
#if os.path.isfile("/home/rlegendre/Documents/FrameShift/OutOfFrameWithIntron/pyt_dict") :
# with open("/home/rlegendre/Documents/FrameShift/OutOfFrameWithIntron/pyt_dict",'rb') as fp:
@@ -861,7 +872,7 @@
stop_err( 'Error running metagene analysis : ' + str( e ) )
if os.path.exists( tmp_dir ):
- shutil.rmtree( tmp_dir )
+ shutil.rmtree( tmp_dir )
diff -r c87c40e642af -r a121cce43f90 ribo_functions.py
--- a/ribo_functions.py Fri May 29 09:17:29 2015 -0400
+++ b/ribo_functions.py Tue Jun 09 09:06:17 2015 -0400
@@ -141,6 +141,7 @@
'''
try:
GFF = {}
+ mRNA = {}
with open(gff, 'r') as f_gff :
GFF['order'] = []
@@ -153,8 +154,9 @@
# first line is already gene line :
if line.split('\t')[2] == 'gene' :
gene = feature[0].replace("ID=","")
+ curent_gene = gene
if 'Name' in line :
- regex = re.compile('(Name=)([^;]*);')
+ regex = re.compile('(Name=)([^;]*);?')
res = regex.search(line.split('\t')[8])
Name = res.group(2)
Name = Name.rstrip()
@@ -162,7 +164,7 @@
Name = "Unknown"
##get annotation
if 'Note' in line :
- regex = re.compile('(Note=)([^;]*);')
+ regex = re.compile('(Note=)([^;]*);?')
res = regex.search(line.split('\t')[8])
note = res.group(2)
note = urllib.unquote(str(note)).replace("\n","")
@@ -180,12 +182,27 @@
GFF[gene]['exon'] = {}
GFF[gene]['exon_number'] = 0
#print Name
+ elif line.split('\t')[2] == 'mRNA' :
+ regex = re.compile('(Parent=)([^;]*);?')
+ res = regex.search(line.split('\t')[8])
+ gene_name = res.group(2)
+ regex = re.compile('(ID=)([^;]*);?')
+ res = regex.search(line.split('\t')[8])
+ mRNA_name = res.group(2)
+ if gene not in mRNA.viewvalues() and gene_name == curent_gene :
+ mRNA[mRNA_name] = gene_name
+
elif line.split('\t')[2] == 'CDS' :
- regex = re.compile('(Parent=)([^;]*);')
+ regex = re.compile('(Parent=)([^;]*);?')
res = regex.search(line.split('\t')[8])
gene = res.group(2)
+
if 'mRNA' in gene:
gene = re.sub(r"(.*)(\_mRNA)", r"\1", gene)
+ if mRNA.has_key(gene) and GFF.has_key(mRNA[gene]):
+
+ gene = gene_name
+
if GFF.has_key(gene) :
GFF[gene]['exon_number'] += 1
exon_number = GFF[gene]['exon_number']
@@ -193,7 +210,7 @@
GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
-
+
## if there is a five prim UTR intron, we change start of gene
elif line.split('\t')[2] == 'five_prime_UTR_intron' :
if GFF[gene]['strand'] == "+" :