annotate kmer_analysis.py @ 2:da126b91f9ea

Uploaded
author rlegendre
date Mon, 20 Oct 2014 11:06:53 -0400
parents
children 707807fee542
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
1 #!/usr/bin/env python2.7.3
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
2 #-*- coding: utf-8 -*-
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
3 '''
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
4 Created on Jun. 2014
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
5 @author: rachel legendre
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
6 @copyright: rachel.legendre@igmors.u-psud.fr
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
7 @license: GPL v3
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
8 '''
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
9
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
10 import os, sys, time, optparse, re, urllib, subprocess, tempfile, commands
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
11 import pysam
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
12 #from matplotlib import pyplot as pl
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
13 import matplotlib
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
14 matplotlib.use('Agg')
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
15 import matplotlib.pyplot as pl
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
16 from numpy import arange
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
17 import ribo_functions
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
18 from collections import OrderedDict
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
19 #import cPickle
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
20
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
21 total_mapped_read = 0
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
22
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
23
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
24 def stop_err( msg ):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
25 sys.stderr.write( "%s\n" % msg )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
26 sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
27 sys.exit()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
28
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
29
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
30
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
31 def get_first_base(tmpdir, kmer):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
32 '''
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
33 write footprint coverage file for each sam file in tmpdir and get kmer distribution
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
34 '''
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
35 global total_mapped_read
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
36
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
37 ## init kmer dict
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
38 KMER = OrderedDict({})
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
39
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
40 try:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
41 file_array = (commands.getoutput('ls '+tmpdir)).split('\n')
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
42 ##write coverage for each sam file in tmpdir
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
43 for samfile in file_array:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
44 with open(tmpdir+'/'+samfile, 'r') as sam :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
45 ##get chromosome name
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
46 chrom = samfile.split(".sam")[0]
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
47
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
48 for line in sam:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
49 #initialize dictionnary
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
50 if '@SQ' in line :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
51 size = int(line.split('LN:')[1])
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
52 genomeF = [0]*size
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
53 genomeR = [0]*size
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
54 # define multiple reads keys from mapper
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
55 elif '@PG' in line :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
56 if 'bowtie' in line:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
57 multi_tag = "XS:i:"
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
58 elif 'bwa' in line:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
59 multi_tag = "XT:A:R"
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
60 else :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
61 stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping")
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
62
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
63 # get footprint
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
64 elif re.search('^[^@].+', line) :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
65 len_read = len(line.split('\t')[9])
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
66 ##full kmer dict
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
67 if KMER.has_key(len_read):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
68 KMER[len_read] += 1
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
69 else :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
70 KMER[len_read] = 1
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
71
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
72 #print line.rstrip()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
73 read_pos = int(line.split('\t')[3])
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
74 read_sens = int(line.split('\t')[1])
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
75 #len_read = len(line.split('\t')[9])
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
76 if len_read == kmer and multi_tag not in line:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
77 ###if line.split('\t')[5] == '28M' :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
78 total_mapped_read +=1
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
79 #if it's a forward read
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
80 if read_sens == 0 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
81 #get 5' base
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
82 genomeF[read_pos] += 1
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
83 #if it's a reverse read
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
84 elif read_sens == 16 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
85 #get 5' base
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
86 read_pos += (len_read-1)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
87 genomeR[read_pos] += 1
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
88
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
89 #try:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
90 #write coverage in files
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
91 with open(tmpdir+'/assoCov_'+chrom+'.txt', 'w') as cov :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
92 for i in range(0,size):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
93 cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n')
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
94 #except Exception,e:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
95 # stop_err( 'Error during coverage file writting : ' + str( e ) )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
96
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
97 #sys.stdout.write("%d reads are in your bam file\n" % total_mapped_read)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
98
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
99 return KMER
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
100
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
101 except Exception, e:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
102 stop_err( 'Error during footprinting : ' + str( e ) )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
103
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
104
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
105 def __percent__(prop):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
106
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
107 if sum(prop) != 0 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
108 perc = [0,0,0]
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
109 if prop[0] != 0 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
110 perc[0] = int("{0:.0f}".format((prop[0]*100.0)/sum(prop)))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
111 if prop[1] != 0 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
112 perc[1] = int("{0:.0f}".format((prop[1]*100.0)/sum(prop)))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
113 if prop[2] != 0 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
114 perc[2] = int("{0:.0f}".format((prop[2]*100.0)/sum(prop)))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
115 return perc
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
116 else :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
117 return prop
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
118
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
119
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
120 def frame_analysis(tmpdir,GFF):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
121 '''
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
122 This function take footprint and annotation (gff) for analyse reading frame in each gene
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
123 '''
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
124 global total_mapped_read
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
125 try:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
126 chrom = "" # initializing chromosome
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
127 nb_gene = 0 # number of analysed genes
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
128 whole_phasing = [0,0,0]
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
129 for gene in GFF['order']:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
130 ## maybe no start position in GTF file so we must to check and replace
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
131 try : GFF[gene]['start']
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
132 except :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
133 if GFF[gene]['strand'] == '+' :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
134 GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
135 else :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
136 GFF[gene]['start'] = GFF[gene]['exon'][1]['stop']
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
137 ## also for stop coordinates
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
138 try : GFF[gene]['stop']
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
139 except :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
140 exon_number = GFF[gene]['exon_number']
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
141 if GFF[gene]['strand'] == '+' :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
142 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
143
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
144 else :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
145 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['start']
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
146 cov = []
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
147 ##first chromosome : we open corresponding file
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
148 if chrom == "" :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
149 chrom = GFF[gene]['chrom']
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
150 with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
151 data = f.readlines()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
152 ##if we change chrosomosome
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
153 elif chrom != GFF[gene]['chrom'] :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
154 chrom = GFF[gene]['chrom']
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
155 with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
156 data = f.readlines()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
157
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
158 ## if a gene without intron :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
159 if GFF[gene]['exon_number'] == 1:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
160
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
161 ## get coverage for each gene
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
162 if GFF[gene]['strand'] == "+":
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
163 for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
164 cov.append(int((data[i].rstrip()).split("\t")[0]))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
165 else :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
166 for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
167 cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
168 cov.reverse()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
169 else :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
170 ## For each gene, get coverage and sum of exon size
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
171 if GFF[gene]['strand'] == "+":
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
172
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
173 for exon in range(1,GFF[gene]['exon_number']+1) :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
174 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
175 if i <= GFF[gene]['stop'] :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
176 cov.append(int((data[i].rstrip()).split("\t")[0]))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
177 else :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
178
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
179 for exon in range(1,GFF[gene]['exon_number']+1) :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
180 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
181 if i <= GFF[gene]['start'] :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
182 cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
183 cov.reverse()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
184
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
185 len_cov = len(cov)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
186 prop = [0,0,0]
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
187 for nuc in range(0,len_cov-2,3) :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
188 ## Calculate proportion
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
189 prop[0] += cov[nuc]
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
190 prop[1] += cov[nuc+1]
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
191 prop[2] += cov[nuc+2]
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
192 whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop)))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
193
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
194 whole_phasing = __percent__(whole_phasing)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
195 #sys.stdout.write("Proportion of reads in each frame :\n%s\n" % whole_phasing)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
196 return whole_phasing
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
197
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
198 except Exception, e:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
199 stop_err( 'Error during frame analysis : ' + str( e ) )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
200
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
201
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
202
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
203 def make_report(html_file, dirout, kmer, results) :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
204
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
205 array = sorted(kmer.items(), key=lambda x: x[0])
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
206 values = []
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
207 label = []
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
208 for x,y in array :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
209 values.append(y)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
210 label.append(x)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
211 index = arange(len(label))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
212 bar_width = 0.35
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
213 axis_font = {'size':'10'}
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
214 fig, ax = pl.subplots()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
215 pl.bar(index ,values, color='LightsteelBlue')
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
216 pl.xlabel('kmer value', **axis_font)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
217 pl.ylabel('Number of reads', **axis_font)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
218 pl.title('Number of reads for each k-mer')
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
219 pl.xticks(index + bar_width, label, **axis_font)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
220 #pl.show()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
221 pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
222 pl.clf()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
223
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
224 for key, phase in results.iteritems() :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
225 fig = pl.figure(num=1)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
226 frame = phase
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
227 index = arange(3)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
228 bar_width = 0.5
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
229 pl.bar(index,frame,color=['RoyalBlue','LightSkyBlue','LightBlue'])
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
230 pl.xlabel('Frame in gene', **axis_font)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
231 pl.ylabel('Percent of read', **axis_font)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
232 pl.title('Proportion of reads in each frame for '+str(key)+'-mer')
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
233 pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
234 pl.tight_layout()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
235 pl.ylim(0,100)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
236 pl.draw()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
237 #pl.show()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
238 pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
239 pl.clf()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
240
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
241 kmer_summary = ''
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
242 kmer_sorted = OrderedDict(sorted(kmer.iteritems(), key=lambda x: x[0]))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
243 for key, number in kmer_sorted.iteritems() :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
244 if number > 100 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
245 kmer_summary += '<li>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
246 kmer_summary += 'Analysis of '+str(key)+'-mer : <br>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
247 kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
248 kmer_summary += 'Phasing of '+str(key)+'-mer is : '+str(results[key])+'<br>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
249 kmer_summary += '<img border="0" src="'+str(key)+'_phasing.png" width="50%%" height="50%%"/><br>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
250 kmer_summary += '</li>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
251 else :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
252 kmer_summary += '<li>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
253 kmer_summary += 'Analysis of '+str(key)+'-mer : <br>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
254 kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
255 kmer_summary += '</li>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
256
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
257 html_str = """
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
258 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
259 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
260
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
261 <html xmlns="http://www.w3.org/1999/xhtml">
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
262 <head>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
263 <link href="lightbox/css/lightbox.css" rel="stylesheet" />
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
264 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
265 <title>kmer analysis</title>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
266 </head>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
267 <body>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
268 <h1>k-mer analysis results</h1>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
269 <p>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
270 <ul>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
271 <li>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
272 Number of reads for each k-mer in your sample : <br />
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
273 <img border="0" src="kmer_proportion.png" width="50%%" height="50%%"/>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
274 </li>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
275 %s
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
276 </ul>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
277 </p>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
278 </body>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
279 </html>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
280 """
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
281 all = html_str % kmer_summary
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
282
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
283 html = open(html_file,"w")
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
284 html.write(all)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
285 html.close()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
286
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
287 def __main__():
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
288
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
289 #Parse command line options
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
290 parser = optparse.OptionParser()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
291 parser.add_option("-g", "--gff", dest="gfffile", type= "string",
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
292 help="GFF annotation file", metavar="FILE")
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
293
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
294 parser.add_option("-b", "--bam", dest="bamfile", type= "string",
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
295 help="Bam Ribo-Seq alignments ", metavar="FILE")
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
296
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
297 parser.add_option("-o", "--out", dest="html_file", type= "string",
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
298 help="write report to FILE", metavar="FILE")
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
299
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
300 parser.add_option("-d", "--dirout", dest="dirout", type= "string",
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
301 help="write report to PNG files", metavar="FILE")
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
302
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
303 parser.add_option("-q", "--quiet",
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
304 action="store_false", dest="verbose", default=True,
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
305 help="don't print status messages to stdout")
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
306
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
307 (options, args) = parser.parse_args()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
308 sys.stdout.write("Begin kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
309
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
310 try:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
311
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
312 if os.path.exists(options.dirout):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
313 raise
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
314 try:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
315 os.mkdir(options.dirout)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
316 except:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
317 raise
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
318
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
319 ##testing indexed bam file
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
320 if os.path.isfile(options.bamfile+".bai") :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
321 pass
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
322 else :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
323 cmd = "samtools index %s " % (options.bamfile)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
324 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
325 returncode = proc.wait()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
326
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
327 tmpdir = tempfile.mkdtemp()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
328 GFF = ribo_functions.store_gff(options.gfffile)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
329 ## split bam
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
330 ribo_functions.split_bam(options.bamfile,tmpdir)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
331 ###################################
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
332 ## First analysis with 28mer :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
333 ###################################
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
334 ## compute coverage and distribution kmer
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
335 kmer = get_first_base(tmpdir, 28)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
336 ## compute phasing
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
337 whole_phasing = frame_analysis(tmpdir,GFF)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
338 ## save phasing
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
339 results = {}
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
340 results[28] = whole_phasing
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
341 ## compute analysis with other kmer
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
342 for keys in kmer.iterkeys() :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
343 if keys != 28 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
344 ## remove all txt files in tmp directory
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
345 if os.system("rm "+tmpdir+"/*.txt") != 0 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
346 stop_err( 'Error during tmp directory cleaning : ' + str( e ) )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
347
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
348 ## If not enought reads in this kmer :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
349 if kmer[keys] > 100 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
350 ## compute coverage and distribution kmer
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
351 tmp = get_first_base(tmpdir, keys)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
352 ## compute phasing
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
353 whole_phasing = frame_analysis(tmpdir,GFF)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
354 results[keys] = whole_phasing
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
355 ## get report
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
356 make_report(options.html_file, options.dirout, kmer, results)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
357 #=======================================================================
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
358 # ############
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
359 # # tests
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
360 # ############
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
361 # with open("kmer_dict",'rb') as km:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
362 # kmer = cPickle.load(km)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
363 # with open("results_dict",'rb') as res:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
364 # results = cPickle.load(res)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
365 # with open("kmer_dict",'wb') as km:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
366 # cPickle.dump(kmer,km)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
367 # with open("results_dict",'wb') as res:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
368 # cPickle.dump(results,res)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
369 # make_report(options.html_file, options.dirout, kmer, results)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
370 #=======================================================================
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
371
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
372 sys.stdout.write("Finish kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
373
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
374 except Exception, e:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
375 stop_err( 'Error running kmer and phasing analysis : ' + str( e ) )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
376
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
377
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
378 if __name__=="__main__":
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
379 __main__()