annotate kmer_analysis.py @ 10:707807fee542

(none)
author rlegendre
date Thu, 22 Jan 2015 14:34:38 +0100
parents da126b91f9ea
children 7c944fd9907e
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
1 #!/usr/bin/env python2.7.3
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
2 #-*- coding: utf-8 -*-
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
3 '''
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
4 Created on Jun. 2014
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
5 @author: rachel legendre
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
6 @copyright: rachel.legendre@igmors.u-psud.fr
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
7 @license: GPL v3
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
8 '''
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
9
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
10 import os, sys, time, optparse, re, urllib, subprocess, tempfile, commands
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
11 import pysam
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
12 #from matplotlib import pyplot as pl
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
13 import matplotlib
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
14 matplotlib.use('Agg')
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
15 import matplotlib.pyplot as pl
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
16 from numpy import arange
10
rlegendre
parents: 2
diff changeset
17 from collections import OrderedDict
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
18 import ribo_functions
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
19 #import cPickle
10
rlegendre
parents: 2
diff changeset
20 ## suppress matplotlib warnings
rlegendre
parents: 2
diff changeset
21 import warnings
rlegendre
parents: 2
diff changeset
22 warnings.filterwarnings('ignore')
rlegendre
parents: 2
diff changeset
23
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
24
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
25 total_mapped_read = 0
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
26
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
27
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
28 def stop_err( msg ):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
29 sys.stderr.write( "%s\n" % msg )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
30 sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
31 sys.exit()
10
rlegendre
parents: 2
diff changeset
32
rlegendre
parents: 2
diff changeset
33
rlegendre
parents: 2
diff changeset
34 def split_bam(bamfile,tmpdir):
rlegendre
parents: 2
diff changeset
35 '''
rlegendre
parents: 2
diff changeset
36 split bam by chromosome and write sam file in tmpdir
rlegendre
parents: 2
diff changeset
37 '''
rlegendre
parents: 2
diff changeset
38 try:
rlegendre
parents: 2
diff changeset
39 #get header
rlegendre
parents: 2
diff changeset
40 results = subprocess.check_output(['samtools', 'view', '-H',bamfile])
rlegendre
parents: 2
diff changeset
41 header = results.split('\n')
rlegendre
parents: 2
diff changeset
42
rlegendre
parents: 2
diff changeset
43 #define genome size
rlegendre
parents: 2
diff changeset
44 genome = []
rlegendre
parents: 2
diff changeset
45 for line in header:
rlegendre
parents: 2
diff changeset
46 result = re.search('SN', line)
rlegendre
parents: 2
diff changeset
47 if result :
rlegendre
parents: 2
diff changeset
48 #print line
rlegendre
parents: 2
diff changeset
49 feat = line.split('\t')
rlegendre
parents: 2
diff changeset
50 chrom = re.split(":", feat[1])
rlegendre
parents: 2
diff changeset
51 #print feat[1]
rlegendre
parents: 2
diff changeset
52 genome.append(chrom[1])
rlegendre
parents: 2
diff changeset
53
rlegendre
parents: 2
diff changeset
54 #split sam by chrom
rlegendre
parents: 2
diff changeset
55 n = 0
rlegendre
parents: 2
diff changeset
56 for chrm in genome:
rlegendre
parents: 2
diff changeset
57 with open(tmpdir+'/'+chrm+'.sam', 'w') as f :
rlegendre
parents: 2
diff changeset
58 #write header correctly for each chromosome
rlegendre
parents: 2
diff changeset
59 f.write(header[0]+'\n')
rlegendre
parents: 2
diff changeset
60 expr = re.compile(chrm+'\t')
rlegendre
parents: 2
diff changeset
61 el =[elem for elem in header if expr.search(elem)][0]
rlegendre
parents: 2
diff changeset
62 f.write(el+'\n')
rlegendre
parents: 2
diff changeset
63 f.write(header[-2]+'\n')
rlegendre
parents: 2
diff changeset
64 #write all reads for each chromosome
rlegendre
parents: 2
diff changeset
65 reads = subprocess.check_output(["samtools", "view", bamfile, chrm])
rlegendre
parents: 2
diff changeset
66 f.write(reads)
rlegendre
parents: 2
diff changeset
67 # calculate number of reads
rlegendre
parents: 2
diff changeset
68 n += reads.count(chrm)
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
69
10
rlegendre
parents: 2
diff changeset
70 sys.stdout.write("%d reads are presents in your bam file\n" % n)
rlegendre
parents: 2
diff changeset
71
rlegendre
parents: 2
diff changeset
72 except Exception, e:
rlegendre
parents: 2
diff changeset
73 stop_err( 'Error during bam file splitting : ' + str( e ) )
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
74
10
rlegendre
parents: 2
diff changeset
75
rlegendre
parents: 2
diff changeset
76
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
77 def get_first_base(tmpdir, kmer):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
78 '''
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
79 write footprint coverage file for each sam file in tmpdir and get kmer distribution
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
80 '''
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
81 global total_mapped_read
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
82
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
83 ## init kmer dict
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
84 KMER = OrderedDict({})
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
85
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
86 try:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
87 file_array = (commands.getoutput('ls '+tmpdir)).split('\n')
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
88 ##write coverage for each sam file in tmpdir
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
89 for samfile in file_array:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
90 with open(tmpdir+'/'+samfile, 'r') as sam :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
91 ##get chromosome name
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
92 chrom = samfile.split(".sam")[0]
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
93
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
94 for line in sam:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
95 #initialize dictionnary
10
rlegendre
parents: 2
diff changeset
96 if '@SQ\tSN:' in line :
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
97 size = int(line.split('LN:')[1])
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
98 genomeF = [0]*size
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
99 genomeR = [0]*size
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
100 # define multiple reads keys from mapper
10
rlegendre
parents: 2
diff changeset
101 elif '@PG\tID' in line :
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
102 if 'bowtie' in line:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
103 multi_tag = "XS:i:"
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
104 elif 'bwa' in line:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
105 multi_tag = "XT:A:R"
10
rlegendre
parents: 2
diff changeset
106 #elif 'TopHat' in line:
rlegendre
parents: 2
diff changeset
107 # multi_tag = "NH:i:1"
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
108 else :
10
rlegendre
parents: 2
diff changeset
109 stop_err("No PG tag find in "+samfile+". Please use bowtie or bwa for mapping")
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
110
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
111 # get footprint
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
112 elif re.search('^[^@].+', line) :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
113 len_read = len(line.split('\t')[9])
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
114 ##full kmer dict
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
115 if KMER.has_key(len_read):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
116 KMER[len_read] += 1
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
117 else :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
118 KMER[len_read] = 1
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
119
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
120 #print line.rstrip()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
121 read_pos = int(line.split('\t')[3])
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
122 read_sens = int(line.split('\t')[1])
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
123 #len_read = len(line.split('\t')[9])
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
124 if len_read == kmer and multi_tag not in line:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
125 ###if line.split('\t')[5] == '28M' :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
126 total_mapped_read +=1
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
127 #if it's a forward read
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
128 if read_sens == 0 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
129 #get 5' base
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
130 genomeF[read_pos] += 1
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
131 #if it's a reverse read
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
132 elif read_sens == 16 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
133 #get 5' base
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
134 read_pos += (len_read-1)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
135 genomeR[read_pos] += 1
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
136
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
137 #try:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
138 #write coverage in files
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
139 with open(tmpdir+'/assoCov_'+chrom+'.txt', 'w') as cov :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
140 for i in range(0,size):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
141 cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n')
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
142 #except Exception,e:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
143 # stop_err( 'Error during coverage file writting : ' + str( e ) )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
144
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
145 #sys.stdout.write("%d reads are in your bam file\n" % total_mapped_read)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
146
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
147 return KMER
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
148
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
149 except Exception, e:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
150 stop_err( 'Error during footprinting : ' + str( e ) )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
151
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
152
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
153 def __percent__(prop):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
154
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
155 if sum(prop) != 0 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
156 perc = [0,0,0]
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
157 if prop[0] != 0 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
158 perc[0] = int("{0:.0f}".format((prop[0]*100.0)/sum(prop)))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
159 if prop[1] != 0 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
160 perc[1] = int("{0:.0f}".format((prop[1]*100.0)/sum(prop)))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
161 if prop[2] != 0 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
162 perc[2] = int("{0:.0f}".format((prop[2]*100.0)/sum(prop)))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
163 return perc
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
164 else :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
165 return prop
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
166
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
167
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
168 def frame_analysis(tmpdir,GFF):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
169 '''
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
170 This function take footprint and annotation (gff) for analyse reading frame in each gene
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
171 '''
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
172 global total_mapped_read
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
173 try:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
174 chrom = "" # initializing chromosome
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
175 nb_gene = 0 # number of analysed genes
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
176 whole_phasing = [0,0,0]
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
177 for gene in GFF['order']:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
178 ## maybe no start position in GTF file so we must to check and replace
10
rlegendre
parents: 2
diff changeset
179 exon_number = GFF[gene]['exon_number']
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
180 try : GFF[gene]['start']
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
181 except :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
182 if GFF[gene]['strand'] == '+' :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
183 GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
184 else :
10
rlegendre
parents: 2
diff changeset
185 GFF[gene]['start'] = GFF[gene]['exon'][exon_number]['stop']
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
186 ## also for stop coordinates
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
187 try : GFF[gene]['stop']
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
188 except :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
189 if GFF[gene]['strand'] == '+' :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
190 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
191
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
192 else :
10
rlegendre
parents: 2
diff changeset
193 GFF[gene]['stop'] = GFF[gene]['exon'][1]['start']
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
194 cov = []
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
195 ##first chromosome : we open corresponding file
10
rlegendre
parents: 2
diff changeset
196 try:
rlegendre
parents: 2
diff changeset
197 if chrom == "" :
rlegendre
parents: 2
diff changeset
198 chrom = GFF[gene]['chrom']
rlegendre
parents: 2
diff changeset
199 with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
rlegendre
parents: 2
diff changeset
200 data = f.readlines()
rlegendre
parents: 2
diff changeset
201 ##if we change chromosome
rlegendre
parents: 2
diff changeset
202 elif chrom != GFF[gene]['chrom'] :
rlegendre
parents: 2
diff changeset
203 chrom = GFF[gene]['chrom']
rlegendre
parents: 2
diff changeset
204 with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
rlegendre
parents: 2
diff changeset
205 data = f.readlines()
rlegendre
parents: 2
diff changeset
206 except IOError :
rlegendre
parents: 2
diff changeset
207 print tmpdir+"/assoCov_"+chrom+".txt doesn't exist"
rlegendre
parents: 2
diff changeset
208
rlegendre
parents: 2
diff changeset
209
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
210 ## if a gene without intron :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
211 if GFF[gene]['exon_number'] == 1:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
212
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
213 ## get coverage for each gene
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
214 if GFF[gene]['strand'] == "+":
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
215 for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
216 cov.append(int((data[i].rstrip()).split("\t")[0]))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
217 else :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
218 for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
219 cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
220 cov.reverse()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
221 else :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
222 ## For each gene, get coverage and sum of exon size
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
223 if GFF[gene]['strand'] == "+":
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
224
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
225 for exon in range(1,GFF[gene]['exon_number']+1) :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
226 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
10
rlegendre
parents: 2
diff changeset
227 #if i <= GFF[gene]['stop'] :
rlegendre
parents: 2
diff changeset
228 cov.append(int((data[i].rstrip()).split("\t")[0]))
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
229 else :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
230
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
231 for exon in range(1,GFF[gene]['exon_number']+1) :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
232 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
10
rlegendre
parents: 2
diff changeset
233 #if i <= GFF[gene]['start'] :
rlegendre
parents: 2
diff changeset
234 cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
235 cov.reverse()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
236
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
237 len_cov = len(cov)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
238 prop = [0,0,0]
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
239 for nuc in range(0,len_cov-2,3) :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
240 ## Calculate proportion
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
241 prop[0] += cov[nuc]
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
242 prop[1] += cov[nuc+1]
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
243 prop[2] += cov[nuc+2]
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
244 whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop)))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
245
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
246 whole_phasing = __percent__(whole_phasing)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
247 #sys.stdout.write("Proportion of reads in each frame :\n%s\n" % whole_phasing)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
248 return whole_phasing
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
249
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
250 except Exception, e:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
251 stop_err( 'Error during frame analysis : ' + str( e ) )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
252
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
253
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
254
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
255 def make_report(html_file, dirout, kmer, results) :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
256
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
257 array = sorted(kmer.items(), key=lambda x: x[0])
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
258 values = []
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
259 label = []
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
260 for x,y in array :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
261 values.append(y)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
262 label.append(x)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
263 index = arange(len(label))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
264 bar_width = 0.35
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
265 axis_font = {'size':'10'}
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
266 fig, ax = pl.subplots()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
267 pl.bar(index ,values, color='LightsteelBlue')
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
268 pl.xlabel('kmer value', **axis_font)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
269 pl.ylabel('Number of reads', **axis_font)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
270 pl.title('Number of reads for each k-mer')
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
271 pl.xticks(index + bar_width, label, **axis_font)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
272 #pl.show()
10
rlegendre
parents: 2
diff changeset
273 fig.subplots_adjust()
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
274 pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
275 pl.clf()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
276
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
277 for key, phase in results.iteritems() :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
278 fig = pl.figure(num=1)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
279 frame = phase
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
280 index = arange(3)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
281 bar_width = 0.5
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
282 pl.bar(index,frame,color=['RoyalBlue','LightSkyBlue','LightBlue'])
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
283 pl.xlabel('Frame in gene', **axis_font)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
284 pl.ylabel('Percent of read', **axis_font)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
285 pl.title('Proportion of reads in each frame for '+str(key)+'-mer')
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
286 pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font)
10
rlegendre
parents: 2
diff changeset
287 #pl.tight_layout()
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
288 pl.ylim(0,100)
10
rlegendre
parents: 2
diff changeset
289 fig.subplots_adjust()
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
290 pl.draw()
10
rlegendre
parents: 2
diff changeset
291 pl.show()
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
292 pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
293 pl.clf()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
294
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
295 kmer_summary = ''
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
296 kmer_sorted = OrderedDict(sorted(kmer.iteritems(), key=lambda x: x[0]))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
297 for key, number in kmer_sorted.iteritems() :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
298 if number > 100 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
299 kmer_summary += '<li>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
300 kmer_summary += 'Analysis of '+str(key)+'-mer : <br>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
301 kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
302 kmer_summary += 'Phasing of '+str(key)+'-mer is : '+str(results[key])+'<br>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
303 kmer_summary += '<img border="0" src="'+str(key)+'_phasing.png" width="50%%" height="50%%"/><br>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
304 kmer_summary += '</li>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
305 else :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
306 kmer_summary += '<li>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
307 kmer_summary += 'Analysis of '+str(key)+'-mer : <br>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
308 kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
309 kmer_summary += '</li>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
310
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
311 html_str = """
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
312 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
313 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
314
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
315 <html xmlns="http://www.w3.org/1999/xhtml">
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
316 <head>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
317 <link href="lightbox/css/lightbox.css" rel="stylesheet" />
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
318 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
319 <title>kmer analysis</title>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
320 </head>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
321 <body>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
322 <h1>k-mer analysis results</h1>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
323 <p>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
324 <ul>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
325 <li>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
326 Number of reads for each k-mer in your sample : <br />
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
327 <img border="0" src="kmer_proportion.png" width="50%%" height="50%%"/>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
328 </li>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
329 %s
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
330 </ul>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
331 </p>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
332 </body>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
333 </html>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
334 """
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
335 all = html_str % kmer_summary
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
336
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
337 html = open(html_file,"w")
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
338 html.write(all)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
339 html.close()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
340
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
341 def __main__():
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
342
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
343 #Parse command line options
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
344 parser = optparse.OptionParser()
10
rlegendre
parents: 2
diff changeset
345 parser.add_option("-g", "--gff", dest="gff", type= "string",
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
346 help="GFF annotation file", metavar="FILE")
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
347
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
348 parser.add_option("-b", "--bam", dest="bamfile", type= "string",
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
349 help="Bam Ribo-Seq alignments ", metavar="FILE")
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
350
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
351 parser.add_option("-o", "--out", dest="html_file", type= "string",
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
352 help="write report to FILE", metavar="FILE")
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
353
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
354 parser.add_option("-d", "--dirout", dest="dirout", type= "string",
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
355 help="write report to PNG files", metavar="FILE")
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
356
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
357 parser.add_option("-q", "--quiet",
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
358 action="store_false", dest="verbose", default=True,
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
359 help="don't print status messages to stdout")
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
360
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
361 (options, args) = parser.parse_args()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
362 sys.stdout.write("Begin kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
363
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
364 try:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
365
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
366 if os.path.exists(options.dirout):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
367 raise
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
368 try:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
369 os.mkdir(options.dirout)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
370 except:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
371 raise
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
372
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
373 ##testing indexed bam file
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
374 if os.path.isfile(options.bamfile+".bai") :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
375 pass
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
376 else :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
377 cmd = "samtools index %s " % (options.bamfile)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
378 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
379 returncode = proc.wait()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
380
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
381 tmpdir = tempfile.mkdtemp()
10
rlegendre
parents: 2
diff changeset
382 ## identify GFF or GTF format from 9th column
rlegendre
parents: 2
diff changeset
383 with open (options.gff,"r") as gffile :
rlegendre
parents: 2
diff changeset
384 for line in gffile :
rlegendre
parents: 2
diff changeset
385 if '#' in line :
rlegendre
parents: 2
diff changeset
386 ## skip header
rlegendre
parents: 2
diff changeset
387 gffile.next()
rlegendre
parents: 2
diff changeset
388 elif 'gene_id' in line :
rlegendre
parents: 2
diff changeset
389 ## launch gtf reader :
rlegendre
parents: 2
diff changeset
390 GFF = ribo_functions.store_gtf(options.gff)
rlegendre
parents: 2
diff changeset
391 break
rlegendre
parents: 2
diff changeset
392 elif 'ID=' in line :
rlegendre
parents: 2
diff changeset
393 ## launch gff reader
rlegendre
parents: 2
diff changeset
394 GFF = ribo_functions.store_gff(options.gff)
rlegendre
parents: 2
diff changeset
395 break
rlegendre
parents: 2
diff changeset
396 else :
rlegendre
parents: 2
diff changeset
397 stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )
rlegendre
parents: 2
diff changeset
398 #GFF = store_gff(options.gff)
rlegendre
parents: 2
diff changeset
399 #GFF = ribo_functions.store_gtf(options.gff)
rlegendre
parents: 2
diff changeset
400 ## check gff reading
rlegendre
parents: 2
diff changeset
401 if not GFF['order'] :
rlegendre
parents: 2
diff changeset
402 stop_err( 'Incorrect GFF file' + str( e ) )
rlegendre
parents: 2
diff changeset
403
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
404 ## split bam
10
rlegendre
parents: 2
diff changeset
405 split_bam(options.bamfile,tmpdir)
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
406 ###################################
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
407 ## First analysis with 28mer :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
408 ###################################
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
409 ## compute coverage and distribution kmer
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
410 kmer = get_first_base(tmpdir, 28)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
411 ## compute phasing
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
412 whole_phasing = frame_analysis(tmpdir,GFF)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
413 ## save phasing
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
414 results = {}
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
415 results[28] = whole_phasing
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
416 ## compute analysis with other kmer
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
417 for keys in kmer.iterkeys() :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
418 if keys != 28 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
419 ## remove all txt files in tmp directory
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
420 if os.system("rm "+tmpdir+"/*.txt") != 0 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
421 stop_err( 'Error during tmp directory cleaning : ' + str( e ) )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
422
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
423 ## If not enought reads in this kmer :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
424 if kmer[keys] > 100 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
425 ## compute coverage and distribution kmer
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
426 tmp = get_first_base(tmpdir, keys)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
427 ## compute phasing
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
428 whole_phasing = frame_analysis(tmpdir,GFF)
10
rlegendre
parents: 2
diff changeset
429
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
430 results[keys] = whole_phasing
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
431 ## get report
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
432 make_report(options.html_file, options.dirout, kmer, results)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
433 #=======================================================================
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
434 # ############
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
435 # # tests
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
436 # ############
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
437 # with open("kmer_dict",'rb') as km:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
438 # kmer = cPickle.load(km)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
439 # with open("results_dict",'rb') as res:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
440 # results = cPickle.load(res)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
441 # with open("kmer_dict",'wb') as km:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
442 # cPickle.dump(kmer,km)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
443 # with open("results_dict",'wb') as res:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
444 # cPickle.dump(results,res)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
445 # make_report(options.html_file, options.dirout, kmer, results)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
446 #=======================================================================
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
447
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
448 sys.stdout.write("Finish kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
449
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
450 except Exception, e:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
451 stop_err( 'Error running kmer and phasing analysis : ' + str( e ) )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
452
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
453
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
454 if __name__=="__main__":
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
455 __main__()