annotate kmer_analysis.py @ 19:385fc64fa988 draft default tip

Uploaded
author rlegendre
date Fri, 12 Jun 2015 11:32:59 -0400
parents a121cce43f90
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
1 #!/usr/bin/env python2.7.3
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
2 #-*- coding: utf-8 -*-
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
3 '''
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
4 Created on Jun. 2014
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
5 @author: rachel legendre
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
6 @copyright: rachel.legendre@igmors.u-psud.fr
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
7 @license: GPL v3
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
8 '''
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
9
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
10 import os, sys, time, optparse, re, urllib, subprocess, tempfile, commands
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
11 import pysam
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
12 #from matplotlib import pyplot as pl
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
13 import matplotlib
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
14 matplotlib.use('Agg')
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
15 import matplotlib.pyplot as pl
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
16 from numpy import arange
10
rlegendre
parents: 2
diff changeset
17 from collections import OrderedDict
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
18 import ribo_functions
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
19 #import cPickle
10
rlegendre
parents: 2
diff changeset
20 ## suppress matplotlib warnings
rlegendre
parents: 2
diff changeset
21 import warnings
rlegendre
parents: 2
diff changeset
22 warnings.filterwarnings('ignore')
rlegendre
parents: 2
diff changeset
23
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
24
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
25 total_mapped_read = 0
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
26
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
27
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
28 def stop_err( msg ):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
29 sys.stderr.write( "%s\n" % msg )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
30 sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
31 sys.exit()
10
rlegendre
parents: 2
diff changeset
32
rlegendre
parents: 2
diff changeset
33
rlegendre
parents: 2
diff changeset
34 def split_bam(bamfile,tmpdir):
rlegendre
parents: 2
diff changeset
35 '''
rlegendre
parents: 2
diff changeset
36 split bam by chromosome and write sam file in tmpdir
rlegendre
parents: 2
diff changeset
37 '''
rlegendre
parents: 2
diff changeset
38 try:
rlegendre
parents: 2
diff changeset
39 #get header
rlegendre
parents: 2
diff changeset
40 results = subprocess.check_output(['samtools', 'view', '-H',bamfile])
rlegendre
parents: 2
diff changeset
41 header = results.split('\n')
rlegendre
parents: 2
diff changeset
42
rlegendre
parents: 2
diff changeset
43 #define genome size
rlegendre
parents: 2
diff changeset
44 genome = []
rlegendre
parents: 2
diff changeset
45 for line in header:
rlegendre
parents: 2
diff changeset
46 result = re.search('SN', line)
rlegendre
parents: 2
diff changeset
47 if result :
rlegendre
parents: 2
diff changeset
48 #print line
rlegendre
parents: 2
diff changeset
49 feat = line.split('\t')
rlegendre
parents: 2
diff changeset
50 chrom = re.split(":", feat[1])
rlegendre
parents: 2
diff changeset
51 #print feat[1]
rlegendre
parents: 2
diff changeset
52 genome.append(chrom[1])
rlegendre
parents: 2
diff changeset
53
rlegendre
parents: 2
diff changeset
54 #split sam by chrom
rlegendre
parents: 2
diff changeset
55 n = 0
rlegendre
parents: 2
diff changeset
56 for chrm in genome:
rlegendre
parents: 2
diff changeset
57 with open(tmpdir+'/'+chrm+'.sam', 'w') as f :
rlegendre
parents: 2
diff changeset
58 #write header correctly for each chromosome
rlegendre
parents: 2
diff changeset
59 f.write(header[0]+'\n')
rlegendre
parents: 2
diff changeset
60 expr = re.compile(chrm+'\t')
rlegendre
parents: 2
diff changeset
61 el =[elem for elem in header if expr.search(elem)][0]
rlegendre
parents: 2
diff changeset
62 f.write(el+'\n')
rlegendre
parents: 2
diff changeset
63 f.write(header[-2]+'\n')
rlegendre
parents: 2
diff changeset
64 #write all reads for each chromosome
rlegendre
parents: 2
diff changeset
65 reads = subprocess.check_output(["samtools", "view", bamfile, chrm])
rlegendre
parents: 2
diff changeset
66 f.write(reads)
rlegendre
parents: 2
diff changeset
67 # calculate number of reads
rlegendre
parents: 2
diff changeset
68 n += reads.count(chrm)
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
69
10
rlegendre
parents: 2
diff changeset
70 sys.stdout.write("%d reads are presents in your bam file\n" % n)
rlegendre
parents: 2
diff changeset
71
rlegendre
parents: 2
diff changeset
72 except Exception, e:
rlegendre
parents: 2
diff changeset
73 stop_err( 'Error during bam file splitting : ' + str( e ) )
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
74
10
rlegendre
parents: 2
diff changeset
75
rlegendre
parents: 2
diff changeset
76
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
77 def get_first_base(tmpdir, kmer):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
78 '''
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
79 write footprint coverage file for each sam file in tmpdir and get kmer distribution
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
80 '''
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
81 global total_mapped_read
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
82
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
83 ## tags by default
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
84 multi_tag = "XS:i:"
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
85 tag = "IH:i:1"
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
86
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
87 ## init kmer dict
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
88 KMER = OrderedDict({})
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
89
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
90 try:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
91 file_array = (commands.getoutput('ls '+tmpdir)).split('\n')
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
92 ##write coverage for each sam file in tmpdir
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
93 for samfile in file_array:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
94 with open(tmpdir+'/'+samfile, 'r') as sam :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
95 ##get chromosome name
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
96 chrom = samfile.split(".sam")[0]
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
97
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
98 for line in sam:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
99 #initialize dictionnary
10
rlegendre
parents: 2
diff changeset
100 if '@SQ\tSN:' in line :
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
101 size = int(line.split('LN:')[1])
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
102 genomeF = [0]*size
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
103 genomeR = [0]*size
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
104 # define multiple reads keys from mapper
10
rlegendre
parents: 2
diff changeset
105 elif '@PG\tID' in line :
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
106 if 'bowtie' in line:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
107 multi_tag = "XS:i:"
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
108 elif 'bwa' in line:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
109 multi_tag = "XT:A:R"
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
110 elif 'TopHat' in line:
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
111 tag = "NH:i:1"
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
112 else :
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
113 stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
114
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
115 # get footprint
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
116 elif re.search('^[^@].+', line) :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
117 len_read = len(line.split('\t')[9])
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
118 ##full kmer dict
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
119 if KMER.has_key(len_read):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
120 KMER[len_read] += 1
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
121 else :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
122 KMER[len_read] = 1
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
123
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
124 #print line.rstrip()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
125 read_pos = int(line.split('\t')[3])
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
126 read_sens = int(line.split('\t')[1])
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
127 #len_read = len(line.split('\t')[9])
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
128 if len_read == kmer and (tag in line or multi_tag not in line):
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
129 ###if line.split('\t')[5] == '28M' :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
130 total_mapped_read +=1
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
131 #if it's a forward read
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
132 if read_sens == 0 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
133 #get 5' base
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
134 genomeF[read_pos] += 1
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
135 #if it's a reverse read
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
136 elif read_sens == 16 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
137 #get 5' base
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
138 read_pos += (len_read-1)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
139 genomeR[read_pos] += 1
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
140
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
141 #try:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
142 #write coverage in files
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
143 with open(tmpdir+'/assoCov_'+chrom+'.txt', 'w') as cov :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
144 for i in range(0,size):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
145 cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n')
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
146 #except Exception,e:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
147 # stop_err( 'Error during coverage file writting : ' + str( e ) )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
148
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
149 #sys.stdout.write("%d reads are in your bam file\n" % total_mapped_read)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
150
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
151 return KMER
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
152
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
153 except Exception, e:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
154 stop_err( 'Error during footprinting : ' + str( e ) )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
155
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
156
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
157 def __percent__(prop):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
158
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
159 if sum(prop) != 0 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
160 perc = [0,0,0]
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
161 if prop[0] != 0 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
162 perc[0] = int("{0:.0f}".format((prop[0]*100.0)/sum(prop)))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
163 if prop[1] != 0 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
164 perc[1] = int("{0:.0f}".format((prop[1]*100.0)/sum(prop)))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
165 if prop[2] != 0 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
166 perc[2] = int("{0:.0f}".format((prop[2]*100.0)/sum(prop)))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
167 return perc
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
168 else :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
169 return prop
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
170
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
171
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
172 def frame_analysis(tmpdir,GFF):
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
173 '''
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
174 This function take footprint and annotation (gff) for analyse reading frame in each gene
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
175 '''
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
176 global total_mapped_read
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
177 try:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
178 chrom = "" # initializing chromosome
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
179 nb_gene = 0 # number of analysed genes
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
180 whole_phasing = [0,0,0]
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
181 for gene in GFF['order']:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
182 ## maybe no start position in GTF file so we must to check and replace
10
rlegendre
parents: 2
diff changeset
183 exon_number = GFF[gene]['exon_number']
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
184 try : GFF[gene]['start']
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
185 except :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
186 if GFF[gene]['strand'] == '+' :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
187 GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
188 else :
10
rlegendre
parents: 2
diff changeset
189 GFF[gene]['start'] = GFF[gene]['exon'][exon_number]['stop']
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
190 ## also for stop coordinates
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
191 try : GFF[gene]['stop']
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
192 except :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
193 if GFF[gene]['strand'] == '+' :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
194 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
195
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
196 else :
10
rlegendre
parents: 2
diff changeset
197 GFF[gene]['stop'] = GFF[gene]['exon'][1]['start']
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
198 cov = []
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
199 ##first chromosome : we open corresponding file
10
rlegendre
parents: 2
diff changeset
200 try:
rlegendre
parents: 2
diff changeset
201 if chrom == "" :
rlegendre
parents: 2
diff changeset
202 chrom = GFF[gene]['chrom']
rlegendre
parents: 2
diff changeset
203 with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
rlegendre
parents: 2
diff changeset
204 data = f.readlines()
rlegendre
parents: 2
diff changeset
205 ##if we change chromosome
rlegendre
parents: 2
diff changeset
206 elif chrom != GFF[gene]['chrom'] :
rlegendre
parents: 2
diff changeset
207 chrom = GFF[gene]['chrom']
rlegendre
parents: 2
diff changeset
208 with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
rlegendre
parents: 2
diff changeset
209 data = f.readlines()
rlegendre
parents: 2
diff changeset
210 except IOError :
rlegendre
parents: 2
diff changeset
211 print tmpdir+"/assoCov_"+chrom+".txt doesn't exist"
rlegendre
parents: 2
diff changeset
212
19
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
213 try:
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
214 ## if a gene without intron :
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
215 if GFF[gene]['exon_number'] == 1:
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
216
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
217 ## get coverage for each gene
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
218 if GFF[gene]['strand'] == "+":
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
219 for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
220 cov.append(int((data[i].rstrip()).split("\t")[0]))
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
221 else :
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
222 for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1):
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
223 cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
224 cov.reverse()
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
225 else :
19
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
226 ## For each gene, get coverage and sum of exon size
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
227 if GFF[gene]['strand'] == "+":
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
228
19
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
229 for exon in range(1,GFF[gene]['exon_number']+1) :
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
230 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
231 #if i <= GFF[gene]['stop'] :
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
232 cov.append(int((data[i].rstrip()).split("\t")[0]))
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
233 else :
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
234
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
235 for exon in range(1,GFF[gene]['exon_number']+1) :
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
236 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
237 #if i <= GFF[gene]['start'] :
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
238 cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
239 cov.reverse()
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
240 except :
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
241 #print gene+" could not be analysed."
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
242 #del GFF[gene]
385fc64fa988 Uploaded
rlegendre
parents: 18
diff changeset
243 continue
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
244 len_cov = len(cov)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
245 prop = [0,0,0]
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
246 for nuc in range(0,len_cov-2,3) :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
247 ## Calculate proportion
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
248 prop[0] += cov[nuc]
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
249 prop[1] += cov[nuc+1]
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
250 prop[2] += cov[nuc+2]
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
251 whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop)))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
252
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
253 whole_phasing = __percent__(whole_phasing)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
254 #sys.stdout.write("Proportion of reads in each frame :\n%s\n" % whole_phasing)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
255 return whole_phasing
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
256
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
257 except Exception, e:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
258 stop_err( 'Error during frame analysis : ' + str( e ) )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
259
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
260
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
261
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
262 def make_report(html_file, dirout, kmer, results) :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
263
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
264 array = sorted(kmer.items(), key=lambda x: x[0])
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
265 values = []
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
266 label = []
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
267 for x,y in array :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
268 values.append(y)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
269 label.append(x)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
270 index = arange(len(label))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
271 bar_width = 0.35
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
272 axis_font = {'size':'10'}
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
273 fig, ax = pl.subplots()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
274 pl.bar(index ,values, color='LightsteelBlue')
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
275 pl.xlabel('kmer value', **axis_font)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
276 pl.ylabel('Number of reads', **axis_font)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
277 pl.title('Number of reads for each k-mer')
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
278 pl.xticks(index + bar_width, label, **axis_font)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
279 #pl.show()
10
rlegendre
parents: 2
diff changeset
280 fig.subplots_adjust()
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
281 pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
282 pl.clf()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
283
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
284 for key, phase in results.iteritems() :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
285 fig = pl.figure(num=1)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
286 frame = phase
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
287 index = arange(3)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
288 bar_width = 0.5
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
289 pl.bar(index,frame,color=['RoyalBlue','LightSkyBlue','LightBlue'])
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
290 pl.xlabel('Frame in gene', **axis_font)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
291 pl.ylabel('Percent of read', **axis_font)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
292 pl.title('Proportion of reads in each frame for '+str(key)+'-mer')
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
293 pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font)
10
rlegendre
parents: 2
diff changeset
294 #pl.tight_layout()
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
295 pl.ylim(0,100)
10
rlegendre
parents: 2
diff changeset
296 fig.subplots_adjust()
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
297 pl.draw()
10
rlegendre
parents: 2
diff changeset
298 pl.show()
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
299 pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
300 pl.clf()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
301
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
302 kmer_summary = ''
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
303 kmer_sorted = OrderedDict(sorted(kmer.iteritems(), key=lambda x: x[0]))
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
304 for key, number in kmer_sorted.iteritems() :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
305 if number > 100 :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
306 kmer_summary += '<li>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
307 kmer_summary += 'Analysis of '+str(key)+'-mer : <br>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
308 kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
309 kmer_summary += 'Phasing of '+str(key)+'-mer is : '+str(results[key])+'<br>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
310 kmer_summary += '<img border="0" src="'+str(key)+'_phasing.png" width="50%%" height="50%%"/><br>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
311 kmer_summary += '</li>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
312 else :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
313 kmer_summary += '<li>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
314 kmer_summary += 'Analysis of '+str(key)+'-mer : <br>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
315 kmer_summary += 'You have '+str(number)+' '+str(key)+'-mer : <br>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
316 kmer_summary += '</li>'
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
317
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
318 html_str = """
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
319 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
320 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
321
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
322 <html xmlns="http://www.w3.org/1999/xhtml">
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
323 <head>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
324 <link href="lightbox/css/lightbox.css" rel="stylesheet" />
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
325 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
326 <title>kmer analysis</title>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
327 </head>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
328 <body>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
329 <h1>k-mer analysis results</h1>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
330 <p>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
331 <ul>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
332 <li>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
333 Number of reads for each k-mer in your sample : <br />
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
334 <img border="0" src="kmer_proportion.png" width="50%%" height="50%%"/>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
335 </li>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
336 %s
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
337 </ul>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
338 </p>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
339 </body>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
340 </html>
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
341 """
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
342 all = html_str % kmer_summary
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
343
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
344 html = open(html_file,"w")
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
345 html.write(all)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
346 html.close()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
347
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
348 def __main__():
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
349
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
350 #Parse command line options
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
351 parser = optparse.OptionParser()
10
rlegendre
parents: 2
diff changeset
352 parser.add_option("-g", "--gff", dest="gff", type= "string",
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
353 help="GFF annotation file", metavar="FILE")
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
354
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
355 parser.add_option("-b", "--bam", dest="bamfile", type= "string",
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
356 help="Bam Ribo-Seq alignments ", metavar="FILE")
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
357
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
358 parser.add_option("-o", "--out", dest="html_file", type= "string",
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
359 help="write report to FILE", metavar="FILE")
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
360
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
361 parser.add_option("-d", "--dirout", dest="dirout", type= "string",
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
362 help="write report to PNG files", metavar="FILE")
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
363
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
364 parser.add_option("-q", "--quiet",
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
365 action="store_false", dest="verbose", default=True,
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
366 help="don't print status messages to stdout")
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
367
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
368 (options, args) = parser.parse_args()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
369 sys.stdout.write("Begin kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
370
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
371 try:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
372
18
a121cce43f90 Uploaded
rlegendre
parents: 13
diff changeset
373 if not os.path.exists(options.dirout):
a121cce43f90 Uploaded
rlegendre
parents: 13
diff changeset
374 try:
a121cce43f90 Uploaded
rlegendre
parents: 13
diff changeset
375 os.mkdir(options.dirout)
a121cce43f90 Uploaded
rlegendre
parents: 13
diff changeset
376 except Exception, e :
a121cce43f90 Uploaded
rlegendre
parents: 13
diff changeset
377 stop_err('Error running make directory : ' + str(e))
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
378
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
379 ##testing indexed bam file
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
380 if os.path.isfile(options.bamfile+".bai") :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
381 pass
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
382 else :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
383 cmd = "samtools index %s " % (options.bamfile)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
384 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
385 returncode = proc.wait()
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
386
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
387 tmpdir = tempfile.mkdtemp()
10
rlegendre
parents: 2
diff changeset
388 ## identify GFF or GTF format from 9th column
rlegendre
parents: 2
diff changeset
389 with open (options.gff,"r") as gffile :
rlegendre
parents: 2
diff changeset
390 for line in gffile :
rlegendre
parents: 2
diff changeset
391 if '#' in line :
rlegendre
parents: 2
diff changeset
392 ## skip header
rlegendre
parents: 2
diff changeset
393 gffile.next()
rlegendre
parents: 2
diff changeset
394 elif 'gene_id' in line :
rlegendre
parents: 2
diff changeset
395 ## launch gtf reader :
rlegendre
parents: 2
diff changeset
396 GFF = ribo_functions.store_gtf(options.gff)
rlegendre
parents: 2
diff changeset
397 break
rlegendre
parents: 2
diff changeset
398 elif 'ID=' in line :
rlegendre
parents: 2
diff changeset
399 ## launch gff reader
rlegendre
parents: 2
diff changeset
400 GFF = ribo_functions.store_gff(options.gff)
rlegendre
parents: 2
diff changeset
401 break
rlegendre
parents: 2
diff changeset
402 else :
rlegendre
parents: 2
diff changeset
403 stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )
rlegendre
parents: 2
diff changeset
404 #GFF = store_gff(options.gff)
rlegendre
parents: 2
diff changeset
405 #GFF = ribo_functions.store_gtf(options.gff)
rlegendre
parents: 2
diff changeset
406 ## check gff reading
rlegendre
parents: 2
diff changeset
407 if not GFF['order'] :
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
408 stop_err( 'Incorrect GFF file' )
18
a121cce43f90 Uploaded
rlegendre
parents: 13
diff changeset
409 for gene in GFF['order']:
a121cce43f90 Uploaded
rlegendre
parents: 13
diff changeset
410 if not GFF[gene]['exon'] :
a121cce43f90 Uploaded
rlegendre
parents: 13
diff changeset
411 del GFF[gene]
a121cce43f90 Uploaded
rlegendre
parents: 13
diff changeset
412 GFF['order'].remove(gene)
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
413 ## split bam
10
rlegendre
parents: 2
diff changeset
414 split_bam(options.bamfile,tmpdir)
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
415 ###################################
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
416 ## First analysis with 28mer :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
417 ###################################
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
418 ## compute coverage and distribution kmer
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
419 kmer = get_first_base(tmpdir, 28)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
420 ## compute phasing
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
421 whole_phasing = frame_analysis(tmpdir,GFF)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
422 ## save phasing
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
423 results = {}
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
424 results[28] = whole_phasing
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
425 ## compute analysis with other kmer
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
426 for keys in kmer.iterkeys() :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
427 if keys != 28 :
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
428
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
429 ## If not enought reads in this kmer :
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
430 if kmer[keys] > 100 :
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
431 ## remove all txt files in tmp directory
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
432 if os.system("rm "+tmpdir+"/*.txt") != 0 :
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
433 stop_err( 'Error during tmp directory cleaning')
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
434 ## compute coverage and distribution kmer
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
435 tmp = get_first_base(tmpdir, keys)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
436 ## compute phasing
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
437 whole_phasing = frame_analysis(tmpdir,GFF)
10
rlegendre
parents: 2
diff changeset
438
2
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
439 results[keys] = whole_phasing
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
440 ## get report
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
441 make_report(options.html_file, options.dirout, kmer, results)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
442 #=======================================================================
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
443 # ############
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
444 # # tests
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
445 # ############
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
446 # with open("kmer_dict",'rb') as km:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
447 # kmer = cPickle.load(km)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
448 # with open("results_dict",'rb') as res:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
449 # results = cPickle.load(res)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
450 # with open("kmer_dict",'wb') as km:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
451 # cPickle.dump(kmer,km)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
452 # with open("results_dict",'wb') as res:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
453 # cPickle.dump(results,res)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
454 # make_report(options.html_file, options.dirout, kmer, results)
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
455 #=======================================================================
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
456
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
457 sys.stdout.write("Finish kmer and phasing analysis at %s\n" % time.asctime( time.localtime( time.time() ) ) )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
458
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
459 except Exception, e:
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
460 stop_err( 'Error running kmer and phasing analysis : ' + str( e ) )
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
461
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
462
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
463 if __name__=="__main__":
da126b91f9ea Uploaded
rlegendre
parents:
diff changeset
464 __main__()