comparison kmer_analysis.py @ 10:707807fee542

(none)
author rlegendre
date Thu, 22 Jan 2015 14:34:38 +0100
parents da126b91f9ea
children 7c944fd9907e
comparison
equal deleted inserted replaced
9:d7739f797a26 10:707807fee542
12 #from matplotlib import pyplot as pl 12 #from matplotlib import pyplot as pl
13 import matplotlib 13 import matplotlib
14 matplotlib.use('Agg') 14 matplotlib.use('Agg')
15 import matplotlib.pyplot as pl 15 import matplotlib.pyplot as pl
16 from numpy import arange 16 from numpy import arange
17 from collections import OrderedDict
17 import ribo_functions 18 import ribo_functions
18 from collections import OrderedDict
19 #import cPickle 19 #import cPickle
20 ## suppress matplotlib warnings
21 import warnings
22 warnings.filterwarnings('ignore')
23
20 24
21 total_mapped_read = 0 25 total_mapped_read = 0
22 26
23 27
24 def stop_err( msg ): 28 def stop_err( msg ):
25 sys.stderr.write( "%s\n" % msg ) 29 sys.stderr.write( "%s\n" % msg )
26 sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) ) 30 sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) )
27 sys.exit() 31 sys.exit()
28 32
29 33
30 34 def split_bam(bamfile,tmpdir):
35 '''
36 split bam by chromosome and write sam file in tmpdir
37 '''
38 try:
39 #get header
40 results = subprocess.check_output(['samtools', 'view', '-H',bamfile])
41 header = results.split('\n')
42
43 #define genome size
44 genome = []
45 for line in header:
46 result = re.search('SN', line)
47 if result :
48 #print line
49 feat = line.split('\t')
50 chrom = re.split(":", feat[1])
51 #print feat[1]
52 genome.append(chrom[1])
53
54 #split sam by chrom
55 n = 0
56 for chrm in genome:
57 with open(tmpdir+'/'+chrm+'.sam', 'w') as f :
58 #write header correctly for each chromosome
59 f.write(header[0]+'\n')
60 expr = re.compile(chrm+'\t')
61 el =[elem for elem in header if expr.search(elem)][0]
62 f.write(el+'\n')
63 f.write(header[-2]+'\n')
64 #write all reads for each chromosome
65 reads = subprocess.check_output(["samtools", "view", bamfile, chrm])
66 f.write(reads)
67 # calculate number of reads
68 n += reads.count(chrm)
69
70 sys.stdout.write("%d reads are presents in your bam file\n" % n)
71
72 except Exception, e:
73 stop_err( 'Error during bam file splitting : ' + str( e ) )
74
75
76
31 def get_first_base(tmpdir, kmer): 77 def get_first_base(tmpdir, kmer):
32 ''' 78 '''
33 write footprint coverage file for each sam file in tmpdir and get kmer distribution 79 write footprint coverage file for each sam file in tmpdir and get kmer distribution
34 ''' 80 '''
35 global total_mapped_read 81 global total_mapped_read
45 ##get chromosome name 91 ##get chromosome name
46 chrom = samfile.split(".sam")[0] 92 chrom = samfile.split(".sam")[0]
47 93
48 for line in sam: 94 for line in sam:
49 #initialize dictionnary 95 #initialize dictionnary
50 if '@SQ' in line : 96 if '@SQ\tSN:' in line :
51 size = int(line.split('LN:')[1]) 97 size = int(line.split('LN:')[1])
52 genomeF = [0]*size 98 genomeF = [0]*size
53 genomeR = [0]*size 99 genomeR = [0]*size
54 # define multiple reads keys from mapper 100 # define multiple reads keys from mapper
55 elif '@PG' in line : 101 elif '@PG\tID' in line :
56 if 'bowtie' in line: 102 if 'bowtie' in line:
57 multi_tag = "XS:i:" 103 multi_tag = "XS:i:"
58 elif 'bwa' in line: 104 elif 'bwa' in line:
59 multi_tag = "XT:A:R" 105 multi_tag = "XT:A:R"
106 #elif 'TopHat' in line:
107 # multi_tag = "NH:i:1"
60 else : 108 else :
61 stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") 109 stop_err("No PG tag find in "+samfile+". Please use bowtie or bwa for mapping")
62 110
63 # get footprint 111 # get footprint
64 elif re.search('^[^@].+', line) : 112 elif re.search('^[^@].+', line) :
65 len_read = len(line.split('\t')[9]) 113 len_read = len(line.split('\t')[9])
66 ##full kmer dict 114 ##full kmer dict
126 chrom = "" # initializing chromosome 174 chrom = "" # initializing chromosome
127 nb_gene = 0 # number of analysed genes 175 nb_gene = 0 # number of analysed genes
128 whole_phasing = [0,0,0] 176 whole_phasing = [0,0,0]
129 for gene in GFF['order']: 177 for gene in GFF['order']:
130 ## maybe no start position in GTF file so we must to check and replace 178 ## maybe no start position in GTF file so we must to check and replace
179 exon_number = GFF[gene]['exon_number']
131 try : GFF[gene]['start'] 180 try : GFF[gene]['start']
132 except : 181 except :
133 if GFF[gene]['strand'] == '+' : 182 if GFF[gene]['strand'] == '+' :
134 GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] 183 GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
135 else : 184 else :
136 GFF[gene]['start'] = GFF[gene]['exon'][1]['stop'] 185 GFF[gene]['start'] = GFF[gene]['exon'][exon_number]['stop']
137 ## also for stop coordinates 186 ## also for stop coordinates
138 try : GFF[gene]['stop'] 187 try : GFF[gene]['stop']
139 except : 188 except :
140 exon_number = GFF[gene]['exon_number']
141 if GFF[gene]['strand'] == '+' : 189 if GFF[gene]['strand'] == '+' :
142 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] 190 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
143 191
144 else : 192 else :
145 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['start'] 193 GFF[gene]['stop'] = GFF[gene]['exon'][1]['start']
146 cov = [] 194 cov = []
147 ##first chromosome : we open corresponding file 195 ##first chromosome : we open corresponding file
148 if chrom == "" : 196 try:
149 chrom = GFF[gene]['chrom'] 197 if chrom == "" :
150 with open(tmpdir+"/assoCov_"+chrom+".txt") as f : 198 chrom = GFF[gene]['chrom']
151 data = f.readlines() 199 with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
152 ##if we change chrosomosome 200 data = f.readlines()
153 elif chrom != GFF[gene]['chrom'] : 201 ##if we change chromosome
154 chrom = GFF[gene]['chrom'] 202 elif chrom != GFF[gene]['chrom'] :
155 with open(tmpdir+"/assoCov_"+chrom+".txt") as f : 203 chrom = GFF[gene]['chrom']
156 data = f.readlines() 204 with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
157 205 data = f.readlines()
206 except IOError :
207 print tmpdir+"/assoCov_"+chrom+".txt doesn't exist"
208
209
158 ## if a gene without intron : 210 ## if a gene without intron :
159 if GFF[gene]['exon_number'] == 1: 211 if GFF[gene]['exon_number'] == 1:
160 212
161 ## get coverage for each gene 213 ## get coverage for each gene
162 if GFF[gene]['strand'] == "+": 214 if GFF[gene]['strand'] == "+":
170 ## For each gene, get coverage and sum of exon size 222 ## For each gene, get coverage and sum of exon size
171 if GFF[gene]['strand'] == "+": 223 if GFF[gene]['strand'] == "+":
172 224
173 for exon in range(1,GFF[gene]['exon_number']+1) : 225 for exon in range(1,GFF[gene]['exon_number']+1) :
174 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): 226 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
175 if i <= GFF[gene]['stop'] : 227 #if i <= GFF[gene]['stop'] :
176 cov.append(int((data[i].rstrip()).split("\t")[0])) 228 cov.append(int((data[i].rstrip()).split("\t")[0]))
177 else : 229 else :
178 230
179 for exon in range(1,GFF[gene]['exon_number']+1) : 231 for exon in range(1,GFF[gene]['exon_number']+1) :
180 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): 232 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
181 if i <= GFF[gene]['start'] : 233 #if i <= GFF[gene]['start'] :
182 cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) 234 cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
183 cov.reverse() 235 cov.reverse()
184 236
185 len_cov = len(cov) 237 len_cov = len(cov)
186 prop = [0,0,0] 238 prop = [0,0,0]
187 for nuc in range(0,len_cov-2,3) : 239 for nuc in range(0,len_cov-2,3) :
216 pl.xlabel('kmer value', **axis_font) 268 pl.xlabel('kmer value', **axis_font)
217 pl.ylabel('Number of reads', **axis_font) 269 pl.ylabel('Number of reads', **axis_font)
218 pl.title('Number of reads for each k-mer') 270 pl.title('Number of reads for each k-mer')
219 pl.xticks(index + bar_width, label, **axis_font) 271 pl.xticks(index + bar_width, label, **axis_font)
220 #pl.show() 272 #pl.show()
273 fig.subplots_adjust()
221 pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640) 274 pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640)
222 pl.clf() 275 pl.clf()
223 276
224 for key, phase in results.iteritems() : 277 for key, phase in results.iteritems() :
225 fig = pl.figure(num=1) 278 fig = pl.figure(num=1)
229 pl.bar(index,frame,color=['RoyalBlue','LightSkyBlue','LightBlue']) 282 pl.bar(index,frame,color=['RoyalBlue','LightSkyBlue','LightBlue'])
230 pl.xlabel('Frame in gene', **axis_font) 283 pl.xlabel('Frame in gene', **axis_font)
231 pl.ylabel('Percent of read', **axis_font) 284 pl.ylabel('Percent of read', **axis_font)
232 pl.title('Proportion of reads in each frame for '+str(key)+'-mer') 285 pl.title('Proportion of reads in each frame for '+str(key)+'-mer')
233 pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font) 286 pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font)
234 pl.tight_layout() 287 #pl.tight_layout()
235 pl.ylim(0,100) 288 pl.ylim(0,100)
289 fig.subplots_adjust()
236 pl.draw() 290 pl.draw()
237 #pl.show() 291 pl.show()
238 pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300) 292 pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300)
239 pl.clf() 293 pl.clf()
240 294
241 kmer_summary = '' 295 kmer_summary = ''
242 kmer_sorted = OrderedDict(sorted(kmer.iteritems(), key=lambda x: x[0])) 296 kmer_sorted = OrderedDict(sorted(kmer.iteritems(), key=lambda x: x[0]))
286 340
287 def __main__(): 341 def __main__():
288 342
289 #Parse command line options 343 #Parse command line options
290 parser = optparse.OptionParser() 344 parser = optparse.OptionParser()
291 parser.add_option("-g", "--gff", dest="gfffile", type= "string", 345 parser.add_option("-g", "--gff", dest="gff", type= "string",
292 help="GFF annotation file", metavar="FILE") 346 help="GFF annotation file", metavar="FILE")
293 347
294 parser.add_option("-b", "--bam", dest="bamfile", type= "string", 348 parser.add_option("-b", "--bam", dest="bamfile", type= "string",
295 help="Bam Ribo-Seq alignments ", metavar="FILE") 349 help="Bam Ribo-Seq alignments ", metavar="FILE")
296 350
323 cmd = "samtools index %s " % (options.bamfile) 377 cmd = "samtools index %s " % (options.bamfile)
324 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) 378 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
325 returncode = proc.wait() 379 returncode = proc.wait()
326 380
327 tmpdir = tempfile.mkdtemp() 381 tmpdir = tempfile.mkdtemp()
328 GFF = ribo_functions.store_gff(options.gfffile) 382 ## identify GFF or GTF format from 9th column
383 with open (options.gff,"r") as gffile :
384 for line in gffile :
385 if '#' in line :
386 ## skip header
387 gffile.next()
388 elif 'gene_id' in line :
389 ## launch gtf reader :
390 GFF = ribo_functions.store_gtf(options.gff)
391 break
392 elif 'ID=' in line :
393 ## launch gff reader
394 GFF = ribo_functions.store_gff(options.gff)
395 break
396 else :
397 stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )
398 #GFF = store_gff(options.gff)
399 #GFF = ribo_functions.store_gtf(options.gff)
400 ## check gff reading
401 if not GFF['order'] :
402 stop_err( 'Incorrect GFF file' + str( e ) )
403
329 ## split bam 404 ## split bam
330 ribo_functions.split_bam(options.bamfile,tmpdir) 405 split_bam(options.bamfile,tmpdir)
331 ################################### 406 ###################################
332 ## First analysis with 28mer : 407 ## First analysis with 28mer :
333 ################################### 408 ###################################
334 ## compute coverage and distribution kmer 409 ## compute coverage and distribution kmer
335 kmer = get_first_base(tmpdir, 28) 410 kmer = get_first_base(tmpdir, 28)
349 if kmer[keys] > 100 : 424 if kmer[keys] > 100 :
350 ## compute coverage and distribution kmer 425 ## compute coverage and distribution kmer
351 tmp = get_first_base(tmpdir, keys) 426 tmp = get_first_base(tmpdir, keys)
352 ## compute phasing 427 ## compute phasing
353 whole_phasing = frame_analysis(tmpdir,GFF) 428 whole_phasing = frame_analysis(tmpdir,GFF)
429
354 results[keys] = whole_phasing 430 results[keys] = whole_phasing
355 ## get report 431 ## get report
356 make_report(options.html_file, options.dirout, kmer, results) 432 make_report(options.html_file, options.dirout, kmer, results)
357 #======================================================================= 433 #=======================================================================
358 # ############ 434 # ############