comparison conifer/conifer.py @ 0:ca5354286bee draft

Uploaded
author bzonnedda
date Tue, 18 Oct 2016 09:26:52 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ca5354286bee
1 #######################################################################
2 #######################################################################
3 # CoNIFER: Copy Number Inference From Exome Reads
4 # Developed by Niklas Krumm (C) 2012
5 # nkrumm@gmail.com
6 #
7 # homepage: http://conifer.sf.net
8 # This program is described in:
9 # Krumm et al. 2012. Genome Research. doi:10.1101/gr.138115.112
10 #
11 # This file is part of CoNIFER.
12 # CoNIFER is free software: you can redistribute it and/or modify
13 # it under the terms of the GNU General Public License as published by
14 # the Free Software Foundation, either version 3 of the License, or
15 # (at your option) any later version.
16 #
17 # This program is distributed in the hope that it will be useful,
18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 # GNU General Public License for more details.
21 #
22 # You should have received a copy of the GNU General Public License
23 # along with this program. If not, see <http://www.gnu.org/licenses/>.
24 #######################################################################
25 #######################################################################
26
27 import argparse
28 import os, sys, copy
29 import glob
30 import csv
31 import conifer_functions as cf
32 import operator
33 from tables import *
34 import numpy as np
35
36 def CF_analyze(args):
37 # do path/file checks:
38 try:
39 # read probes table
40 probe_fn = str(args.probes[0])
41 probes = cf.loadProbeList(probe_fn)
42 num_probes = len(probes)
43 print '[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn)
44 except IOError as e:
45 print '[ERROR] Cannot read probes file: ', probe_fn
46 sys.exit(0)
47
48 try:
49 svd_outfile_fn = str(args.output)
50 h5file_out = openFile(svd_outfile_fn, mode='w')
51 probe_group = h5file_out.createGroup("/","probes","probes")
52 except IOError as e:
53 print '[ERROR] Cannot open SVD output file for writing: ', svd_outfile_fn
54 sys.exit(0)
55
56 if args.write_svals != "":
57 sval_f = open(args.write_svals,'w')
58
59 if args.plot_scree != "":
60 try:
61 import matplotlib
62 matplotlib.use('Agg')
63 import matplotlib.pyplot as plt
64 import pylab as P
65 from matplotlib.lines import Line2D
66 from matplotlib.patches import Rectangle
67 except:
68 print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
69 sys.exit(0)
70
71 plt.gcf().clear()
72 fig = plt.figure(figsize=(10,5))
73 ax = fig.add_subplot(111)
74
75 rpkm_dir = str(args.rpkm_dir[0])
76 rpkm_files = glob.glob(rpkm_dir + "/*")
77 if len(rpkm_files) == 0:
78 print '[ERROR] Cannot find any files in RPKM directory (or directory path is incorrect): ', rpkm_dir
79 sys.exit(0)
80 elif len(rpkm_files) == 1:
81 print '[ERROR] Found only 1 RPKM file (sample). CoNIFER requires multiple samples (8 or more) to run. Exiting.'
82 sys.exit(0)
83 elif len(rpkm_files) < 8:
84 print '[WARNING] Only found %d samples... this is less than the recommended minimum, and CoNIFER may not analyze this dataset correctly!' % len(rpkm_files)
85 elif len(rpkm_files) <= int(args.svd):
86 print '[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (len(rpkm_files), int(args.svd))
87 sys.exit(0)
88 else:
89 print '[INIT] Found %d RPKM files in %s' % (len(rpkm_files), rpkm_dir)
90
91 # read in samples names and generate file list
92 samples = {}
93 for f in rpkm_files:
94 s = '.'.join(f.split('/')[-1].split('.')[0:-1])
95 print "[INIT] Mapping file to sampleID: %s --> %s" % (f, s)
96 samples[s] = f
97
98 #check uniqueness and total # of samples
99 if len(set(samples)) != len(set(rpkm_files)):
100 print '[ERROR] Could not successfully derive sample names from RPKM filenames. There are probably non-unique sample names! Please rename files using <sampleID>.txt format!'
101 sys.exit(0)
102
103 # LOAD RPKM DATA
104 RPKM_data = np.zeros([num_probes,len(samples)], dtype=np.float)
105 failed_samples = 0
106
107 for i,s in enumerate(samples.keys()):
108 t = np.loadtxt(samples[s], dtype=np.float, delimiter="\t", skiprows=0, usecols=[2])
109 if len(t) != num_probes:
110 print "[WARNING] Number of RPKM values for %s in file %s does not match number of defined probes in %s. **This sample will be dropped from analysis**!" % (s, samples[s], probe_fn)
111 _ = samples.pop(s)
112 failed_samples += 1
113 else:
114 RPKM_data[:,i] = t
115 print "[INIT] Successfully read RPKM data for sampleID: %s" % s
116
117 RPKM_data = RPKM_data[:,0:len(samples)]
118 print "[INIT] Finished reading RPKM files. Total number of samples in experiment: %d (%d failed to read properly)" % (len(samples), failed_samples)
119
120 if len(samples) <= int(args.svd):
121 print '[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (int(args.svd), len(samples))
122 sys.exit(0)
123
124 # BEGIN
125 chrs_to_process = set(map(operator.itemgetter("chr"),probes))
126 chrs_to_process_str = ', '.join([cf.chrInt2Str(c) for c in chrs_to_process])
127 print '[INIT] Attempting to process chromosomes: ', chrs_to_process_str
128
129
130
131 for chr in chrs_to_process:
132 print "[RUNNING: chr%d] Now on: %s" %(chr, cf.chrInt2Str(chr))
133 chr_group_name = "chr%d" % chr
134 chr_group = h5file_out.createGroup("/",chr_group_name,chr_group_name)
135
136 chr_probes = filter(lambda i: i["chr"] == chr, probes)
137 num_chr_probes = len(chr_probes)
138 start_probeID = chr_probes[0]['probeID']
139 stop_probeID = chr_probes[-1]['probeID']
140 print "[RUNNING: chr%d] Found %d probes; probeID range is [%d-%d]" % (chr, len(chr_probes), start_probeID-1, stop_probeID) # probeID is 1-based and slicing is 0-based, hence the start_probeID-1 term
141
142 rpkm = RPKM_data[start_probeID:stop_probeID,:]
143
144 print "[RUNNING: chr%d] Calculating median RPKM" % chr
145 median = np.median(rpkm,1)
146 sd = np.std(rpkm,1)
147 probe_mask = median >= float(args.min_rpkm)
148 print "[RUNNING: chr%d] Masking %d probes with median RPKM < %f" % (chr, np.sum(probe_mask==False), float(args.min_rpkm))
149
150 rpkm = rpkm[probe_mask, :]
151 num_chr_probes = np.sum(probe_mask)
152
153 if num_chr_probes <= len(samples):
154 print "[ERROR] This chromosome has fewer informative probes than there are samples in the analysis! There are probably no mappings on this chromosome. Please remove these probes from the probes.txt file"
155 sys.exit(0)
156
157 probeIDs = np.array(map(operator.itemgetter("probeID"),chr_probes))[probe_mask]
158 probe_starts = np.array(map(operator.itemgetter("start"),chr_probes))[probe_mask]
159 probe_stops = np.array(map(operator.itemgetter("stop"),chr_probes))[probe_mask]
160 gene_names = np.array(map(operator.itemgetter("name"),chr_probes))[probe_mask]
161
162 dt = np.dtype([('probeID',np.uint32),('start',np.uint32),('stop',np.uint32), ('name', np.str_, 20)])
163
164 out_probes = np.empty(num_chr_probes,dtype=dt)
165 out_probes['probeID'] = probeIDs
166 out_probes['start'] = probe_starts
167 out_probes['stop'] = probe_stops
168 out_probes['name'] = gene_names
169 probe_table = h5file_out.createTable(probe_group,"probes_chr%d" % chr,cf.probe,"chr%d" % chr)
170 probe_table.append(out_probes)
171
172 print "[RUNNING: chr%d] Calculating ZRPKM scores..." % chr
173 rpkm = np.apply_along_axis(cf.zrpkm, 0, rpkm, median[probe_mask], sd[probe_mask])
174
175 # svd transform
176 print "[RUNNING: chr%d] SVD decomposition..." % chr
177 components_removed = int(args.svd)
178
179 U, S, Vt = np.linalg.svd(rpkm,full_matrices=False)
180 new_S = np.diag(np.hstack([np.zeros([components_removed]),S[components_removed:]]))
181
182 if args.write_svals != "":
183 sval_f.write('chr' + str(chr) + '\t' + '\t'.join([str(_i) for _i in S]) + "\n")
184
185 if args.plot_scree != "":
186 ax.plot(S, label='chr' + str(chr),lw=0.5)
187
188 # reconstruct data matrix
189 rpkm = np.dot(U, np.dot(new_S, Vt))
190
191
192 # save to HDF5 file
193 print "[RUNNING: chr%d] Saving SVD-ZRPKM values" % chr
194
195 for i,s in enumerate(samples):
196 out_data = np.empty(num_chr_probes,dtype='u4,f8')
197 out_data['f0'] = probeIDs
198 out_data['f1'] = rpkm[:,i]
199 sample_tbl = h5file_out.createTable(chr_group,"sample_" + str(s),cf.rpkm_value,"%s" % str(s))
200 sample_tbl.append(out_data)
201
202
203 print "[RUNNING] Saving sampleIDs to file..."
204 sample_group = h5file_out.createGroup("/","samples","samples")
205 sample_table = h5file_out.createTable(sample_group,"samples",cf.sample,"samples")
206 dt = np.dtype([('sampleID',np.str_,100)])
207 out_samples = np.empty(len(samples.keys()),dtype=dt)
208 out_samples['sampleID'] = np.array(samples.keys())
209 sample_table.append(out_samples)
210
211
212 if args.write_sd != "":
213 print "[RUNNING] Calculating standard deviations for all samples (this can take a while)..."
214
215 sd_file = open(args.write_sd,'w')
216
217 for i,s in enumerate(samples):
218 # collect all SVD-ZRPKM values
219 count = 1
220 for chr in chrs_to_process:
221 if count == 1:
222 sd_out = h5file_out.root._f_getChild("chr%d" % chr)._f_getChild("sample_%s" % s).read(field="rpkm").flatten()
223 else:
224 sd_out = np.hstack([sd_out,out.h5file_out.root._f_getChild("chr%d" % chr)._f_getChild("sample_%s" % s).read(field="rpkm").flatten()])
225
226 sd = np.std(sd_out)
227 sd_file.write("%s\t%f\n" % (s,sd))
228
229 sd_file.close()
230
231 if args.plot_scree != "":
232 plt.title("Scree plot")
233 if len(samples) < 50:
234 plt.xlim([0,len(samples)])
235 plt.xlabel("S values")
236 else:
237 plt.xlim([0,50])
238 plt.xlabel("S values (only first 50 plotted)")
239 plt.ylabel("Relative contributed variance")
240 plt.savefig(args.plot_scree)
241
242 print "[FINISHED]"
243 h5file_out.close()
244 sys.exit(0)
245
246 def CF_export(args):
247 try:
248 h5file_in_fn = str(args.input)
249 h5file_in = openFile(h5file_in_fn, mode='r')
250 except IOError as e:
251 print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn
252 sys.exit(0)
253
254 # read probes
255 probes = {}
256 for probes_chr in h5file_in.root.probes:
257 probes[probes_chr.title] = probes_chr.read()
258
259 if args.sample =='all':
260 all_samples = list(h5file_in.root.samples.samples.read(field="sampleID"))
261
262 out_path = os.path.abspath(args.output)
263
264 print "[INIT] Preparing to export all samples (%d samples) to %s" % (len(all_samples), out_path)
265 for sample in all_samples:
266 try:
267 outfile_fn = out_path + "/" + sample + ".bed"
268 outfile_f = open(outfile_fn,'w')
269 except IOError as e:
270 print '[ERROR] Cannot open output file for writing: ', outfile_fn
271 sys.exit(0)
272 print "[RUNNING] Exporting %s" % sample
273
274 cf.export_sample(h5file_in,sample,probes,outfile_f)
275 outfile_f.close()
276
277 elif len(args.sample) == 1:
278 out_path = os.path.abspath(args.output)
279 sample = args.sample[0]
280 print "[INIT] Preparing to export sampleID %s to %s" % (args.sample[0], out_path)
281 try:
282 if os.path.isdir(out_path):
283 outfile_fn = out_path + "/" + sample + ".bed"
284 else:
285 outfile_fn = out_path
286 outfile_f = open(outfile_fn,'w')
287 except IOError as e:
288 print '[ERROR] Cannot open output file for writing: ', outfile_fn
289 sys.exit(0)
290 print "[RUNNING] Exporting %s to %s" % (sample, outfile_fn)
291
292 cf.export_sample(h5file_in,sample,probes,outfile_f)
293 outfile_f.close()
294
295 else:
296 out_path = os.path.abspath(args.output)
297 print "[INIT] Preparing to export %d samples to %s" % (len(args.sample), out_path)
298 for sample in args.sample:
299 try:
300 if os.path.isdir(out_path):
301 outfile_fn = out_path + "/" + sample + ".bed"
302 else:
303 outfile_fn = out_path
304 outfile_f = open(outfile_fn,'w')
305 except IOError as e:
306 print '[ERROR] Cannot open output file for writing: ', outfile_fn
307 sys.exit(0)
308 print "[RUNNING] Exporting %s to %s" % (sample, outfile_fn)
309
310 cf.export_sample(h5file_in,sample,probes,outfile_f)
311 outfile_f.close()
312 sys.exit(0)
313
314 def CF_call(args):
315 try:
316 h5file_in_fn = str(args.input)
317 h5file_in = openFile(h5file_in_fn, mode='r')
318 except IOError as e:
319 print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn
320 sys.exit(0)
321
322 try:
323 callfile_fn = str(args.output)
324 callfile_f = open(callfile_fn, mode='w')
325 except IOError as e:
326 print '[ERROR] Cannot open output file for writing: ', callfile_fn
327 sys.exit(0)
328
329 chrs_to_process = []
330 for chr in h5file_in.root:
331 if chr._v_title not in ('probes','samples'):
332 chrs_to_process.append(chr._v_title.replace("chr",""))
333
334 h5file_in.close()
335
336 print '[INIT] Initializing caller at threshold = %f' % (args.threshold)
337
338 r = cf.rpkm_reader(h5file_in_fn)
339
340 all_calls = []
341
342 for chr in chrs_to_process:
343 print '[RUNNING] Now processing chr%s' % chr
344 data = r.getExonValuesByRegion(chr)
345
346 #raw_data = copy.copy(data)
347 _ = data.smooth()
348
349 mean= np.mean(data.rpkm,axis=1)
350 sd = np.std(data.rpkm,axis=1)
351
352 for sample in r.getSampleList():
353 sample_data = data.getSample([sample]).flatten()
354 #sample_raw_data = raw_data.getSample([sample]).flatten()
355
356 dup_mask = sample_data >= args.threshold
357 del_mask = sample_data <= -1*args.threshold
358
359 dup_bkpoints = cf.getbkpoints(dup_mask) #returns exon coordinates for this chromosome (numpy array coords)
360 del_bkpoints = cf.getbkpoints(del_mask)
361
362
363 dups = []
364 for start,stop in dup_bkpoints:
365 try: new_start = np.max(np.where(sample_data[:start] < (mean[:start] + 3*sd[:start])))
366 except ValueError: new_start = 0
367 try: new_stop = stop + np.min(np.where(sample_data[stop:] < (mean[stop:] + 3*sd[stop:])))
368 except ValueError: new_stop = data.shape[1]-1
369 dups.append({"sampleID":sample,"chromosome": cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "dup"})
370
371 dels = []
372 for start,stop in del_bkpoints:
373 try: new_start = np.max(np.where(sample_data[:start] > (-1*mean[:start] - 3*sd[:start])))
374 except ValueError: new_start = 0
375 try: new_stop = stop + np.min(np.where(sample_data[stop:] > (-1*mean[stop:] - 3*sd[stop:])))
376 except ValueError: new_stop = data.shape[1]-1
377 dels.append({"sampleID":sample,"chromosome": cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "del"})
378
379 dels = cf.mergeCalls(dels) #merges overlapping calls
380 dups = cf.mergeCalls(dups)
381
382 #print sampleID, len(dels), len(dups)
383
384 all_calls.extend(list(dels))
385 all_calls.extend(list(dups))
386
387 # print calls to file
388 header = ['sampleID','chromosome','start','stop','state']
389
390 callfile_f.write('\t'.join(header) + "\n")
391 for call in all_calls:
392 print "%s\t%s\t%d\t%d\t%s" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"])
393 callfile_f.write("%s\t%s\t%d\t%d\t%s\n" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"]))
394
395 sys.exit(0)
396
397 def CF_plot(args):
398 try:
399 import locale
400 import matplotlib
401 matplotlib.use('Agg')
402 import matplotlib.pyplot as plt
403 import pylab as P
404 from matplotlib.lines import Line2D
405 from matplotlib.patches import Rectangle
406 _ = locale.setlocale(locale.LC_ALL, '')
407 except:
408 print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
409 sys.exit(0)
410
411
412 chr, start, stop = cf.parseLocString(args.region)
413
414 r = cf.rpkm_reader(args.input)
415
416 data = r.getExonValuesByRegion(chr,start,stop)
417 _ = data.smooth()
418
419 plt.gcf().clear()
420 fig = plt.figure(figsize=(10,5))
421 ax = fig.add_subplot(111)
422
423
424 ax.plot(data.rpkm, linewidth = 0.3, c='k')
425
426
427 if args.sample != 'none':
428 cnt = 1
429 coloriter = iter(['r','b','g','y'])
430 for sample in args.sample:
431 try:
432 color, sampleID = sample.split(":")
433 except:
434 color =coloriter.next()
435 sampleID = sample
436
437 ax.plot(data.getSample([sampleID]), linewidth = 1, c=color, label = sampleID)
438
439 if cnt == 1:
440 cf.plotRawData(ax, r.getExonValuesByRegion(chr,start,stop,sampleList=[sampleID]).getSample([sampleID]),color=color)
441 cnt +=1
442 plt.legend(prop={'size':10},frameon=False)
443
444 cf.plotGenes(ax, data)
445 cf.plotGenomicCoords(plt,data)
446 plt.xlim(0,data.shape[1])
447 plt.ylim(-3,3)
448
449 plt.title("%s: %s - %s" % (cf.chrInt2Str(chr),locale.format("%d",start, grouping=True),locale.format("%d",stop, grouping=True)))
450 plt.xlabel("Position")
451 plt.ylabel("SVD-ZRPKM Values")
452
453 plt.savefig(args.output)
454
455 sys.exit(0)
456
457 def CF_plotcalls(args):
458 try:
459 import matplotlib
460 matplotlib.use('Agg')
461 import matplotlib.pyplot as plt
462 import pylab as P
463 from matplotlib.lines import Line2D
464 from matplotlib.patches import Rectangle
465 except:
466 print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
467 sys.exit(0)
468
469 import locale
470 try:
471 _ = locale.setlocale(locale.LC_ALL, 'en_US')
472 except:
473 _ = locale.setlocale(locale.LC_ALL, '')
474
475 try:
476 callfile_fn = str(args.calls)
477 callfile_f = open(callfile_fn, mode='r')
478 except IOError as e:
479 print '[ERROR] Cannot open call file for reading: ', callfile_fn
480 sys.exit(0)
481
482 all_calls = []
483 header = callfile_f.readline()
484
485 for line in callfile_f:
486 sampleID, chr, start, stop, state = line.strip().split()
487 chr = cf.chrStr2Int(chr)
488 all_calls.append({"chromosome":int(chr), "start":int(start), "stop":int(stop), "sampleID":sampleID})
489
490 r = cf.rpkm_reader(args.input)
491
492 for call in all_calls:
493 chr = call["chromosome"]
494 start = call["start"]
495 stop = call["stop"]
496 sampleID = call["sampleID"]
497
498 exons = r.getExonIDs(chr,int(start),int(stop))
499
500
501 window_start = max(exons[0]-args.window,0)
502 window_stop = exons[-1]+args.window
503
504 data = r.getExonValuesByExons(chr,window_start, window_stop)
505 _ = data.smooth()
506
507 plt.gcf().clear()
508 fig = plt.figure(figsize=(10,5))
509 ax = fig.add_subplot(111)
510
511
512 ax.plot(data.rpkm, linewidth = 0.3, c='k')
513
514
515 ax.plot(data.getSample([sampleID]), linewidth = 1, c='r', label = sampleID)
516 cf.plotRawData(ax, r.getExonValuesByExons(chr,window_start, window_stop,sampleList=[sampleID]).getSample([sampleID]),color='r')
517
518 plt.legend(prop={'size':10},frameon=False)
519
520 cf.plotGenes(ax, data)
521 cf.plotGenomicCoords(plt,data)
522
523 exon_start = np.where(data.exons["start"] == start)[0][0]
524 exon_stop = np.where(data.exons["stop"] == stop)[0][0]
525 _ = ax.add_line(matplotlib.lines.Line2D([exon_start,exon_stop],[2,2],color='k',lw=6,linestyle='-',alpha=1,solid_capstyle='butt'))
526
527 _ = plt.xlim(0,data.shape[1])
528 _ = plt.ylim(-3,3)
529
530 plt.title("%s: %s - %s" % (cf.chrInt2Str(chr),locale.format("%d",start, grouping=True),locale.format("%d",stop, grouping=True)))
531 plt.xlabel("Position")
532 plt.ylabel("SVD-ZRPKM Values")
533 outfile = "%s_%d_%d_%s.png" % (cf.chrInt2Str(chr), start, stop, sampleID)
534 plt.savefig(args.outputdir + "/" + outfile)
535
536 def CF_bam2RPKM(args):
537 try:
538 import pysam
539 except:
540 print '[ERROR] Cannot load pysam module! Make sure it is insalled'
541 sys.exit(0)
542 try:
543 # read probes table
544 probe_fn = str(args.probes[0])
545 probes = cf.loadProbeList(probe_fn)
546 num_probes = len(probes)
547 print '[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn)
548 except IOError as e:
549 print '[ERROR] Cannot read probes file: ', probe_fn
550 sys.exit(0)
551
552 try:
553 rpkm_f = open(args.output[0],'w')
554 except IOError as e:
555 print '[ERROR] Cannot open rpkm file for writing: ', args.output
556 sys.exit(0)
557
558 print "[RUNNING] Counting total number of reads in bam file..."
559 total_reads = float(pysam.view("-c", args.input[0])[0].strip("\n"))
560 print "[RUNNING] Found %d reads" % total_reads
561
562 f = pysam.Samfile(args.input[0], "rb" )
563
564 if not f._hasIndex():
565 print "[ERROR] No index found for bam file (%s)!\n[ERROR] You must first index the bam file and include the .bai file in the same directory as the bam file!" % args.input[0]
566 sys.exit(0)
567
568 # will be storing values in these arrays
569 readcount = np.zeros(num_probes)
570 exon_bp = np.zeros(num_probes)
571 probeIDs = np.zeros(num_probes)
572 counter = 0
573
574 # detect contig naming scheme here # TODO, add an optional "contigs.txt" file or automatically handle contig naming
575 bam_contigs = f.references
576 probes_contigs = [str(p) for p in set(map(operator.itemgetter("chr"),probes))]
577
578 probes2contigmap = {}
579
580 for probes_contig in probes_contigs:
581 if probes_contig in bam_contigs:
582 probes2contigmap[probes_contig] = probes_contig
583 elif cf.chrInt2Str(probes_contig) in bam_contigs:
584 probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig)
585 elif cf.chrInt2Str(probes_contig).replace("chr","") in bam_contigs:
586 probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig).replace("chr","")
587 else:
588 print "[ERROR] Could not find contig '%s' from %s in bam file! \n[ERROR] Perhaps the contig names for the probes are incompatible with the bam file ('chr1' vs. '1'), or unsupported contig naming is used?" % (probes_contig, probe_fn)
589 sys.exit(0)
590
591 print "[RUNNING] Calculating RPKM values..."
592
593 # loop through each probe
594 for p in probes:
595
596 # f.fetch is a pysam method and returns an iterator for reads overlapping interval
597
598 p_chr = probes2contigmap[str(p["chr"])]
599
600 p_start = p["start"]
601 p_stop = p["stop"]
602 try:
603 iter = f.fetch(p_chr,p_start,p_stop)
604 except:
605 print "[ERROR] Could not retrieve mappings for region %s:%d-%d. Check that contigs are named correctly and the bam file is properly indexed" % (p_chr,p_start,p_stop)
606 sys.exit(0)
607
608 for i in iter:
609 if i.pos+1 >= p_start: #this checks to make sure a read actually starts in an interval
610 readcount[counter] += 1
611
612 exon_bp[counter] = p_stop-p_start
613 probeIDs[counter] = counter +1 #probeIDs are 1-based
614 counter +=1
615
616 #calcualte RPKM values for all probes
617 rpkm = (10**9*(readcount)/(exon_bp))/(total_reads)
618
619 out = np.vstack([probeIDs,readcount,rpkm])
620
621 np.savetxt(rpkm_f,out.transpose(),delimiter='\t',fmt=['%d','%d','%f'])
622
623 rpkm_f.close()
624
625
626
627 VERSION = "0.2.2"
628 parser = argparse.ArgumentParser(prog="CoNIFER", description="This is CoNIFER %s (Copy Number Inference From Exome Reads), designed to detect and genotype CNVs and CNPs from exome sequence read-depth data. See Krumm et al., Genome Research (2012) doi:10.1101/gr.138115.112 \nNiklas Krumm, 2012\n nkrumm@uw.edu" % VERSION)
629 parser.add_argument('--version', action='version', version='%(prog)s ' + VERSION)
630 subparsers = parser.add_subparsers(help='Command to be run.')
631
632 # rpkm command
633 rpkm_parser= subparsers.add_parser('rpkm', help='Create an RPKM file from a BAM file')
634 rpkm_parser.add_argument('--probes',action='store', required=True, metavar='/path/to/probes_file.txt', nargs=1,help="Probe definition file")
635 rpkm_parser.add_argument('--input',action='store', required=True, metavar='sample.bam',nargs=1, help="Aligned BAM file")
636 rpkm_parser.add_argument('--output',action='store', required=True, metavar='sample.rpkm.txt',nargs=1, help="RPKM file to write")
637 rpkm_parser.set_defaults(func=CF_bam2RPKM)
638
639 # analyze command
640 analyze_parser= subparsers.add_parser('analyze', help='Basic CoNIFER analysis. Reads a directory of RPKM files and a probe list and outputs a HDF5 file containing SVD-ZRPKM values.')
641 analyze_parser.add_argument('--probes',action='store', required=True, metavar='/path/to/probes_file.txt', nargs=1,help="Probe definition file")
642 analyze_parser.add_argument('--rpkm_dir',action='store', required=True, metavar='/path/to/folder/containing/rpkm_files/',nargs=1, help="Location of folder containing RPKM files. Folder should contain ONLY contain RPKM files, and all readable RPKM files will used in analysis.")
643 analyze_parser.add_argument('--output','-o', required=True, metavar='/path/to/output_file.hdf5', type=str, help="Output location of HDF5 file to contain SVD-ZRPKM values")
644 analyze_parser.add_argument('--svd', metavar='12', type=int, nargs='?', default = 12,help="Number of components to remove")
645 analyze_parser.add_argument('--min_rpkm', metavar='1.00', type=float, nargs="?", default = 1.00,help="Minimum population median RPKM per probe.")
646 analyze_parser.add_argument('--write_svals', metavar='SingularValues.txt', type=str, default= "", help="Optional file to write singular values (S-values). Used for Scree Plot.")
647 analyze_parser.add_argument('--plot_scree', metavar='ScreePlot.png', type=str, default= "", help="Optional graphical scree plot. Requires matplotlib.")
648 analyze_parser.add_argument('--write_sd', metavar='StandardDeviations.txt', type=str, default= "", help="Optional file with sample SVD-ZRPKM standard deviations. Used to filter noisy samples.")
649 analyze_parser.set_defaults(func=CF_analyze)
650
651 # export command
652 export_parser= subparsers.add_parser('export', help='Export SVD-ZRPKM values from a HDF5 file to bed or vcf format.')
653 export_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
654 export_parser.add_argument('--output','-o',action='store', required=False, default='.', metavar='output.bed',help="Location of output file[s]. When exporting multiple samples, top-level directory of this path will be used.")
655 export_parser.add_argument('--sample','-s',action='store', required=False, metavar='sampleID', default='all', nargs="+",help="SampleID or comma-separated list of sampleIDs to export. Default: export all samples")
656 export_parser.set_defaults(func=CF_export)
657
658 # plot command
659 plot_parser= subparsers.add_parser('plot', help='Plot SVD-ZRPKM values using matplotlib')
660 plot_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
661 plot_parser.add_argument('--region',action='store', required=True, metavar='chr#:start-stop',help="Region to plot")
662 plot_parser.add_argument('--output',action='store', required=True, metavar='image.png',help="Output path and filetype. PDF, PNG, PS, EPS, and SVG are supported.")
663 plot_parser.add_argument('--sample',action='store', required=False, metavar='sampleID',nargs="+",default='none',help="Sample[s] to highlight. The following optional color spec can be used: <color>:<sampleID>. Available colors are r,b,g,y,c,m,y,k. The unsmoothed SVD-ZRPKM values for the first sample in this list will be drawn. Default: No samples highlighted.")
664 plot_parser.set_defaults(func=CF_plot)
665
666 # make calls command
667 call_parser= subparsers.add_parser('call', help='Very rudimentary caller for CNVs using SVD-ZRPKM thresholding.')
668 call_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
669 call_parser.add_argument('--output',action='store', required=True, metavar='calls.txt',help="Output file for calls")
670 call_parser.add_argument('--threshold', metavar='1.50', type=float, nargs='?', required=False, default = 1.50,help="+/- Threshold for calling (minimum SVD-ZRPKM)")
671 call_parser.set_defaults(func=CF_call)
672
673 # plotcalls command
674 plotcalls_parser= subparsers.add_parser('plotcalls', help='Make basic plots from call file from "call" command.')
675 plotcalls_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
676 plotcalls_parser.add_argument('--calls',action='store', required=True, metavar='calls.txt',help="File with calls from 'call' command.")
677 plotcalls_parser.add_argument('--outputdir',action='store', required=True, metavar='/path/to/directory',help="Output directory for plots")
678 plotcalls_parser.add_argument('--window',action='store', required=False, metavar='50',default=50,help="In exons, the amount of padding to plot around each call")
679 plotcalls_parser.set_defaults(func=CF_plotcalls)
680
681 args = parser.parse_args()
682 args.func(args)