comparison conifer.py @ 1:54973c4a1125 draft

Uploaded
author bzonnedda
date Mon, 06 Feb 2017 10:55:27 -0500
parents
children
comparison
equal deleted inserted replaced
0:9e24f334dc80 1:54973c4a1125
1 #######################################################################
2 #######################################################################
3 # CoNIFER: Copy Number Inference From Exome Reads
4 # Developed by Niklas Krumm (C) 2012
5 # nkrumm@gmail.com
6 #
7 # homepage: http://conifer.sf.net
8 # This program is described in:
9 # Krumm et al. 2012. Genome Research. doi:10.1101/gr.138115.112
10 #
11 # This file is part of CoNIFER.
12 # CoNIFER is free software: you can redistribute it and/or modify
13 # it under the terms of the GNU General Public License as published by
14 # the Free Software Foundation, either version 3 of the License, or
15 # (at your option) any later version.
16 #
17 # This program is distributed in the hope that it will be useful,
18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 # GNU General Public License for more details.
21 #
22 # You should have received a copy of the GNU General Public License
23 # along with this program. If not, see <http://www.gnu.org/licenses/>.
24 #######################################################################
25 #######################################################################
26
27 import argparse
28 import os, sys, copy
29 import glob
30 import csv
31 import conifer_functions as cf
32 import operator
33 from tables import *
34 import numpy as np
35
36 def CF_analyze(args):
37 # do path/file checks:
38 try:
39 # read probes table
40 probe_fn = str(args.probes[0])
41 probes = cf.loadProbeList(probe_fn)
42 num_probes = len(probes)
43 print '[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn)
44 except IOError as e:
45 print '[ERROR] Cannot read probes file: ', probe_fn
46 sys.exit(0)
47
48 try:
49 svd_outfile_fn = str(args.output)
50 # h5file_out = openFile(svd_outfile_fn, mode='w')
51 h5file_out = open_file(svd_outfile_fn, mode='w')
52 # probe_group = h5file_out.createGroup("/","probes","probes")
53 probe_group = h5file_out.create_group("/","probes","probes")
54 except IOError as e:
55 print '[ERROR] Cannot open SVD output file for writing: ', svd_outfile_fn
56 sys.exit(0)
57
58 if args.write_svals != "":
59 sval_f = open(args.write_svals,'w')
60
61 if args.plot_scree != "":
62 try:
63 import matplotlib
64 matplotlib.use('Agg')
65 import matplotlib.pyplot as plt
66 import pylab as P
67 from matplotlib.lines import Line2D
68 from matplotlib.patches import Rectangle
69 except:
70 print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
71 sys.exit(0)
72
73 plt.gcf().clear()
74 fig = plt.figure(figsize=(10,5))
75 ax = fig.add_subplot(111)
76
77 rpkm_dir = str(args.rpkm_dir[0])
78 rpkm_files = glob.glob(rpkm_dir + "/*")
79 if len(rpkm_files) == 0:
80 print '[ERROR] Cannot find any files in RPKM directory (or directory path is incorrect): ', rpkm_dir
81 sys.exit(0)
82 elif len(rpkm_files) == 1:
83 print '[ERROR] Found only 1 RPKM file (sample). CoNIFER requires multiple samples (8 or more) to run. Exiting.'
84 sys.exit(0)
85 elif len(rpkm_files) < 8:
86 print '[WARNING] Only found %d samples... this is less than the recommended minimum, and CoNIFER may not analyze this dataset correctly!' % len(rpkm_files)
87 elif len(rpkm_files) <= int(args.svd):
88 print '[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (len(rpkm_files), int(args.svd))
89 sys.exit(0)
90 else:
91 print '[INIT] Found %d RPKM files in %s' % (len(rpkm_files), rpkm_dir)
92
93 # read in samples names and generate file list
94 samples = {}
95 for f in rpkm_files:
96 s = '.'.join(f.split('/')[-1].split('.')[0:-1])
97 print "[INIT] Mapping file to sampleID: %s --> %s" % (f, s)
98 samples[s] = f
99
100 #check uniqueness and total # of samples
101 if len(set(samples)) != len(set(rpkm_files)):
102 print '[ERROR] Could not successfully derive sample names from RPKM filenames. There are probably non-unique sample names! Please rename files using <sampleID>.txt format!'
103 sys.exit(0)
104
105 # LOAD RPKM DATA
106 RPKM_data = np.zeros([num_probes,len(samples)], dtype=np.float)
107 failed_samples = 0
108
109 for i,s in enumerate(samples.keys()):
110 t = np.loadtxt(samples[s], dtype=np.float, delimiter="\t", skiprows=0, usecols=[2])
111 if len(t) != num_probes:
112 print "[WARNING] Number of RPKM values for %s in file %s does not match number of defined probes in %s. **This sample will be dropped from analysis**!" % (s, samples[s], probe_fn)
113 _ = samples.pop(s)
114 failed_samples += 1
115 else:
116 RPKM_data[:,i] = t
117 print "[INIT] Successfully read RPKM data for sampleID: %s" % s
118
119 RPKM_data = RPKM_data[:,0:len(samples)]
120 print "[INIT] Finished reading RPKM files. Total number of samples in experiment: %d (%d failed to read properly)" % (len(samples), failed_samples)
121
122 if len(samples) <= int(args.svd):
123 print '[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (int(args.svd), len(samples))
124 sys.exit(0)
125
126 # BEGIN
127 chrs_to_process = set(map(operator.itemgetter("chr"),probes))
128 chrs_to_process_str = ', '.join([cf.chrInt2Str(c) for c in chrs_to_process])
129 print '[INIT] Attempting to process chromosomes: ', chrs_to_process_str
130
131
132
133 for chr in chrs_to_process:
134 print "[RUNNING: chr%d] Now on: %s" %(chr, cf.chrInt2Str(chr))
135 chr_group_name = "chr%d" % chr
136 # chr_group = h5file_out.createGroup("/",chr_group_name,chr_group_name)
137 chr_group = h5file_out.create_group("/",chr_group_name,chr_group_name)
138
139 chr_probes = filter(lambda i: i["chr"] == chr, probes)
140 num_chr_probes = len(chr_probes)
141 start_probeID = chr_probes[0]['probeID']
142 stop_probeID = chr_probes[-1]['probeID']
143 print "[RUNNING: chr%d] Found %d probes; probeID range is [%d-%d]" % (chr, len(chr_probes), start_probeID-1, stop_probeID) # probeID is 1-based and slicing is 0-based, hence the start_probeID-1 term
144
145 rpkm = RPKM_data[start_probeID:stop_probeID,:]
146
147 print "[RUNNING: chr%d] Calculating median RPKM" % chr
148 median = np.median(rpkm,1)
149 sd = np.std(rpkm,1)
150 probe_mask = median >= float(args.min_rpkm)
151 print "[RUNNING: chr%d] Masking %d probes with median RPKM < %f" % (chr, np.sum(probe_mask==False), float(args.min_rpkm))
152
153 rpkm = rpkm[probe_mask, :]
154 num_chr_probes = np.sum(probe_mask)
155
156 if num_chr_probes <= len(samples):
157 print "[ERROR] This chromosome has fewer informative probes than there are samples in the analysis! There are probably no mappings on this chromosome. Please remove these probes from the probes.txt file"
158 sys.exit(0)
159
160 probeIDs = np.array(map(operator.itemgetter("probeID"),chr_probes))[probe_mask]
161 probe_starts = np.array(map(operator.itemgetter("start"),chr_probes))[probe_mask]
162 probe_stops = np.array(map(operator.itemgetter("stop"),chr_probes))[probe_mask]
163 gene_names = np.array(map(operator.itemgetter("name"),chr_probes))[probe_mask]
164
165 dt = np.dtype([('probeID',np.uint32),('start',np.uint32),('stop',np.uint32), ('name', np.str_, 20)])
166
167 out_probes = np.empty(num_chr_probes,dtype=dt)
168 out_probes['probeID'] = probeIDs
169 out_probes['start'] = probe_starts
170 out_probes['stop'] = probe_stops
171 out_probes['name'] = gene_names
172 # probe_table = h5file_out.createTable(probe_group,"probes_chr%d" % chr,cf.probe,"chr%d" % chr)
173 probe_table = h5file_out.create_table(probe_group,"probes_chr%d" % chr,cf.probe,"chr%d" % chr)
174 probe_table.append(out_probes)
175
176 print "[RUNNING: chr%d] Calculating ZRPKM scores..." % chr
177 rpkm = np.apply_along_axis(cf.zrpkm, 0, rpkm, median[probe_mask], sd[probe_mask])
178
179 # svd transform
180 print "[RUNNING: chr%d] SVD decomposition..." % chr
181 components_removed = int(args.svd)
182
183 U, S, Vt = np.linalg.svd(rpkm,full_matrices=False)
184 new_S = np.diag(np.hstack([np.zeros([components_removed]),S[components_removed:]]))
185
186 if args.write_svals != "":
187 sval_f.write('chr' + str(chr) + '\t' + '\t'.join([str(_i) for _i in S]) + "\n")
188
189 if args.plot_scree != "":
190 ax.plot(S, label='chr' + str(chr),lw=0.5)
191
192 # reconstruct data matrix
193 rpkm = np.dot(U, np.dot(new_S, Vt))
194
195
196 # save to HDF5 file
197 print "[RUNNING: chr%d] Saving SVD-ZRPKM values" % chr
198
199 for i,s in enumerate(samples):
200 out_data = np.empty(num_chr_probes,dtype='u4,f8')
201 out_data['f0'] = probeIDs
202 out_data['f1'] = rpkm[:,i]
203 # sample_tbl = h5file_out.createTable(chr_group,"sample_" + str(s),cf.rpkm_value,"%s" % str(s))
204 sample_tbl = h5file_out.create_table(chr_group,"sample_" + str(s),cf.rpkm_value,"%s" % str(s))
205 sample_tbl.append(out_data)
206
207
208 print "[RUNNING] Saving sampleIDs to file..."
209 # sample_group = h5file_out.createGroup("/","samples","samples")
210 # sample_table = h5file_out.createTable(sample_group,"samples",cf.sample,"samples")
211 sample_group = h5file_out.create_group("/","samples","samples")
212 sample_table = h5file_out.create_table(sample_group,"samples",cf.sample,"samples")
213 dt = np.dtype([('sampleID',np.str_,100)])
214 out_samples = np.empty(len(samples.keys()),dtype=dt)
215 out_samples['sampleID'] = np.array(samples.keys())
216 sample_table.append(out_samples)
217
218
219 if args.write_sd != "":
220 print "[RUNNING] Calculating standard deviations for all samples (this can take a while)..."
221
222 sd_file = open(args.write_sd,'w')
223
224 for i,s in enumerate(samples):
225 # collect all SVD-ZRPKM values
226 count = 1
227 for chr in chrs_to_process:
228 if count == 1:
229 sd_out = h5file_out.root._f_getChild("chr%d" % chr)._f_getChild("sample_%s" % s).read(field="rpkm").flatten()
230 else:
231 sd_out = np.hstack([sd_out,out.h5file_out.root._f_getChild("chr%d" % chr)._f_getChild("sample_%s" % s).read(field="rpkm").flatten()])
232
233 sd = np.std(sd_out)
234 sd_file.write("%s\t%f\n" % (s,sd))
235
236 sd_file.close()
237
238 if args.plot_scree != "":
239 plt.title("Scree plot")
240 if len(samples) < 50:
241 plt.xlim([0,len(samples)])
242 plt.xlabel("S values")
243 else:
244 plt.xlim([0,50])
245 plt.xlabel("S values (only first 50 plotted)")
246 plt.ylabel("Relative contributed variance")
247 plt.savefig(args.plot_scree)
248
249 print "[FINISHED]"
250 h5file_out.close()
251 sys.exit(0)
252
253 def CF_export(args):
254 try:
255 h5file_in_fn = str(args.input)
256 # h5file_in = openFile(h5file_in_fn, mode='r')
257 h5file_in = open_file(h5file_in_fn, mode='r')
258 except IOError as e:
259 print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn
260 sys.exit(0)
261
262 # read probes
263 probes = {}
264 for probes_chr in h5file_in.root.probes:
265 probes[probes_chr.title] = probes_chr.read()
266
267 if args.sample =='all':
268 all_samples = list(h5file_in.root.samples.samples.read(field="sampleID"))
269
270 out_path = os.path.abspath(args.output)
271
272 print "[INIT] Preparing to export all samples (%d samples) to %s" % (len(all_samples), out_path)
273 for sample in all_samples:
274 try:
275 outfile_fn = out_path + "/" + sample + ".bed"
276 outfile_f = open(outfile_fn,'w')
277 except IOError as e:
278 print '[ERROR] Cannot open output file for writing: ', outfile_fn
279 sys.exit(0)
280 print "[RUNNING] Exporting %s" % sample
281
282 cf.export_sample(h5file_in,sample,probes,outfile_f)
283 outfile_f.close()
284
285 elif len(args.sample) == 1:
286 out_path = os.path.abspath(args.output)
287 sample = args.sample[0]
288 print "[INIT] Preparing to export sampleID %s to %s" % (args.sample[0], out_path)
289 try:
290 if os.path.isdir(out_path):
291 outfile_fn = out_path + "/" + sample + ".bed"
292 else:
293 outfile_fn = out_path
294 outfile_f = open(outfile_fn,'w')
295 except IOError as e:
296 print '[ERROR] Cannot open output file for writing: ', outfile_fn
297 sys.exit(0)
298 print "[RUNNING] Exporting %s to %s" % (sample, outfile_fn)
299
300 cf.export_sample(h5file_in,sample,probes,outfile_f)
301 outfile_f.close()
302
303 else:
304 out_path = os.path.abspath(args.output)
305 print "[INIT] Preparing to export %d samples to %s" % (len(args.sample), out_path)
306 for sample in args.sample:
307 try:
308 if os.path.isdir(out_path):
309 outfile_fn = out_path + "/" + sample + ".bed"
310 else:
311 outfile_fn = out_path
312 outfile_f = open(outfile_fn,'w')
313 except IOError as e:
314 print '[ERROR] Cannot open output file for writing: ', outfile_fn
315 sys.exit(0)
316 print "[RUNNING] Exporting %s to %s" % (sample, outfile_fn)
317
318 cf.export_sample(h5file_in,sample,probes,outfile_f)
319 outfile_f.close()
320 sys.exit(0)
321
322 def CF_call(args):
323 try:
324 h5file_in_fn = str(args.input)
325 # h5file_in = openFile(h5file_in_fn, mode='r')
326 h5file_in = open_file(h5file_in_fn, mode='r')
327 except IOError as e:
328 print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn
329 sys.exit(0)
330
331 try:
332 callfile_fn = str(args.output)
333 callfile_f = open(callfile_fn, mode='w')
334 except IOError as e:
335 print '[ERROR] Cannot open output file for writing: ', callfile_fn
336 sys.exit(0)
337
338 chrs_to_process = []
339 for chr in h5file_in.root:
340 if chr._v_title not in ('probes','samples'):
341 chrs_to_process.append(chr._v_title.replace("chr",""))
342
343 h5file_in.close()
344
345 print '[INIT] Initializing caller at threshold = %f' % (args.threshold)
346
347 r = cf.rpkm_reader(h5file_in_fn)
348
349 all_calls = []
350
351 for chr in chrs_to_process:
352 print '[RUNNING] Now processing chr%s' % chr
353 data = r.getExonValuesByRegion(chr)
354
355 #raw_data = copy.copy(data)
356 _ = data.smooth()
357
358 mean= np.mean(data.rpkm,axis=1)
359 sd = np.std(data.rpkm,axis=1)
360
361 for sample in r.getSampleList():
362 sample_data = data.getSample([sample]).flatten()
363 #sample_raw_data = raw_data.getSample([sample]).flatten()
364
365 dup_mask = sample_data >= args.threshold
366 del_mask = sample_data <= -1*args.threshold
367
368 dup_bkpoints = cf.getbkpoints(dup_mask) #returns exon coordinates for this chromosome (numpy array coords)
369 del_bkpoints = cf.getbkpoints(del_mask)
370
371
372 dups = []
373 for start,stop in dup_bkpoints:
374 try: new_start = np.max(np.where(sample_data[:start] < (mean[:start] + 3*sd[:start])))
375 except ValueError: new_start = 0
376 try: new_stop = stop + np.min(np.where(sample_data[stop:] < (mean[stop:] + 3*sd[stop:])))
377 except ValueError: new_stop = data.shape[1]-1
378 dups.append({"sampleID":sample,"chromosome": cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "dup"})
379
380 dels = []
381 for start,stop in del_bkpoints:
382 try: new_start = np.max(np.where(sample_data[:start] > (-1*mean[:start] - 3*sd[:start])))
383 except ValueError: new_start = 0
384 try: new_stop = stop + np.min(np.where(sample_data[stop:] > (-1*mean[stop:] - 3*sd[stop:])))
385 except ValueError: new_stop = data.shape[1]-1
386 dels.append({"sampleID":sample,"chromosome": cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "del"})
387
388 dels = cf.mergeCalls(dels) #merges overlapping calls
389 dups = cf.mergeCalls(dups)
390
391 #print sampleID, len(dels), len(dups)
392
393 all_calls.extend(list(dels))
394 all_calls.extend(list(dups))
395
396 # print calls to file
397 header = ['sampleID','chromosome','start','stop','state']
398
399 callfile_f.write('\t'.join(header) + "\n")
400 for call in all_calls:
401 print "%s\t%s\t%d\t%d\t%s" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"])
402 callfile_f.write("%s\t%s\t%d\t%d\t%s\n" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"]))
403
404 sys.exit(0)
405
406 def CF_plot(args):
407 try:
408 import locale
409 import matplotlib
410 matplotlib.use('Agg')
411 import matplotlib.pyplot as plt
412 import pylab as P
413 from matplotlib.lines import Line2D
414 from matplotlib.patches import Rectangle
415 _ = locale.setlocale(locale.LC_ALL, '')
416 except:
417 print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
418 sys.exit(0)
419
420
421 chr, start, stop = cf.parseLocString(args.region)
422
423 r = cf.rpkm_reader(args.input)
424
425 data = r.getExonValuesByRegion(chr,start,stop)
426 _ = data.smooth()
427
428 plt.gcf().clear()
429 fig = plt.figure(figsize=(10,5))
430 ax = fig.add_subplot(111)
431
432
433 ax.plot(data.rpkm, linewidth = 0.3, c='k')
434
435
436 if args.sample != 'none':
437 cnt = 1
438 coloriter = iter(['r','b','g','y'])
439 for sample in args.sample:
440 try:
441 color, sampleID = sample.split(":")
442 except:
443 color =coloriter.next()
444 sampleID = sample
445
446 ax.plot(data.getSample([sampleID]), linewidth = 1, c=color, label = sampleID)
447
448 if cnt == 1:
449 cf.plotRawData(ax, r.getExonValuesByRegion(chr,start,stop,sampleList=[sampleID]).getSample([sampleID]),color=color)
450 cnt +=1
451 plt.legend(prop={'size':10},frameon=False)
452
453 cf.plotGenes(ax, data)
454 cf.plotGenomicCoords(plt,data)
455 plt.xlim(0,data.shape[1])
456 plt.ylim(-3,3)
457
458 plt.title("%s: %s - %s" % (cf.chrInt2Str(chr),locale.format("%d",start, grouping=True),locale.format("%d",stop, grouping=True)))
459 plt.xlabel("Position")
460 plt.ylabel("SVD-ZRPKM Values")
461
462 plt.savefig(args.output)
463
464 sys.exit(0)
465
466 def CF_plotcalls(args):
467 try:
468 import matplotlib
469 matplotlib.use('Agg')
470 import matplotlib.pyplot as plt
471 import pylab as P
472 from matplotlib.lines import Line2D
473 from matplotlib.patches import Rectangle
474 except:
475 print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
476 sys.exit(0)
477
478 import locale
479 try:
480 _ = locale.setlocale(locale.LC_ALL, 'en_US')
481 except:
482 _ = locale.setlocale(locale.LC_ALL, '')
483
484 try:
485 callfile_fn = str(args.calls)
486 callfile_f = open(callfile_fn, mode='r')
487 except IOError as e:
488 print '[ERROR] Cannot open call file for reading: ', callfile_fn
489 sys.exit(0)
490
491 all_calls = []
492 header = callfile_f.readline()
493
494 for line in callfile_f:
495 sampleID, chr, start, stop, state = line.strip().split()
496 chr = cf.chrStr2Int(chr)
497 all_calls.append({"chromosome":int(chr), "start":int(start), "stop":int(stop), "sampleID":sampleID})
498
499 r = cf.rpkm_reader(args.input)
500
501 for call in all_calls:
502 chr = call["chromosome"]
503 start = call["start"]
504 stop = call["stop"]
505 sampleID = call["sampleID"]
506
507 exons = r.getExonIDs(chr,int(start),int(stop))
508
509
510 window_start = max(exons[0]-args.window,0)
511 window_stop = exons[-1]+args.window
512
513 data = r.getExonValuesByExons(chr,window_start, window_stop)
514 _ = data.smooth()
515
516 plt.gcf().clear()
517 fig = plt.figure(figsize=(10,5))
518 ax = fig.add_subplot(111)
519
520
521 ax.plot(data.rpkm, linewidth = 0.3, c='k')
522
523
524 ax.plot(data.getSample([sampleID]), linewidth = 1, c='r', label = sampleID)
525 cf.plotRawData(ax, r.getExonValuesByExons(chr,window_start, window_stop,sampleList=[sampleID]).getSample([sampleID]),color='r')
526
527 plt.legend(prop={'size':10},frameon=False)
528
529 cf.plotGenes(ax, data)
530 cf.plotGenomicCoords(plt,data)
531
532 exon_start = np.where(data.exons["start"] == start)[0][0]
533 exon_stop = np.where(data.exons["stop"] == stop)[0][0]
534 _ = ax.add_line(matplotlib.lines.Line2D([exon_start,exon_stop],[2,2],color='k',lw=6,linestyle='-',alpha=1,solid_capstyle='butt'))
535
536 _ = plt.xlim(0,data.shape[1])
537 _ = plt.ylim(-3,3)
538
539 plt.title("%s: %s - %s" % (cf.chrInt2Str(chr),locale.format("%d",start, grouping=True),locale.format("%d",stop, grouping=True)))
540 plt.xlabel("Position")
541 plt.ylabel("SVD-ZRPKM Values")
542 outfile = "%s_%d_%d_%s.png" % (cf.chrInt2Str(chr), start, stop, sampleID)
543 plt.savefig(args.outputdir + "/" + outfile)
544
545 def CF_bam2RPKM(args):
546 try:
547 import pysam
548 except:
549 print '[ERROR] Cannot load pysam module! Make sure it is insalled'
550 sys.exit(0)
551 try:
552 # read probes table
553 probe_fn = str(args.probes[0])
554 probes = cf.loadProbeList(probe_fn)
555 num_probes = len(probes)
556 print '[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn)
557 except IOError as e:
558 print '[ERROR] Cannot read probes file: ', probe_fn
559 sys.exit(0)
560
561 try:
562 rpkm_f = open(args.output[0],'w')
563 except IOError as e:
564 print '[ERROR] Cannot open rpkm file for writing: ', args.output
565 sys.exit(0)
566
567 print "[RUNNING] Counting total number of reads in bam file..."
568 total_reads = float(pysam.view("-c", args.input[0])[0].strip("\n"))
569 print "[RUNNING] Found %d reads" % total_reads
570
571 f = pysam.Samfile(args.input[0], "rb" )
572
573 # if not f._hasIndex():
574 if not f.has_index():
575 print "[ERROR] No index found for bam file (%s)!\n[ERROR] You must first index the bam file and include the .bai file in the same directory as the bam file!" % args.input[0]
576 sys.exit(0)
577
578 # will be storing values in these arrays
579 readcount = np.zeros(num_probes)
580 exon_bp = np.zeros(num_probes)
581 probeIDs = np.zeros(num_probes)
582 counter = 0
583
584 # detect contig naming scheme here # TODO, add an optional "contigs.txt" file or automatically handle contig naming
585 bam_contigs = f.references
586 probes_contigs = [str(p) for p in set(map(operator.itemgetter("chr"),probes))]
587
588 probes2contigmap = {}
589
590 for probes_contig in probes_contigs:
591 if probes_contig in bam_contigs:
592 probes2contigmap[probes_contig] = probes_contig
593 elif cf.chrInt2Str(probes_contig) in bam_contigs:
594 probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig)
595 elif cf.chrInt2Str(probes_contig).replace("chr","") in bam_contigs:
596 probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig).replace("chr","")
597 else:
598 print "[ERROR] Could not find contig '%s' from %s in bam file! \n[ERROR] Perhaps the contig names for the probes are incompatible with the bam file ('chr1' vs. '1'), or unsupported contig naming is used?" % (probes_contig, probe_fn)
599 sys.exit(0)
600
601 print "[RUNNING] Calculating RPKM values..."
602
603 # loop through each probe
604 for p in probes:
605
606 # f.fetch is a pysam method and returns an iterator for reads overlapping interval
607
608 p_chr = probes2contigmap[str(p["chr"])]
609
610 p_start = p["start"]
611 p_stop = p["stop"]
612 try:
613 iter = f.fetch(p_chr,p_start,p_stop)
614 except:
615 print "[ERROR] Could not retrieve mappings for region %s:%d-%d. Check that contigs are named correctly and the bam file is properly indexed" % (p_chr,p_start,p_stop)
616 sys.exit(0)
617
618 for i in iter:
619 if i.pos+1 >= p_start: #this checks to make sure a read actually starts in an interval
620 readcount[counter] += 1
621
622 exon_bp[counter] = p_stop-p_start
623 probeIDs[counter] = counter +1 #probeIDs are 1-based
624 counter +=1
625
626 #calcualte RPKM values for all probes
627 rpkm = (10**9*(readcount)/(exon_bp))/(total_reads)
628
629 out = np.vstack([probeIDs,readcount,rpkm])
630
631 np.savetxt(rpkm_f,out.transpose(),delimiter='\t',fmt=['%d','%d','%f'])
632
633 rpkm_f.close()
634
635
636
637 VERSION = "0.2.2"
638 parser = argparse.ArgumentParser(prog="CoNIFER", description="This is CoNIFER %s (Copy Number Inference From Exome Reads), designed to detect and genotype CNVs and CNPs from exome sequence read-depth data. See Krumm et al., Genome Research (2012) doi:10.1101/gr.138115.112 \nNiklas Krumm, 2012\n nkrumm@uw.edu" % VERSION)
639 parser.add_argument('--version', action='version', version='%(prog)s ' + VERSION)
640 subparsers = parser.add_subparsers(help='Command to be run.')
641
642 # rpkm command
643 rpkm_parser= subparsers.add_parser('rpkm', help='Create an RPKM file from a BAM file')
644 rpkm_parser.add_argument('--probes',action='store', required=True, metavar='/path/to/probes_file.txt', nargs=1,help="Probe definition file")
645 rpkm_parser.add_argument('--input',action='store', required=True, metavar='sample.bam',nargs=1, help="Aligned BAM file")
646 rpkm_parser.add_argument('--output',action='store', required=True, metavar='sample.rpkm.txt',nargs=1, help="RPKM file to write")
647 rpkm_parser.set_defaults(func=CF_bam2RPKM)
648
649 # analyze command
650 analyze_parser= subparsers.add_parser('analyze', help='Basic CoNIFER analysis. Reads a directory of RPKM files and a probe list and outputs a HDF5 file containing SVD-ZRPKM values.')
651 analyze_parser.add_argument('--probes',action='store', required=True, metavar='/path/to/probes_file.txt', nargs=1,help="Probe definition file")
652 analyze_parser.add_argument('--rpkm_dir',action='store', required=True, metavar='/path/to/folder/containing/rpkm_files/',nargs=1, help="Location of folder containing RPKM files. Folder should contain ONLY contain RPKM files, and all readable RPKM files will used in analysis.")
653 analyze_parser.add_argument('--output','-o', required=True, metavar='/path/to/output_file.hdf5', type=str, help="Output location of HDF5 file to contain SVD-ZRPKM values")
654 analyze_parser.add_argument('--svd', metavar='12', type=int, nargs='?', default = 12,help="Number of components to remove")
655 analyze_parser.add_argument('--min_rpkm', metavar='1.00', type=float, nargs="?", default = 1.00,help="Minimum population median RPKM per probe.")
656 analyze_parser.add_argument('--write_svals', metavar='SingularValues.txt', type=str, default= "", help="Optional file to write singular values (S-values). Used for Scree Plot.")
657 analyze_parser.add_argument('--plot_scree', metavar='ScreePlot.png', type=str, default= "", help="Optional graphical scree plot. Requires matplotlib.")
658 analyze_parser.add_argument('--write_sd', metavar='StandardDeviations.txt', type=str, default= "", help="Optional file with sample SVD-ZRPKM standard deviations. Used to filter noisy samples.")
659 analyze_parser.set_defaults(func=CF_analyze)
660
661 # export command
662 export_parser= subparsers.add_parser('export', help='Export SVD-ZRPKM values from a HDF5 file to bed or vcf format.')
663 export_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
664 export_parser.add_argument('--output','-o',action='store', required=False, default='.', metavar='output.bed',help="Location of output file[s]. When exporting multiple samples, top-level directory of this path will be used.")
665 export_parser.add_argument('--sample','-s',action='store', required=False, metavar='sampleID', default='all', nargs="+",help="SampleID or comma-separated list of sampleIDs to export. Default: export all samples")
666 export_parser.set_defaults(func=CF_export)
667
668 # plot command
669 plot_parser= subparsers.add_parser('plot', help='Plot SVD-ZRPKM values using matplotlib')
670 plot_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
671 plot_parser.add_argument('--region',action='store', required=True, metavar='chr#:start-stop',help="Region to plot")
672 plot_parser.add_argument('--output',action='store', required=True, metavar='image.png',help="Output path and filetype. PDF, PNG, PS, EPS, and SVG are supported.")
673 plot_parser.add_argument('--sample',action='store', required=False, metavar='sampleID',nargs="+",default='none',help="Sample[s] to highlight. The following optional color spec can be used: <color>:<sampleID>. Available colors are r,b,g,y,c,m,y,k. The unsmoothed SVD-ZRPKM values for the first sample in this list will be drawn. Default: No samples highlighted.")
674 plot_parser.set_defaults(func=CF_plot)
675
676 # make calls command
677 call_parser= subparsers.add_parser('call', help='Very rudimentary caller for CNVs using SVD-ZRPKM thresholding.')
678 call_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
679 call_parser.add_argument('--output',action='store', required=True, metavar='calls.txt',help="Output file for calls")
680 call_parser.add_argument('--threshold', metavar='1.50', type=float, nargs='?', required=False, default = 1.50,help="+/- Threshold for calling (minimum SVD-ZRPKM)")
681 call_parser.set_defaults(func=CF_call)
682
683 # plotcalls command
684 plotcalls_parser= subparsers.add_parser('plotcalls', help='Make basic plots from call file from "call" command.')
685 plotcalls_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
686 plotcalls_parser.add_argument('--calls',action='store', required=True, metavar='calls.txt',help="File with calls from 'call' command.")
687 plotcalls_parser.add_argument('--outputdir',action='store', required=True, metavar='/path/to/directory',help="Output directory for plots")
688 plotcalls_parser.add_argument('--window',action='store', required=False, metavar='50',default=50,help="In exons, the amount of padding to plot around each call")
689 plotcalls_parser.set_defaults(func=CF_plotcalls)
690
691 args = parser.parse_args()
692 args.func(args)