comparison conifer.py @ 25:abdc05fae16e draft

Uploaded
author bzonnedda
date Thu, 06 Jul 2017 06:28:33 -0400
parents 9196f299dc5e
children
comparison
equal deleted inserted replaced
24:9c5a978ce25d 25:abdc05fae16e
28 import os, sys, copy 28 import os, sys, copy
29 import glob 29 import glob
30 import csv 30 import csv
31 import conifer_functions as cf 31 import conifer_functions as cf
32 import operator 32 import operator
33 #from tables import * 33 from tables import *
34 import pysam
34 import numpy as np 35 import numpy as np
35 36
37
36 def CF_analyze(args): 38 def CF_analyze(args):
37 # do path/file checks: 39 # do path/file checks:
38 try: 40 try:
39 # read probes table 41 # read probes table
40 probe_fn = str(args.probes[0]) 42 probe_fn = str(args.probes[0])
41 probes = cf.loadProbeList(probe_fn) 43 probes = cf.loadProbeList(probe_fn)
42 num_probes = len(probes) 44 num_probes = len(probes)
43 print '[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn) 45 print('[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn))
44 except IOError as e: 46 except IOError as e:
45 print '[ERROR] Cannot read probes file: ', probe_fn 47 print('[ERROR] Cannot read probes file: ', probe_fn)
46 sys.exit(0) 48 sys.exit(0)
47 49
48 try: 50 try:
49 svd_outfile_fn = str(args.output) 51 svd_outfile_fn = str(args.output)
50 h5file_out = openFile(svd_outfile_fn, mode='w') 52 h5file_out = OpenFile(svd_outfile_fn, mode='w')
51 probe_group = h5file_out.createGroup("/","probes","probes") 53 probe_group = h5file_out.createGroup("/", "probes", "probes")
52 except IOError as e: 54 except IOError as e:
53 print '[ERROR] Cannot open SVD output file for writing: ', svd_outfile_fn 55 print('[ERROR] Cannot open SVD output file for writing: ', svd_outfile_fn)
54 sys.exit(0) 56 sys.exit(0)
55 57
56 if args.write_svals != "": 58 if args.write_svals != "":
57 sval_f = open(args.write_svals,'w') 59 sval_f = open(args.write_svals, 'w')
58 60
59 if args.plot_scree != "": 61 if args.plot_scree != "":
60 try: 62 try:
61 import matplotlib 63 import matplotlib
62 matplotlib.use('Agg') 64 matplotlib.use('Agg')
63 import matplotlib.pyplot as plt 65 import matplotlib.pyplot as plt
64 import pylab as P 66 import pylab as P
65 from matplotlib.lines import Line2D 67 from matplotlib.lines import Line2D
66 from matplotlib.patches import Rectangle 68 from matplotlib.patches import Rectangle
67 except: 69 except:
68 print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?" 70 print("[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?")
69 sys.exit(0) 71 sys.exit(0)
70 72
71 plt.gcf().clear() 73 plt.gcf().clear()
72 fig = plt.figure(figsize=(10,5)) 74 fig = plt.figure(figsize=(10, 5))
73 ax = fig.add_subplot(111) 75 ax = fig.add_subplot(111)
74 76
75 rpkm_dir = str(args.rpkm_dir[0]) 77 rpkm_dir = str(args.rpkm_dir[0])
76 rpkm_files = glob.glob(rpkm_dir + "/*") 78 rpkm_files = glob.glob(rpkm_dir + "/*")
77 if len(rpkm_files) == 0: 79 if len(rpkm_files) == 0:
78 print '[ERROR] Cannot find any files in RPKM directory (or directory path is incorrect): ', rpkm_dir 80 print('[ERROR] Cannot find any files in RPKM directory (or directory path is incorrect): ', rpkm_dir)
79 sys.exit(0) 81 sys.exit(0)
80 elif len(rpkm_files) == 1: 82 elif len(rpkm_files) == 1:
81 print '[ERROR] Found only 1 RPKM file (sample). CoNIFER requires multiple samples (8 or more) to run. Exiting.' 83 print('[ERROR] Found only 1 RPKM file (sample). CoNIFER requires multiple samples (8 or more) to run. Exiting.')
82 sys.exit(0) 84 sys.exit(0)
83 elif len(rpkm_files) < 8: 85 elif len(rpkm_files) < 8:
84 print '[WARNING] Only found %d samples... this is less than the recommended minimum, and CoNIFER may not analyze this dataset correctly!' % len(rpkm_files) 86 print('[WARNING] Only found %d samples... this is less than the recommended minimum, and CoNIFER may not analyze this dataset correctly!' % len(
85 elif len(rpkm_files) <= int(args.svd): 87 rpkm_files))
86 print '[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (len(rpkm_files), int(args.svd)) 88 elif len(rpkm_files) <= int(args.svd):
87 sys.exit(0) 89 print('[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (
88 else: 90 len(rpkm_files), int(args.svd)))
89 print '[INIT] Found %d RPKM files in %s' % (len(rpkm_files), rpkm_dir) 91 sys.exit(0)
90 92 else:
91 # read in samples names and generate file list 93 print('[INIT] Found %d RPKM files in %s' % (len(rpkm_files), rpkm_dir))
92 samples = {} 94
93 for f in rpkm_files: 95 # read in samples names and generate file list
94 s = '.'.join(f.split('/')[-1].split('.')[0:-1]) 96 samples = {}
95 print "[INIT] Mapping file to sampleID: %s --> %s" % (f, s) 97 for f in rpkm_files:
96 samples[s] = f 98 s = '.'.join(f.split('/')[-1].split('.')[0:-1])
97 99 print("[INIT] Mapping file to sampleID: %s --> %s" % (f, s))
98 #check uniqueness and total # of samples 100 samples[s] = f
99 if len(set(samples)) != len(set(rpkm_files)): 101
100 print '[ERROR] Could not successfully derive sample names from RPKM filenames. There are probably non-unique sample names! Please rename files using <sampleID>.txt format!' 102 # check uniqueness and total # of samples
101 sys.exit(0) 103 if len(set(samples)) != len(set(rpkm_files)):
102 104 print('[ERROR] Could not successfully derive sample names from RPKM filenames. There are probably non-unique sample names! Please rename files using <sampleID>.txt format!')
103 # LOAD RPKM DATA 105 sys.exit(0)
104 RPKM_data = np.zeros([num_probes,len(samples)], dtype=np.float) 106
105 failed_samples = 0 107 # LOAD RPKM DATA
106 108 RPKM_data = np.zeros([num_probes, len(samples)], dtype=np.float)
107 for i,s in enumerate(samples.keys()): 109 failed_samples = 0
108 t = np.loadtxt(samples[s], dtype=np.float, delimiter="\t", skiprows=0, usecols=[2]) 110
109 if len(t) != num_probes: 111 for i, s in enumerate(samples.keys()):
110 print "[WARNING] Number of RPKM values for %s in file %s does not match number of defined probes in %s. **This sample will be dropped from analysis**!" % (s, samples[s], probe_fn) 112 t = np.loadtxt(samples[s], dtype=np.float, delimiter="\t", skiprows=0, usecols=[2])
111 _ = samples.pop(s) 113 if len(t) != num_probes:
112 failed_samples += 1 114 print("[WARNING] Number of RPKM values for %s in file %s does not match number of defined probes in %s. **This sample will be dropped from analysis**!" % (
113 else: 115 s, samples[s], probe_fn))
114 RPKM_data[:,i] = t 116 _ = samples.pop(s)
115 print "[INIT] Successfully read RPKM data for sampleID: %s" % s 117 failed_samples += 1
116 118 else:
117 RPKM_data = RPKM_data[:,0:len(samples)] 119 RPKM_data[:, i] = t
118 print "[INIT] Finished reading RPKM files. Total number of samples in experiment: %d (%d failed to read properly)" % (len(samples), failed_samples) 120 print("[INIT] Successfully read RPKM data for sampleID: %s" % s)
119 121
120 if len(samples) <= int(args.svd): 122 RPKM_data = RPKM_data[:, 0:len(samples)]
121 print '[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (int(args.svd), len(samples)) 123 print("[INIT] Finished reading RPKM files. Total number of samples in experiment: %d (%d failed to read properly)" % (
122 sys.exit(0) 124 len(samples), failed_samples))
123 125
124 # BEGIN 126 if len(samples) <= int(args.svd):
125 chrs_to_process = set(map(operator.itemgetter("chr"),probes)) 127 print('[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (
126 chrs_to_process_str = ', '.join([cf.chrInt2Str(c) for c in chrs_to_process]) 128 int(args.svd), len(samples)))
127 print '[INIT] Attempting to process chromosomes: ', chrs_to_process_str 129 sys.exit(0)
128 130
129 131 # BEGIN
130 132 chrs_to_process = set(map(operator.itemgetter("chr"), probes))
131 for chr in chrs_to_process: 133 chrs_to_process_str = ', '.join([cf.chrInt2Str(c) for c in chrs_to_process])
132 print "[RUNNING: chr%d] Now on: %s" %(chr, cf.chrInt2Str(chr)) 134 print('[INIT] Attempting to process chromosomes: ', chrs_to_process_str)
133 chr_group_name = "chr%d" % chr 135
134 chr_group = h5file_out.createGroup("/",chr_group_name,chr_group_name) 136 for chr in chrs_to_process:
135 137 print("[RUNNING: chr%d] Now on: %s" % (chr, cf.chrInt2Str(chr)))
136 chr_probes = filter(lambda i: i["chr"] == chr, probes) 138 chr_group_name = "chr%d" % chr
137 num_chr_probes = len(chr_probes) 139 chr_group = h5file_out.createGroup("/", chr_group_name, chr_group_name)
138 start_probeID = chr_probes[0]['probeID'] 140
139 stop_probeID = chr_probes[-1]['probeID'] 141 chr_probes = [i for i in probes if i["chr"] == chr]
140 print "[RUNNING: chr%d] Found %d probes; probeID range is [%d-%d]" % (chr, len(chr_probes), start_probeID-1, stop_probeID) # probeID is 1-based and slicing is 0-based, hence the start_probeID-1 term 142 num_chr_probes = len(chr_probes)
141 143 start_probeID = chr_probes[0]['probeID']
142 rpkm = RPKM_data[start_probeID:stop_probeID,:] 144 stop_probeID = chr_probes[-1]['probeID']
143 145 print("[RUNNING: chr%d] Found %d probes; probeID range is [%d-%d]" % (chr, len(chr_probes), start_probeID - 1,
144 print "[RUNNING: chr%d] Calculating median RPKM" % chr 146 stop_probeID)) # probeID is 1-based and slicing is 0-based, hence the start_probeID-1 term
145 median = np.median(rpkm,1) 147
146 sd = np.std(rpkm,1) 148 rpkm = RPKM_data[start_probeID:stop_probeID, :]
147 probe_mask = median >= float(args.min_rpkm) 149
148 print "[RUNNING: chr%d] Masking %d probes with median RPKM < %f" % (chr, np.sum(probe_mask==False), float(args.min_rpkm)) 150 print("[RUNNING: chr%d] Calculating median RPKM" % chr)
149 151 median = np.median(rpkm, 1)
150 rpkm = rpkm[probe_mask, :] 152 sd = np.std(rpkm, 1)
151 num_chr_probes = np.sum(probe_mask) 153 probe_mask = median >= float(args.min_rpkm)
152 154 print("[RUNNING: chr%d] Masking %d probes with median RPKM < %f" % (
153 if num_chr_probes <= len(samples): 155 chr, np.sum(probe_mask == False), float(args.min_rpkm)))
154 print "[ERROR] This chromosome has fewer informative probes than there are samples in the analysis! There are probably no mappings on this chromosome. Please remove these probes from the probes.txt file" 156
155 sys.exit(0) 157 rpkm = rpkm[probe_mask, :]
156 158 num_chr_probes = np.sum(probe_mask)
157 probeIDs = np.array(map(operator.itemgetter("probeID"),chr_probes))[probe_mask] 159
158 probe_starts = np.array(map(operator.itemgetter("start"),chr_probes))[probe_mask] 160 if num_chr_probes <= len(samples):
159 probe_stops = np.array(map(operator.itemgetter("stop"),chr_probes))[probe_mask] 161 print("[ERROR] This chromosome has fewer informative probes than there are samples in the analysis! There are probably no mappings on this chromosome. Please remove these probes from the probes.txt file")
160 gene_names = np.array(map(operator.itemgetter("name"),chr_probes))[probe_mask] 162 sys.exit(0)
161 163
162 dt = np.dtype([('probeID',np.uint32),('start',np.uint32),('stop',np.uint32), ('name', np.str_, 20)]) 164 probeIDs = np.array(list(map(operator.itemgetter("probeID"), chr_probes)))[probe_mask]
163 165 probe_starts = np.array(list(map(operator.itemgetter("start"), chr_probes)))[probe_mask]
164 out_probes = np.empty(num_chr_probes,dtype=dt) 166 probe_stops = np.array(list(map(operator.itemgetter("stop"), chr_probes)))[probe_mask]
165 out_probes['probeID'] = probeIDs 167 gene_names = np.array(list(map(operator.itemgetter("name"), chr_probes)))[probe_mask]
166 out_probes['start'] = probe_starts 168
167 out_probes['stop'] = probe_stops 169 dt = np.dtype([('probeID', np.uint32), ('start', np.uint32), ('stop', np.uint32), ('name', np.str_, 20)])
168 out_probes['name'] = gene_names 170
169 probe_table = h5file_out.createTable(probe_group,"probes_chr%d" % chr,cf.probe,"chr%d" % chr) 171 out_probes = np.empty(num_chr_probes, dtype=dt)
170 probe_table.append(out_probes) 172 out_probes['probeID'] = probeIDs
171 173 out_probes['start'] = probe_starts
172 print "[RUNNING: chr%d] Calculating ZRPKM scores..." % chr 174 out_probes['stop'] = probe_stops
173 rpkm = np.apply_along_axis(cf.zrpkm, 0, rpkm, median[probe_mask], sd[probe_mask]) 175 out_probes['name'] = gene_names
174 176 probe_table = h5file_out.createTable(probe_group, "probes_chr%d" % chr, cf.probe, "chr%d" % chr)
175 # svd transform 177 probe_table.append(out_probes)
176 print "[RUNNING: chr%d] SVD decomposition..." % chr 178
177 components_removed = int(args.svd) 179 print("[RUNNING: chr%d] Calculating ZRPKM scores..." % chr)
178 180 rpkm = np.apply_along_axis(cf.zrpkm, 0, rpkm, median[probe_mask], sd[probe_mask])
179 U, S, Vt = np.linalg.svd(rpkm,full_matrices=False) 181
180 new_S = np.diag(np.hstack([np.zeros([components_removed]),S[components_removed:]])) 182 # svd transform
181 183 print("[RUNNING: chr%d] SVD decomposition..." % chr)
182 if args.write_svals != "": 184 components_removed = int(args.svd)
183 sval_f.write('chr' + str(chr) + '\t' + '\t'.join([str(_i) for _i in S]) + "\n") 185
184 186 U, S, Vt = np.linalg.svd(rpkm, full_matrices=False)
185 if args.plot_scree != "": 187 new_S = np.diag(np.hstack([np.zeros([components_removed]), S[components_removed:]]))
186 ax.plot(S, label='chr' + str(chr),lw=0.5) 188
187 189 if args.write_svals != "":
188 # reconstruct data matrix 190 sval_f.write('chr' + str(chr) + '\t' + '\t'.join([str(_i) for _i in S]) + "\n")
189 rpkm = np.dot(U, np.dot(new_S, Vt)) 191
190 192 if args.plot_scree != "":
191 193 ax.plot(S, label='chr' + str(chr), lw=0.5)
192 # save to HDF5 file 194
193 print "[RUNNING: chr%d] Saving SVD-ZRPKM values" % chr 195 # reconstruct data matrix
194 196 rpkm = np.dot(U, np.dot(new_S, Vt))
195 for i,s in enumerate(samples): 197
196 out_data = np.empty(num_chr_probes,dtype='u4,f8') 198 # save to HDF5 file
197 out_data['f0'] = probeIDs 199 print("[RUNNING: chr%d] Saving SVD-ZRPKM values" % chr)
198 out_data['f1'] = rpkm[:,i] 200
199 sample_tbl = h5file_out.createTable(chr_group,"sample_" + str(s),cf.rpkm_value,"%s" % str(s)) 201 for i, s in enumerate(samples):
200 sample_tbl.append(out_data) 202 out_data = np.empty(num_chr_probes, dtype='u4,f8')
201 203 out_data['f0'] = probeIDs
202 204 out_data['f1'] = rpkm[:, i]
203 print "[RUNNING] Saving sampleIDs to file..." 205 sample_tbl = h5file_out.createTable(chr_group, "sample_" + str(s), cf.rpkm_value, "%s" % str(s))
204 sample_group = h5file_out.createGroup("/","samples","samples") 206 sample_tbl.append(out_data)
205 sample_table = h5file_out.createTable(sample_group,"samples",cf.sample,"samples") 207
206 dt = np.dtype([('sampleID',np.str_,100)]) 208 print("[RUNNING] Saving sampleIDs to file...")
207 out_samples = np.empty(len(samples.keys()),dtype=dt) 209 sample_group = h5file_out.createGroup("/", "samples", "samples")
208 out_samples['sampleID'] = np.array(samples.keys()) 210 sample_table = h5file_out.createTable(sample_group, "samples", cf.sample, "samples")
209 sample_table.append(out_samples) 211 dt = np.dtype([('sampleID', np.str_, 100)])
210 212 out_samples = np.empty(len(list(samples.keys())), dtype=dt)
211 213 out_samples['sampleID'] = np.array(list(samples.keys()))
212 if args.write_sd != "": 214 sample_table.append(out_samples)
213 print "[RUNNING] Calculating standard deviations for all samples (this can take a while)..." 215
214 216 if args.write_sd != "":
215 sd_file = open(args.write_sd,'w') 217 print("[RUNNING] Calculating standard deviations for all samples (this can take a while)...")
216 218
217 for i,s in enumerate(samples): 219 sd_file = open(args.write_sd, 'w')
218 # collect all SVD-ZRPKM values 220
219 count = 1 221 for i, s in enumerate(samples):
220 for chr in chrs_to_process: 222 # collect all SVD-ZRPKM values
221 if count == 1: 223 count = 1
222 sd_out = h5file_out.root._f_getChild("chr%d" % chr)._f_getChild("sample_%s" % s).read(field="rpkm").flatten() 224 for chr in chrs_to_process:
223 else: 225 if count == 1:
224 sd_out = np.hstack([sd_out,out.h5file_out.root._f_getChild("chr%d" % chr)._f_getChild("sample_%s" % s).read(field="rpkm").flatten()]) 226 sd_out = h5file_out.root._f_getChild("chr%d" % chr)._f_getChild("sample_%s" % s).read(
225 227 field="rpkm").flatten()
226 sd = np.std(sd_out) 228 else:
227 sd_file.write("%s\t%f\n" % (s,sd)) 229 sd_out = np.hstack([sd_out, out.h5file_out.root._f_getChild("chr%d" % chr)._f_getChild(
228 230 "sample_%s" % s).read(field="rpkm").flatten()])
229 sd_file.close() 231
230 232 sd = np.std(sd_out)
231 if args.plot_scree != "": 233 sd_file.write("%s\t%f\n" % (s, sd))
232 plt.title("Scree plot") 234
233 if len(samples) < 50: 235 sd_file.close()
234 plt.xlim([0,len(samples)]) 236
235 plt.xlabel("S values") 237 if args.plot_scree != "":
236 else: 238 plt.title("Scree plot")
237 plt.xlim([0,50]) 239 if len(samples) < 50:
238 plt.xlabel("S values (only first 50 plotted)") 240 plt.xlim([0, len(samples)])
239 plt.ylabel("Relative contributed variance") 241 plt.xlabel("S values")
240 plt.savefig(args.plot_scree) 242 else:
241 243 plt.xlim([0, 50])
242 print "[FINISHED]" 244 plt.xlabel("S values (only first 50 plotted)")
243 h5file_out.close() 245 plt.ylabel("Relative contributed variance")
244 sys.exit(0) 246 plt.savefig(args.plot_scree)
247
248 print("[FINISHED]")
249 h5file_out.close()
250 sys.exit(0)
251
245 252
246 def CF_export(args): 253 def CF_export(args):
247 try: 254 try:
248 h5file_in_fn = str(args.input) 255 h5file_in_fn = str(args.input)
249 h5file_in = openFile(h5file_in_fn, mode='r') 256 h5file_in = OpenFile(h5file_in_fn, mode='r')
250 except IOError as e: 257 except IOError as e:
251 print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn 258 print('[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn)
252 sys.exit(0) 259 sys.exit(0)
253 260
254 # read probes 261 # read probes
255 probes = {} 262 probes = {}
256 for probes_chr in h5file_in.root.probes: 263 for probes_chr in h5file_in.root.probes:
257 probes[probes_chr.title] = probes_chr.read() 264 probes[probes_chr.title] = probes_chr.read()
258 265
259 if args.sample =='all': 266 if args.sample == 'all':
260 all_samples = list(h5file_in.root.samples.samples.read(field="sampleID")) 267 all_samples = list(h5file_in.root.samples.samples.read(field="sampleID"))
261 268
262 out_path = os.path.abspath(args.output) 269 out_path = os.path.abspath(args.output)
263 270
264 print "[INIT] Preparing to export all samples (%d samples) to %s" % (len(all_samples), out_path) 271 print("[INIT] Preparing to export all samples (%d samples) to %s" % (len(all_samples), out_path))
265 for sample in all_samples: 272 for sample in all_samples:
266 try: 273 try:
267 outfile_fn = out_path + "/" + sample + ".bed" 274 outfile_fn = out_path + "/" + sample + ".bed"
268 outfile_f = open(outfile_fn,'w') 275 outfile_f = open(outfile_fn, 'w')
269 except IOError as e: 276 except IOError as e:
270 print '[ERROR] Cannot open output file for writing: ', outfile_fn 277 print('[ERROR] Cannot open output file for writing: ', outfile_fn)
271 sys.exit(0) 278 sys.exit(0)
272 print "[RUNNING] Exporting %s" % sample 279 print("[RUNNING] Exporting %s" % sample)
273 280
274 cf.export_sample(h5file_in,sample,probes,outfile_f) 281 cf.export_sample(h5file_in, sample, probes, outfile_f)
275 outfile_f.close() 282 outfile_f.close()
276 283
277 elif len(args.sample) == 1: 284 elif len(args.sample) == 1:
278 out_path = os.path.abspath(args.output) 285 out_path = os.path.abspath(args.output)
279 sample = args.sample[0] 286 sample = args.sample[0]
280 print "[INIT] Preparing to export sampleID %s to %s" % (args.sample[0], out_path) 287 print("[INIT] Preparing to export sampleID %s to %s" % (args.sample[0], out_path))
281 try: 288 try:
282 if os.path.isdir(out_path): 289 if os.path.isdir(out_path):
283 outfile_fn = out_path + "/" + sample + ".bed" 290 outfile_fn = out_path + "/" + sample + ".bed"
284 else: 291 else:
285 outfile_fn = out_path 292 outfile_fn = out_path
286 outfile_f = open(outfile_fn,'w') 293 outfile_f = open(outfile_fn, 'w')
287 except IOError as e: 294 except IOError as e:
288 print '[ERROR] Cannot open output file for writing: ', outfile_fn 295 print('[ERROR] Cannot open output file for writing: ', outfile_fn)
289 sys.exit(0) 296 sys.exit(0)
290 print "[RUNNING] Exporting %s to %s" % (sample, outfile_fn) 297 print("[RUNNING] Exporting %s to %s" % (sample, outfile_fn))
291 298
292 cf.export_sample(h5file_in,sample,probes,outfile_f) 299 cf.export_sample(h5file_in, sample, probes, outfile_f)
293 outfile_f.close() 300 outfile_f.close()
294 301
295 else: 302 else:
296 out_path = os.path.abspath(args.output) 303 out_path = os.path.abspath(args.output)
297 print "[INIT] Preparing to export %d samples to %s" % (len(args.sample), out_path) 304 print("[INIT] Preparing to export %d samples to %s" % (len(args.sample), out_path))
298 for sample in args.sample: 305 for sample in args.sample:
299 try: 306 try:
300 if os.path.isdir(out_path): 307 if os.path.isdir(out_path):
301 outfile_fn = out_path + "/" + sample + ".bed" 308 outfile_fn = out_path + "/" + sample + ".bed"
302 else: 309 else:
303 outfile_fn = out_path 310 outfile_fn = out_path
304 outfile_f = open(outfile_fn,'w') 311 outfile_f = open(outfile_fn, 'w')
305 except IOError as e: 312 except IOError as e:
306 print '[ERROR] Cannot open output file for writing: ', outfile_fn 313 print('[ERROR] Cannot open output file for writing: ', outfile_fn)
307 sys.exit(0) 314 sys.exit(0)
308 print "[RUNNING] Exporting %s to %s" % (sample, outfile_fn) 315 print("[RUNNING] Exporting %s to %s" % (sample, outfile_fn))
309 316
310 cf.export_sample(h5file_in,sample,probes,outfile_f) 317 cf.export_sample(h5file_in, sample, probes, outfile_f)
311 outfile_f.close() 318 outfile_f.close()
312 sys.exit(0) 319 sys.exit(0)
320
313 321
314 def CF_call(args): 322 def CF_call(args):
315 try: 323 try:
316 h5file_in_fn = str(args.input) 324 h5file_in_fn = str(args.input)
317 h5file_in = openFile(h5file_in_fn, mode='r') 325 h5file_in = OpenFile(h5file_in_fn, mode='r')
318 except IOError as e: 326 except IOError as e:
319 print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn 327 print('[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn)
320 sys.exit(0) 328 sys.exit(0)
321 329
322 try: 330 try:
323 callfile_fn = str(args.output) 331 callfile_fn = str(args.output)
324 callfile_f = open(callfile_fn, mode='w') 332 callfile_f = open(callfile_fn, mode='w')
325 except IOError as e: 333 except IOError as e:
326 print '[ERROR] Cannot open output file for writing: ', callfile_fn 334 print('[ERROR] Cannot open output file for writing: ', callfile_fn)
327 sys.exit(0) 335 sys.exit(0)
328 336
329 chrs_to_process = [] 337 chrs_to_process = []
330 for chr in h5file_in.root: 338 for chr in h5file_in.root:
331 if chr._v_title not in ('probes','samples'): 339 if chr._v_title not in ('probes', 'samples'):
332 chrs_to_process.append(chr._v_title.replace("chr","")) 340 chrs_to_process.append(chr._v_title.replace("chr", ""))
333 341
334 h5file_in.close() 342 h5file_in.close()
335 343
336 print '[INIT] Initializing caller at threshold = %f' % (args.threshold) 344 print('[INIT] Initializing caller at threshold = %f' % (args.threshold))
337 345
338 r = cf.rpkm_reader(h5file_in_fn) 346 r = cf.rpkm_reader(h5file_in_fn)
339 347
340 all_calls = [] 348 all_calls = []
341 349
342 for chr in chrs_to_process: 350 for chr in chrs_to_process:
343 print '[RUNNING] Now processing chr%s' % chr 351 print('[RUNNING] Now processing chr%s' % chr)
344 data = r.getExonValuesByRegion(chr) 352 data = r.getExonValuesByRegion(chr)
345 353
346 #raw_data = copy.copy(data) 354 # raw_data = copy.copy(data)
347 _ = data.smooth() 355 _ = data.smooth()
348 356
349 mean= np.mean(data.rpkm,axis=1) 357 mean = np.mean(data.rpkm, axis=1)
350 sd = np.std(data.rpkm,axis=1) 358 sd = np.std(data.rpkm, axis=1)
351 359
352 for sample in r.getSampleList(): 360 for sample in r.getSampleList():
353 sample_data = data.getSample([sample]).flatten() 361 sample_data = data.getSample([sample]).flatten()
354 #sample_raw_data = raw_data.getSample([sample]).flatten() 362 # sample_raw_data = raw_data.getSample([sample]).flatten()
355 363
356 dup_mask = sample_data >= args.threshold 364 dup_mask = sample_data >= args.threshold
357 del_mask = sample_data <= -1*args.threshold 365 del_mask = sample_data <= -1 * args.threshold
358 366
359 dup_bkpoints = cf.getbkpoints(dup_mask) #returns exon coordinates for this chromosome (numpy array coords) 367 dup_bkpoints = cf.getbkpoints(dup_mask) # returns exon coordinates for this chromosome (numpy array coords)
360 del_bkpoints = cf.getbkpoints(del_mask) 368 del_bkpoints = cf.getbkpoints(del_mask)
361 369
362 370 dups = []
363 dups = [] 371 for start, stop in dup_bkpoints:
364 for start,stop in dup_bkpoints: 372 try:
365 try: new_start = np.max(np.where(sample_data[:start] < (mean[:start] + 3*sd[:start]))) 373 new_start = np.max(np.where(sample_data[:start] < (mean[:start] + 3 * sd[:start])))
366 except ValueError: new_start = 0 374 except ValueError:
367 try: new_stop = stop + np.min(np.where(sample_data[stop:] < (mean[stop:] + 3*sd[stop:]))) 375 new_start = 0
368 except ValueError: new_stop = data.shape[1]-1 376 try:
369 dups.append({"sampleID":sample,"chromosome": cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "dup"}) 377 new_stop = stop + np.min(np.where(sample_data[stop:] < (mean[stop:] + 3 * sd[stop:])))
370 378 except ValueError:
371 dels = [] 379 new_stop = data.shape[1] - 1
372 for start,stop in del_bkpoints: 380 dups.append(
373 try: new_start = np.max(np.where(sample_data[:start] > (-1*mean[:start] - 3*sd[:start]))) 381 {"sampleID": sample, "chromosome": cf.chrInt2Str(chr), "start": data.exons[new_start]["start"],
374 except ValueError: new_start = 0 382 "stop": data.exons[new_stop]["stop"], "state": "dup"})
375 try: new_stop = stop + np.min(np.where(sample_data[stop:] > (-1*mean[stop:] - 3*sd[stop:]))) 383
376 except ValueError: new_stop = data.shape[1]-1 384 dels = []
377 dels.append({"sampleID":sample,"chromosome": cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "del"}) 385 for start, stop in del_bkpoints:
378 386 try:
379 dels = cf.mergeCalls(dels) #merges overlapping calls 387 new_start = np.max(np.where(sample_data[:start] > (-1 * mean[:start] - 3 * sd[:start])))
380 dups = cf.mergeCalls(dups) 388 except ValueError:
381 389 new_start = 0
382 #print sampleID, len(dels), len(dups) 390 try:
383 391 new_stop = stop + np.min(np.where(sample_data[stop:] > (-1 * mean[stop:] - 3 * sd[stop:])))
384 all_calls.extend(list(dels)) 392 except ValueError:
385 all_calls.extend(list(dups)) 393 new_stop = data.shape[1] - 1
386 394 dels.append(
387 # print calls to file 395 {"sampleID": sample, "chromosome": cf.chrInt2Str(chr), "start": data.exons[new_start]["start"],
388 header = ['sampleID','chromosome','start','stop','state'] 396 "stop": data.exons[new_stop]["stop"], "state": "del"})
389 397
390 callfile_f.write('\t'.join(header) + "\n") 398 dels = cf.mergeCalls(dels) # merges overlapping calls
391 for call in all_calls: 399 dups = cf.mergeCalls(dups)
392 print "%s\t%s\t%d\t%d\t%s" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"]) 400
393 callfile_f.write("%s\t%s\t%d\t%d\t%s\n" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"])) 401 # print sampleID, len(dels), len(dups)
394 402
395 sys.exit(0) 403 all_calls.extend(list(dels))
404 all_calls.extend(list(dups))
405
406 # print calls to file
407 header = ['sampleID', 'chromosome', 'start', 'stop', 'state']
408
409 callfile_f.write('\t'.join(header) + "\n")
410 for call in all_calls:
411 print("%s\t%s\t%d\t%d\t%s" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"]))
412 callfile_f.write(
413 "%s\t%s\t%d\t%d\t%s\n" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"]))
414
415 sys.exit(0)
416
396 417
397 def CF_plot(args): 418 def CF_plot(args):
398 try: 419 try:
399 import locale 420 import locale
400 import matplotlib 421 import matplotlib
401 matplotlib.use('Agg') 422 matplotlib.use('Agg')
402 import matplotlib.pyplot as plt 423 import matplotlib.pyplot as plt
403 import pylab as P 424 import pylab as P
404 from matplotlib.lines import Line2D 425 from matplotlib.lines import Line2D
405 from matplotlib.patches import Rectangle 426 from matplotlib.patches import Rectangle
406 _ = locale.setlocale(locale.LC_ALL, '') 427 _ = locale.setlocale(locale.LC_ALL, '')
407 except: 428 except:
408 print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?" 429 print("[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?")
409 sys.exit(0) 430 sys.exit(0)
410 431
411 432 chr, start, stop = cf.parseLocString(args.region)
412 chr, start, stop = cf.parseLocString(args.region) 433
413 434 r = cf.rpkm_reader(args.input)
414 r = cf.rpkm_reader(args.input) 435
415 436 data = r.getExonValuesByRegion(chr, start, stop)
416 data = r.getExonValuesByRegion(chr,start,stop) 437 _ = data.smooth()
417 _ = data.smooth() 438
418 439 plt.gcf().clear()
419 plt.gcf().clear() 440 fig = plt.figure(figsize=(10, 5))
420 fig = plt.figure(figsize=(10,5)) 441 ax = fig.add_subplot(111)
421 ax = fig.add_subplot(111) 442
422 443 ax.plot(data.rpkm, linewidth=0.3, c='k')
423 444
424 ax.plot(data.rpkm, linewidth = 0.3, c='k') 445 if args.sample != 'none':
425 446 cnt = 1
426 447 coloriter = iter(['r', 'b', 'g', 'y'])
427 if args.sample != 'none': 448 for sample in args.sample:
428 cnt = 1 449 try:
429 coloriter = iter(['r','b','g','y']) 450 color, sampleID = sample.split(":")
430 for sample in args.sample: 451 except:
431 try: 452 color = next(coloriter)
432 color, sampleID = sample.split(":") 453 sampleID = sample
433 except: 454
434 color =coloriter.next() 455 ax.plot(data.getSample([sampleID]), linewidth=1, c=color, label=sampleID)
435 sampleID = sample 456
436 457 if cnt == 1:
437 ax.plot(data.getSample([sampleID]), linewidth = 1, c=color, label = sampleID) 458 cf.plotRawData(ax,
438 459 r.getExonValuesByRegion(chr, start, stop, sampleList=[sampleID]).getSample([sampleID]),
439 if cnt == 1: 460 color=color)
440 cf.plotRawData(ax, r.getExonValuesByRegion(chr,start,stop,sampleList=[sampleID]).getSample([sampleID]),color=color) 461 cnt += 1
441 cnt +=1 462 plt.legend(prop={'size': 10}, frameon=False)
442 plt.legend(prop={'size':10},frameon=False) 463
443 464 cf.plotGenes(ax, data)
444 cf.plotGenes(ax, data) 465 cf.plotGenomicCoords(plt, data)
445 cf.plotGenomicCoords(plt,data) 466 plt.xlim(0, data.shape[1])
446 plt.xlim(0,data.shape[1]) 467 plt.ylim(-3, 3)
447 plt.ylim(-3,3) 468
448 469 plt.title("%s: %s - %s" % (
449 plt.title("%s: %s - %s" % (cf.chrInt2Str(chr),locale.format("%d",start, grouping=True),locale.format("%d",stop, grouping=True))) 470 cf.chrInt2Str(chr), locale.format("%d", start, grouping=True), locale.format("%d", stop, grouping=True)))
450 plt.xlabel("Position") 471 plt.xlabel("Position")
451 plt.ylabel("SVD-ZRPKM Values") 472 plt.ylabel("SVD-ZRPKM Values")
452 473
453 plt.savefig(args.output) 474 plt.savefig(args.output)
454 475
455 sys.exit(0) 476 sys.exit(0)
477
456 478
457 def CF_plotcalls(args): 479 def CF_plotcalls(args):
458 try: 480 try:
459 import matplotlib 481 import matplotlib
460 matplotlib.use('Agg') 482 matplotlib.use('Agg')
461 import matplotlib.pyplot as plt 483 import matplotlib.pyplot as plt
462 import pylab as P 484 import pylab as P
463 from matplotlib.lines import Line2D 485 from matplotlib.lines import Line2D
464 from matplotlib.patches import Rectangle 486 from matplotlib.patches import Rectangle
465 except: 487 except:
466 print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?" 488 print("[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?")
467 sys.exit(0) 489 sys.exit(0)
468 490
469 import locale 491 import locale
470 try: 492 try:
471 _ = locale.setlocale(locale.LC_ALL, 'en_US') 493 _ = locale.setlocale(locale.LC_ALL, 'en_US')
472 except: 494 except:
473 _ = locale.setlocale(locale.LC_ALL, '') 495 _ = locale.setlocale(locale.LC_ALL, '')
474 496
475 try: 497 try:
476 callfile_fn = str(args.calls) 498 callfile_fn = str(args.calls)
477 callfile_f = open(callfile_fn, mode='r') 499 callfile_f = open(callfile_fn, mode='r')
478 except IOError as e: 500 except IOError as e:
479 print '[ERROR] Cannot open call file for reading: ', callfile_fn 501 print('[ERROR] Cannot open call file for reading: ', callfile_fn)
480 sys.exit(0) 502 sys.exit(0)
481 503
482 all_calls = [] 504 all_calls = []
483 header = callfile_f.readline() 505 header = callfile_f.readline()
484 506
485 for line in callfile_f: 507 for line in callfile_f:
486 sampleID, chr, start, stop, state = line.strip().split() 508 sampleID, chr, start, stop, state = line.strip().split()
487 chr = cf.chrStr2Int(chr) 509 chr = cf.chrStr2Int(chr)
488 all_calls.append({"chromosome":int(chr), "start":int(start), "stop":int(stop), "sampleID":sampleID}) 510 all_calls.append({"chromosome": int(chr), "start": int(start), "stop": int(stop), "sampleID": sampleID})
489 511
490 r = cf.rpkm_reader(args.input) 512 r = cf.rpkm_reader(args.input)
491 513
492 for call in all_calls: 514 for call in all_calls:
493 chr = call["chromosome"] 515 chr = call["chromosome"]
494 start = call["start"] 516 start = call["start"]
495 stop = call["stop"] 517 stop = call["stop"]
496 sampleID = call["sampleID"] 518 sampleID = call["sampleID"]
497 519
498 exons = r.getExonIDs(chr,int(start),int(stop)) 520 exons = r.getExonIDs(chr, int(start), int(stop))
499 521
500 522 window_start = max(exons[0] - args.window, 0)
501 window_start = max(exons[0]-args.window,0) 523 window_stop = exons[-1] + args.window
502 window_stop = exons[-1]+args.window 524
503 525 data = r.getExonValuesByExons(chr, window_start, window_stop)
504 data = r.getExonValuesByExons(chr,window_start, window_stop) 526 _ = data.smooth()
505 _ = data.smooth() 527
506 528 plt.gcf().clear()
507 plt.gcf().clear() 529 fig = plt.figure(figsize=(10, 5))
508 fig = plt.figure(figsize=(10,5)) 530 ax = fig.add_subplot(111)
509 ax = fig.add_subplot(111) 531
510 532 ax.plot(data.rpkm, linewidth=0.3, c='k')
511 533
512 ax.plot(data.rpkm, linewidth = 0.3, c='k') 534 ax.plot(data.getSample([sampleID]), linewidth=1, c='r', label=sampleID)
513 535 cf.plotRawData(ax, r.getExonValuesByExons(chr, window_start, window_stop, sampleList=[sampleID]).getSample(
514 536 [sampleID]), color='r')
515 ax.plot(data.getSample([sampleID]), linewidth = 1, c='r', label = sampleID) 537
516 cf.plotRawData(ax, r.getExonValuesByExons(chr,window_start, window_stop,sampleList=[sampleID]).getSample([sampleID]),color='r') 538 plt.legend(prop={'size': 10}, frameon=False)
517 539
518 plt.legend(prop={'size':10},frameon=False) 540 cf.plotGenes(ax, data)
519 541 cf.plotGenomicCoords(plt, data)
520 cf.plotGenes(ax, data) 542
521 cf.plotGenomicCoords(plt,data) 543 exon_start = np.where(data.exons["start"] == start)[0][0]
522 544 exon_stop = np.where(data.exons["stop"] == stop)[0][0]
523 exon_start = np.where(data.exons["start"] == start)[0][0] 545 _ = ax.add_line(
524 exon_stop = np.where(data.exons["stop"] == stop)[0][0] 546 matplotlib.lines.Line2D([exon_start, exon_stop], [2, 2], color='k', lw=6, linestyle='-', alpha=1,
525 _ = ax.add_line(matplotlib.lines.Line2D([exon_start,exon_stop],[2,2],color='k',lw=6,linestyle='-',alpha=1,solid_capstyle='butt')) 547 solid_capstyle='butt'))
526 548
527 _ = plt.xlim(0,data.shape[1]) 549 _ = plt.xlim(0, data.shape[1])
528 _ = plt.ylim(-3,3) 550 _ = plt.ylim(-3, 3)
529 551
530 plt.title("%s: %s - %s" % (cf.chrInt2Str(chr),locale.format("%d",start, grouping=True),locale.format("%d",stop, grouping=True))) 552 plt.title("%s: %s - %s" % (
531 plt.xlabel("Position") 553 cf.chrInt2Str(chr), locale.format("%d", start, grouping=True), locale.format("%d", stop, grouping=True)))
532 plt.ylabel("SVD-ZRPKM Values") 554 plt.xlabel("Position")
533 outfile = "%s_%d_%d_%s.png" % (cf.chrInt2Str(chr), start, stop, sampleID) 555 plt.ylabel("SVD-ZRPKM Values")
534 plt.savefig(args.outputdir + "/" + outfile) 556 outfile = "%s_%d_%d_%s.png" % (cf.chrInt2Str(chr), start, stop, sampleID)
557 plt.savefig(args.outputdir + "/" + outfile)
558
535 559
536 def CF_bam2RPKM(args): 560 def CF_bam2RPKM(args):
537 try: 561 try:
538 import pysam 562 import pysam
539 except: 563 except:
540 print '[ERROR] Cannot load pysam module! Make sure it is insalled' 564 print('[ERROR] Cannot load pysam module! Make sure it is insalled')
541 sys.exit(0) 565 sys.exit(0)
542 try: 566 try:
543 # read probes table 567 # read probes table
544 probe_fn = str(args.probes[0]) 568 probe_fn = str(args.probes[0])
545 probes = cf.loadProbeList(probe_fn) 569 probes = cf.loadProbeList(probe_fn)
546 num_probes = len(probes) 570 num_probes = len(probes)
547 print '[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn) 571 print('[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn))
548 except IOError as e: 572 except IOError as e:
549 print '[ERROR] Cannot read probes file: ', probe_fn 573 print('[ERROR] Cannot read probes file: ', probe_fn)
550 sys.exit(0) 574 sys.exit(0)
551 575
552 try: 576 try:
553 rpkm_f = open(args.output[0],'w') 577 rpkm_f = open(args.output[0], 'wb')
554 except IOError as e: 578 except IOError as e:
555 print '[ERROR] Cannot open rpkm file for writing: ', args.output 579 print('[ERROR] Cannot open rpkm file for writing: ', args.output)
556 sys.exit(0) 580 sys.exit(0)
557 581
558 print "[RUNNING] Counting total number of reads in bam file..." 582 print("[RUNNING] Counting total number of reads in bam file...")
559 total_reads = float(pysam.view("-c", args.input[0])[0].strip("\n")) 583 total_reads = float(pysam.view("-c", args.input[0])[0].strip("\n"))
560 print "[RUNNING] Found %d reads" % total_reads 584 print("[RUNNING] Found %d reads" % total_reads)
561 585
562 f = pysam.Samfile(args.input[0], "rb" ) 586 f = pysam.Samfile(args.input[0], "rb")
563 587
564 if not f._hasIndex(): 588 # if not f._hasIndex():
565 print "[ERROR] No index found for bam file (%s)!\n[ERROR] You must first index the bam file and include the .bai file in the same directory as the bam file!" % args.input[0] 589 if not f.has_index():
566 sys.exit(0) 590 print("[ERROR] No index found for bam file (%s)!\n[ERROR] You must first index the bam file and include the .bai file in the same directory as the bam file!" % \
567 591 args.input[0])
568 # will be storing values in these arrays 592 sys.exit(0)
569 readcount = np.zeros(num_probes) 593
570 exon_bp = np.zeros(num_probes) 594 # will be storing values in these arrays
571 probeIDs = np.zeros(num_probes) 595 readcount = np.zeros(num_probes)
572 counter = 0 596 exon_bp = np.zeros(num_probes)
573 597 probeIDs = np.zeros(num_probes)
574 # detect contig naming scheme here # TODO, add an optional "contigs.txt" file or automatically handle contig naming 598 counter = 0
575 bam_contigs = f.references 599
576 probes_contigs = [str(p) for p in set(map(operator.itemgetter("chr"),probes))] 600 # detect contig naming scheme here # TODO, add an optional "contigs.txt" file or automatically handle contig naming
577 601 bam_contigs = f.references
578 probes2contigmap = {} 602 probes_contigs = [str(p) for p in set(map(operator.itemgetter("chr"), probes))]
579 603
580 for probes_contig in probes_contigs: 604 probes2contigmap = {}
581 if probes_contig in bam_contigs: 605
582 probes2contigmap[probes_contig] = probes_contig 606 for probes_contig in probes_contigs:
583 elif cf.chrInt2Str(probes_contig) in bam_contigs: 607 if probes_contig in bam_contigs:
584 probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig) 608 probes2contigmap[probes_contig] = probes_contig
585 elif cf.chrInt2Str(probes_contig).replace("chr","") in bam_contigs: 609 elif cf.chrInt2Str(probes_contig) in bam_contigs:
586 probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig).replace("chr","") 610 probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig)
587 else: 611 elif cf.chrInt2Str(probes_contig).replace("chr", "") in bam_contigs:
588 print "[ERROR] Could not find contig '%s' from %s in bam file! \n[ERROR] Perhaps the contig names for the probes are incompatible with the bam file ('chr1' vs. '1'), or unsupported contig naming is used?" % (probes_contig, probe_fn) 612 probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig).replace("chr", "")
589 sys.exit(0) 613 else:
590 614 print("[ERROR] Could not find contig '%s' from %s in bam file! \n[ERROR] Perhaps the contig names for the probes are incompatible with the bam file ('chr1' vs. '1'), or unsupported contig naming is used?" % (
591 print "[RUNNING] Calculating RPKM values..." 615 probes_contig, probe_fn))
592 616 sys.exit(0)
593 # loop through each probe 617
594 for p in probes: 618 print("[RUNNING] Calculating RPKM values...")
595 619
596 # f.fetch is a pysam method and returns an iterator for reads overlapping interval 620 # loop through each probe
597 621 for p in probes:
598 p_chr = probes2contigmap[str(p["chr"])] 622
599 623 # f.fetch is a pysam method and returns an iterator for reads overlapping interval
600 p_start = p["start"] 624
601 p_stop = p["stop"] 625 p_chr = probes2contigmap[str(p["chr"])]
602 try: 626
603 iter = f.fetch(p_chr,p_start,p_stop) 627 p_start = p["start"]
604 except: 628 p_stop = p["stop"]
605 print "[ERROR] Could not retrieve mappings for region %s:%d-%d. Check that contigs are named correctly and the bam file is properly indexed" % (p_chr,p_start,p_stop) 629 try:
606 sys.exit(0) 630 iter = f.fetch(p_chr, p_start, p_stop)
607 631 except:
608 for i in iter: 632 print("[ERROR] Could not retrieve mappings for region %s:%d-%d. Check that contigs are named correctly and the bam file is properly indexed" % (
609 if i.pos+1 >= p_start: #this checks to make sure a read actually starts in an interval 633 p_chr, p_start, p_stop))
610 readcount[counter] += 1 634 sys.exit(0)
611 635
612 exon_bp[counter] = p_stop-p_start 636 for i in iter:
613 probeIDs[counter] = counter +1 #probeIDs are 1-based 637 if i.pos + 1 >= p_start: # this checks to make sure a read actually starts in an interval
614 counter +=1 638 readcount[counter] += 1
615 639
616 #calcualte RPKM values for all probes 640 exon_bp[counter] = p_stop - p_start
617 rpkm = (10**9*(readcount)/(exon_bp))/(total_reads) 641 probeIDs[counter] = counter + 1 # probeIDs are 1-based
618 642 counter += 1
619 out = np.vstack([probeIDs,readcount,rpkm]) 643
620 644 # calcualte RPKM values for all probes
621 np.savetxt(rpkm_f,out.transpose(),delimiter='\t',fmt=['%d','%d','%f']) 645 rpkm = (10 ** 9 * (readcount) / (exon_bp)) / (total_reads)
622 646
623 rpkm_f.close() 647 out = np.vstack([probeIDs, readcount, rpkm])
624 648
625 649 np.savetxt(rpkm_f, out.transpose(), delimiter='\t', fmt=['%d', '%d', '%f'])
650
651 rpkm_f.close()
652
626 653
627 VERSION = "0.2.2" 654 VERSION = "0.2.2"
628 parser = argparse.ArgumentParser(prog="CoNIFER", description="This is CoNIFER %s (Copy Number Inference From Exome Reads), designed to detect and genotype CNVs and CNPs from exome sequence read-depth data. See Krumm et al., Genome Research (2012) doi:10.1101/gr.138115.112 \nNiklas Krumm, 2012\n nkrumm@uw.edu" % VERSION) 655 parser = argparse.ArgumentParser(prog="CoNIFER",
656 description="This is CoNIFER %s (Copy Number Inference From Exome Reads), designed to detect and genotype CNVs and CNPs from exome sequence read-depth data. See Krumm et al., Genome Research (2012) doi:10.1101/gr.138115.112 \nNiklas Krumm, 2012\n nkrumm@uw.edu" % VERSION)
629 parser.add_argument('--version', action='version', version='%(prog)s ' + VERSION) 657 parser.add_argument('--version', action='version', version='%(prog)s ' + VERSION)
630 subparsers = parser.add_subparsers(help='Command to be run.') 658 subparsers = parser.add_subparsers(help='Command to be run.')
631 659
632 # rpkm command 660 # rpkm command
633 rpkm_parser= subparsers.add_parser('rpkm', help='Create an RPKM file from a BAM file') 661 rpkm_parser = subparsers.add_parser('rpkm', help='Create an RPKM file from a BAM file')
634 rpkm_parser.add_argument('--probes',action='store', required=True, metavar='/path/to/probes_file.txt', nargs=1,help="Probe definition file") 662 rpkm_parser.add_argument('--probes', action='store', required=True, metavar='/path/to/probes_file.txt', nargs=1,
635 rpkm_parser.add_argument('--input',action='store', required=True, metavar='sample.bam',nargs=1, help="Aligned BAM file") 663 help="Probe definition file")
636 rpkm_parser.add_argument('--output',action='store', required=True, metavar='sample.rpkm.txt',nargs=1, help="RPKM file to write") 664 rpkm_parser.add_argument('--input', action='store', required=True, metavar='sample.bam', nargs=1,
665 help="Aligned BAM file")
666 rpkm_parser.add_argument('--output', action='store', required=True, metavar='sample.rpkm.txt', nargs=1,
667 help="RPKM file to write")
637 rpkm_parser.set_defaults(func=CF_bam2RPKM) 668 rpkm_parser.set_defaults(func=CF_bam2RPKM)
638 669
639 # analyze command 670 # analyze command
640 analyze_parser= subparsers.add_parser('analyze', help='Basic CoNIFER analysis. Reads a directory of RPKM files and a probe list and outputs a HDF5 file containing SVD-ZRPKM values.') 671 analyze_parser = subparsers.add_parser('analyze',
641 analyze_parser.add_argument('--probes',action='store', required=True, metavar='/path/to/probes_file.txt', nargs=1,help="Probe definition file") 672 help='Basic CoNIFER analysis. Reads a directory of RPKM files and a probe list and outputs a HDF5 file containing SVD-ZRPKM values.')
642 analyze_parser.add_argument('--rpkm_dir',action='store', required=True, metavar='/path/to/folder/containing/rpkm_files/',nargs=1, help="Location of folder containing RPKM files. Folder should contain ONLY contain RPKM files, and all readable RPKM files will used in analysis.") 673 analyze_parser.add_argument('--probes', action='store', required=True, metavar='/path/to/probes_file.txt', nargs=1,
643 analyze_parser.add_argument('--output','-o', required=True, metavar='/path/to/output_file.hdf5', type=str, help="Output location of HDF5 file to contain SVD-ZRPKM values") 674 help="Probe definition file")
644 analyze_parser.add_argument('--svd', metavar='12', type=int, nargs='?', default = 12,help="Number of components to remove") 675 analyze_parser.add_argument('--rpkm_dir', action='store', required=True,
645 analyze_parser.add_argument('--min_rpkm', metavar='1.00', type=float, nargs="?", default = 1.00,help="Minimum population median RPKM per probe.") 676 metavar='/path/to/folder/containing/rpkm_files/', nargs=1,
646 analyze_parser.add_argument('--write_svals', metavar='SingularValues.txt', type=str, default= "", help="Optional file to write singular values (S-values). Used for Scree Plot.") 677 help="Location of folder containing RPKM files. Folder should contain ONLY contain RPKM files, and all readable RPKM files will used in analysis.")
647 analyze_parser.add_argument('--plot_scree', metavar='ScreePlot.png', type=str, default= "", help="Optional graphical scree plot. Requires matplotlib.") 678 analyze_parser.add_argument('--output', '-o', required=True, metavar='/path/to/output_file.hdf5', type=str,
648 analyze_parser.add_argument('--write_sd', metavar='StandardDeviations.txt', type=str, default= "", help="Optional file with sample SVD-ZRPKM standard deviations. Used to filter noisy samples.") 679 help="Output location of HDF5 file to contain SVD-ZRPKM values")
680 analyze_parser.add_argument('--svd', metavar='12', type=int, nargs='?', default=12,
681 help="Number of components to remove")
682 analyze_parser.add_argument('--min_rpkm', metavar='1.00', type=float, nargs="?", default=1.00,
683 help="Minimum population median RPKM per probe.")
684 analyze_parser.add_argument('--write_svals', metavar='SingularValues.txt', type=str, default="",
685 help="Optional file to write singular values (S-values). Used for Scree Plot.")
686 analyze_parser.add_argument('--plot_scree', metavar='ScreePlot.png', type=str, default="",
687 help="Optional graphical scree plot. Requires matplotlib.")
688 analyze_parser.add_argument('--write_sd', metavar='StandardDeviations.txt', type=str, default="",
689 help="Optional file with sample SVD-ZRPKM standard deviations. Used to filter noisy samples.")
649 analyze_parser.set_defaults(func=CF_analyze) 690 analyze_parser.set_defaults(func=CF_analyze)
650 691
651 # export command 692 # export command
652 export_parser= subparsers.add_parser('export', help='Export SVD-ZRPKM values from a HDF5 file to bed or vcf format.') 693 export_parser = subparsers.add_parser('export', help='Export SVD-ZRPKM values from a HDF5 file to bed or vcf format.')
653 export_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step") 694 export_parser.add_argument('--input', '-i', action='store', required=True, metavar='CoNIFER_SVD.hdf5',
654 export_parser.add_argument('--output','-o',action='store', required=False, default='.', metavar='output.bed',help="Location of output file[s]. When exporting multiple samples, top-level directory of this path will be used.") 695 help="HDF5 file from CoNIFER 'analyze' step")
655 export_parser.add_argument('--sample','-s',action='store', required=False, metavar='sampleID', default='all', nargs="+",help="SampleID or comma-separated list of sampleIDs to export. Default: export all samples") 696 export_parser.add_argument('--output', '-o', action='store', required=False, default='.', metavar='output.bed',
697 help="Location of output file[s]. When exporting multiple samples, top-level directory of this path will be used.")
698 export_parser.add_argument('--sample', '-s', action='store', required=False, metavar='sampleID', default='all',
699 nargs="+",
700 help="SampleID or comma-separated list of sampleIDs to export. Default: export all samples")
656 export_parser.set_defaults(func=CF_export) 701 export_parser.set_defaults(func=CF_export)
657 702
658 # plot command 703 # plot command
659 plot_parser= subparsers.add_parser('plot', help='Plot SVD-ZRPKM values using matplotlib') 704 plot_parser = subparsers.add_parser('plot', help='Plot SVD-ZRPKM values using matplotlib')
660 plot_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step") 705 plot_parser.add_argument('--input', '-i', action='store', required=True, metavar='CoNIFER_SVD.hdf5',
661 plot_parser.add_argument('--region',action='store', required=True, metavar='chr#:start-stop',help="Region to plot") 706 help="HDF5 file from CoNIFER 'analyze' step")
662 plot_parser.add_argument('--output',action='store', required=True, metavar='image.png',help="Output path and filetype. PDF, PNG, PS, EPS, and SVG are supported.") 707 plot_parser.add_argument('--region', action='store', required=True, metavar='chr#:start-stop', help="Region to plot")
663 plot_parser.add_argument('--sample',action='store', required=False, metavar='sampleID',nargs="+",default='none',help="Sample[s] to highlight. The following optional color spec can be used: <color>:<sampleID>. Available colors are r,b,g,y,c,m,y,k. The unsmoothed SVD-ZRPKM values for the first sample in this list will be drawn. Default: No samples highlighted.") 708 plot_parser.add_argument('--output', action='store', required=True, metavar='image.png',
709 help="Output path and filetype. PDF, PNG, PS, EPS, and SVG are supported.")
710 plot_parser.add_argument('--sample', action='store', required=False, metavar='sampleID', nargs="+", default='none',
711 help="Sample[s] to highlight. The following optional color spec can be used: <color>:<sampleID>. Available colors are r,b,g,y,c,m,y,k. The unsmoothed SVD-ZRPKM values for the first sample in this list will be drawn. Default: No samples highlighted.")
664 plot_parser.set_defaults(func=CF_plot) 712 plot_parser.set_defaults(func=CF_plot)
665 713
666 # make calls command 714 # make calls command
667 call_parser= subparsers.add_parser('call', help='Very rudimentary caller for CNVs using SVD-ZRPKM thresholding.') 715 call_parser = subparsers.add_parser('call', help='Very rudimentary caller for CNVs using SVD-ZRPKM thresholding.')
668 call_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step") 716 call_parser.add_argument('--input', '-i', action='store', required=True, metavar='CoNIFER_SVD.hdf5',
669 call_parser.add_argument('--output',action='store', required=True, metavar='calls.txt',help="Output file for calls") 717 help="HDF5 file from CoNIFER 'analyze' step")
670 call_parser.add_argument('--threshold', metavar='1.50', type=float, nargs='?', required=False, default = 1.50,help="+/- Threshold for calling (minimum SVD-ZRPKM)") 718 call_parser.add_argument('--output', action='store', required=True, metavar='calls.txt', help="Output file for calls")
719 call_parser.add_argument('--threshold', metavar='1.50', type=float, nargs='?', required=False, default=1.50,
720 help="+/- Threshold for calling (minimum SVD-ZRPKM)")
671 call_parser.set_defaults(func=CF_call) 721 call_parser.set_defaults(func=CF_call)
672 722
673 # plotcalls command 723 # plotcalls command
674 plotcalls_parser= subparsers.add_parser('plotcalls', help='Make basic plots from call file from "call" command.') 724 plotcalls_parser = subparsers.add_parser('plotcalls', help='Make basic plots from call file from "call" command.')
675 plotcalls_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step") 725 plotcalls_parser.add_argument('--input', '-i', action='store', required=True, metavar='CoNIFER_SVD.hdf5',
676 plotcalls_parser.add_argument('--calls',action='store', required=True, metavar='calls.txt',help="File with calls from 'call' command.") 726 help="HDF5 file from CoNIFER 'analyze' step")
677 plotcalls_parser.add_argument('--outputdir',action='store', required=True, metavar='/path/to/directory',help="Output directory for plots") 727 plotcalls_parser.add_argument('--calls', action='store', required=True, metavar='calls.txt',
678 plotcalls_parser.add_argument('--window',action='store', required=False, metavar='50',default=50,help="In exons, the amount of padding to plot around each call") 728 help="File with calls from 'call' command.")
729 plotcalls_parser.add_argument('--outputdir', action='store', required=True, metavar='/path/to/directory',
730 help="Output directory for plots")
731 plotcalls_parser.add_argument('--window', action='store', required=False, metavar='50', default=50,
732 help="In exons, the amount of padding to plot around each call")
679 plotcalls_parser.set_defaults(func=CF_plotcalls) 733 plotcalls_parser.set_defaults(func=CF_plotcalls)
680 734
681 args = parser.parse_args() 735 args = parser.parse_args()
682 args.func(args) 736 args.func(args)