Mercurial > repos > bzonnedda > conifer2
comparison conifer.py @ 25:abdc05fae16e draft
Uploaded
author | bzonnedda |
---|---|
date | Thu, 06 Jul 2017 06:28:33 -0400 |
parents | 9196f299dc5e |
children |
comparison
equal
deleted
inserted
replaced
24:9c5a978ce25d | 25:abdc05fae16e |
---|---|
28 import os, sys, copy | 28 import os, sys, copy |
29 import glob | 29 import glob |
30 import csv | 30 import csv |
31 import conifer_functions as cf | 31 import conifer_functions as cf |
32 import operator | 32 import operator |
33 #from tables import * | 33 from tables import * |
34 import pysam | |
34 import numpy as np | 35 import numpy as np |
35 | 36 |
37 | |
36 def CF_analyze(args): | 38 def CF_analyze(args): |
37 # do path/file checks: | 39 # do path/file checks: |
38 try: | 40 try: |
39 # read probes table | 41 # read probes table |
40 probe_fn = str(args.probes[0]) | 42 probe_fn = str(args.probes[0]) |
41 probes = cf.loadProbeList(probe_fn) | 43 probes = cf.loadProbeList(probe_fn) |
42 num_probes = len(probes) | 44 num_probes = len(probes) |
43 print '[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn) | 45 print('[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn)) |
44 except IOError as e: | 46 except IOError as e: |
45 print '[ERROR] Cannot read probes file: ', probe_fn | 47 print('[ERROR] Cannot read probes file: ', probe_fn) |
46 sys.exit(0) | 48 sys.exit(0) |
47 | 49 |
48 try: | 50 try: |
49 svd_outfile_fn = str(args.output) | 51 svd_outfile_fn = str(args.output) |
50 h5file_out = openFile(svd_outfile_fn, mode='w') | 52 h5file_out = OpenFile(svd_outfile_fn, mode='w') |
51 probe_group = h5file_out.createGroup("/","probes","probes") | 53 probe_group = h5file_out.createGroup("/", "probes", "probes") |
52 except IOError as e: | 54 except IOError as e: |
53 print '[ERROR] Cannot open SVD output file for writing: ', svd_outfile_fn | 55 print('[ERROR] Cannot open SVD output file for writing: ', svd_outfile_fn) |
54 sys.exit(0) | 56 sys.exit(0) |
55 | 57 |
56 if args.write_svals != "": | 58 if args.write_svals != "": |
57 sval_f = open(args.write_svals,'w') | 59 sval_f = open(args.write_svals, 'w') |
58 | 60 |
59 if args.plot_scree != "": | 61 if args.plot_scree != "": |
60 try: | 62 try: |
61 import matplotlib | 63 import matplotlib |
62 matplotlib.use('Agg') | 64 matplotlib.use('Agg') |
63 import matplotlib.pyplot as plt | 65 import matplotlib.pyplot as plt |
64 import pylab as P | 66 import pylab as P |
65 from matplotlib.lines import Line2D | 67 from matplotlib.lines import Line2D |
66 from matplotlib.patches import Rectangle | 68 from matplotlib.patches import Rectangle |
67 except: | 69 except: |
68 print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?" | 70 print("[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?") |
69 sys.exit(0) | 71 sys.exit(0) |
70 | 72 |
71 plt.gcf().clear() | 73 plt.gcf().clear() |
72 fig = plt.figure(figsize=(10,5)) | 74 fig = plt.figure(figsize=(10, 5)) |
73 ax = fig.add_subplot(111) | 75 ax = fig.add_subplot(111) |
74 | 76 |
75 rpkm_dir = str(args.rpkm_dir[0]) | 77 rpkm_dir = str(args.rpkm_dir[0]) |
76 rpkm_files = glob.glob(rpkm_dir + "/*") | 78 rpkm_files = glob.glob(rpkm_dir + "/*") |
77 if len(rpkm_files) == 0: | 79 if len(rpkm_files) == 0: |
78 print '[ERROR] Cannot find any files in RPKM directory (or directory path is incorrect): ', rpkm_dir | 80 print('[ERROR] Cannot find any files in RPKM directory (or directory path is incorrect): ', rpkm_dir) |
79 sys.exit(0) | 81 sys.exit(0) |
80 elif len(rpkm_files) == 1: | 82 elif len(rpkm_files) == 1: |
81 print '[ERROR] Found only 1 RPKM file (sample). CoNIFER requires multiple samples (8 or more) to run. Exiting.' | 83 print('[ERROR] Found only 1 RPKM file (sample). CoNIFER requires multiple samples (8 or more) to run. Exiting.') |
82 sys.exit(0) | 84 sys.exit(0) |
83 elif len(rpkm_files) < 8: | 85 elif len(rpkm_files) < 8: |
84 print '[WARNING] Only found %d samples... this is less than the recommended minimum, and CoNIFER may not analyze this dataset correctly!' % len(rpkm_files) | 86 print('[WARNING] Only found %d samples... this is less than the recommended minimum, and CoNIFER may not analyze this dataset correctly!' % len( |
85 elif len(rpkm_files) <= int(args.svd): | 87 rpkm_files)) |
86 print '[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (len(rpkm_files), int(args.svd)) | 88 elif len(rpkm_files) <= int(args.svd): |
87 sys.exit(0) | 89 print('[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % ( |
88 else: | 90 len(rpkm_files), int(args.svd))) |
89 print '[INIT] Found %d RPKM files in %s' % (len(rpkm_files), rpkm_dir) | 91 sys.exit(0) |
90 | 92 else: |
91 # read in samples names and generate file list | 93 print('[INIT] Found %d RPKM files in %s' % (len(rpkm_files), rpkm_dir)) |
92 samples = {} | 94 |
93 for f in rpkm_files: | 95 # read in samples names and generate file list |
94 s = '.'.join(f.split('/')[-1].split('.')[0:-1]) | 96 samples = {} |
95 print "[INIT] Mapping file to sampleID: %s --> %s" % (f, s) | 97 for f in rpkm_files: |
96 samples[s] = f | 98 s = '.'.join(f.split('/')[-1].split('.')[0:-1]) |
97 | 99 print("[INIT] Mapping file to sampleID: %s --> %s" % (f, s)) |
98 #check uniqueness and total # of samples | 100 samples[s] = f |
99 if len(set(samples)) != len(set(rpkm_files)): | 101 |
100 print '[ERROR] Could not successfully derive sample names from RPKM filenames. There are probably non-unique sample names! Please rename files using <sampleID>.txt format!' | 102 # check uniqueness and total # of samples |
101 sys.exit(0) | 103 if len(set(samples)) != len(set(rpkm_files)): |
102 | 104 print('[ERROR] Could not successfully derive sample names from RPKM filenames. There are probably non-unique sample names! Please rename files using <sampleID>.txt format!') |
103 # LOAD RPKM DATA | 105 sys.exit(0) |
104 RPKM_data = np.zeros([num_probes,len(samples)], dtype=np.float) | 106 |
105 failed_samples = 0 | 107 # LOAD RPKM DATA |
106 | 108 RPKM_data = np.zeros([num_probes, len(samples)], dtype=np.float) |
107 for i,s in enumerate(samples.keys()): | 109 failed_samples = 0 |
108 t = np.loadtxt(samples[s], dtype=np.float, delimiter="\t", skiprows=0, usecols=[2]) | 110 |
109 if len(t) != num_probes: | 111 for i, s in enumerate(samples.keys()): |
110 print "[WARNING] Number of RPKM values for %s in file %s does not match number of defined probes in %s. **This sample will be dropped from analysis**!" % (s, samples[s], probe_fn) | 112 t = np.loadtxt(samples[s], dtype=np.float, delimiter="\t", skiprows=0, usecols=[2]) |
111 _ = samples.pop(s) | 113 if len(t) != num_probes: |
112 failed_samples += 1 | 114 print("[WARNING] Number of RPKM values for %s in file %s does not match number of defined probes in %s. **This sample will be dropped from analysis**!" % ( |
113 else: | 115 s, samples[s], probe_fn)) |
114 RPKM_data[:,i] = t | 116 _ = samples.pop(s) |
115 print "[INIT] Successfully read RPKM data for sampleID: %s" % s | 117 failed_samples += 1 |
116 | 118 else: |
117 RPKM_data = RPKM_data[:,0:len(samples)] | 119 RPKM_data[:, i] = t |
118 print "[INIT] Finished reading RPKM files. Total number of samples in experiment: %d (%d failed to read properly)" % (len(samples), failed_samples) | 120 print("[INIT] Successfully read RPKM data for sampleID: %s" % s) |
119 | 121 |
120 if len(samples) <= int(args.svd): | 122 RPKM_data = RPKM_data[:, 0:len(samples)] |
121 print '[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (int(args.svd), len(samples)) | 123 print("[INIT] Finished reading RPKM files. Total number of samples in experiment: %d (%d failed to read properly)" % ( |
122 sys.exit(0) | 124 len(samples), failed_samples)) |
123 | 125 |
124 # BEGIN | 126 if len(samples) <= int(args.svd): |
125 chrs_to_process = set(map(operator.itemgetter("chr"),probes)) | 127 print('[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % ( |
126 chrs_to_process_str = ', '.join([cf.chrInt2Str(c) for c in chrs_to_process]) | 128 int(args.svd), len(samples))) |
127 print '[INIT] Attempting to process chromosomes: ', chrs_to_process_str | 129 sys.exit(0) |
128 | 130 |
129 | 131 # BEGIN |
130 | 132 chrs_to_process = set(map(operator.itemgetter("chr"), probes)) |
131 for chr in chrs_to_process: | 133 chrs_to_process_str = ', '.join([cf.chrInt2Str(c) for c in chrs_to_process]) |
132 print "[RUNNING: chr%d] Now on: %s" %(chr, cf.chrInt2Str(chr)) | 134 print('[INIT] Attempting to process chromosomes: ', chrs_to_process_str) |
133 chr_group_name = "chr%d" % chr | 135 |
134 chr_group = h5file_out.createGroup("/",chr_group_name,chr_group_name) | 136 for chr in chrs_to_process: |
135 | 137 print("[RUNNING: chr%d] Now on: %s" % (chr, cf.chrInt2Str(chr))) |
136 chr_probes = filter(lambda i: i["chr"] == chr, probes) | 138 chr_group_name = "chr%d" % chr |
137 num_chr_probes = len(chr_probes) | 139 chr_group = h5file_out.createGroup("/", chr_group_name, chr_group_name) |
138 start_probeID = chr_probes[0]['probeID'] | 140 |
139 stop_probeID = chr_probes[-1]['probeID'] | 141 chr_probes = [i for i in probes if i["chr"] == chr] |
140 print "[RUNNING: chr%d] Found %d probes; probeID range is [%d-%d]" % (chr, len(chr_probes), start_probeID-1, stop_probeID) # probeID is 1-based and slicing is 0-based, hence the start_probeID-1 term | 142 num_chr_probes = len(chr_probes) |
141 | 143 start_probeID = chr_probes[0]['probeID'] |
142 rpkm = RPKM_data[start_probeID:stop_probeID,:] | 144 stop_probeID = chr_probes[-1]['probeID'] |
143 | 145 print("[RUNNING: chr%d] Found %d probes; probeID range is [%d-%d]" % (chr, len(chr_probes), start_probeID - 1, |
144 print "[RUNNING: chr%d] Calculating median RPKM" % chr | 146 stop_probeID)) # probeID is 1-based and slicing is 0-based, hence the start_probeID-1 term |
145 median = np.median(rpkm,1) | 147 |
146 sd = np.std(rpkm,1) | 148 rpkm = RPKM_data[start_probeID:stop_probeID, :] |
147 probe_mask = median >= float(args.min_rpkm) | 149 |
148 print "[RUNNING: chr%d] Masking %d probes with median RPKM < %f" % (chr, np.sum(probe_mask==False), float(args.min_rpkm)) | 150 print("[RUNNING: chr%d] Calculating median RPKM" % chr) |
149 | 151 median = np.median(rpkm, 1) |
150 rpkm = rpkm[probe_mask, :] | 152 sd = np.std(rpkm, 1) |
151 num_chr_probes = np.sum(probe_mask) | 153 probe_mask = median >= float(args.min_rpkm) |
152 | 154 print("[RUNNING: chr%d] Masking %d probes with median RPKM < %f" % ( |
153 if num_chr_probes <= len(samples): | 155 chr, np.sum(probe_mask == False), float(args.min_rpkm))) |
154 print "[ERROR] This chromosome has fewer informative probes than there are samples in the analysis! There are probably no mappings on this chromosome. Please remove these probes from the probes.txt file" | 156 |
155 sys.exit(0) | 157 rpkm = rpkm[probe_mask, :] |
156 | 158 num_chr_probes = np.sum(probe_mask) |
157 probeIDs = np.array(map(operator.itemgetter("probeID"),chr_probes))[probe_mask] | 159 |
158 probe_starts = np.array(map(operator.itemgetter("start"),chr_probes))[probe_mask] | 160 if num_chr_probes <= len(samples): |
159 probe_stops = np.array(map(operator.itemgetter("stop"),chr_probes))[probe_mask] | 161 print("[ERROR] This chromosome has fewer informative probes than there are samples in the analysis! There are probably no mappings on this chromosome. Please remove these probes from the probes.txt file") |
160 gene_names = np.array(map(operator.itemgetter("name"),chr_probes))[probe_mask] | 162 sys.exit(0) |
161 | 163 |
162 dt = np.dtype([('probeID',np.uint32),('start',np.uint32),('stop',np.uint32), ('name', np.str_, 20)]) | 164 probeIDs = np.array(list(map(operator.itemgetter("probeID"), chr_probes)))[probe_mask] |
163 | 165 probe_starts = np.array(list(map(operator.itemgetter("start"), chr_probes)))[probe_mask] |
164 out_probes = np.empty(num_chr_probes,dtype=dt) | 166 probe_stops = np.array(list(map(operator.itemgetter("stop"), chr_probes)))[probe_mask] |
165 out_probes['probeID'] = probeIDs | 167 gene_names = np.array(list(map(operator.itemgetter("name"), chr_probes)))[probe_mask] |
166 out_probes['start'] = probe_starts | 168 |
167 out_probes['stop'] = probe_stops | 169 dt = np.dtype([('probeID', np.uint32), ('start', np.uint32), ('stop', np.uint32), ('name', np.str_, 20)]) |
168 out_probes['name'] = gene_names | 170 |
169 probe_table = h5file_out.createTable(probe_group,"probes_chr%d" % chr,cf.probe,"chr%d" % chr) | 171 out_probes = np.empty(num_chr_probes, dtype=dt) |
170 probe_table.append(out_probes) | 172 out_probes['probeID'] = probeIDs |
171 | 173 out_probes['start'] = probe_starts |
172 print "[RUNNING: chr%d] Calculating ZRPKM scores..." % chr | 174 out_probes['stop'] = probe_stops |
173 rpkm = np.apply_along_axis(cf.zrpkm, 0, rpkm, median[probe_mask], sd[probe_mask]) | 175 out_probes['name'] = gene_names |
174 | 176 probe_table = h5file_out.createTable(probe_group, "probes_chr%d" % chr, cf.probe, "chr%d" % chr) |
175 # svd transform | 177 probe_table.append(out_probes) |
176 print "[RUNNING: chr%d] SVD decomposition..." % chr | 178 |
177 components_removed = int(args.svd) | 179 print("[RUNNING: chr%d] Calculating ZRPKM scores..." % chr) |
178 | 180 rpkm = np.apply_along_axis(cf.zrpkm, 0, rpkm, median[probe_mask], sd[probe_mask]) |
179 U, S, Vt = np.linalg.svd(rpkm,full_matrices=False) | 181 |
180 new_S = np.diag(np.hstack([np.zeros([components_removed]),S[components_removed:]])) | 182 # svd transform |
181 | 183 print("[RUNNING: chr%d] SVD decomposition..." % chr) |
182 if args.write_svals != "": | 184 components_removed = int(args.svd) |
183 sval_f.write('chr' + str(chr) + '\t' + '\t'.join([str(_i) for _i in S]) + "\n") | 185 |
184 | 186 U, S, Vt = np.linalg.svd(rpkm, full_matrices=False) |
185 if args.plot_scree != "": | 187 new_S = np.diag(np.hstack([np.zeros([components_removed]), S[components_removed:]])) |
186 ax.plot(S, label='chr' + str(chr),lw=0.5) | 188 |
187 | 189 if args.write_svals != "": |
188 # reconstruct data matrix | 190 sval_f.write('chr' + str(chr) + '\t' + '\t'.join([str(_i) for _i in S]) + "\n") |
189 rpkm = np.dot(U, np.dot(new_S, Vt)) | 191 |
190 | 192 if args.plot_scree != "": |
191 | 193 ax.plot(S, label='chr' + str(chr), lw=0.5) |
192 # save to HDF5 file | 194 |
193 print "[RUNNING: chr%d] Saving SVD-ZRPKM values" % chr | 195 # reconstruct data matrix |
194 | 196 rpkm = np.dot(U, np.dot(new_S, Vt)) |
195 for i,s in enumerate(samples): | 197 |
196 out_data = np.empty(num_chr_probes,dtype='u4,f8') | 198 # save to HDF5 file |
197 out_data['f0'] = probeIDs | 199 print("[RUNNING: chr%d] Saving SVD-ZRPKM values" % chr) |
198 out_data['f1'] = rpkm[:,i] | 200 |
199 sample_tbl = h5file_out.createTable(chr_group,"sample_" + str(s),cf.rpkm_value,"%s" % str(s)) | 201 for i, s in enumerate(samples): |
200 sample_tbl.append(out_data) | 202 out_data = np.empty(num_chr_probes, dtype='u4,f8') |
201 | 203 out_data['f0'] = probeIDs |
202 | 204 out_data['f1'] = rpkm[:, i] |
203 print "[RUNNING] Saving sampleIDs to file..." | 205 sample_tbl = h5file_out.createTable(chr_group, "sample_" + str(s), cf.rpkm_value, "%s" % str(s)) |
204 sample_group = h5file_out.createGroup("/","samples","samples") | 206 sample_tbl.append(out_data) |
205 sample_table = h5file_out.createTable(sample_group,"samples",cf.sample,"samples") | 207 |
206 dt = np.dtype([('sampleID',np.str_,100)]) | 208 print("[RUNNING] Saving sampleIDs to file...") |
207 out_samples = np.empty(len(samples.keys()),dtype=dt) | 209 sample_group = h5file_out.createGroup("/", "samples", "samples") |
208 out_samples['sampleID'] = np.array(samples.keys()) | 210 sample_table = h5file_out.createTable(sample_group, "samples", cf.sample, "samples") |
209 sample_table.append(out_samples) | 211 dt = np.dtype([('sampleID', np.str_, 100)]) |
210 | 212 out_samples = np.empty(len(list(samples.keys())), dtype=dt) |
211 | 213 out_samples['sampleID'] = np.array(list(samples.keys())) |
212 if args.write_sd != "": | 214 sample_table.append(out_samples) |
213 print "[RUNNING] Calculating standard deviations for all samples (this can take a while)..." | 215 |
214 | 216 if args.write_sd != "": |
215 sd_file = open(args.write_sd,'w') | 217 print("[RUNNING] Calculating standard deviations for all samples (this can take a while)...") |
216 | 218 |
217 for i,s in enumerate(samples): | 219 sd_file = open(args.write_sd, 'w') |
218 # collect all SVD-ZRPKM values | 220 |
219 count = 1 | 221 for i, s in enumerate(samples): |
220 for chr in chrs_to_process: | 222 # collect all SVD-ZRPKM values |
221 if count == 1: | 223 count = 1 |
222 sd_out = h5file_out.root._f_getChild("chr%d" % chr)._f_getChild("sample_%s" % s).read(field="rpkm").flatten() | 224 for chr in chrs_to_process: |
223 else: | 225 if count == 1: |
224 sd_out = np.hstack([sd_out,out.h5file_out.root._f_getChild("chr%d" % chr)._f_getChild("sample_%s" % s).read(field="rpkm").flatten()]) | 226 sd_out = h5file_out.root._f_getChild("chr%d" % chr)._f_getChild("sample_%s" % s).read( |
225 | 227 field="rpkm").flatten() |
226 sd = np.std(sd_out) | 228 else: |
227 sd_file.write("%s\t%f\n" % (s,sd)) | 229 sd_out = np.hstack([sd_out, out.h5file_out.root._f_getChild("chr%d" % chr)._f_getChild( |
228 | 230 "sample_%s" % s).read(field="rpkm").flatten()]) |
229 sd_file.close() | 231 |
230 | 232 sd = np.std(sd_out) |
231 if args.plot_scree != "": | 233 sd_file.write("%s\t%f\n" % (s, sd)) |
232 plt.title("Scree plot") | 234 |
233 if len(samples) < 50: | 235 sd_file.close() |
234 plt.xlim([0,len(samples)]) | 236 |
235 plt.xlabel("S values") | 237 if args.plot_scree != "": |
236 else: | 238 plt.title("Scree plot") |
237 plt.xlim([0,50]) | 239 if len(samples) < 50: |
238 plt.xlabel("S values (only first 50 plotted)") | 240 plt.xlim([0, len(samples)]) |
239 plt.ylabel("Relative contributed variance") | 241 plt.xlabel("S values") |
240 plt.savefig(args.plot_scree) | 242 else: |
241 | 243 plt.xlim([0, 50]) |
242 print "[FINISHED]" | 244 plt.xlabel("S values (only first 50 plotted)") |
243 h5file_out.close() | 245 plt.ylabel("Relative contributed variance") |
244 sys.exit(0) | 246 plt.savefig(args.plot_scree) |
247 | |
248 print("[FINISHED]") | |
249 h5file_out.close() | |
250 sys.exit(0) | |
251 | |
245 | 252 |
246 def CF_export(args): | 253 def CF_export(args): |
247 try: | 254 try: |
248 h5file_in_fn = str(args.input) | 255 h5file_in_fn = str(args.input) |
249 h5file_in = openFile(h5file_in_fn, mode='r') | 256 h5file_in = OpenFile(h5file_in_fn, mode='r') |
250 except IOError as e: | 257 except IOError as e: |
251 print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn | 258 print('[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn) |
252 sys.exit(0) | 259 sys.exit(0) |
253 | 260 |
254 # read probes | 261 # read probes |
255 probes = {} | 262 probes = {} |
256 for probes_chr in h5file_in.root.probes: | 263 for probes_chr in h5file_in.root.probes: |
257 probes[probes_chr.title] = probes_chr.read() | 264 probes[probes_chr.title] = probes_chr.read() |
258 | 265 |
259 if args.sample =='all': | 266 if args.sample == 'all': |
260 all_samples = list(h5file_in.root.samples.samples.read(field="sampleID")) | 267 all_samples = list(h5file_in.root.samples.samples.read(field="sampleID")) |
261 | 268 |
262 out_path = os.path.abspath(args.output) | 269 out_path = os.path.abspath(args.output) |
263 | 270 |
264 print "[INIT] Preparing to export all samples (%d samples) to %s" % (len(all_samples), out_path) | 271 print("[INIT] Preparing to export all samples (%d samples) to %s" % (len(all_samples), out_path)) |
265 for sample in all_samples: | 272 for sample in all_samples: |
266 try: | 273 try: |
267 outfile_fn = out_path + "/" + sample + ".bed" | 274 outfile_fn = out_path + "/" + sample + ".bed" |
268 outfile_f = open(outfile_fn,'w') | 275 outfile_f = open(outfile_fn, 'w') |
269 except IOError as e: | 276 except IOError as e: |
270 print '[ERROR] Cannot open output file for writing: ', outfile_fn | 277 print('[ERROR] Cannot open output file for writing: ', outfile_fn) |
271 sys.exit(0) | 278 sys.exit(0) |
272 print "[RUNNING] Exporting %s" % sample | 279 print("[RUNNING] Exporting %s" % sample) |
273 | 280 |
274 cf.export_sample(h5file_in,sample,probes,outfile_f) | 281 cf.export_sample(h5file_in, sample, probes, outfile_f) |
275 outfile_f.close() | 282 outfile_f.close() |
276 | 283 |
277 elif len(args.sample) == 1: | 284 elif len(args.sample) == 1: |
278 out_path = os.path.abspath(args.output) | 285 out_path = os.path.abspath(args.output) |
279 sample = args.sample[0] | 286 sample = args.sample[0] |
280 print "[INIT] Preparing to export sampleID %s to %s" % (args.sample[0], out_path) | 287 print("[INIT] Preparing to export sampleID %s to %s" % (args.sample[0], out_path)) |
281 try: | 288 try: |
282 if os.path.isdir(out_path): | 289 if os.path.isdir(out_path): |
283 outfile_fn = out_path + "/" + sample + ".bed" | 290 outfile_fn = out_path + "/" + sample + ".bed" |
284 else: | 291 else: |
285 outfile_fn = out_path | 292 outfile_fn = out_path |
286 outfile_f = open(outfile_fn,'w') | 293 outfile_f = open(outfile_fn, 'w') |
287 except IOError as e: | 294 except IOError as e: |
288 print '[ERROR] Cannot open output file for writing: ', outfile_fn | 295 print('[ERROR] Cannot open output file for writing: ', outfile_fn) |
289 sys.exit(0) | 296 sys.exit(0) |
290 print "[RUNNING] Exporting %s to %s" % (sample, outfile_fn) | 297 print("[RUNNING] Exporting %s to %s" % (sample, outfile_fn)) |
291 | 298 |
292 cf.export_sample(h5file_in,sample,probes,outfile_f) | 299 cf.export_sample(h5file_in, sample, probes, outfile_f) |
293 outfile_f.close() | 300 outfile_f.close() |
294 | 301 |
295 else: | 302 else: |
296 out_path = os.path.abspath(args.output) | 303 out_path = os.path.abspath(args.output) |
297 print "[INIT] Preparing to export %d samples to %s" % (len(args.sample), out_path) | 304 print("[INIT] Preparing to export %d samples to %s" % (len(args.sample), out_path)) |
298 for sample in args.sample: | 305 for sample in args.sample: |
299 try: | 306 try: |
300 if os.path.isdir(out_path): | 307 if os.path.isdir(out_path): |
301 outfile_fn = out_path + "/" + sample + ".bed" | 308 outfile_fn = out_path + "/" + sample + ".bed" |
302 else: | 309 else: |
303 outfile_fn = out_path | 310 outfile_fn = out_path |
304 outfile_f = open(outfile_fn,'w') | 311 outfile_f = open(outfile_fn, 'w') |
305 except IOError as e: | 312 except IOError as e: |
306 print '[ERROR] Cannot open output file for writing: ', outfile_fn | 313 print('[ERROR] Cannot open output file for writing: ', outfile_fn) |
307 sys.exit(0) | 314 sys.exit(0) |
308 print "[RUNNING] Exporting %s to %s" % (sample, outfile_fn) | 315 print("[RUNNING] Exporting %s to %s" % (sample, outfile_fn)) |
309 | 316 |
310 cf.export_sample(h5file_in,sample,probes,outfile_f) | 317 cf.export_sample(h5file_in, sample, probes, outfile_f) |
311 outfile_f.close() | 318 outfile_f.close() |
312 sys.exit(0) | 319 sys.exit(0) |
320 | |
313 | 321 |
314 def CF_call(args): | 322 def CF_call(args): |
315 try: | 323 try: |
316 h5file_in_fn = str(args.input) | 324 h5file_in_fn = str(args.input) |
317 h5file_in = openFile(h5file_in_fn, mode='r') | 325 h5file_in = OpenFile(h5file_in_fn, mode='r') |
318 except IOError as e: | 326 except IOError as e: |
319 print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn | 327 print('[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn) |
320 sys.exit(0) | 328 sys.exit(0) |
321 | 329 |
322 try: | 330 try: |
323 callfile_fn = str(args.output) | 331 callfile_fn = str(args.output) |
324 callfile_f = open(callfile_fn, mode='w') | 332 callfile_f = open(callfile_fn, mode='w') |
325 except IOError as e: | 333 except IOError as e: |
326 print '[ERROR] Cannot open output file for writing: ', callfile_fn | 334 print('[ERROR] Cannot open output file for writing: ', callfile_fn) |
327 sys.exit(0) | 335 sys.exit(0) |
328 | 336 |
329 chrs_to_process = [] | 337 chrs_to_process = [] |
330 for chr in h5file_in.root: | 338 for chr in h5file_in.root: |
331 if chr._v_title not in ('probes','samples'): | 339 if chr._v_title not in ('probes', 'samples'): |
332 chrs_to_process.append(chr._v_title.replace("chr","")) | 340 chrs_to_process.append(chr._v_title.replace("chr", "")) |
333 | 341 |
334 h5file_in.close() | 342 h5file_in.close() |
335 | 343 |
336 print '[INIT] Initializing caller at threshold = %f' % (args.threshold) | 344 print('[INIT] Initializing caller at threshold = %f' % (args.threshold)) |
337 | 345 |
338 r = cf.rpkm_reader(h5file_in_fn) | 346 r = cf.rpkm_reader(h5file_in_fn) |
339 | 347 |
340 all_calls = [] | 348 all_calls = [] |
341 | 349 |
342 for chr in chrs_to_process: | 350 for chr in chrs_to_process: |
343 print '[RUNNING] Now processing chr%s' % chr | 351 print('[RUNNING] Now processing chr%s' % chr) |
344 data = r.getExonValuesByRegion(chr) | 352 data = r.getExonValuesByRegion(chr) |
345 | 353 |
346 #raw_data = copy.copy(data) | 354 # raw_data = copy.copy(data) |
347 _ = data.smooth() | 355 _ = data.smooth() |
348 | 356 |
349 mean= np.mean(data.rpkm,axis=1) | 357 mean = np.mean(data.rpkm, axis=1) |
350 sd = np.std(data.rpkm,axis=1) | 358 sd = np.std(data.rpkm, axis=1) |
351 | 359 |
352 for sample in r.getSampleList(): | 360 for sample in r.getSampleList(): |
353 sample_data = data.getSample([sample]).flatten() | 361 sample_data = data.getSample([sample]).flatten() |
354 #sample_raw_data = raw_data.getSample([sample]).flatten() | 362 # sample_raw_data = raw_data.getSample([sample]).flatten() |
355 | 363 |
356 dup_mask = sample_data >= args.threshold | 364 dup_mask = sample_data >= args.threshold |
357 del_mask = sample_data <= -1*args.threshold | 365 del_mask = sample_data <= -1 * args.threshold |
358 | 366 |
359 dup_bkpoints = cf.getbkpoints(dup_mask) #returns exon coordinates for this chromosome (numpy array coords) | 367 dup_bkpoints = cf.getbkpoints(dup_mask) # returns exon coordinates for this chromosome (numpy array coords) |
360 del_bkpoints = cf.getbkpoints(del_mask) | 368 del_bkpoints = cf.getbkpoints(del_mask) |
361 | 369 |
362 | 370 dups = [] |
363 dups = [] | 371 for start, stop in dup_bkpoints: |
364 for start,stop in dup_bkpoints: | 372 try: |
365 try: new_start = np.max(np.where(sample_data[:start] < (mean[:start] + 3*sd[:start]))) | 373 new_start = np.max(np.where(sample_data[:start] < (mean[:start] + 3 * sd[:start]))) |
366 except ValueError: new_start = 0 | 374 except ValueError: |
367 try: new_stop = stop + np.min(np.where(sample_data[stop:] < (mean[stop:] + 3*sd[stop:]))) | 375 new_start = 0 |
368 except ValueError: new_stop = data.shape[1]-1 | 376 try: |
369 dups.append({"sampleID":sample,"chromosome": cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "dup"}) | 377 new_stop = stop + np.min(np.where(sample_data[stop:] < (mean[stop:] + 3 * sd[stop:]))) |
370 | 378 except ValueError: |
371 dels = [] | 379 new_stop = data.shape[1] - 1 |
372 for start,stop in del_bkpoints: | 380 dups.append( |
373 try: new_start = np.max(np.where(sample_data[:start] > (-1*mean[:start] - 3*sd[:start]))) | 381 {"sampleID": sample, "chromosome": cf.chrInt2Str(chr), "start": data.exons[new_start]["start"], |
374 except ValueError: new_start = 0 | 382 "stop": data.exons[new_stop]["stop"], "state": "dup"}) |
375 try: new_stop = stop + np.min(np.where(sample_data[stop:] > (-1*mean[stop:] - 3*sd[stop:]))) | 383 |
376 except ValueError: new_stop = data.shape[1]-1 | 384 dels = [] |
377 dels.append({"sampleID":sample,"chromosome": cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "del"}) | 385 for start, stop in del_bkpoints: |
378 | 386 try: |
379 dels = cf.mergeCalls(dels) #merges overlapping calls | 387 new_start = np.max(np.where(sample_data[:start] > (-1 * mean[:start] - 3 * sd[:start]))) |
380 dups = cf.mergeCalls(dups) | 388 except ValueError: |
381 | 389 new_start = 0 |
382 #print sampleID, len(dels), len(dups) | 390 try: |
383 | 391 new_stop = stop + np.min(np.where(sample_data[stop:] > (-1 * mean[stop:] - 3 * sd[stop:]))) |
384 all_calls.extend(list(dels)) | 392 except ValueError: |
385 all_calls.extend(list(dups)) | 393 new_stop = data.shape[1] - 1 |
386 | 394 dels.append( |
387 # print calls to file | 395 {"sampleID": sample, "chromosome": cf.chrInt2Str(chr), "start": data.exons[new_start]["start"], |
388 header = ['sampleID','chromosome','start','stop','state'] | 396 "stop": data.exons[new_stop]["stop"], "state": "del"}) |
389 | 397 |
390 callfile_f.write('\t'.join(header) + "\n") | 398 dels = cf.mergeCalls(dels) # merges overlapping calls |
391 for call in all_calls: | 399 dups = cf.mergeCalls(dups) |
392 print "%s\t%s\t%d\t%d\t%s" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"]) | 400 |
393 callfile_f.write("%s\t%s\t%d\t%d\t%s\n" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"])) | 401 # print sampleID, len(dels), len(dups) |
394 | 402 |
395 sys.exit(0) | 403 all_calls.extend(list(dels)) |
404 all_calls.extend(list(dups)) | |
405 | |
406 # print calls to file | |
407 header = ['sampleID', 'chromosome', 'start', 'stop', 'state'] | |
408 | |
409 callfile_f.write('\t'.join(header) + "\n") | |
410 for call in all_calls: | |
411 print("%s\t%s\t%d\t%d\t%s" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"])) | |
412 callfile_f.write( | |
413 "%s\t%s\t%d\t%d\t%s\n" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"])) | |
414 | |
415 sys.exit(0) | |
416 | |
396 | 417 |
397 def CF_plot(args): | 418 def CF_plot(args): |
398 try: | 419 try: |
399 import locale | 420 import locale |
400 import matplotlib | 421 import matplotlib |
401 matplotlib.use('Agg') | 422 matplotlib.use('Agg') |
402 import matplotlib.pyplot as plt | 423 import matplotlib.pyplot as plt |
403 import pylab as P | 424 import pylab as P |
404 from matplotlib.lines import Line2D | 425 from matplotlib.lines import Line2D |
405 from matplotlib.patches import Rectangle | 426 from matplotlib.patches import Rectangle |
406 _ = locale.setlocale(locale.LC_ALL, '') | 427 _ = locale.setlocale(locale.LC_ALL, '') |
407 except: | 428 except: |
408 print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?" | 429 print("[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?") |
409 sys.exit(0) | 430 sys.exit(0) |
410 | 431 |
411 | 432 chr, start, stop = cf.parseLocString(args.region) |
412 chr, start, stop = cf.parseLocString(args.region) | 433 |
413 | 434 r = cf.rpkm_reader(args.input) |
414 r = cf.rpkm_reader(args.input) | 435 |
415 | 436 data = r.getExonValuesByRegion(chr, start, stop) |
416 data = r.getExonValuesByRegion(chr,start,stop) | 437 _ = data.smooth() |
417 _ = data.smooth() | 438 |
418 | 439 plt.gcf().clear() |
419 plt.gcf().clear() | 440 fig = plt.figure(figsize=(10, 5)) |
420 fig = plt.figure(figsize=(10,5)) | 441 ax = fig.add_subplot(111) |
421 ax = fig.add_subplot(111) | 442 |
422 | 443 ax.plot(data.rpkm, linewidth=0.3, c='k') |
423 | 444 |
424 ax.plot(data.rpkm, linewidth = 0.3, c='k') | 445 if args.sample != 'none': |
425 | 446 cnt = 1 |
426 | 447 coloriter = iter(['r', 'b', 'g', 'y']) |
427 if args.sample != 'none': | 448 for sample in args.sample: |
428 cnt = 1 | 449 try: |
429 coloriter = iter(['r','b','g','y']) | 450 color, sampleID = sample.split(":") |
430 for sample in args.sample: | 451 except: |
431 try: | 452 color = next(coloriter) |
432 color, sampleID = sample.split(":") | 453 sampleID = sample |
433 except: | 454 |
434 color =coloriter.next() | 455 ax.plot(data.getSample([sampleID]), linewidth=1, c=color, label=sampleID) |
435 sampleID = sample | 456 |
436 | 457 if cnt == 1: |
437 ax.plot(data.getSample([sampleID]), linewidth = 1, c=color, label = sampleID) | 458 cf.plotRawData(ax, |
438 | 459 r.getExonValuesByRegion(chr, start, stop, sampleList=[sampleID]).getSample([sampleID]), |
439 if cnt == 1: | 460 color=color) |
440 cf.plotRawData(ax, r.getExonValuesByRegion(chr,start,stop,sampleList=[sampleID]).getSample([sampleID]),color=color) | 461 cnt += 1 |
441 cnt +=1 | 462 plt.legend(prop={'size': 10}, frameon=False) |
442 plt.legend(prop={'size':10},frameon=False) | 463 |
443 | 464 cf.plotGenes(ax, data) |
444 cf.plotGenes(ax, data) | 465 cf.plotGenomicCoords(plt, data) |
445 cf.plotGenomicCoords(plt,data) | 466 plt.xlim(0, data.shape[1]) |
446 plt.xlim(0,data.shape[1]) | 467 plt.ylim(-3, 3) |
447 plt.ylim(-3,3) | 468 |
448 | 469 plt.title("%s: %s - %s" % ( |
449 plt.title("%s: %s - %s" % (cf.chrInt2Str(chr),locale.format("%d",start, grouping=True),locale.format("%d",stop, grouping=True))) | 470 cf.chrInt2Str(chr), locale.format("%d", start, grouping=True), locale.format("%d", stop, grouping=True))) |
450 plt.xlabel("Position") | 471 plt.xlabel("Position") |
451 plt.ylabel("SVD-ZRPKM Values") | 472 plt.ylabel("SVD-ZRPKM Values") |
452 | 473 |
453 plt.savefig(args.output) | 474 plt.savefig(args.output) |
454 | 475 |
455 sys.exit(0) | 476 sys.exit(0) |
477 | |
456 | 478 |
457 def CF_plotcalls(args): | 479 def CF_plotcalls(args): |
458 try: | 480 try: |
459 import matplotlib | 481 import matplotlib |
460 matplotlib.use('Agg') | 482 matplotlib.use('Agg') |
461 import matplotlib.pyplot as plt | 483 import matplotlib.pyplot as plt |
462 import pylab as P | 484 import pylab as P |
463 from matplotlib.lines import Line2D | 485 from matplotlib.lines import Line2D |
464 from matplotlib.patches import Rectangle | 486 from matplotlib.patches import Rectangle |
465 except: | 487 except: |
466 print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?" | 488 print("[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?") |
467 sys.exit(0) | 489 sys.exit(0) |
468 | 490 |
469 import locale | 491 import locale |
470 try: | 492 try: |
471 _ = locale.setlocale(locale.LC_ALL, 'en_US') | 493 _ = locale.setlocale(locale.LC_ALL, 'en_US') |
472 except: | 494 except: |
473 _ = locale.setlocale(locale.LC_ALL, '') | 495 _ = locale.setlocale(locale.LC_ALL, '') |
474 | 496 |
475 try: | 497 try: |
476 callfile_fn = str(args.calls) | 498 callfile_fn = str(args.calls) |
477 callfile_f = open(callfile_fn, mode='r') | 499 callfile_f = open(callfile_fn, mode='r') |
478 except IOError as e: | 500 except IOError as e: |
479 print '[ERROR] Cannot open call file for reading: ', callfile_fn | 501 print('[ERROR] Cannot open call file for reading: ', callfile_fn) |
480 sys.exit(0) | 502 sys.exit(0) |
481 | 503 |
482 all_calls = [] | 504 all_calls = [] |
483 header = callfile_f.readline() | 505 header = callfile_f.readline() |
484 | 506 |
485 for line in callfile_f: | 507 for line in callfile_f: |
486 sampleID, chr, start, stop, state = line.strip().split() | 508 sampleID, chr, start, stop, state = line.strip().split() |
487 chr = cf.chrStr2Int(chr) | 509 chr = cf.chrStr2Int(chr) |
488 all_calls.append({"chromosome":int(chr), "start":int(start), "stop":int(stop), "sampleID":sampleID}) | 510 all_calls.append({"chromosome": int(chr), "start": int(start), "stop": int(stop), "sampleID": sampleID}) |
489 | 511 |
490 r = cf.rpkm_reader(args.input) | 512 r = cf.rpkm_reader(args.input) |
491 | 513 |
492 for call in all_calls: | 514 for call in all_calls: |
493 chr = call["chromosome"] | 515 chr = call["chromosome"] |
494 start = call["start"] | 516 start = call["start"] |
495 stop = call["stop"] | 517 stop = call["stop"] |
496 sampleID = call["sampleID"] | 518 sampleID = call["sampleID"] |
497 | 519 |
498 exons = r.getExonIDs(chr,int(start),int(stop)) | 520 exons = r.getExonIDs(chr, int(start), int(stop)) |
499 | 521 |
500 | 522 window_start = max(exons[0] - args.window, 0) |
501 window_start = max(exons[0]-args.window,0) | 523 window_stop = exons[-1] + args.window |
502 window_stop = exons[-1]+args.window | 524 |
503 | 525 data = r.getExonValuesByExons(chr, window_start, window_stop) |
504 data = r.getExonValuesByExons(chr,window_start, window_stop) | 526 _ = data.smooth() |
505 _ = data.smooth() | 527 |
506 | 528 plt.gcf().clear() |
507 plt.gcf().clear() | 529 fig = plt.figure(figsize=(10, 5)) |
508 fig = plt.figure(figsize=(10,5)) | 530 ax = fig.add_subplot(111) |
509 ax = fig.add_subplot(111) | 531 |
510 | 532 ax.plot(data.rpkm, linewidth=0.3, c='k') |
511 | 533 |
512 ax.plot(data.rpkm, linewidth = 0.3, c='k') | 534 ax.plot(data.getSample([sampleID]), linewidth=1, c='r', label=sampleID) |
513 | 535 cf.plotRawData(ax, r.getExonValuesByExons(chr, window_start, window_stop, sampleList=[sampleID]).getSample( |
514 | 536 [sampleID]), color='r') |
515 ax.plot(data.getSample([sampleID]), linewidth = 1, c='r', label = sampleID) | 537 |
516 cf.plotRawData(ax, r.getExonValuesByExons(chr,window_start, window_stop,sampleList=[sampleID]).getSample([sampleID]),color='r') | 538 plt.legend(prop={'size': 10}, frameon=False) |
517 | 539 |
518 plt.legend(prop={'size':10},frameon=False) | 540 cf.plotGenes(ax, data) |
519 | 541 cf.plotGenomicCoords(plt, data) |
520 cf.plotGenes(ax, data) | 542 |
521 cf.plotGenomicCoords(plt,data) | 543 exon_start = np.where(data.exons["start"] == start)[0][0] |
522 | 544 exon_stop = np.where(data.exons["stop"] == stop)[0][0] |
523 exon_start = np.where(data.exons["start"] == start)[0][0] | 545 _ = ax.add_line( |
524 exon_stop = np.where(data.exons["stop"] == stop)[0][0] | 546 matplotlib.lines.Line2D([exon_start, exon_stop], [2, 2], color='k', lw=6, linestyle='-', alpha=1, |
525 _ = ax.add_line(matplotlib.lines.Line2D([exon_start,exon_stop],[2,2],color='k',lw=6,linestyle='-',alpha=1,solid_capstyle='butt')) | 547 solid_capstyle='butt')) |
526 | 548 |
527 _ = plt.xlim(0,data.shape[1]) | 549 _ = plt.xlim(0, data.shape[1]) |
528 _ = plt.ylim(-3,3) | 550 _ = plt.ylim(-3, 3) |
529 | 551 |
530 plt.title("%s: %s - %s" % (cf.chrInt2Str(chr),locale.format("%d",start, grouping=True),locale.format("%d",stop, grouping=True))) | 552 plt.title("%s: %s - %s" % ( |
531 plt.xlabel("Position") | 553 cf.chrInt2Str(chr), locale.format("%d", start, grouping=True), locale.format("%d", stop, grouping=True))) |
532 plt.ylabel("SVD-ZRPKM Values") | 554 plt.xlabel("Position") |
533 outfile = "%s_%d_%d_%s.png" % (cf.chrInt2Str(chr), start, stop, sampleID) | 555 plt.ylabel("SVD-ZRPKM Values") |
534 plt.savefig(args.outputdir + "/" + outfile) | 556 outfile = "%s_%d_%d_%s.png" % (cf.chrInt2Str(chr), start, stop, sampleID) |
557 plt.savefig(args.outputdir + "/" + outfile) | |
558 | |
535 | 559 |
536 def CF_bam2RPKM(args): | 560 def CF_bam2RPKM(args): |
537 try: | 561 try: |
538 import pysam | 562 import pysam |
539 except: | 563 except: |
540 print '[ERROR] Cannot load pysam module! Make sure it is insalled' | 564 print('[ERROR] Cannot load pysam module! Make sure it is insalled') |
541 sys.exit(0) | 565 sys.exit(0) |
542 try: | 566 try: |
543 # read probes table | 567 # read probes table |
544 probe_fn = str(args.probes[0]) | 568 probe_fn = str(args.probes[0]) |
545 probes = cf.loadProbeList(probe_fn) | 569 probes = cf.loadProbeList(probe_fn) |
546 num_probes = len(probes) | 570 num_probes = len(probes) |
547 print '[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn) | 571 print('[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn)) |
548 except IOError as e: | 572 except IOError as e: |
549 print '[ERROR] Cannot read probes file: ', probe_fn | 573 print('[ERROR] Cannot read probes file: ', probe_fn) |
550 sys.exit(0) | 574 sys.exit(0) |
551 | 575 |
552 try: | 576 try: |
553 rpkm_f = open(args.output[0],'w') | 577 rpkm_f = open(args.output[0], 'wb') |
554 except IOError as e: | 578 except IOError as e: |
555 print '[ERROR] Cannot open rpkm file for writing: ', args.output | 579 print('[ERROR] Cannot open rpkm file for writing: ', args.output) |
556 sys.exit(0) | 580 sys.exit(0) |
557 | 581 |
558 print "[RUNNING] Counting total number of reads in bam file..." | 582 print("[RUNNING] Counting total number of reads in bam file...") |
559 total_reads = float(pysam.view("-c", args.input[0])[0].strip("\n")) | 583 total_reads = float(pysam.view("-c", args.input[0])[0].strip("\n")) |
560 print "[RUNNING] Found %d reads" % total_reads | 584 print("[RUNNING] Found %d reads" % total_reads) |
561 | 585 |
562 f = pysam.Samfile(args.input[0], "rb" ) | 586 f = pysam.Samfile(args.input[0], "rb") |
563 | 587 |
564 if not f._hasIndex(): | 588 # if not f._hasIndex(): |
565 print "[ERROR] No index found for bam file (%s)!\n[ERROR] You must first index the bam file and include the .bai file in the same directory as the bam file!" % args.input[0] | 589 if not f.has_index(): |
566 sys.exit(0) | 590 print("[ERROR] No index found for bam file (%s)!\n[ERROR] You must first index the bam file and include the .bai file in the same directory as the bam file!" % \ |
567 | 591 args.input[0]) |
568 # will be storing values in these arrays | 592 sys.exit(0) |
569 readcount = np.zeros(num_probes) | 593 |
570 exon_bp = np.zeros(num_probes) | 594 # will be storing values in these arrays |
571 probeIDs = np.zeros(num_probes) | 595 readcount = np.zeros(num_probes) |
572 counter = 0 | 596 exon_bp = np.zeros(num_probes) |
573 | 597 probeIDs = np.zeros(num_probes) |
574 # detect contig naming scheme here # TODO, add an optional "contigs.txt" file or automatically handle contig naming | 598 counter = 0 |
575 bam_contigs = f.references | 599 |
576 probes_contigs = [str(p) for p in set(map(operator.itemgetter("chr"),probes))] | 600 # detect contig naming scheme here # TODO, add an optional "contigs.txt" file or automatically handle contig naming |
577 | 601 bam_contigs = f.references |
578 probes2contigmap = {} | 602 probes_contigs = [str(p) for p in set(map(operator.itemgetter("chr"), probes))] |
579 | 603 |
580 for probes_contig in probes_contigs: | 604 probes2contigmap = {} |
581 if probes_contig in bam_contigs: | 605 |
582 probes2contigmap[probes_contig] = probes_contig | 606 for probes_contig in probes_contigs: |
583 elif cf.chrInt2Str(probes_contig) in bam_contigs: | 607 if probes_contig in bam_contigs: |
584 probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig) | 608 probes2contigmap[probes_contig] = probes_contig |
585 elif cf.chrInt2Str(probes_contig).replace("chr","") in bam_contigs: | 609 elif cf.chrInt2Str(probes_contig) in bam_contigs: |
586 probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig).replace("chr","") | 610 probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig) |
587 else: | 611 elif cf.chrInt2Str(probes_contig).replace("chr", "") in bam_contigs: |
588 print "[ERROR] Could not find contig '%s' from %s in bam file! \n[ERROR] Perhaps the contig names for the probes are incompatible with the bam file ('chr1' vs. '1'), or unsupported contig naming is used?" % (probes_contig, probe_fn) | 612 probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig).replace("chr", "") |
589 sys.exit(0) | 613 else: |
590 | 614 print("[ERROR] Could not find contig '%s' from %s in bam file! \n[ERROR] Perhaps the contig names for the probes are incompatible with the bam file ('chr1' vs. '1'), or unsupported contig naming is used?" % ( |
591 print "[RUNNING] Calculating RPKM values..." | 615 probes_contig, probe_fn)) |
592 | 616 sys.exit(0) |
593 # loop through each probe | 617 |
594 for p in probes: | 618 print("[RUNNING] Calculating RPKM values...") |
595 | 619 |
596 # f.fetch is a pysam method and returns an iterator for reads overlapping interval | 620 # loop through each probe |
597 | 621 for p in probes: |
598 p_chr = probes2contigmap[str(p["chr"])] | 622 |
599 | 623 # f.fetch is a pysam method and returns an iterator for reads overlapping interval |
600 p_start = p["start"] | 624 |
601 p_stop = p["stop"] | 625 p_chr = probes2contigmap[str(p["chr"])] |
602 try: | 626 |
603 iter = f.fetch(p_chr,p_start,p_stop) | 627 p_start = p["start"] |
604 except: | 628 p_stop = p["stop"] |
605 print "[ERROR] Could not retrieve mappings for region %s:%d-%d. Check that contigs are named correctly and the bam file is properly indexed" % (p_chr,p_start,p_stop) | 629 try: |
606 sys.exit(0) | 630 iter = f.fetch(p_chr, p_start, p_stop) |
607 | 631 except: |
608 for i in iter: | 632 print("[ERROR] Could not retrieve mappings for region %s:%d-%d. Check that contigs are named correctly and the bam file is properly indexed" % ( |
609 if i.pos+1 >= p_start: #this checks to make sure a read actually starts in an interval | 633 p_chr, p_start, p_stop)) |
610 readcount[counter] += 1 | 634 sys.exit(0) |
611 | 635 |
612 exon_bp[counter] = p_stop-p_start | 636 for i in iter: |
613 probeIDs[counter] = counter +1 #probeIDs are 1-based | 637 if i.pos + 1 >= p_start: # this checks to make sure a read actually starts in an interval |
614 counter +=1 | 638 readcount[counter] += 1 |
615 | 639 |
616 #calcualte RPKM values for all probes | 640 exon_bp[counter] = p_stop - p_start |
617 rpkm = (10**9*(readcount)/(exon_bp))/(total_reads) | 641 probeIDs[counter] = counter + 1 # probeIDs are 1-based |
618 | 642 counter += 1 |
619 out = np.vstack([probeIDs,readcount,rpkm]) | 643 |
620 | 644 # calcualte RPKM values for all probes |
621 np.savetxt(rpkm_f,out.transpose(),delimiter='\t',fmt=['%d','%d','%f']) | 645 rpkm = (10 ** 9 * (readcount) / (exon_bp)) / (total_reads) |
622 | 646 |
623 rpkm_f.close() | 647 out = np.vstack([probeIDs, readcount, rpkm]) |
624 | 648 |
625 | 649 np.savetxt(rpkm_f, out.transpose(), delimiter='\t', fmt=['%d', '%d', '%f']) |
650 | |
651 rpkm_f.close() | |
652 | |
626 | 653 |
627 VERSION = "0.2.2" | 654 VERSION = "0.2.2" |
628 parser = argparse.ArgumentParser(prog="CoNIFER", description="This is CoNIFER %s (Copy Number Inference From Exome Reads), designed to detect and genotype CNVs and CNPs from exome sequence read-depth data. See Krumm et al., Genome Research (2012) doi:10.1101/gr.138115.112 \nNiklas Krumm, 2012\n nkrumm@uw.edu" % VERSION) | 655 parser = argparse.ArgumentParser(prog="CoNIFER", |
656 description="This is CoNIFER %s (Copy Number Inference From Exome Reads), designed to detect and genotype CNVs and CNPs from exome sequence read-depth data. See Krumm et al., Genome Research (2012) doi:10.1101/gr.138115.112 \nNiklas Krumm, 2012\n nkrumm@uw.edu" % VERSION) | |
629 parser.add_argument('--version', action='version', version='%(prog)s ' + VERSION) | 657 parser.add_argument('--version', action='version', version='%(prog)s ' + VERSION) |
630 subparsers = parser.add_subparsers(help='Command to be run.') | 658 subparsers = parser.add_subparsers(help='Command to be run.') |
631 | 659 |
632 # rpkm command | 660 # rpkm command |
633 rpkm_parser= subparsers.add_parser('rpkm', help='Create an RPKM file from a BAM file') | 661 rpkm_parser = subparsers.add_parser('rpkm', help='Create an RPKM file from a BAM file') |
634 rpkm_parser.add_argument('--probes',action='store', required=True, metavar='/path/to/probes_file.txt', nargs=1,help="Probe definition file") | 662 rpkm_parser.add_argument('--probes', action='store', required=True, metavar='/path/to/probes_file.txt', nargs=1, |
635 rpkm_parser.add_argument('--input',action='store', required=True, metavar='sample.bam',nargs=1, help="Aligned BAM file") | 663 help="Probe definition file") |
636 rpkm_parser.add_argument('--output',action='store', required=True, metavar='sample.rpkm.txt',nargs=1, help="RPKM file to write") | 664 rpkm_parser.add_argument('--input', action='store', required=True, metavar='sample.bam', nargs=1, |
665 help="Aligned BAM file") | |
666 rpkm_parser.add_argument('--output', action='store', required=True, metavar='sample.rpkm.txt', nargs=1, | |
667 help="RPKM file to write") | |
637 rpkm_parser.set_defaults(func=CF_bam2RPKM) | 668 rpkm_parser.set_defaults(func=CF_bam2RPKM) |
638 | 669 |
639 # analyze command | 670 # analyze command |
640 analyze_parser= subparsers.add_parser('analyze', help='Basic CoNIFER analysis. Reads a directory of RPKM files and a probe list and outputs a HDF5 file containing SVD-ZRPKM values.') | 671 analyze_parser = subparsers.add_parser('analyze', |
641 analyze_parser.add_argument('--probes',action='store', required=True, metavar='/path/to/probes_file.txt', nargs=1,help="Probe definition file") | 672 help='Basic CoNIFER analysis. Reads a directory of RPKM files and a probe list and outputs a HDF5 file containing SVD-ZRPKM values.') |
642 analyze_parser.add_argument('--rpkm_dir',action='store', required=True, metavar='/path/to/folder/containing/rpkm_files/',nargs=1, help="Location of folder containing RPKM files. Folder should contain ONLY contain RPKM files, and all readable RPKM files will used in analysis.") | 673 analyze_parser.add_argument('--probes', action='store', required=True, metavar='/path/to/probes_file.txt', nargs=1, |
643 analyze_parser.add_argument('--output','-o', required=True, metavar='/path/to/output_file.hdf5', type=str, help="Output location of HDF5 file to contain SVD-ZRPKM values") | 674 help="Probe definition file") |
644 analyze_parser.add_argument('--svd', metavar='12', type=int, nargs='?', default = 12,help="Number of components to remove") | 675 analyze_parser.add_argument('--rpkm_dir', action='store', required=True, |
645 analyze_parser.add_argument('--min_rpkm', metavar='1.00', type=float, nargs="?", default = 1.00,help="Minimum population median RPKM per probe.") | 676 metavar='/path/to/folder/containing/rpkm_files/', nargs=1, |
646 analyze_parser.add_argument('--write_svals', metavar='SingularValues.txt', type=str, default= "", help="Optional file to write singular values (S-values). Used for Scree Plot.") | 677 help="Location of folder containing RPKM files. Folder should contain ONLY contain RPKM files, and all readable RPKM files will used in analysis.") |
647 analyze_parser.add_argument('--plot_scree', metavar='ScreePlot.png', type=str, default= "", help="Optional graphical scree plot. Requires matplotlib.") | 678 analyze_parser.add_argument('--output', '-o', required=True, metavar='/path/to/output_file.hdf5', type=str, |
648 analyze_parser.add_argument('--write_sd', metavar='StandardDeviations.txt', type=str, default= "", help="Optional file with sample SVD-ZRPKM standard deviations. Used to filter noisy samples.") | 679 help="Output location of HDF5 file to contain SVD-ZRPKM values") |
680 analyze_parser.add_argument('--svd', metavar='12', type=int, nargs='?', default=12, | |
681 help="Number of components to remove") | |
682 analyze_parser.add_argument('--min_rpkm', metavar='1.00', type=float, nargs="?", default=1.00, | |
683 help="Minimum population median RPKM per probe.") | |
684 analyze_parser.add_argument('--write_svals', metavar='SingularValues.txt', type=str, default="", | |
685 help="Optional file to write singular values (S-values). Used for Scree Plot.") | |
686 analyze_parser.add_argument('--plot_scree', metavar='ScreePlot.png', type=str, default="", | |
687 help="Optional graphical scree plot. Requires matplotlib.") | |
688 analyze_parser.add_argument('--write_sd', metavar='StandardDeviations.txt', type=str, default="", | |
689 help="Optional file with sample SVD-ZRPKM standard deviations. Used to filter noisy samples.") | |
649 analyze_parser.set_defaults(func=CF_analyze) | 690 analyze_parser.set_defaults(func=CF_analyze) |
650 | 691 |
651 # export command | 692 # export command |
652 export_parser= subparsers.add_parser('export', help='Export SVD-ZRPKM values from a HDF5 file to bed or vcf format.') | 693 export_parser = subparsers.add_parser('export', help='Export SVD-ZRPKM values from a HDF5 file to bed or vcf format.') |
653 export_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step") | 694 export_parser.add_argument('--input', '-i', action='store', required=True, metavar='CoNIFER_SVD.hdf5', |
654 export_parser.add_argument('--output','-o',action='store', required=False, default='.', metavar='output.bed',help="Location of output file[s]. When exporting multiple samples, top-level directory of this path will be used.") | 695 help="HDF5 file from CoNIFER 'analyze' step") |
655 export_parser.add_argument('--sample','-s',action='store', required=False, metavar='sampleID', default='all', nargs="+",help="SampleID or comma-separated list of sampleIDs to export. Default: export all samples") | 696 export_parser.add_argument('--output', '-o', action='store', required=False, default='.', metavar='output.bed', |
697 help="Location of output file[s]. When exporting multiple samples, top-level directory of this path will be used.") | |
698 export_parser.add_argument('--sample', '-s', action='store', required=False, metavar='sampleID', default='all', | |
699 nargs="+", | |
700 help="SampleID or comma-separated list of sampleIDs to export. Default: export all samples") | |
656 export_parser.set_defaults(func=CF_export) | 701 export_parser.set_defaults(func=CF_export) |
657 | 702 |
658 # plot command | 703 # plot command |
659 plot_parser= subparsers.add_parser('plot', help='Plot SVD-ZRPKM values using matplotlib') | 704 plot_parser = subparsers.add_parser('plot', help='Plot SVD-ZRPKM values using matplotlib') |
660 plot_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step") | 705 plot_parser.add_argument('--input', '-i', action='store', required=True, metavar='CoNIFER_SVD.hdf5', |
661 plot_parser.add_argument('--region',action='store', required=True, metavar='chr#:start-stop',help="Region to plot") | 706 help="HDF5 file from CoNIFER 'analyze' step") |
662 plot_parser.add_argument('--output',action='store', required=True, metavar='image.png',help="Output path and filetype. PDF, PNG, PS, EPS, and SVG are supported.") | 707 plot_parser.add_argument('--region', action='store', required=True, metavar='chr#:start-stop', help="Region to plot") |
663 plot_parser.add_argument('--sample',action='store', required=False, metavar='sampleID',nargs="+",default='none',help="Sample[s] to highlight. The following optional color spec can be used: <color>:<sampleID>. Available colors are r,b,g,y,c,m,y,k. The unsmoothed SVD-ZRPKM values for the first sample in this list will be drawn. Default: No samples highlighted.") | 708 plot_parser.add_argument('--output', action='store', required=True, metavar='image.png', |
709 help="Output path and filetype. PDF, PNG, PS, EPS, and SVG are supported.") | |
710 plot_parser.add_argument('--sample', action='store', required=False, metavar='sampleID', nargs="+", default='none', | |
711 help="Sample[s] to highlight. The following optional color spec can be used: <color>:<sampleID>. Available colors are r,b,g,y,c,m,y,k. The unsmoothed SVD-ZRPKM values for the first sample in this list will be drawn. Default: No samples highlighted.") | |
664 plot_parser.set_defaults(func=CF_plot) | 712 plot_parser.set_defaults(func=CF_plot) |
665 | 713 |
666 # make calls command | 714 # make calls command |
667 call_parser= subparsers.add_parser('call', help='Very rudimentary caller for CNVs using SVD-ZRPKM thresholding.') | 715 call_parser = subparsers.add_parser('call', help='Very rudimentary caller for CNVs using SVD-ZRPKM thresholding.') |
668 call_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step") | 716 call_parser.add_argument('--input', '-i', action='store', required=True, metavar='CoNIFER_SVD.hdf5', |
669 call_parser.add_argument('--output',action='store', required=True, metavar='calls.txt',help="Output file for calls") | 717 help="HDF5 file from CoNIFER 'analyze' step") |
670 call_parser.add_argument('--threshold', metavar='1.50', type=float, nargs='?', required=False, default = 1.50,help="+/- Threshold for calling (minimum SVD-ZRPKM)") | 718 call_parser.add_argument('--output', action='store', required=True, metavar='calls.txt', help="Output file for calls") |
719 call_parser.add_argument('--threshold', metavar='1.50', type=float, nargs='?', required=False, default=1.50, | |
720 help="+/- Threshold for calling (minimum SVD-ZRPKM)") | |
671 call_parser.set_defaults(func=CF_call) | 721 call_parser.set_defaults(func=CF_call) |
672 | 722 |
673 # plotcalls command | 723 # plotcalls command |
674 plotcalls_parser= subparsers.add_parser('plotcalls', help='Make basic plots from call file from "call" command.') | 724 plotcalls_parser = subparsers.add_parser('plotcalls', help='Make basic plots from call file from "call" command.') |
675 plotcalls_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step") | 725 plotcalls_parser.add_argument('--input', '-i', action='store', required=True, metavar='CoNIFER_SVD.hdf5', |
676 plotcalls_parser.add_argument('--calls',action='store', required=True, metavar='calls.txt',help="File with calls from 'call' command.") | 726 help="HDF5 file from CoNIFER 'analyze' step") |
677 plotcalls_parser.add_argument('--outputdir',action='store', required=True, metavar='/path/to/directory',help="Output directory for plots") | 727 plotcalls_parser.add_argument('--calls', action='store', required=True, metavar='calls.txt', |
678 plotcalls_parser.add_argument('--window',action='store', required=False, metavar='50',default=50,help="In exons, the amount of padding to plot around each call") | 728 help="File with calls from 'call' command.") |
729 plotcalls_parser.add_argument('--outputdir', action='store', required=True, metavar='/path/to/directory', | |
730 help="Output directory for plots") | |
731 plotcalls_parser.add_argument('--window', action='store', required=False, metavar='50', default=50, | |
732 help="In exons, the amount of padding to plot around each call") | |
679 plotcalls_parser.set_defaults(func=CF_plotcalls) | 733 plotcalls_parser.set_defaults(func=CF_plotcalls) |
680 | 734 |
681 args = parser.parse_args() | 735 args = parser.parse_args() |
682 args.func(args) | 736 args.func(args) |