| 10 | 1 ####################################################################### | 
|  | 2 ####################################################################### | 
|  | 3 # CoNIFER: Copy Number Inference From Exome Reads | 
|  | 4 # Developed by Niklas Krumm (C) 2012 | 
|  | 5 # nkrumm@gmail.com | 
|  | 6 # | 
|  | 7 # homepage: http://conifer.sf.net | 
|  | 8 # This program is described in: | 
|  | 9 # Krumm et al. 2012. Genome Research. doi:10.1101/gr.138115.112 | 
|  | 10 # | 
|  | 11 # This file is part of CoNIFER. | 
|  | 12 # CoNIFER is free software:  you can redistribute it and/or modify | 
|  | 13 # it under the terms of the GNU General Public License as published by | 
|  | 14 # the Free Software Foundation, either version 3 of the License, or | 
|  | 15 # (at your option) any later version. | 
|  | 16 # | 
|  | 17 # This program is distributed in the hope that it will be useful, | 
|  | 18 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | 19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
|  | 20 # GNU General Public License for more details. | 
|  | 21 # | 
|  | 22 # You should have received a copy of the GNU General Public License | 
|  | 23 # along with this program.  If not, see <http://www.gnu.org/licenses/>. | 
|  | 24 ####################################################################### | 
|  | 25 ####################################################################### | 
|  | 26 | 
|  | 27 import argparse | 
|  | 28 import os, sys, copy | 
|  | 29 import glob | 
|  | 30 import csv | 
|  | 31 import conifer_functions as cf | 
|  | 32 import operator | 
| 23 | 33 #from tables import * | 
| 10 | 34 import numpy  as np | 
|  | 35 | 
|  | 36 def CF_analyze(args): | 
|  | 37 	# do path/file checks: | 
|  | 38 	try: | 
|  | 39 		# read probes table | 
|  | 40 		probe_fn = str(args.probes[0]) | 
|  | 41 		probes = cf.loadProbeList(probe_fn) | 
|  | 42 		num_probes = len(probes) | 
|  | 43 		print '[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn) | 
|  | 44 	except IOError as e: | 
|  | 45 		print '[ERROR] Cannot read probes file: ', probe_fn | 
|  | 46 		sys.exit(0) | 
|  | 47 | 
|  | 48 	try: | 
|  | 49 		svd_outfile_fn = str(args.output) | 
|  | 50 		h5file_out = openFile(svd_outfile_fn, mode='w') | 
|  | 51 		probe_group = h5file_out.createGroup("/","probes","probes") | 
|  | 52 	except IOError as e: | 
|  | 53 		print '[ERROR] Cannot open SVD output file for writing: ', svd_outfile_fn | 
|  | 54 		sys.exit(0) | 
|  | 55 | 
|  | 56 	if args.write_svals != "": | 
|  | 57 		sval_f = open(args.write_svals,'w') | 
|  | 58 | 
|  | 59 	if args.plot_scree != "": | 
|  | 60 		try: | 
|  | 61 			import matplotlib | 
|  | 62 			matplotlib.use('Agg') | 
|  | 63 			import matplotlib.pyplot as plt | 
|  | 64 			import pylab as P | 
|  | 65 			from matplotlib.lines import Line2D | 
|  | 66 			from matplotlib.patches import Rectangle | 
|  | 67 		except: | 
|  | 68 			print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?" | 
|  | 69 			sys.exit(0) | 
|  | 70 | 
|  | 71 		plt.gcf().clear() | 
|  | 72 		fig = plt.figure(figsize=(10,5)) | 
|  | 73 		ax = fig.add_subplot(111) | 
|  | 74 | 
|  | 75 	rpkm_dir = str(args.rpkm_dir[0]) | 
|  | 76 	rpkm_files = glob.glob(rpkm_dir + "/*") | 
|  | 77 	if len(rpkm_files) == 0: | 
|  | 78 		print '[ERROR] Cannot find any files in RPKM directory (or directory path is incorrect): ', rpkm_dir | 
|  | 79 		sys.exit(0) | 
|  | 80 	elif len(rpkm_files) == 1: | 
|  | 81 		print '[ERROR] Found only 1 RPKM file (sample). CoNIFER requires multiple samples (8 or more) to run. Exiting.' | 
|  | 82 		sys.exit(0) | 
|  | 83 	elif len(rpkm_files) < 8: | 
|  | 84 		print '[WARNING] Only found %d samples... this is less than the recommended minimum, and CoNIFER may not analyze this dataset correctly!' % len(rpkm_files) | 
|  | 85 	elif len(rpkm_files) <= int(args.svd): | 
|  | 86 		print '[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (len(rpkm_files), int(args.svd)) | 
|  | 87 		sys.exit(0) | 
|  | 88 	else: | 
|  | 89 		print '[INIT] Found %d RPKM files in %s' % (len(rpkm_files), rpkm_dir) | 
|  | 90 | 
|  | 91 	# read in samples names and generate file list | 
|  | 92 	samples = {} | 
|  | 93 	for f in rpkm_files: | 
|  | 94 		s = '.'.join(f.split('/')[-1].split('.')[0:-1]) | 
|  | 95 		print "[INIT] Mapping file to sampleID: %s --> %s" % (f, s) | 
|  | 96 		samples[s] = f | 
|  | 97 | 
|  | 98 	#check uniqueness and total # of samples | 
|  | 99 	if len(set(samples)) != len(set(rpkm_files)): | 
|  | 100 		print '[ERROR] Could not successfully derive sample names from RPKM filenames. There are probably non-unique sample names! Please rename files using <sampleID>.txt format!' | 
|  | 101 		sys.exit(0) | 
|  | 102 | 
|  | 103 	# LOAD RPKM DATA | 
|  | 104 	RPKM_data = np.zeros([num_probes,len(samples)], dtype=np.float) | 
|  | 105 	failed_samples = 0 | 
|  | 106 | 
|  | 107 	for i,s in enumerate(samples.keys()): | 
|  | 108 		t = np.loadtxt(samples[s], dtype=np.float, delimiter="\t", skiprows=0, usecols=[2]) | 
|  | 109 		if len(t) != num_probes: | 
|  | 110 			print "[WARNING] Number of RPKM values for %s in file %s does not match number of defined probes in %s. **This sample will be dropped from analysis**!" % (s, samples[s], probe_fn) | 
|  | 111 			_ = samples.pop(s) | 
|  | 112 			failed_samples += 1 | 
|  | 113 		else: | 
|  | 114 			RPKM_data[:,i] = t | 
|  | 115 			print "[INIT] Successfully read RPKM data for sampleID: %s" % s | 
|  | 116 | 
|  | 117 	RPKM_data = RPKM_data[:,0:len(samples)] | 
|  | 118 	print "[INIT] Finished reading RPKM files. Total number of samples in experiment: %d (%d failed to read properly)" % (len(samples), failed_samples) | 
|  | 119 | 
|  | 120 	if len(samples) <= int(args.svd): | 
|  | 121 		print '[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (int(args.svd), len(samples)) | 
|  | 122 		sys.exit(0) | 
|  | 123 | 
|  | 124 	# BEGIN | 
|  | 125 	chrs_to_process = set(map(operator.itemgetter("chr"),probes)) | 
|  | 126 	chrs_to_process_str = ', '.join([cf.chrInt2Str(c) for c in chrs_to_process]) | 
|  | 127 	print '[INIT] Attempting to process chromosomes: ', chrs_to_process_str | 
|  | 128 | 
|  | 129 | 
|  | 130 | 
|  | 131 	for chr in chrs_to_process: | 
|  | 132 		print "[RUNNING: chr%d] Now on: %s" %(chr, cf.chrInt2Str(chr)) | 
|  | 133 		chr_group_name = "chr%d" % chr | 
|  | 134 		chr_group = h5file_out.createGroup("/",chr_group_name,chr_group_name) | 
|  | 135 | 
|  | 136 		chr_probes = filter(lambda i: i["chr"] == chr, probes) | 
|  | 137 		num_chr_probes = len(chr_probes) | 
|  | 138 		start_probeID = chr_probes[0]['probeID'] | 
|  | 139 		stop_probeID = chr_probes[-1]['probeID'] | 
|  | 140 		print "[RUNNING: chr%d] Found %d probes; probeID range is [%d-%d]" % (chr, len(chr_probes), start_probeID-1, stop_probeID) # probeID is 1-based and slicing is 0-based, hence the start_probeID-1 term | 
|  | 141 | 
|  | 142 		rpkm = RPKM_data[start_probeID:stop_probeID,:] | 
|  | 143 | 
|  | 144 		print "[RUNNING: chr%d] Calculating median RPKM" % chr | 
|  | 145 		median = np.median(rpkm,1) | 
|  | 146 		sd = np.std(rpkm,1) | 
|  | 147 		probe_mask = median >= float(args.min_rpkm) | 
|  | 148 		print "[RUNNING: chr%d] Masking %d probes with median RPKM < %f" % (chr, np.sum(probe_mask==False), float(args.min_rpkm)) | 
|  | 149 | 
|  | 150 		rpkm = rpkm[probe_mask, :] | 
|  | 151 		num_chr_probes = np.sum(probe_mask) | 
|  | 152 | 
|  | 153 		if num_chr_probes <= len(samples): | 
|  | 154 			print "[ERROR] This chromosome has fewer informative probes than there are samples in the analysis! There are probably no mappings on this chromosome. Please remove these probes from the probes.txt file" | 
|  | 155 			sys.exit(0) | 
|  | 156 | 
|  | 157 		probeIDs = np.array(map(operator.itemgetter("probeID"),chr_probes))[probe_mask] | 
|  | 158 		probe_starts = np.array(map(operator.itemgetter("start"),chr_probes))[probe_mask] | 
|  | 159 		probe_stops = np.array(map(operator.itemgetter("stop"),chr_probes))[probe_mask] | 
|  | 160 		gene_names =  np.array(map(operator.itemgetter("name"),chr_probes))[probe_mask] | 
|  | 161 | 
|  | 162 		dt = np.dtype([('probeID',np.uint32),('start',np.uint32),('stop',np.uint32), ('name', np.str_, 20)]) | 
|  | 163 | 
|  | 164 		out_probes = np.empty(num_chr_probes,dtype=dt) | 
|  | 165 		out_probes['probeID'] = probeIDs | 
|  | 166 		out_probes['start'] = probe_starts | 
|  | 167 		out_probes['stop'] = probe_stops | 
|  | 168 		out_probes['name'] = gene_names | 
|  | 169 		probe_table = h5file_out.createTable(probe_group,"probes_chr%d" % chr,cf.probe,"chr%d" % chr) | 
|  | 170 		probe_table.append(out_probes) | 
|  | 171 | 
|  | 172 		print "[RUNNING: chr%d] Calculating ZRPKM scores..." % chr | 
|  | 173 		rpkm = np.apply_along_axis(cf.zrpkm, 0, rpkm, median[probe_mask], sd[probe_mask]) | 
|  | 174 | 
|  | 175 		# svd transform | 
|  | 176 		print "[RUNNING: chr%d] SVD decomposition..." % chr | 
|  | 177 		components_removed = int(args.svd) | 
|  | 178 | 
|  | 179 		U, S, Vt = np.linalg.svd(rpkm,full_matrices=False) | 
|  | 180 		new_S = np.diag(np.hstack([np.zeros([components_removed]),S[components_removed:]])) | 
|  | 181 | 
|  | 182 		if args.write_svals != "": | 
|  | 183 			sval_f.write('chr' + str(chr) + '\t' + '\t'.join([str(_i) for _i in S]) + "\n") | 
|  | 184 | 
|  | 185 		if args.plot_scree != "": | 
|  | 186 			ax.plot(S, label='chr' + str(chr),lw=0.5) | 
|  | 187 | 
|  | 188 		# reconstruct data matrix | 
|  | 189 		rpkm = np.dot(U, np.dot(new_S, Vt)) | 
|  | 190 | 
|  | 191 | 
|  | 192 		# save to HDF5 file | 
|  | 193 		print "[RUNNING: chr%d] Saving SVD-ZRPKM values" % chr | 
|  | 194 | 
|  | 195 		for i,s in enumerate(samples): | 
|  | 196 			out_data = np.empty(num_chr_probes,dtype='u4,f8') | 
|  | 197 			out_data['f0'] = probeIDs | 
|  | 198 			out_data['f1'] = rpkm[:,i] | 
|  | 199 			sample_tbl = h5file_out.createTable(chr_group,"sample_" + str(s),cf.rpkm_value,"%s" % str(s)) | 
|  | 200 			sample_tbl.append(out_data) | 
|  | 201 | 
|  | 202 | 
|  | 203 	print "[RUNNING] Saving sampleIDs to file..." | 
|  | 204 	sample_group = h5file_out.createGroup("/","samples","samples") | 
|  | 205 	sample_table = h5file_out.createTable(sample_group,"samples",cf.sample,"samples") | 
|  | 206 	dt = np.dtype([('sampleID',np.str_,100)]) | 
|  | 207 	out_samples = np.empty(len(samples.keys()),dtype=dt) | 
|  | 208 	out_samples['sampleID'] = np.array(samples.keys()) | 
|  | 209 	sample_table.append(out_samples) | 
|  | 210 | 
|  | 211 | 
|  | 212 	if args.write_sd != "": | 
|  | 213 		print "[RUNNING] Calculating standard deviations for all samples (this can take a while)..." | 
|  | 214 | 
|  | 215 		sd_file = open(args.write_sd,'w') | 
|  | 216 | 
|  | 217 		for i,s in enumerate(samples): | 
|  | 218 			# collect all SVD-ZRPKM values | 
|  | 219 			count = 1 | 
|  | 220 			for chr in chrs_to_process: | 
|  | 221 				if count == 1: | 
|  | 222 					sd_out = h5file_out.root._f_getChild("chr%d" % chr)._f_getChild("sample_%s" % s).read(field="rpkm").flatten() | 
|  | 223 				else: | 
|  | 224 					sd_out = np.hstack([sd_out,out.h5file_out.root._f_getChild("chr%d" % chr)._f_getChild("sample_%s" % s).read(field="rpkm").flatten()]) | 
|  | 225 | 
|  | 226 				sd = np.std(sd_out) | 
|  | 227 			sd_file.write("%s\t%f\n" % (s,sd)) | 
|  | 228 | 
|  | 229 		sd_file.close() | 
|  | 230 | 
|  | 231 	if args.plot_scree != "": | 
|  | 232 		plt.title("Scree plot") | 
|  | 233 		if len(samples) < 50: | 
|  | 234 			plt.xlim([0,len(samples)]) | 
|  | 235 			plt.xlabel("S values") | 
|  | 236 		else: | 
|  | 237 			plt.xlim([0,50]) | 
|  | 238 			plt.xlabel("S values (only first 50 plotted)") | 
|  | 239 		plt.ylabel("Relative contributed variance") | 
|  | 240 		plt.savefig(args.plot_scree) | 
|  | 241 | 
|  | 242 	print "[FINISHED]" | 
|  | 243 	h5file_out.close() | 
|  | 244 	sys.exit(0) | 
|  | 245 | 
|  | 246 def CF_export(args): | 
|  | 247 	try: | 
|  | 248 		h5file_in_fn = str(args.input) | 
|  | 249 		h5file_in = openFile(h5file_in_fn, mode='r') | 
|  | 250 	except IOError as e: | 
|  | 251 		print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn | 
|  | 252 		sys.exit(0) | 
|  | 253 | 
|  | 254 	# read probes | 
|  | 255 	probes = {} | 
|  | 256 	for probes_chr in h5file_in.root.probes: | 
|  | 257 		probes[probes_chr.title] = probes_chr.read() | 
|  | 258 | 
|  | 259 	if args.sample =='all': | 
|  | 260 		all_samples = list(h5file_in.root.samples.samples.read(field="sampleID")) | 
|  | 261 | 
|  | 262 		out_path = os.path.abspath(args.output) | 
|  | 263 | 
|  | 264 		print "[INIT] Preparing to export all samples (%d samples) to %s" % (len(all_samples), out_path) | 
|  | 265 		for sample in all_samples: | 
|  | 266 			try: | 
|  | 267 				outfile_fn = out_path + "/" + sample + ".bed" | 
|  | 268 				outfile_f = open(outfile_fn,'w') | 
|  | 269 			except IOError as e: | 
|  | 270 				print '[ERROR] Cannot open output file for writing: ', outfile_fn | 
|  | 271 				sys.exit(0) | 
|  | 272 			print "[RUNNING] Exporting %s" % sample | 
|  | 273 | 
|  | 274 			cf.export_sample(h5file_in,sample,probes,outfile_f) | 
|  | 275 			outfile_f.close() | 
|  | 276 | 
|  | 277 	elif len(args.sample) == 1: | 
|  | 278 		out_path = os.path.abspath(args.output) | 
|  | 279 		sample = args.sample[0] | 
|  | 280 		print "[INIT] Preparing to export sampleID %s to %s" % (args.sample[0], out_path) | 
|  | 281 		try: | 
|  | 282 			if os.path.isdir(out_path): | 
|  | 283 				outfile_fn = out_path + "/" + sample + ".bed" | 
|  | 284 			else: | 
|  | 285 				outfile_fn = out_path | 
|  | 286 			outfile_f = open(outfile_fn,'w') | 
|  | 287 		except IOError as e: | 
|  | 288 			print '[ERROR] Cannot open output file for writing: ', outfile_fn | 
|  | 289 			sys.exit(0) | 
|  | 290 		print "[RUNNING] Exporting %s to %s" % (sample, outfile_fn) | 
|  | 291 | 
|  | 292 		cf.export_sample(h5file_in,sample,probes,outfile_f) | 
|  | 293 		outfile_f.close() | 
|  | 294 | 
|  | 295 	else: | 
|  | 296 		out_path = os.path.abspath(args.output) | 
|  | 297 		print "[INIT] Preparing to export %d samples to %s" % (len(args.sample), out_path) | 
|  | 298 		for sample in args.sample: | 
|  | 299 			try: | 
|  | 300 				if os.path.isdir(out_path): | 
|  | 301 					outfile_fn = out_path + "/" + sample + ".bed" | 
|  | 302 				else: | 
|  | 303 					outfile_fn = out_path | 
|  | 304 				outfile_f = open(outfile_fn,'w') | 
|  | 305 			except IOError as e: | 
|  | 306 				print '[ERROR] Cannot open output file for writing: ', outfile_fn | 
|  | 307 				sys.exit(0) | 
|  | 308 			print "[RUNNING] Exporting %s to %s" % (sample, outfile_fn) | 
|  | 309 | 
|  | 310 			cf.export_sample(h5file_in,sample,probes,outfile_f) | 
|  | 311 			outfile_f.close() | 
|  | 312 	sys.exit(0) | 
|  | 313 | 
|  | 314 def CF_call(args): | 
|  | 315 	try: | 
|  | 316 		h5file_in_fn = str(args.input) | 
|  | 317 		h5file_in = openFile(h5file_in_fn, mode='r') | 
|  | 318 	except IOError as e: | 
|  | 319 		print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn | 
|  | 320 		sys.exit(0) | 
|  | 321 | 
|  | 322 	try: | 
|  | 323 		callfile_fn = str(args.output) | 
|  | 324 		callfile_f = open(callfile_fn, mode='w') | 
|  | 325 	except IOError as e: | 
|  | 326 		print '[ERROR] Cannot open output file for writing: ', callfile_fn | 
|  | 327 		sys.exit(0) | 
|  | 328 | 
|  | 329 	chrs_to_process = [] | 
|  | 330 	for chr in h5file_in.root: | 
|  | 331 		if chr._v_title not in ('probes','samples'): | 
|  | 332 			chrs_to_process.append(chr._v_title.replace("chr","")) | 
|  | 333 | 
|  | 334 	h5file_in.close() | 
|  | 335 | 
|  | 336 	print '[INIT] Initializing caller at threshold = %f' % (args.threshold) | 
|  | 337 | 
|  | 338 	r = cf.rpkm_reader(h5file_in_fn) | 
|  | 339 | 
|  | 340 	all_calls = [] | 
|  | 341 | 
|  | 342 	for chr in chrs_to_process: | 
|  | 343 		print '[RUNNING] Now processing chr%s' % chr | 
|  | 344 		data = r.getExonValuesByRegion(chr) | 
|  | 345 | 
|  | 346 		#raw_data = copy.copy(data) | 
|  | 347 		_ = data.smooth() | 
|  | 348 | 
|  | 349 		mean= np.mean(data.rpkm,axis=1) | 
|  | 350 		sd =  np.std(data.rpkm,axis=1) | 
|  | 351 | 
|  | 352 		for sample in r.getSampleList(): | 
|  | 353 			sample_data = data.getSample([sample]).flatten() | 
|  | 354 			#sample_raw_data = raw_data.getSample([sample]).flatten() | 
|  | 355 | 
|  | 356 			dup_mask = sample_data >= args.threshold | 
|  | 357 			del_mask = sample_data <= -1*args.threshold | 
|  | 358 | 
|  | 359 			dup_bkpoints = cf.getbkpoints(dup_mask) #returns exon coordinates for this chromosome (numpy array coords) | 
|  | 360 			del_bkpoints = cf.getbkpoints(del_mask) | 
|  | 361 | 
|  | 362 | 
|  | 363 			dups = [] | 
|  | 364 			for start,stop in dup_bkpoints: | 
|  | 365 				try: new_start =  np.max(np.where(sample_data[:start] < (mean[:start] + 3*sd[:start]))) | 
|  | 366 				except ValueError: new_start = 0 | 
|  | 367 				try: new_stop = stop + np.min(np.where(sample_data[stop:] < (mean[stop:] + 3*sd[stop:]))) | 
|  | 368 				except ValueError: new_stop = data.shape[1]-1 | 
|  | 369 				dups.append({"sampleID":sample,"chromosome":  cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "dup"}) | 
|  | 370 | 
|  | 371 			dels = [] | 
|  | 372 			for start,stop in del_bkpoints: | 
|  | 373 				try: new_start =  np.max(np.where(sample_data[:start] > (-1*mean[:start] - 3*sd[:start]))) | 
|  | 374 				except ValueError: new_start = 0 | 
|  | 375 				try: new_stop = stop + np.min(np.where(sample_data[stop:] > (-1*mean[stop:] - 3*sd[stop:]))) | 
|  | 376 				except ValueError: new_stop = data.shape[1]-1 | 
|  | 377 				dels.append({"sampleID":sample,"chromosome": cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "del"}) | 
|  | 378 | 
|  | 379 			dels = cf.mergeCalls(dels) #merges overlapping calls | 
|  | 380 			dups = cf.mergeCalls(dups) | 
|  | 381 | 
|  | 382 			#print sampleID, len(dels), len(dups) | 
|  | 383 | 
|  | 384 			all_calls.extend(list(dels)) | 
|  | 385 			all_calls.extend(list(dups)) | 
|  | 386 | 
|  | 387 	# print calls to file | 
|  | 388 	header = ['sampleID','chromosome','start','stop','state'] | 
|  | 389 | 
|  | 390 	callfile_f.write('\t'.join(header) + "\n") | 
|  | 391 	for call in all_calls: | 
|  | 392 		print "%s\t%s\t%d\t%d\t%s" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"]) | 
|  | 393 		callfile_f.write("%s\t%s\t%d\t%d\t%s\n" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"])) | 
|  | 394 | 
|  | 395 	sys.exit(0) | 
|  | 396 | 
|  | 397 def CF_plot(args): | 
|  | 398 	try: | 
|  | 399 		import locale | 
|  | 400 		import matplotlib | 
|  | 401 		matplotlib.use('Agg') | 
|  | 402 		import matplotlib.pyplot as plt | 
|  | 403 		import pylab as P | 
|  | 404 		from matplotlib.lines import Line2D | 
|  | 405 		from matplotlib.patches import Rectangle | 
|  | 406 		_ = locale.setlocale(locale.LC_ALL, '') | 
|  | 407 	except: | 
|  | 408 		print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?" | 
|  | 409 		sys.exit(0) | 
|  | 410 | 
|  | 411 | 
|  | 412 	chr, start, stop = cf.parseLocString(args.region) | 
|  | 413 | 
|  | 414 	r = cf.rpkm_reader(args.input) | 
|  | 415 | 
|  | 416 	data = r.getExonValuesByRegion(chr,start,stop) | 
|  | 417 	_ = data.smooth() | 
|  | 418 | 
|  | 419 	plt.gcf().clear() | 
|  | 420 	fig = plt.figure(figsize=(10,5)) | 
|  | 421 	ax = fig.add_subplot(111) | 
|  | 422 | 
|  | 423 | 
|  | 424 	ax.plot(data.rpkm, linewidth = 0.3, c='k') | 
|  | 425 | 
|  | 426 | 
|  | 427 	if args.sample != 'none': | 
|  | 428 		cnt = 1 | 
|  | 429 		coloriter = iter(['r','b','g','y']) | 
|  | 430 		for sample in args.sample: | 
|  | 431 			try: | 
|  | 432 				color, sampleID = sample.split(":") | 
|  | 433 			except: | 
|  | 434 				color =coloriter.next() | 
|  | 435 				sampleID = sample | 
|  | 436 | 
|  | 437 			ax.plot(data.getSample([sampleID]), linewidth = 1, c=color, label = sampleID) | 
|  | 438 | 
|  | 439 			if cnt == 1: | 
|  | 440 				cf.plotRawData(ax, r.getExonValuesByRegion(chr,start,stop,sampleList=[sampleID]).getSample([sampleID]),color=color) | 
|  | 441 			cnt +=1 | 
|  | 442 		plt.legend(prop={'size':10},frameon=False) | 
|  | 443 | 
|  | 444 	cf.plotGenes(ax, data) | 
|  | 445 	cf.plotGenomicCoords(plt,data) | 
|  | 446 	plt.xlim(0,data.shape[1]) | 
|  | 447 	plt.ylim(-3,3) | 
|  | 448 | 
|  | 449 	plt.title("%s: %s - %s" % (cf.chrInt2Str(chr),locale.format("%d",start, grouping=True),locale.format("%d",stop, grouping=True))) | 
|  | 450 	plt.xlabel("Position") | 
|  | 451 	plt.ylabel("SVD-ZRPKM Values") | 
|  | 452 | 
|  | 453 	plt.savefig(args.output) | 
|  | 454 | 
|  | 455 	sys.exit(0) | 
|  | 456 | 
|  | 457 def CF_plotcalls(args): | 
|  | 458 	try: | 
|  | 459 		import matplotlib | 
|  | 460 		matplotlib.use('Agg') | 
|  | 461 		import matplotlib.pyplot as plt | 
|  | 462 		import pylab as P | 
|  | 463 		from matplotlib.lines import Line2D | 
|  | 464 		from matplotlib.patches import Rectangle | 
|  | 465 	except: | 
|  | 466 		print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?" | 
|  | 467 		sys.exit(0) | 
|  | 468 | 
|  | 469 	import locale | 
|  | 470 	try: | 
|  | 471 		_ = locale.setlocale(locale.LC_ALL, 'en_US') | 
|  | 472 	except: | 
|  | 473 		_ = locale.setlocale(locale.LC_ALL, '') | 
|  | 474 | 
|  | 475 	try: | 
|  | 476 		callfile_fn = str(args.calls) | 
|  | 477 		callfile_f = open(callfile_fn, mode='r') | 
|  | 478 	except IOError as e: | 
|  | 479 		print '[ERROR] Cannot open call file for reading: ', callfile_fn | 
|  | 480 		sys.exit(0) | 
|  | 481 | 
|  | 482 	all_calls = [] | 
|  | 483 	header = callfile_f.readline() | 
|  | 484 | 
|  | 485 	for line in callfile_f: | 
|  | 486 		sampleID, chr, start, stop, state = line.strip().split() | 
|  | 487 		chr = cf.chrStr2Int(chr) | 
|  | 488 		all_calls.append({"chromosome":int(chr), "start":int(start), "stop":int(stop), "sampleID":sampleID}) | 
|  | 489 | 
|  | 490 	r = cf.rpkm_reader(args.input) | 
|  | 491 | 
|  | 492 	for call in all_calls: | 
|  | 493 		chr = call["chromosome"] | 
|  | 494 		start = call["start"] | 
|  | 495 		stop = call["stop"] | 
|  | 496 		sampleID = call["sampleID"] | 
|  | 497 | 
|  | 498 		exons = r.getExonIDs(chr,int(start),int(stop)) | 
|  | 499 | 
|  | 500 | 
|  | 501 		window_start = max(exons[0]-args.window,0) | 
|  | 502 		window_stop = exons[-1]+args.window | 
|  | 503 | 
|  | 504 		data = r.getExonValuesByExons(chr,window_start, window_stop) | 
|  | 505 		_ = data.smooth() | 
|  | 506 | 
|  | 507 		plt.gcf().clear() | 
|  | 508 		fig = plt.figure(figsize=(10,5)) | 
|  | 509 		ax = fig.add_subplot(111) | 
|  | 510 | 
|  | 511 | 
|  | 512 		ax.plot(data.rpkm, linewidth = 0.3, c='k') | 
|  | 513 | 
|  | 514 | 
|  | 515 		ax.plot(data.getSample([sampleID]), linewidth = 1, c='r', label = sampleID) | 
|  | 516 		cf.plotRawData(ax, r.getExonValuesByExons(chr,window_start, window_stop,sampleList=[sampleID]).getSample([sampleID]),color='r') | 
|  | 517 | 
|  | 518 		plt.legend(prop={'size':10},frameon=False) | 
|  | 519 | 
|  | 520 		cf.plotGenes(ax, data) | 
|  | 521 		cf.plotGenomicCoords(plt,data) | 
|  | 522 | 
|  | 523 		exon_start = np.where(data.exons["start"] == start)[0][0] | 
|  | 524 		exon_stop = np.where(data.exons["stop"] == stop)[0][0] | 
|  | 525 		_ = ax.add_line(matplotlib.lines.Line2D([exon_start,exon_stop],[2,2],color='k',lw=6,linestyle='-',alpha=1,solid_capstyle='butt')) | 
|  | 526 | 
|  | 527 		_ = plt.xlim(0,data.shape[1]) | 
|  | 528 		_ = plt.ylim(-3,3) | 
|  | 529 | 
|  | 530 		plt.title("%s: %s - %s" % (cf.chrInt2Str(chr),locale.format("%d",start, grouping=True),locale.format("%d",stop, grouping=True))) | 
|  | 531 		plt.xlabel("Position") | 
|  | 532 		plt.ylabel("SVD-ZRPKM Values") | 
|  | 533 		outfile = "%s_%d_%d_%s.png" % (cf.chrInt2Str(chr), start, stop, sampleID) | 
|  | 534 		plt.savefig(args.outputdir + "/" + outfile) | 
|  | 535 | 
|  | 536 def CF_bam2RPKM(args): | 
|  | 537 	try: | 
|  | 538 		import pysam | 
|  | 539 	except: | 
|  | 540 		print '[ERROR] Cannot load pysam module! Make sure it is insalled' | 
|  | 541 		sys.exit(0) | 
|  | 542 	try: | 
|  | 543 		# read probes table | 
|  | 544 		probe_fn = str(args.probes[0]) | 
|  | 545 		probes = cf.loadProbeList(probe_fn) | 
|  | 546 		num_probes = len(probes) | 
|  | 547 		print '[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn) | 
|  | 548 	except IOError as e: | 
|  | 549 		print '[ERROR] Cannot read probes file: ', probe_fn | 
|  | 550 		sys.exit(0) | 
|  | 551 | 
|  | 552 	try: | 
|  | 553 		rpkm_f = open(args.output[0],'w') | 
|  | 554 	except IOError as e: | 
|  | 555 		print '[ERROR] Cannot open rpkm file for writing: ', args.output | 
|  | 556 		sys.exit(0) | 
|  | 557 | 
|  | 558 	print "[RUNNING] Counting total number of reads in bam file..." | 
|  | 559 	total_reads = float(pysam.view("-c", args.input[0])[0].strip("\n")) | 
|  | 560 	print "[RUNNING] Found %d reads" % total_reads | 
|  | 561 | 
|  | 562 	f = pysam.Samfile(args.input[0], "rb" ) | 
|  | 563 | 
|  | 564 	if not f._hasIndex(): | 
|  | 565 		print "[ERROR] No index found for bam file (%s)!\n[ERROR] You must first index the bam file and include the .bai file in the same directory as the bam file!" % args.input[0] | 
|  | 566 		sys.exit(0) | 
|  | 567 | 
|  | 568 	# will be storing values in these arrays | 
|  | 569 	readcount = np.zeros(num_probes) | 
|  | 570 	exon_bp = np.zeros(num_probes) | 
|  | 571 	probeIDs = np.zeros(num_probes) | 
|  | 572 	counter = 0 | 
|  | 573 | 
|  | 574 	# detect contig naming scheme here # TODO, add an optional "contigs.txt" file or automatically handle contig naming | 
|  | 575 	bam_contigs = f.references | 
|  | 576 	probes_contigs = [str(p) for p in set(map(operator.itemgetter("chr"),probes))] | 
|  | 577 | 
|  | 578 	probes2contigmap = {} | 
|  | 579 | 
|  | 580 	for probes_contig in probes_contigs: | 
|  | 581 		if probes_contig in bam_contigs: | 
|  | 582 			probes2contigmap[probes_contig] = probes_contig | 
|  | 583 		elif cf.chrInt2Str(probes_contig) in bam_contigs: | 
|  | 584 			probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig) | 
|  | 585 		elif cf.chrInt2Str(probes_contig).replace("chr","") in bam_contigs: | 
|  | 586 			probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig).replace("chr","") | 
|  | 587 		else: | 
|  | 588 			print "[ERROR] Could not find contig '%s' from %s in bam file! \n[ERROR] Perhaps the contig names for the probes are incompatible with the bam file ('chr1' vs. '1'), or unsupported contig naming is used?" % (probes_contig, probe_fn) | 
|  | 589 			sys.exit(0) | 
|  | 590 | 
|  | 591 	print "[RUNNING] Calculating RPKM values..." | 
|  | 592 | 
|  | 593 	# loop through each probe | 
|  | 594 	for p in probes: | 
|  | 595 | 
|  | 596 		# f.fetch is a pysam method and returns an iterator for reads overlapping interval | 
|  | 597 | 
|  | 598 		p_chr = probes2contigmap[str(p["chr"])] | 
|  | 599 | 
|  | 600 		p_start = p["start"] | 
|  | 601 		p_stop = p["stop"] | 
|  | 602 		try: | 
|  | 603 			iter = f.fetch(p_chr,p_start,p_stop) | 
|  | 604 		except: | 
|  | 605 			print "[ERROR] Could not retrieve mappings for region %s:%d-%d. Check that contigs are named correctly and the bam file is properly indexed" % (p_chr,p_start,p_stop) | 
|  | 606 			sys.exit(0) | 
|  | 607 | 
|  | 608 		for i in iter: | 
|  | 609 			if i.pos+1 >= p_start: #this checks to make sure a read actually starts in an interval | 
|  | 610 				readcount[counter] += 1 | 
|  | 611 | 
|  | 612 		exon_bp[counter] = p_stop-p_start | 
|  | 613 		probeIDs[counter] = counter +1 #probeIDs are 1-based | 
|  | 614 		counter +=1 | 
|  | 615 | 
|  | 616 	#calcualte RPKM values for all probes | 
|  | 617 	rpkm = (10**9*(readcount)/(exon_bp))/(total_reads) | 
|  | 618 | 
|  | 619 	out = np.vstack([probeIDs,readcount,rpkm]) | 
|  | 620 | 
|  | 621 	np.savetxt(rpkm_f,out.transpose(),delimiter='\t',fmt=['%d','%d','%f']) | 
|  | 622 | 
|  | 623 	rpkm_f.close() | 
|  | 624 | 
|  | 625 | 
|  | 626 | 
|  | 627 VERSION = "0.2.2" | 
|  | 628 parser = argparse.ArgumentParser(prog="CoNIFER", description="This is CoNIFER %s (Copy Number Inference From Exome Reads), designed to detect and genotype CNVs and CNPs from exome sequence read-depth data. See Krumm et al., Genome Research (2012) doi:10.1101/gr.138115.112 \nNiklas Krumm, 2012\n nkrumm@uw.edu" % VERSION) | 
|  | 629 parser.add_argument('--version', action='version', version='%(prog)s ' + VERSION) | 
|  | 630 subparsers = parser.add_subparsers(help='Command to be run.') | 
|  | 631 | 
|  | 632 # rpkm command | 
|  | 633 rpkm_parser= subparsers.add_parser('rpkm', help='Create an RPKM file from a BAM file') | 
|  | 634 rpkm_parser.add_argument('--probes',action='store', required=True, metavar='/path/to/probes_file.txt',  nargs=1,help="Probe definition file") | 
|  | 635 rpkm_parser.add_argument('--input',action='store', required=True, metavar='sample.bam',nargs=1,  help="Aligned BAM file") | 
|  | 636 rpkm_parser.add_argument('--output',action='store', required=True, metavar='sample.rpkm.txt',nargs=1,  help="RPKM file to write") | 
|  | 637 rpkm_parser.set_defaults(func=CF_bam2RPKM) | 
|  | 638 | 
|  | 639 # analyze command | 
|  | 640 analyze_parser= subparsers.add_parser('analyze', help='Basic CoNIFER analysis. Reads a directory of RPKM files and a probe list and outputs a HDF5 file containing SVD-ZRPKM values.') | 
|  | 641 analyze_parser.add_argument('--probes',action='store', required=True, metavar='/path/to/probes_file.txt',  nargs=1,help="Probe definition file") | 
|  | 642 analyze_parser.add_argument('--rpkm_dir',action='store', required=True, metavar='/path/to/folder/containing/rpkm_files/',nargs=1,  help="Location of folder containing RPKM files. Folder should contain ONLY contain RPKM files, and all readable RPKM files will used in analysis.") | 
|  | 643 analyze_parser.add_argument('--output','-o', required=True, metavar='/path/to/output_file.hdf5', type=str, help="Output location of HDF5 file to contain SVD-ZRPKM values") | 
|  | 644 analyze_parser.add_argument('--svd', metavar='12', type=int, nargs='?', default = 12,help="Number of components to remove") | 
|  | 645 analyze_parser.add_argument('--min_rpkm', metavar='1.00', type=float, nargs="?", default = 1.00,help="Minimum population median RPKM per probe.") | 
|  | 646 analyze_parser.add_argument('--write_svals', metavar='SingularValues.txt', type=str, default= "", help="Optional file to write singular values (S-values). Used for Scree Plot.") | 
|  | 647 analyze_parser.add_argument('--plot_scree', metavar='ScreePlot.png', type=str, default= "", help="Optional graphical scree plot. Requires matplotlib.") | 
|  | 648 analyze_parser.add_argument('--write_sd', metavar='StandardDeviations.txt', type=str, default= "", help="Optional file with sample SVD-ZRPKM standard deviations. Used to filter noisy samples.") | 
|  | 649 analyze_parser.set_defaults(func=CF_analyze) | 
|  | 650 | 
|  | 651 # export command | 
|  | 652 export_parser= subparsers.add_parser('export', help='Export SVD-ZRPKM values from a HDF5 file to bed or vcf format.') | 
|  | 653 export_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step") | 
|  | 654 export_parser.add_argument('--output','-o',action='store', required=False, default='.', metavar='output.bed',help="Location of output file[s]. When exporting multiple samples, top-level directory of this path will be used.") | 
|  | 655 export_parser.add_argument('--sample','-s',action='store', required=False, metavar='sampleID', default='all', nargs="+",help="SampleID or comma-separated list of sampleIDs to export. Default: export all samples") | 
|  | 656 export_parser.set_defaults(func=CF_export) | 
|  | 657 | 
|  | 658 # plot command | 
|  | 659 plot_parser= subparsers.add_parser('plot', help='Plot SVD-ZRPKM values using matplotlib') | 
|  | 660 plot_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step") | 
|  | 661 plot_parser.add_argument('--region',action='store', required=True, metavar='chr#:start-stop',help="Region to plot") | 
|  | 662 plot_parser.add_argument('--output',action='store', required=True, metavar='image.png',help="Output path and filetype. PDF, PNG, PS, EPS, and SVG are supported.") | 
|  | 663 plot_parser.add_argument('--sample',action='store', required=False, metavar='sampleID',nargs="+",default='none',help="Sample[s] to highlight. The following optional color spec can be used: <color>:<sampleID>. Available colors are r,b,g,y,c,m,y,k. The unsmoothed SVD-ZRPKM values for the first sample in this list will be drawn. Default: No samples highlighted.") | 
|  | 664 plot_parser.set_defaults(func=CF_plot) | 
|  | 665 | 
|  | 666 # make calls command | 
|  | 667 call_parser= subparsers.add_parser('call', help='Very rudimentary caller for CNVs using SVD-ZRPKM thresholding.') | 
|  | 668 call_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step") | 
|  | 669 call_parser.add_argument('--output',action='store', required=True, metavar='calls.txt',help="Output file for calls") | 
|  | 670 call_parser.add_argument('--threshold', metavar='1.50', type=float, nargs='?', required=False, default = 1.50,help="+/- Threshold for calling (minimum SVD-ZRPKM)") | 
|  | 671 call_parser.set_defaults(func=CF_call) | 
|  | 672 | 
|  | 673 # plotcalls command | 
|  | 674 plotcalls_parser= subparsers.add_parser('plotcalls', help='Make basic plots from call file from "call" command.') | 
|  | 675 plotcalls_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step") | 
|  | 676 plotcalls_parser.add_argument('--calls',action='store', required=True, metavar='calls.txt',help="File with calls from 'call' command.") | 
|  | 677 plotcalls_parser.add_argument('--outputdir',action='store', required=True, metavar='/path/to/directory',help="Output directory for plots") | 
|  | 678 plotcalls_parser.add_argument('--window',action='store', required=False, metavar='50',default=50,help="In exons, the amount of padding to plot around each call") | 
|  | 679 plotcalls_parser.set_defaults(func=CF_plotcalls) | 
|  | 680 | 
|  | 681 args = parser.parse_args() | 
|  | 682 args.func(args) |