| 9 | 1 ####################################################################### | 
|  | 2 ####################################################################### | 
|  | 3 # CoNIFER: Copy Number Inference From Exome Reads | 
|  | 4 # Developed by Niklas Krumm (C) 2012 | 
|  | 5 # nkrumm@gmail.com | 
|  | 6 # | 
|  | 7 # homepage: http://conifer.sf.net | 
|  | 8 # This program is described in: | 
|  | 9 # Krumm et al. 2012. Genome Research. doi:10.1101/gr.138115.112 | 
|  | 10 # | 
|  | 11 # This file is part of CoNIFER. | 
|  | 12 # CoNIFER is free software: you can redistribute it and/or modify | 
|  | 13 # it under the terms of the GNU General Public License as published by | 
|  | 14 # the Free Software Foundation, either version 3 of the License, or | 
|  | 15 # (at your option) any later version. | 
|  | 16 # | 
|  | 17 # This program is distributed in the hope that it will be useful, | 
|  | 18 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | 19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
|  | 20 # GNU General Public License for more details. | 
|  | 21 # | 
|  | 22 # You should have received a copy of the GNU General Public License | 
|  | 23 # along with this program.  If not, see <http://www.gnu.org/licenses/>. | 
|  | 24 ####################################################################### | 
|  | 25 ####################################################################### | 
|  | 26 | 
|  | 27 import csv | 
|  | 28 from tables import * | 
|  | 29 import numpy as np | 
|  | 30 import operator | 
|  | 31 | 
|  | 32 class rpkm_value(IsDescription): | 
|  | 33 	probeID = UInt32Col(pos=0) | 
|  | 34 	rpkm = FloatCol(pos=1) | 
|  | 35 | 
|  | 36 class probe(IsDescription): | 
|  | 37 	probeID = 	UInt32Col(pos=0) | 
|  | 38 	start = 	UInt32Col(pos=1) # start of probe | 
|  | 39 	stop = 		UInt32Col(pos=2) # stop of probe | 
|  | 40 	name = 		StringCol(20,pos=3)   # 20-character String | 
|  | 41 | 
|  | 42 def chrInt2Str(chromosome_int): | 
|  | 43 	if int(chromosome_int) == 23: | 
|  | 44 		return 'chrX' | 
|  | 45 	elif int(chromosome_int) == 24: | 
|  | 46 		return 'chrY' | 
|  | 47 	else: | 
|  | 48 		return 'chr' + str(chromosome_int) | 
|  | 49 | 
|  | 50 | 
|  | 51 def chrStr2Int(chromosome_str): | 
|  | 52 	chr = chromosome_str.replace('chr','') | 
|  | 53 	if chr == 'X': | 
|  | 54 		return 23 | 
|  | 55 	elif chr == 'Y': | 
|  | 56 		return 24 | 
|  | 57 	else: | 
|  | 58 		return int(chr) | 
|  | 59 | 
|  | 60 def parseLocString(locstr): | 
|  | 61 	try: | 
|  | 62 		chr,locstr = locstr.split(":") | 
|  | 63 		start, stop = locstr.split("-") | 
|  | 64 	except: | 
|  | 65 		chr, start, stop = locstr.split("\t") | 
|  | 66 | 
|  | 67 	chr = chrStr2Int(chr) | 
|  | 68 	start = int(start) | 
|  | 69 	stop = int(stop) | 
|  | 70 	return (chr,start,stop) | 
|  | 71 | 
|  | 72 def zrpkm(rpkm,median,sd): | 
|  | 73 	return (rpkm - median) / sd | 
|  | 74 | 
|  | 75 | 
|  | 76 | 
|  | 77 class sample(IsDescription): | 
|  | 78 	sampleID = 	StringCol(100,pos=0) # 20-char string (sampleID) | 
|  | 79 | 
|  | 80 def loadProbeList(CF_probe_filename): | 
|  | 81 	# Load data files | 
|  | 82 	probefile = open(CF_probe_filename, 'rb') | 
|  | 83 	s = csv.Sniffer() | 
|  | 84 	header = s.has_header(probefile.read(1024)) | 
|  | 85 	probefile.seek(0) | 
|  | 86 	dialect = s.sniff(probefile.read(1024)) | 
|  | 87 	probefile.seek(0) | 
|  | 88 	if header: | 
|  | 89 		r = csv.DictReader(probefile, dialect=dialect) | 
|  | 90 	else: | 
|  | 91 		r = csv.DictReader(probefile, dialect=dialect, fieldnames=['chr','start','stop','name']) | 
|  | 92 | 
|  | 93 	probes = [] | 
|  | 94 	probeID = 1 | 
|  | 95 	for row in r: | 
|  | 96 		probes.append({'probeID': probeID, 'chr':chrStr2Int(row['chr']),'start':int(row['start']),'stop':int(row['stop']), 'name':row['name']}) | 
|  | 97 		probeID +=1 | 
|  | 98 | 
|  | 99 	if len(probes) == 0: | 
|  | 100 		raise Exception("No probes in probe file") | 
|  | 101 | 
|  | 102 	return probes | 
|  | 103 | 
|  | 104 | 
|  | 105 def export_sample(h5file_in,sample,probes,outfile_f): | 
|  | 106 	dt = np.dtype([('chr','|S10'),('start', '<u4'), ('stop', '<u4'), ('name', '|S20'),('SVDZRPKM',np.float)]) | 
|  | 107 	for chr in h5file_in.root: | 
|  | 108 		if chr._v_title in ('probes','samples'): | 
|  | 109 			continue | 
|  | 110 | 
|  | 111 		out_data = np.empty(len(probes[chr._v_title]),dtype=dt) | 
|  | 112 		out_data["SVDZRPKM"] = chr._f_getChild("sample_" + sample).read(field='rpkm') | 
|  | 113 		out_data["chr"] = np.repeat(chr._v_title,len(out_data)) | 
|  | 114 		out_data["start"] = probes[chr._v_title]["start"] | 
|  | 115 		out_data["stop"] = probes[chr._v_title]["stop"] | 
|  | 116 		out_data["name"] = probes[chr._v_title]["name"] | 
|  | 117 		np.savetxt(outfile_f, out_data,fmt=["%s","%d","%d","%s","%f"], delimiter="\t") | 
|  | 118 | 
|  | 119 | 
|  | 120 | 
|  | 121 def plotGenes(axis, rpkm_data, levels=5,x_pos=-2,text_pos='right',line_spacing=0.1,text_offset=0.25,data_range=None): | 
|  | 122 	from matplotlib.lines import Line2D | 
|  | 123 	counter = 0 | 
|  | 124 	prev_gene = "" | 
|  | 125 	if data_range is not None: | 
|  | 126 		exon_set = rpkm_data.exons[data_range] | 
|  | 127 	else: | 
|  | 128 		exon_set = rpkm_data.exons | 
|  | 129 	for gene in exon_set["name"]: | 
|  | 130 		if gene == prev_gene: | 
|  | 131 			continue | 
|  | 132 		elif gene == 'None': | 
|  | 133 			continue | 
|  | 134 		start = np.min(np.where(exon_set["name"] == gene)) | 
|  | 135 		stop = np.max(np.where(exon_set["name"] == gene)) + 1 | 
|  | 136 		_ = axis.add_line(Line2D([start-0.5,stop-0.5],[x_pos - (counter * line_spacing),x_pos - (counter * line_spacing)],color=(102/255.,33/255.,168/255.,0.6),linewidth=5,linestyle='-',alpha=0.5,solid_capstyle='butt')) | 
|  | 137 		_ = axis.text(stop+text_offset, x_pos - (counter * line_spacing), gene, ha='left',va='center',fontsize=6) | 
|  | 138 		counter +=1 | 
|  | 139 		prev_gene = gene | 
|  | 140 		if counter > 5: | 
|  | 141 			counter = 0 | 
|  | 142 | 
|  | 143 | 
|  | 144 def plotGenomicCoords(plt, rpkm_data,fontsize=10,rotation=0): | 
|  | 145 	import operator | 
|  | 146 	import locale | 
|  | 147 	exon_set = rpkm_data.exons | 
|  | 148 	genomic_coords = np.array(map(operator.itemgetter("start"),exon_set)) | 
|  | 149 | 
|  | 150 	ticks = range(0,len(exon_set),len(exon_set)/5) | 
|  | 151 | 
|  | 152 	ticks[-1] -= 1 # the last tick is going to be off the chart, so we estimate it as the second to last genomic coord. | 
|  | 153 	labels = [locale.format("%d", genomic_coords[i], grouping=True) for i in ticks if i < len(genomic_coords)] | 
|  | 154 	if rotation != 0: | 
|  | 155 		ha = "right" | 
|  | 156 	else: | 
|  | 157 		ha = "center" | 
|  | 158 	_ = plt.xticks(ticks,labels,fontsize=fontsize,rotation=rotation,ha=ha) | 
|  | 159 | 
|  | 160 | 
|  | 161 def plotRawData(axis, rpkm_data, color='r',linewidth=0.7): | 
|  | 162 	zero_stack = np.zeros(len(rpkm_data)) | 
|  | 163 	positions = np.repeat(np.arange(0,len(rpkm_data)),3) | 
|  | 164 	logr = np.vstack([zero_stack,rpkm_data.flatten(),zero_stack]).transpose().ravel() | 
|  | 165 	axis.plot(positions,logr,color=color,marker=None,linewidth=1) | 
|  | 166 | 
|  | 167 def getbkpoints(mask): | 
|  | 168 	bkpoints = np.nonzero(np.logical_xor(mask[0:-1],mask[1:]))[0]+1 | 
|  | 169 	if mask[0] == 1: | 
|  | 170 		bkpoints = np.hstack([0,bkpoints]) | 
|  | 171 	if mask[-1] == 1: | 
|  | 172 		bkpoints = np.hstack([bkpoints,len(mask)]) | 
|  | 173 	return bkpoints.reshape(len(bkpoints)/2,2) | 
|  | 174 | 
|  | 175 def mergeCalls(calls): | 
|  | 176 	if len(calls) == 0: | 
|  | 177 		return [] | 
|  | 178 | 
|  | 179 	out_calls = [] | 
|  | 180 	calls=np.array(calls)[np.argsort(np.array(map(operator.itemgetter("start"),calls),dtype=np.int))] | 
|  | 181 	pstart = calls[0]["start"] | 
|  | 182 	pstop = calls[0]["stop"] | 
|  | 183 	for d in calls: | 
|  | 184 		if d["start"] <= pstop: | 
|  | 185 			pstop = max(d["stop"],pstop) | 
|  | 186 		else: | 
|  | 187 			out_calls.append({"sampleID": d["sampleID"], "chromosome": d["chromosome"], "start":pstart, "stop":pstop, "state": d["state"]}) | 
|  | 188 			pstart = d["start"] | 
|  | 189 			pstop = d["stop"] | 
|  | 190 | 
|  | 191 	out_calls.append({"sampleID": d["sampleID"], "chromosome": d["chromosome"], "start":pstart, "stop":pstop, "state": d["state"]}) | 
|  | 192 	return out_calls | 
|  | 193 | 
|  | 194 class rpkm_data: | 
|  | 195 	def __init__(self): | 
|  | 196 		self.rpkm = None | 
|  | 197 		self.samples = None | 
|  | 198 		self.exons = None | 
|  | 199 		self.isGenotype = False | 
|  | 200 		self.calls = [] | 
|  | 201 		self.refined_calls = [] | 
|  | 202 | 
|  | 203 	def smooth(self, window = 15, padded = False): #todo, fix the padding here | 
|  | 204 		if self.isGenotype: | 
|  | 205 			print "Warning: the data in this rpkm_data container are single genotype values. Smoothing will have no effect!" | 
|  | 206 			return self.rpkm | 
|  | 207 | 
|  | 208 		if window > 0: | 
|  | 209 			weightings=np.blackman(window) | 
|  | 210 			weightings = weightings/weightings.sum() | 
|  | 211 			smoothed_data = np.array([]) | 
|  | 212 			for row in self.rpkm.transpose(): | 
|  | 213 				smoothed = np.convolve(row, weightings)[(window-1)/2:-((window-1)/2)] | 
|  | 214 				if len(smoothed_data) == 0: | 
|  | 215 					smoothed_data  = smoothed | 
|  | 216 				else: | 
|  | 217 					smoothed_data  = np.vstack([smoothed_data,smoothed]) | 
|  | 218 | 
|  | 219 			self.rpkm = smoothed_data.transpose() | 
|  | 220 			return self.rpkm | 
|  | 221 		else: | 
|  | 222 			return self.rpkm | 
|  | 223 | 
|  | 224 	def getSample(self, sampleIDs): | 
|  | 225 		sample_array = np.array(self.samples) | 
|  | 226 		if isinstance(sampleIDs,list): | 
|  | 227 			mask = np.zeros(len(sample_array),dtype=np.bool) | 
|  | 228 			for sampleID in sampleIDs: | 
|  | 229 				mask = np.logical_or(mask, sample_array == str(sampleID)) | 
|  | 230 | 
|  | 231 			return self.rpkm[:,mask] | 
|  | 232 		else: | 
|  | 233 			mask = sample_array == str(sampleID) | 
|  | 234 			return self.rpkm[:,mask] | 
|  | 235 | 
|  | 236 	def getSamples(self, sampleIDs): | 
|  | 237 		return self.getSample(sampleIDs) | 
|  | 238 | 
|  | 239 	@property | 
|  | 240 	def shape(self): | 
|  | 241 		if self.isGenotype: | 
|  | 242 			return [len(self.samples), 1] | 
|  | 243 		else: | 
|  | 244 			return [len(self.samples), len(self.exons)] | 
|  | 245 | 
|  | 246 | 
|  | 247 class rpkm_reader: | 
|  | 248 	def __init__(self, rpkm_fn=None): | 
|  | 249 		"""Initialize an rpkm_reader instance. Specify the location of the data file""" | 
|  | 250 | 
|  | 251 		if rpkm_fn == None: | 
|  | 252 			print "Must specify RPKM HDF5 file!" | 
|  | 253 			return 0 | 
|  | 254 		# set up file access | 
|  | 255 		self.h5file = openFile(rpkm_fn, mode='r') | 
|  | 256 		self.sample_table = self.h5file.root.samples.samples | 
|  | 257 | 
|  | 258 	def __del__(self): | 
|  | 259 		self.h5file.close() | 
|  | 260 | 
|  | 261 	def getExonValuesByExons(self, chromosome, start_exon, stop_exon, sampleList=None,genotype=False): | 
|  | 262 | 
|  | 263 		probe_tbl = self.h5file.root.probes._f_getChild("probes_chr" + str(chromosome)) | 
|  | 264 		#table_rows = probe_tbl.getWhereList('(start >= %d) & (stop <= %d)' % (start,stop)) | 
|  | 265 		start_exon = max(start_exon,0) | 
|  | 266 		stop_exon = min(stop_exon, probe_tbl.nrows) | 
|  | 267 		#print start_exon, stop_exon | 
|  | 268 		table_rows = np.arange(start_exon,stop_exon,1) | 
|  | 269 		data_tbl  = self.h5file.root._f_getChild("chr" + str(chromosome)) | 
|  | 270 | 
|  | 271 		if sampleList == None: | 
|  | 272 			num_samples = data_tbl._v_nchildren | 
|  | 273 			samples = data_tbl | 
|  | 274 		else: | 
|  | 275 			num_samples = len(sampleList) | 
|  | 276 			samples = [data_tbl._f_getChild("sample_" + s) for s in sampleList] | 
|  | 277 | 
|  | 278 		data = np.empty([num_samples,len(table_rows)],dtype=np.float) | 
|  | 279 | 
|  | 280 		out_sample_list = [] | 
|  | 281 		cnt = 0 | 
|  | 282 		for sample_tbl in samples: | 
|  | 283 			d = sample_tbl.readCoordinates(table_rows,field="rpkm") | 
|  | 284 			data[cnt,:] = d | 
|  | 285 			cnt +=1 | 
|  | 286 			out_sample_list.append(sample_tbl.title) | 
|  | 287 | 
|  | 288 		d = rpkm_data() | 
|  | 289 		if genotype: # return average #todo-- implement median and SD? | 
|  | 290 			d.rpkm = data.transpose().mean(axis=0) | 
|  | 291 			d.isGenotype = True | 
|  | 292 		else: #return all data points | 
|  | 293 			d.rpkm = data.transpose() | 
|  | 294 		d.samples = out_sample_list | 
|  | 295 		d.exons = probe_tbl.readCoordinates(table_rows) | 
|  | 296 | 
|  | 297 		return d | 
|  | 298 | 
|  | 299 	def getExonValuesByRegion(self, chromosome, start=None, stop=None, sampleList=None,genotype=False): | 
|  | 300 		probe_tbl = self.h5file.root.probes._f_getChild("probes_chr" + str(chromosome)) | 
|  | 301 		if (start is not None) and (stop is not None): | 
|  | 302 			table_rows = probe_tbl.getWhereList('(start >= %d) & (stop <= %d)' % (start,stop)) | 
|  | 303 		else: | 
|  | 304 			table_rows = probe_tbl.getWhereList('(start >= 0) & (stop <= 1000000000)') | 
|  | 305 | 
|  | 306 		data_tbl  = self.h5file.root._f_getChild("chr" + str(chromosome)) | 
|  | 307 | 
|  | 308 		if sampleList == None: | 
|  | 309 			num_samples = data_tbl._v_nchildren | 
|  | 310 			samples = data_tbl | 
|  | 311 		else: | 
|  | 312 			num_samples = len(sampleList) | 
|  | 313 			samples = [data_tbl._f_getChild("sample_" + s) for s in sampleList] | 
|  | 314 | 
|  | 315 		data = np.empty([num_samples,len(table_rows)],dtype=np.float) | 
|  | 316 | 
|  | 317 		out_sample_list = [] | 
|  | 318 		cnt = 0 | 
|  | 319 		for sample_tbl in samples: | 
|  | 320 			d = sample_tbl.readCoordinates(table_rows,field="rpkm") | 
|  | 321 			data[cnt,:] = d | 
|  | 322 			cnt +=1 | 
|  | 323 			out_sample_list.append(sample_tbl.title) | 
|  | 324 | 
|  | 325 		d = rpkm_data() | 
|  | 326 		if genotype: # return average #todo-- implement median and SD? | 
|  | 327 			d.rpkm = data.transpose().mean(axis=0) | 
|  | 328 			d.isGenotype = True | 
|  | 329 		else: #return all data points | 
|  | 330 			d.rpkm = data.transpose() | 
|  | 331 		d.samples = out_sample_list | 
|  | 332 		d.exons = probe_tbl.readCoordinates(table_rows) | 
|  | 333 | 
|  | 334 		return d | 
|  | 335 | 
|  | 336 	def getSampleList(self,cohort=None,sex=None,ethnicity=None,custom=None): | 
|  | 337 		"""Return a list of available samples in the current data file. Specifying no arguments will return all available samples""" | 
|  | 338 | 
|  | 339 		readWhereStr = "" | 
|  | 340 		if custom != None: | 
|  | 341 			readWhereStr = custom | 
|  | 342 		else: | 
|  | 343 			if cohort != None: | 
|  | 344 				if isinstance(cohort,list): | 
|  | 345 					for c in cohort: | 
|  | 346 						readWhereStr += "(cohort=='%s') | " % c | 
|  | 347 					readWhereStr = readWhereStr.strip(" |") | 
|  | 348 					readWhereStr += " & " | 
|  | 349 				else: | 
|  | 350 					readWhereStr += "(cohort=='%s') " % cohort | 
|  | 351 			if sex != None: | 
|  | 352 				if sex not in ['M','F']: | 
|  | 353 					sex = sex.upper()[0] | 
|  | 354 				readWhereStr += " (sex=='%s') &" % sex | 
|  | 355 			if ethnicity != None: | 
|  | 356 				readWhereStr += " (ethnicity=='%s') &" % ethnicity | 
|  | 357 | 
|  | 358 			readWhereStr = readWhereStr.strip(" &") # remove leading or trailing characters | 
|  | 359 		if readWhereStr != "": | 
|  | 360 			#print readWhereStr | 
|  | 361 			sampleIDs = self.sample_table.readWhere(readWhereStr,field='sampleID') | 
|  | 362 		else: | 
|  | 363 			sampleIDs = self.sample_table.read(field='sampleID') | 
|  | 364 | 
|  | 365 		return sampleIDs | 
|  | 366 | 
|  | 367 	def getExonIDs(self, chromosome, start, stop): | 
|  | 368 		probe_tbl = self.h5file.root.probes._f_getChild("probes_chr" + str(chromosome)) | 
|  | 369 		exons = probe_tbl.getWhereList('(start >= %d) & (stop <= %d)' % (start,stop)) | 
|  | 370 		return exons |