# HG changeset patch
# User bzonnedda
# Date 1486396538 18000
# Node ID bc0e3b10d3395fdcb6484c9fa3449dc8c9674db6
# Parent 54973c4a1125d3f8422c70da7ef9d6973ffc979a
Uploaded
diff -r 54973c4a1125 -r bc0e3b10d339 conifer_functions.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/conifer_functions.py Mon Feb 06 10:55:38 2017 -0500
@@ -0,0 +1,371 @@
+#######################################################################
+#######################################################################
+# CoNIFER: Copy Number Inference From Exome Reads
+# Developed by Niklas Krumm (C) 2012
+# nkrumm@gmail.com
+#
+# homepage: http://conifer.sf.net
+# This program is described in:
+# Krumm et al. 2012. Genome Research. doi:10.1101/gr.138115.112
+#
+# This file is part of CoNIFER.
+# CoNIFER is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+#######################################################################
+#######################################################################
+
+import csv
+from tables import *
+import numpy as np
+import operator
+
+class rpkm_value(IsDescription):
+ probeID = UInt32Col(pos=0)
+ rpkm = FloatCol(pos=1)
+
+class probe(IsDescription):
+ probeID = UInt32Col(pos=0)
+ start = UInt32Col(pos=1) # start of probe
+ stop = UInt32Col(pos=2) # stop of probe
+ name = StringCol(20,pos=3) # 20-character String
+
+def chrInt2Str(chromosome_int):
+ if int(chromosome_int) == 23:
+ return 'chrX'
+ elif int(chromosome_int) == 24:
+ return 'chrY'
+ else:
+ return 'chr' + str(chromosome_int)
+
+
+def chrStr2Int(chromosome_str):
+ chr = chromosome_str.replace('chr','')
+ if chr == 'X':
+ return 23
+ elif chr == 'Y':
+ return 24
+ else:
+ return int(chr)
+
+def parseLocString(locstr):
+ try:
+ chr,locstr = locstr.split(":")
+ start, stop = locstr.split("-")
+ except:
+ chr, start, stop = locstr.split("\t")
+
+ chr = chrStr2Int(chr)
+ start = int(start)
+ stop = int(stop)
+ return (chr,start,stop)
+
+def zrpkm(rpkm,median,sd):
+ return (rpkm - median) / sd
+
+
+
+class sample(IsDescription):
+ sampleID = StringCol(100,pos=0) # 20-char string (sampleID)
+
+def loadProbeList(CF_probe_filename):
+ # Load data files
+ probefile = open(CF_probe_filename, 'rb')
+ s = csv.Sniffer()
+ header = s.has_header(probefile.read(1024))
+ probefile.seek(0)
+ dialect = s.sniff(probefile.read(1024))
+ probefile.seek(0)
+ if header:
+ r = csv.DictReader(probefile, dialect=dialect)
+ else:
+ r = csv.DictReader(probefile, dialect=dialect, fieldnames=['chr','start','stop','name'])
+
+ probes = []
+ probeID = 1
+ for row in r:
+ probes.append({'probeID': probeID, 'chr':chrStr2Int(row['chr']),'start':int(row['start']),'stop':int(row['stop']), 'name':row['name']})
+ probeID +=1
+
+ if len(probes) == 0:
+ raise Exception("No probes in probe file")
+
+ return probes
+
+
+def export_sample(h5file_in,sample,probes,outfile_f):
+ dt = np.dtype([('chr','|S10'),('start', ' 5:
+ counter = 0
+
+
+def plotGenomicCoords(plt, rpkm_data,fontsize=10,rotation=0):
+ import operator
+ import locale
+ exon_set = rpkm_data.exons
+ genomic_coords = np.array(map(operator.itemgetter("start"),exon_set))
+
+ ticks = range(0,len(exon_set),len(exon_set)/5)
+
+ ticks[-1] -= 1 # the last tick is going to be off the chart, so we estimate it as the second to last genomic coord.
+ labels = [locale.format("%d", genomic_coords[i], grouping=True) for i in ticks if i < len(genomic_coords)]
+ if rotation != 0:
+ ha = "right"
+ else:
+ ha = "center"
+ _ = plt.xticks(ticks,labels,fontsize=fontsize,rotation=rotation,ha=ha)
+
+
+def plotRawData(axis, rpkm_data, color='r',linewidth=0.7):
+ zero_stack = np.zeros(len(rpkm_data))
+ positions = np.repeat(np.arange(0,len(rpkm_data)),3)
+ logr = np.vstack([zero_stack,rpkm_data.flatten(),zero_stack]).transpose().ravel()
+ axis.plot(positions,logr,color=color,marker=None,linewidth=1)
+
+def getbkpoints(mask):
+ bkpoints = np.nonzero(np.logical_xor(mask[0:-1],mask[1:]))[0]+1
+ if mask[0] == 1:
+ bkpoints = np.hstack([0,bkpoints])
+ if mask[-1] == 1:
+ bkpoints = np.hstack([bkpoints,len(mask)])
+ return bkpoints.reshape(len(bkpoints)/2,2)
+
+def mergeCalls(calls):
+ if len(calls) == 0:
+ return []
+
+ out_calls = []
+ calls=np.array(calls)[np.argsort(np.array(map(operator.itemgetter("start"),calls),dtype=np.int))]
+ pstart = calls[0]["start"]
+ pstop = calls[0]["stop"]
+ for d in calls:
+ if d["start"] <= pstop:
+ pstop = max(d["stop"],pstop)
+ else:
+ out_calls.append({"sampleID": d["sampleID"], "chromosome": d["chromosome"], "start":pstart, "stop":pstop, "state": d["state"]})
+ pstart = d["start"]
+ pstop = d["stop"]
+
+ out_calls.append({"sampleID": d["sampleID"], "chromosome": d["chromosome"], "start":pstart, "stop":pstop, "state": d["state"]})
+ return out_calls
+
+class rpkm_data:
+ def __init__(self):
+ self.rpkm = None
+ self.samples = None
+ self.exons = None
+ self.isGenotype = False
+ self.calls = []
+ self.refined_calls = []
+
+ def smooth(self, window = 15, padded = False): #todo, fix the padding here
+ if self.isGenotype:
+ print "Warning: the data in this rpkm_data container are single genotype values. Smoothing will have no effect!"
+ return self.rpkm
+
+ if window > 0:
+ weightings=np.blackman(window)
+ weightings = weightings/weightings.sum()
+ smoothed_data = np.array([])
+ for row in self.rpkm.transpose():
+ smoothed = np.convolve(row, weightings)[(window-1)/2:-((window-1)/2)]
+ if len(smoothed_data) == 0:
+ smoothed_data = smoothed
+ else:
+ smoothed_data = np.vstack([smoothed_data,smoothed])
+
+ self.rpkm = smoothed_data.transpose()
+ return self.rpkm
+ else:
+ return self.rpkm
+
+ def getSample(self, sampleIDs):
+ sample_array = np.array(self.samples)
+ if isinstance(sampleIDs,list):
+ mask = np.zeros(len(sample_array),dtype=np.bool)
+ for sampleID in sampleIDs:
+ mask = np.logical_or(mask, sample_array == str(sampleID))
+
+ return self.rpkm[:,mask]
+ else:
+ mask = sample_array == str(sampleID)
+ return self.rpkm[:,mask]
+
+ def getSamples(self, sampleIDs):
+ return self.getSample(sampleIDs)
+
+ @property
+ def shape(self):
+ if self.isGenotype:
+ return [len(self.samples), 1]
+ else:
+ return [len(self.samples), len(self.exons)]
+
+
+class rpkm_reader:
+ def __init__(self, rpkm_fn=None):
+ """Initialize an rpkm_reader instance. Specify the location of the data file"""
+
+ if rpkm_fn == None:
+ print "Must specify RPKM HDF5 file!"
+ return 0
+ # set up file access
+ # self.h5file = openFile(rpkm_fn, mode='r')
+ self.h5file = open_file(rpkm_fn, mode='r')
+ self.sample_table = self.h5file.root.samples.samples
+
+ def __del__(self):
+ self.h5file.close()
+
+ def getExonValuesByExons(self, chromosome, start_exon, stop_exon, sampleList=None,genotype=False):
+
+ probe_tbl = self.h5file.root.probes._f_getChild("probes_chr" + str(chromosome))
+ #table_rows = probe_tbl.getWhereList('(start >= %d) & (stop <= %d)' % (start,stop))
+ start_exon = max(start_exon,0)
+ stop_exon = min(stop_exon, probe_tbl.nrows)
+ #print start_exon, stop_exon
+ table_rows = np.arange(start_exon,stop_exon,1)
+ data_tbl = self.h5file.root._f_getChild("chr" + str(chromosome))
+
+ if sampleList == None:
+ num_samples = data_tbl._v_nchildren
+ samples = data_tbl
+ else:
+ num_samples = len(sampleList)
+ samples = [data_tbl._f_getChild("sample_" + s) for s in sampleList]
+
+ data = np.empty([num_samples,len(table_rows)],dtype=np.float)
+
+ out_sample_list = []
+ cnt = 0
+ for sample_tbl in samples:
+ d = sample_tbl.readCoordinates(table_rows,field="rpkm")
+ data[cnt,:] = d
+ cnt +=1
+ out_sample_list.append(sample_tbl.title)
+
+ d = rpkm_data()
+ if genotype: # return average #todo-- implement median and SD?
+ d.rpkm = data.transpose().mean(axis=0)
+ d.isGenotype = True
+ else: #return all data points
+ d.rpkm = data.transpose()
+ d.samples = out_sample_list
+ d.exons = probe_tbl.readCoordinates(table_rows)
+
+ return d
+
+ def getExonValuesByRegion(self, chromosome, start=None, stop=None, sampleList=None,genotype=False):
+ probe_tbl = self.h5file.root.probes._f_getChild("probes_chr" + str(chromosome))
+ if (start is not None) and (stop is not None):
+ table_rows = probe_tbl.getWhereList('(start >= %d) & (stop <= %d)' % (start,stop))
+ else:
+ table_rows = probe_tbl.getWhereList('(start >= 0) & (stop <= 1000000000)')
+
+ data_tbl = self.h5file.root._f_getChild("chr" + str(chromosome))
+
+ if sampleList == None:
+ num_samples = data_tbl._v_nchildren
+ samples = data_tbl
+ else:
+ num_samples = len(sampleList)
+ samples = [data_tbl._f_getChild("sample_" + s) for s in sampleList]
+
+ data = np.empty([num_samples,len(table_rows)],dtype=np.float)
+
+ out_sample_list = []
+ cnt = 0
+ for sample_tbl in samples:
+ d = sample_tbl.readCoordinates(table_rows,field="rpkm")
+ data[cnt,:] = d
+ cnt +=1
+ out_sample_list.append(sample_tbl.title)
+
+ d = rpkm_data()
+ if genotype: # return average #todo-- implement median and SD?
+ d.rpkm = data.transpose().mean(axis=0)
+ d.isGenotype = True
+ else: #return all data points
+ d.rpkm = data.transpose()
+ d.samples = out_sample_list
+ d.exons = probe_tbl.readCoordinates(table_rows)
+
+ return d
+
+ def getSampleList(self,cohort=None,sex=None,ethnicity=None,custom=None):
+ """Return a list of available samples in the current data file. Specifying no arguments will return all available samples"""
+
+ readWhereStr = ""
+ if custom != None:
+ readWhereStr = custom
+ else:
+ if cohort != None:
+ if isinstance(cohort,list):
+ for c in cohort:
+ readWhereStr += "(cohort=='%s') | " % c
+ readWhereStr = readWhereStr.strip(" |")
+ readWhereStr += " & "
+ else:
+ readWhereStr += "(cohort=='%s') " % cohort
+ if sex != None:
+ if sex not in ['M','F']:
+ sex = sex.upper()[0]
+ readWhereStr += " (sex=='%s') &" % sex
+ if ethnicity != None:
+ readWhereStr += " (ethnicity=='%s') &" % ethnicity
+
+ readWhereStr = readWhereStr.strip(" &") # remove leading or trailing characters
+ if readWhereStr != "":
+ #print readWhereStr
+ sampleIDs = self.sample_table.readWhere(readWhereStr,field='sampleID')
+ else:
+ sampleIDs = self.sample_table.read(field='sampleID')
+
+ return sampleIDs
+
+ def getExonIDs(self, chromosome, start, stop):
+ probe_tbl = self.h5file.root.probes._f_getChild("probes_chr" + str(chromosome))
+ exons = probe_tbl.getWhereList('(start >= %d) & (stop <= %d)' % (start,stop))
+ return exons