# HG changeset patch # User bzonnedda # Date 1486396538 18000 # Node ID bc0e3b10d3395fdcb6484c9fa3449dc8c9674db6 # Parent 54973c4a1125d3f8422c70da7ef9d6973ffc979a Uploaded diff -r 54973c4a1125 -r bc0e3b10d339 conifer_functions.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/conifer_functions.py Mon Feb 06 10:55:38 2017 -0500 @@ -0,0 +1,371 @@ +####################################################################### +####################################################################### +# CoNIFER: Copy Number Inference From Exome Reads +# Developed by Niklas Krumm (C) 2012 +# nkrumm@gmail.com +# +# homepage: http://conifer.sf.net +# This program is described in: +# Krumm et al. 2012. Genome Research. doi:10.1101/gr.138115.112 +# +# This file is part of CoNIFER. +# CoNIFER is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +####################################################################### +####################################################################### + +import csv +from tables import * +import numpy as np +import operator + +class rpkm_value(IsDescription): + probeID = UInt32Col(pos=0) + rpkm = FloatCol(pos=1) + +class probe(IsDescription): + probeID = UInt32Col(pos=0) + start = UInt32Col(pos=1) # start of probe + stop = UInt32Col(pos=2) # stop of probe + name = StringCol(20,pos=3) # 20-character String + +def chrInt2Str(chromosome_int): + if int(chromosome_int) == 23: + return 'chrX' + elif int(chromosome_int) == 24: + return 'chrY' + else: + return 'chr' + str(chromosome_int) + + +def chrStr2Int(chromosome_str): + chr = chromosome_str.replace('chr','') + if chr == 'X': + return 23 + elif chr == 'Y': + return 24 + else: + return int(chr) + +def parseLocString(locstr): + try: + chr,locstr = locstr.split(":") + start, stop = locstr.split("-") + except: + chr, start, stop = locstr.split("\t") + + chr = chrStr2Int(chr) + start = int(start) + stop = int(stop) + return (chr,start,stop) + +def zrpkm(rpkm,median,sd): + return (rpkm - median) / sd + + + +class sample(IsDescription): + sampleID = StringCol(100,pos=0) # 20-char string (sampleID) + +def loadProbeList(CF_probe_filename): + # Load data files + probefile = open(CF_probe_filename, 'rb') + s = csv.Sniffer() + header = s.has_header(probefile.read(1024)) + probefile.seek(0) + dialect = s.sniff(probefile.read(1024)) + probefile.seek(0) + if header: + r = csv.DictReader(probefile, dialect=dialect) + else: + r = csv.DictReader(probefile, dialect=dialect, fieldnames=['chr','start','stop','name']) + + probes = [] + probeID = 1 + for row in r: + probes.append({'probeID': probeID, 'chr':chrStr2Int(row['chr']),'start':int(row['start']),'stop':int(row['stop']), 'name':row['name']}) + probeID +=1 + + if len(probes) == 0: + raise Exception("No probes in probe file") + + return probes + + +def export_sample(h5file_in,sample,probes,outfile_f): + dt = np.dtype([('chr','|S10'),('start', ' 5: + counter = 0 + + +def plotGenomicCoords(plt, rpkm_data,fontsize=10,rotation=0): + import operator + import locale + exon_set = rpkm_data.exons + genomic_coords = np.array(map(operator.itemgetter("start"),exon_set)) + + ticks = range(0,len(exon_set),len(exon_set)/5) + + ticks[-1] -= 1 # the last tick is going to be off the chart, so we estimate it as the second to last genomic coord. + labels = [locale.format("%d", genomic_coords[i], grouping=True) for i in ticks if i < len(genomic_coords)] + if rotation != 0: + ha = "right" + else: + ha = "center" + _ = plt.xticks(ticks,labels,fontsize=fontsize,rotation=rotation,ha=ha) + + +def plotRawData(axis, rpkm_data, color='r',linewidth=0.7): + zero_stack = np.zeros(len(rpkm_data)) + positions = np.repeat(np.arange(0,len(rpkm_data)),3) + logr = np.vstack([zero_stack,rpkm_data.flatten(),zero_stack]).transpose().ravel() + axis.plot(positions,logr,color=color,marker=None,linewidth=1) + +def getbkpoints(mask): + bkpoints = np.nonzero(np.logical_xor(mask[0:-1],mask[1:]))[0]+1 + if mask[0] == 1: + bkpoints = np.hstack([0,bkpoints]) + if mask[-1] == 1: + bkpoints = np.hstack([bkpoints,len(mask)]) + return bkpoints.reshape(len(bkpoints)/2,2) + +def mergeCalls(calls): + if len(calls) == 0: + return [] + + out_calls = [] + calls=np.array(calls)[np.argsort(np.array(map(operator.itemgetter("start"),calls),dtype=np.int))] + pstart = calls[0]["start"] + pstop = calls[0]["stop"] + for d in calls: + if d["start"] <= pstop: + pstop = max(d["stop"],pstop) + else: + out_calls.append({"sampleID": d["sampleID"], "chromosome": d["chromosome"], "start":pstart, "stop":pstop, "state": d["state"]}) + pstart = d["start"] + pstop = d["stop"] + + out_calls.append({"sampleID": d["sampleID"], "chromosome": d["chromosome"], "start":pstart, "stop":pstop, "state": d["state"]}) + return out_calls + +class rpkm_data: + def __init__(self): + self.rpkm = None + self.samples = None + self.exons = None + self.isGenotype = False + self.calls = [] + self.refined_calls = [] + + def smooth(self, window = 15, padded = False): #todo, fix the padding here + if self.isGenotype: + print "Warning: the data in this rpkm_data container are single genotype values. Smoothing will have no effect!" + return self.rpkm + + if window > 0: + weightings=np.blackman(window) + weightings = weightings/weightings.sum() + smoothed_data = np.array([]) + for row in self.rpkm.transpose(): + smoothed = np.convolve(row, weightings)[(window-1)/2:-((window-1)/2)] + if len(smoothed_data) == 0: + smoothed_data = smoothed + else: + smoothed_data = np.vstack([smoothed_data,smoothed]) + + self.rpkm = smoothed_data.transpose() + return self.rpkm + else: + return self.rpkm + + def getSample(self, sampleIDs): + sample_array = np.array(self.samples) + if isinstance(sampleIDs,list): + mask = np.zeros(len(sample_array),dtype=np.bool) + for sampleID in sampleIDs: + mask = np.logical_or(mask, sample_array == str(sampleID)) + + return self.rpkm[:,mask] + else: + mask = sample_array == str(sampleID) + return self.rpkm[:,mask] + + def getSamples(self, sampleIDs): + return self.getSample(sampleIDs) + + @property + def shape(self): + if self.isGenotype: + return [len(self.samples), 1] + else: + return [len(self.samples), len(self.exons)] + + +class rpkm_reader: + def __init__(self, rpkm_fn=None): + """Initialize an rpkm_reader instance. Specify the location of the data file""" + + if rpkm_fn == None: + print "Must specify RPKM HDF5 file!" + return 0 + # set up file access + # self.h5file = openFile(rpkm_fn, mode='r') + self.h5file = open_file(rpkm_fn, mode='r') + self.sample_table = self.h5file.root.samples.samples + + def __del__(self): + self.h5file.close() + + def getExonValuesByExons(self, chromosome, start_exon, stop_exon, sampleList=None,genotype=False): + + probe_tbl = self.h5file.root.probes._f_getChild("probes_chr" + str(chromosome)) + #table_rows = probe_tbl.getWhereList('(start >= %d) & (stop <= %d)' % (start,stop)) + start_exon = max(start_exon,0) + stop_exon = min(stop_exon, probe_tbl.nrows) + #print start_exon, stop_exon + table_rows = np.arange(start_exon,stop_exon,1) + data_tbl = self.h5file.root._f_getChild("chr" + str(chromosome)) + + if sampleList == None: + num_samples = data_tbl._v_nchildren + samples = data_tbl + else: + num_samples = len(sampleList) + samples = [data_tbl._f_getChild("sample_" + s) for s in sampleList] + + data = np.empty([num_samples,len(table_rows)],dtype=np.float) + + out_sample_list = [] + cnt = 0 + for sample_tbl in samples: + d = sample_tbl.readCoordinates(table_rows,field="rpkm") + data[cnt,:] = d + cnt +=1 + out_sample_list.append(sample_tbl.title) + + d = rpkm_data() + if genotype: # return average #todo-- implement median and SD? + d.rpkm = data.transpose().mean(axis=0) + d.isGenotype = True + else: #return all data points + d.rpkm = data.transpose() + d.samples = out_sample_list + d.exons = probe_tbl.readCoordinates(table_rows) + + return d + + def getExonValuesByRegion(self, chromosome, start=None, stop=None, sampleList=None,genotype=False): + probe_tbl = self.h5file.root.probes._f_getChild("probes_chr" + str(chromosome)) + if (start is not None) and (stop is not None): + table_rows = probe_tbl.getWhereList('(start >= %d) & (stop <= %d)' % (start,stop)) + else: + table_rows = probe_tbl.getWhereList('(start >= 0) & (stop <= 1000000000)') + + data_tbl = self.h5file.root._f_getChild("chr" + str(chromosome)) + + if sampleList == None: + num_samples = data_tbl._v_nchildren + samples = data_tbl + else: + num_samples = len(sampleList) + samples = [data_tbl._f_getChild("sample_" + s) for s in sampleList] + + data = np.empty([num_samples,len(table_rows)],dtype=np.float) + + out_sample_list = [] + cnt = 0 + for sample_tbl in samples: + d = sample_tbl.readCoordinates(table_rows,field="rpkm") + data[cnt,:] = d + cnt +=1 + out_sample_list.append(sample_tbl.title) + + d = rpkm_data() + if genotype: # return average #todo-- implement median and SD? + d.rpkm = data.transpose().mean(axis=0) + d.isGenotype = True + else: #return all data points + d.rpkm = data.transpose() + d.samples = out_sample_list + d.exons = probe_tbl.readCoordinates(table_rows) + + return d + + def getSampleList(self,cohort=None,sex=None,ethnicity=None,custom=None): + """Return a list of available samples in the current data file. Specifying no arguments will return all available samples""" + + readWhereStr = "" + if custom != None: + readWhereStr = custom + else: + if cohort != None: + if isinstance(cohort,list): + for c in cohort: + readWhereStr += "(cohort=='%s') | " % c + readWhereStr = readWhereStr.strip(" |") + readWhereStr += " & " + else: + readWhereStr += "(cohort=='%s') " % cohort + if sex != None: + if sex not in ['M','F']: + sex = sex.upper()[0] + readWhereStr += " (sex=='%s') &" % sex + if ethnicity != None: + readWhereStr += " (ethnicity=='%s') &" % ethnicity + + readWhereStr = readWhereStr.strip(" &") # remove leading or trailing characters + if readWhereStr != "": + #print readWhereStr + sampleIDs = self.sample_table.readWhere(readWhereStr,field='sampleID') + else: + sampleIDs = self.sample_table.read(field='sampleID') + + return sampleIDs + + def getExonIDs(self, chromosome, start, stop): + probe_tbl = self.h5file.root.probes._f_getChild("probes_chr" + str(chromosome)) + exons = probe_tbl.getWhereList('(start >= %d) & (stop <= %d)' % (start,stop)) + return exons