Mercurial > repos > bzonnedda > conifer

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/conifer/c_calls.xml	Tue Oct 18 09:26:52 2016 -0400
@@ -0,0 +1,48 @@
+<tool id="conifer_call" name="CoNIFER CNVs caller" version="1.0">
+  <description></description>
+  <command>
+       python conifer.py call
+	--input $input
+	--output temporary
+    --threshold $threshold
+	2&gt;&amp;1
+    &amp;&amp; awk '{print \$2,\$3,\$4,\$5,\$1}' OFS="\t" temporary &gt; $outputFile
+  </command>
+  <inputs>
+    <param format="hdf5" name="input" type="data" label="CoNIFER output analysis file (HDF5)" />
+    <param name="threshold" type="float" value="1.5" label="Threshold" />
+  </inputs>
+  <outputs>
+    <data format="bed" name="outputFile" label="${tool.name} on ${on_string}" />
+  </outputs>
+ <help>
+**What it does**
+
+This tool is a CNVs caller using SVD-ZRPKM thresholding.
+
+**License and citation**
+
+This Galaxy tool is Copyright © 2014 `CRS4 Srl.`_ and is released under the `MIT license`_.
+
+.. _CRS4 Srl.: http://www.crs4.it/
+.. _MIT license: http://opensource.org/licenses/MIT
+
+You can use this tool only if you agree to the license terms of: `CoNIFER`_.
+
+.. _CoNIFER: http://conifer.sourceforge.net/
+
+If you use this tool, please cite:
+
+- |Cuccuru2014|_
+- |Krumm2012|_.
+
+.. |Cuccuru2014| replace:: Cuccuru, G., Orsini, M., Pinna, A., Sbardellati, A., Soranzo, N., Travaglione, A., Uva, P., Zanetti, G., Fotia, G. (2014) Orione, a web-based framework for NGS analysis in microbiology. *Bioinformatics* 30(13), 1928-1929
+.. _Cuccuru2014: http://bioinformatics.oxfordjournals.org/content/30/13/1928
+.. |Krumm2012| replace::  Krumm, N., Sudmant, P. H., Ko, A., O'Roak, B. J., Malig, M., Coe, B. P., NHLBI Exome Sequencing Project, Quinlan, A. R., Nickerson, D. A., Eichler, E. E. (2012) Copy number variation detection and genotyping from exome sequence data. *Genome Res.* 22(8), 1525-1532
+.. _Krumm2012: http://genome.cshlp.org/content/22/8/1525
+  </help>
+  <citations>
+    <citation type="doi">10.1093/bioinformatics/btu135</citation>
+    <citation type="doi">10.1101/gr.138115.112</citation>
+  </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/conifer/c_plotcalls.xml	Tue Oct 18 09:26:52 2016 -0400
@@ -0,0 +1,57 @@
+<tool id="conifer_plotcalls" name="CoNIFER plot caller" version="1.0">
+	<description>cnv caller</description>
+	<command interpreter="perl">
+    conifer_wrapper.pl --input $input --regions $plot_option.regions --html_file $html_file --html_folder $html_file.files_path
+    #if str($plot_option.plot_option_select) == "single"
+        --sample "$plot_option.sample"
+    #else
+        --window $plot_option.window
+        --multiple
+    #end if
+    2&gt;&amp;1
+	</command>
+	<inputs>
+		<param format="hdf5" name="input" type="data" label="CoNIFER output analysis file (hdf5)"/>
+		<conditional name="plot_option">
+			<param name="plot_option_select" type="select" label="Single or multiple regions?">
+				<option value="single" selected="true">Single region</option>
+				<option value="multiple">Multiple regions</option>
+			</param>
+			<when value="single">
+				<param name="regions" type="text" size="50" optional="false" label="Region" help="Region for the plot (i.e. chr1:1000-2000)"/>
+				<param name="sample" type="text" size="50" optional="true" value="" label="Sample" help="Sample names to be highlighted in the plot (optional)"/>
+			</when>
+			<when value="multiple">
+				<param format="tabular" name="regions" type="data" label="CoNIFER output calls file (tabular)"/>
+                <param name="window" size="10" type="integer" value="50" label="Window"/>
+			</when>
+		</conditional>
+    </inputs>
+    <outputs>
+        <data format="html" name="html_file" label="${tool.name} on ${on_string}" />
+    </outputs>
+<help>
+**What it does**
+
+Draw a HTML page with CNV plots. It accepts as input either a single region or a file with CoNIFER calls.
+
+**License and citation**
+
+This Galaxy tool is Copyright © 2013 `CRS4 Srl.`_ and is released under the `MIT license`_.
+
+.. _CRS4 Srl.: http://www.crs4.it/
+.. _MIT license: http://opensource.org/licenses/MIT
+
+If you use this tool in Galaxy, please cite |Cuccuru2014|_.
+
+.. |Cuccuru2014| replace:: Cuccuru, G., Orsini, M., Pinna, A., Sbardellati, A., Soranzo, N., Travaglione, A., Uva, P., Zanetti, G., Fotia, G. (2014) Orione, a web-based framework for NGS analysis in microbiology. *Bioinformatics*, accepted
+.. _Cuccuru2014: http://bioinformatics.oxfordjournals.org/content/early/2014/03/10/bioinformatics.btu135
+
+This tool uses CoNIFER, which is licensed separately. Please cite:
+
+- |Krumm2012|_.
+
+.. |Krumm2012| replace::  (Krumm et al., 2012) Copy number variation detection and genotyping from exome sequence data. Genome research.
+.. _Krumm2012: http://genome.cshlp.org/content/22/8/1525.full?sid=4a7a300a-b960-4544-8611-effc3315411c
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/conifer/c_rpkm.xml	Tue Oct 18 09:26:52 2016 -0400
@@ -0,0 +1,82 @@
+<tool id="conifer_rpkm" name="CoNIFER RPKM and Analyze" version="1.0">
+  <description></description>
+  <command>
+     mkdir rpkm_dir;
+#for $input_bam in $inputs:
+	#if str($input_bam.label.value) != "":
+	     ln -s ${input_bam.inputFile} ${$input_bam.label.value}.bam;
+	     ln -s ${input_bam.inputFile.metadata.bam_index} ${$input_bam.label.value}.bam.bai;
+	     python conifer.py rpkm
+	     --probes $probesFile
+	     --input ${$input_bam.label.value}.bam
+	     --output rpkm_dir/${$input_bam.label.value}.txt;
+	#else
+	     ln -s ${input_bam.inputFile} ${input_bam.inputFile.dataset.name}.bam;
+	     ln -s ${input_bam.inputFile.metadata.bam_index} ${input_bam.inputFile.dataset.name}.bam.bai;
+	     python conifer.py rpkm
+	     --probes $probesFile
+	     --input ${input_bam.inputFile.dataset.name}.bam
+	     --output rpkm_dir/${input_bam.inputFile.dataset.name}.txt;
+	#end if
+#end for
+     python conifer.py analyze
+     --probes $probesFile
+     --rpkm_dir rpkm_dir
+     --output $outputFile
+     --svd $svd
+     2&gt;&amp;1;
+  </command>
+  <inputs>
+    <param format="bed" name="probesFile" type="data" label="Target regions (BED)" />
+    <repeat name="inputs" title="BAM" min="2" help="Need to add more files? Use controls below.">
+      <param format="bam" name="inputFile" type="data" label="BAM file" />
+      <param name="label" type="text" size="30" value="" label="Label" help="Label to use in the output. If not given, the dataset name will be used instead">
+        <validator type="regex" message="Spaces are not allowed">^\S*$</validator>
+      </param>
+    </repeat>
+    <param name="svd" type="integer" value="2" label="SVD" />
+  </inputs>
+  <outputs>
+    <data format="hdf5" name="outputFile" label="${tool.name} on ${on_string}: hdf5" />
+  </outputs>
+  <help>
+
+.. class:: warningmark
+
+**Warning about SVD**
+
+The SVD value must be less than the number of samples.
+
+E.g.: if the number of samples is 3, the value of SVD must be 2 or less.
+
+**What it does**
+
+This tool calculates RPKM for each BAM file and analyzes them, afterward it creates a unique HDF5 file containing all SVD-ZRPKM values,
+probes and samples for downstream analysis with 'CoNIFER CNV caller' and 'CoNIFER plot'.
+
+**License and citation**
+
+This Galaxy tool is Copyright © 2015 `CRS4 Srl.`_ and is released under the `MIT license`_.
+
+.. _CRS4 Srl.: http://www.crs4.it/
+.. _MIT license: http://opensource.org/licenses/MIT
+
+You can use this tool only if you agree to the license terms of: `CoNIFER`_.
+
+.. _CoNIFER: http://conifer.sourceforge.net/
+
+If you use this tool, please cite:
+
+- |Cuccuru2014|_
+- |Krumm2012|_.
+
+.. |Cuccuru2014| replace:: Cuccuru, G., Orsini, M., Pinna, A., Sbardellati, A., Soranzo, N., Travaglione, A., Uva, P., Zanetti, G., Fotia, G. (2014) Orione, a web-based framework for NGS analysis in microbiology. *Bioinformatics* 30(13), 1928-1929
+.. _Cuccuru2014: http://bioinformatics.oxfordjournals.org/content/30/13/1928
+.. |Krumm2012| replace::  Krumm, N., Sudmant, P. H., Ko, A., O'Roak, B. J., Malig, M., Coe, B. P., NHLBI Exome Sequencing Project, Quinlan, A. R., Nickerson, D. A., Eichler, E. E. (2012) Copy number variation detection and genotyping from exome sequence data. *Genome Res.* 22(8), 1525-1532
+.. _Krumm2012: http://genome.cshlp.org/content/22/8/1525
+  </help>
+  <citations>
+    <citation type="doi">10.1093/bioinformatics/btu135</citation>
+    <citation type="doi">10.1101/gr.138115.112</citation>
+  </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/conifer/conifer.py	Tue Oct 18 09:26:52 2016 -0400
@@ -0,0 +1,682 @@
+#######################################################################
+#######################################################################
+# CoNIFER: Copy Number Inference From Exome Reads
+# Developed by Niklas Krumm (C) 2012
+# nkrumm@gmail.com
+#
+# homepage: http://conifer.sf.net
+# This program is described in:
+# Krumm et al. 2012. Genome Research. doi:10.1101/gr.138115.112
+#
+# This file is part of CoNIFER.
+# CoNIFER is free software:  you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#######################################################################
+#######################################################################
+
+import argparse
+import os, sys, copy
+import glob
+import csv
+import conifer_functions as cf
+import operator
+from tables import *
+import numpy  as np
+
+def CF_analyze(args):
+	# do path/file checks:
+	try:
+		# read probes table
+		probe_fn = str(args.probes[0])
+		probes = cf.loadProbeList(probe_fn)
+		num_probes = len(probes)
+		print '[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn)
+	except IOError as e:
+		print '[ERROR] Cannot read probes file: ', probe_fn
+		sys.exit(0)
+
+	try:
+		svd_outfile_fn = str(args.output)
+		h5file_out = openFile(svd_outfile_fn, mode='w')
+		probe_group = h5file_out.createGroup("/","probes","probes")
+	except IOError as e:
+		print '[ERROR] Cannot open SVD output file for writing: ', svd_outfile_fn
+		sys.exit(0)
+
+	if args.write_svals != "":
+		sval_f = open(args.write_svals,'w')
+
+	if args.plot_scree != "":
+		try:
+			import matplotlib
+			matplotlib.use('Agg')
+			import matplotlib.pyplot as plt
+			import pylab as P
+			from matplotlib.lines import Line2D
+			from matplotlib.patches import Rectangle
+		except:
+			print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
+			sys.exit(0)
+
+		plt.gcf().clear()
+		fig = plt.figure(figsize=(10,5))
+		ax = fig.add_subplot(111)
+
+	rpkm_dir = str(args.rpkm_dir[0])
+	rpkm_files = glob.glob(rpkm_dir + "/*")
+	if len(rpkm_files) == 0:
+		print '[ERROR] Cannot find any files in RPKM directory (or directory path is incorrect): ', rpkm_dir
+		sys.exit(0)
+	elif len(rpkm_files) == 1:
+		print '[ERROR] Found only 1 RPKM file (sample). CoNIFER requires multiple samples (8 or more) to run. Exiting.'
+		sys.exit(0)
+	elif len(rpkm_files) < 8:
+		print '[WARNING] Only found %d samples... this is less than the recommended minimum, and CoNIFER may not analyze this dataset correctly!' % len(rpkm_files)
+	elif len(rpkm_files) <= int(args.svd):
+		print '[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (len(rpkm_files), int(args.svd))
+		sys.exit(0)
+	else:
+		print '[INIT] Found %d RPKM files in %s' % (len(rpkm_files), rpkm_dir)
+
+	# read in samples names and generate file list
+	samples = {}
+	for f in rpkm_files:
+		s = '.'.join(f.split('/')[-1].split('.')[0:-1])
+		print "[INIT] Mapping file to sampleID: %s --> %s" % (f, s)
+		samples[s] = f
+
+	#check uniqueness and total # of samples
+	if len(set(samples)) != len(set(rpkm_files)):
+		print '[ERROR] Could not successfully derive sample names from RPKM filenames. There are probably non-unique sample names! Please rename files using <sampleID>.txt format!'
+		sys.exit(0)
+
+	# LOAD RPKM DATA
+	RPKM_data = np.zeros([num_probes,len(samples)], dtype=np.float)
+	failed_samples = 0
+
+	for i,s in enumerate(samples.keys()):
+		t = np.loadtxt(samples[s], dtype=np.float, delimiter="\t", skiprows=0, usecols=[2])
+		if len(t) != num_probes:
+			print "[WARNING] Number of RPKM values for %s in file %s does not match number of defined probes in %s. **This sample will be dropped from analysis**!" % (s, samples[s], probe_fn)
+			_ = samples.pop(s)
+			failed_samples += 1
+		else:
+			RPKM_data[:,i] = t
+			print "[INIT] Successfully read RPKM data for sampleID: %s" % s
+
+	RPKM_data = RPKM_data[:,0:len(samples)]
+	print "[INIT] Finished reading RPKM files. Total number of samples in experiment: %d (%d failed to read properly)" % (len(samples), failed_samples)
+
+	if len(samples) <= int(args.svd):
+		print '[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (int(args.svd), len(samples))
+		sys.exit(0)
+
+	# BEGIN
+	chrs_to_process = set(map(operator.itemgetter("chr"),probes))
+	chrs_to_process_str = ', '.join([cf.chrInt2Str(c) for c in chrs_to_process])
+	print '[INIT] Attempting to process chromosomes: ', chrs_to_process_str
+
+
+
+	for chr in chrs_to_process:
+		print "[RUNNING: chr%d] Now on: %s" %(chr, cf.chrInt2Str(chr))
+		chr_group_name = "chr%d" % chr
+		chr_group = h5file_out.createGroup("/",chr_group_name,chr_group_name)
+
+		chr_probes = filter(lambda i: i["chr"] == chr, probes)
+		num_chr_probes = len(chr_probes)
+		start_probeID = chr_probes[0]['probeID']
+		stop_probeID = chr_probes[-1]['probeID']
+		print "[RUNNING: chr%d] Found %d probes; probeID range is [%d-%d]" % (chr, len(chr_probes), start_probeID-1, stop_probeID) # probeID is 1-based and slicing is 0-based, hence the start_probeID-1 term
+
+		rpkm = RPKM_data[start_probeID:stop_probeID,:]
+
+		print "[RUNNING: chr%d] Calculating median RPKM" % chr
+		median = np.median(rpkm,1)
+		sd = np.std(rpkm,1)
+		probe_mask = median >= float(args.min_rpkm)
+		print "[RUNNING: chr%d] Masking %d probes with median RPKM < %f" % (chr, np.sum(probe_mask==False), float(args.min_rpkm))
+
+		rpkm = rpkm[probe_mask, :]
+		num_chr_probes = np.sum(probe_mask)
+
+		if num_chr_probes <= len(samples):
+			print "[ERROR] This chromosome has fewer informative probes than there are samples in the analysis! There are probably no mappings on this chromosome. Please remove these probes from the probes.txt file"
+			sys.exit(0)
+
+		probeIDs = np.array(map(operator.itemgetter("probeID"),chr_probes))[probe_mask]
+		probe_starts = np.array(map(operator.itemgetter("start"),chr_probes))[probe_mask]
+		probe_stops = np.array(map(operator.itemgetter("stop"),chr_probes))[probe_mask]
+		gene_names =  np.array(map(operator.itemgetter("name"),chr_probes))[probe_mask]
+
+		dt = np.dtype([('probeID',np.uint32),('start',np.uint32),('stop',np.uint32), ('name', np.str_, 20)])
+
+		out_probes = np.empty(num_chr_probes,dtype=dt)
+		out_probes['probeID'] = probeIDs
+		out_probes['start'] = probe_starts
+		out_probes['stop'] = probe_stops
+		out_probes['name'] = gene_names
+		probe_table = h5file_out.createTable(probe_group,"probes_chr%d" % chr,cf.probe,"chr%d" % chr)
+		probe_table.append(out_probes)
+
+		print "[RUNNING: chr%d] Calculating ZRPKM scores..." % chr
+		rpkm = np.apply_along_axis(cf.zrpkm, 0, rpkm, median[probe_mask], sd[probe_mask])
+
+		# svd transform
+		print "[RUNNING: chr%d] SVD decomposition..." % chr
+		components_removed = int(args.svd)
+
+		U, S, Vt = np.linalg.svd(rpkm,full_matrices=False)
+		new_S = np.diag(np.hstack([np.zeros([components_removed]),S[components_removed:]]))
+
+		if args.write_svals != "":
+			sval_f.write('chr' + str(chr) + '\t' + '\t'.join([str(_i) for _i in S]) + "\n")
+
+		if args.plot_scree != "":
+			ax.plot(S, label='chr' + str(chr),lw=0.5)
+
+		# reconstruct data matrix
+		rpkm = np.dot(U, np.dot(new_S, Vt))
+
+
+		# save to HDF5 file
+		print "[RUNNING: chr%d] Saving SVD-ZRPKM values" % chr
+
+		for i,s in enumerate(samples):
+			out_data = np.empty(num_chr_probes,dtype='u4,f8')
+			out_data['f0'] = probeIDs
+			out_data['f1'] = rpkm[:,i]
+			sample_tbl = h5file_out.createTable(chr_group,"sample_" + str(s),cf.rpkm_value,"%s" % str(s))
+			sample_tbl.append(out_data)
+
+
+	print "[RUNNING] Saving sampleIDs to file..."
+	sample_group = h5file_out.createGroup("/","samples","samples")
+	sample_table = h5file_out.createTable(sample_group,"samples",cf.sample,"samples")
+	dt = np.dtype([('sampleID',np.str_,100)])
+	out_samples = np.empty(len(samples.keys()),dtype=dt)
+	out_samples['sampleID'] = np.array(samples.keys())
+	sample_table.append(out_samples)
+
+
+	if args.write_sd != "":
+		print "[RUNNING] Calculating standard deviations for all samples (this can take a while)..."
+
+		sd_file = open(args.write_sd,'w')
+
+		for i,s in enumerate(samples):
+			# collect all SVD-ZRPKM values
+			count = 1
+			for chr in chrs_to_process:
+				if count == 1:
+					sd_out = h5file_out.root._f_getChild("chr%d" % chr)._f_getChild("sample_%s" % s).read(field="rpkm").flatten()
+				else:
+					sd_out = np.hstack([sd_out,out.h5file_out.root._f_getChild("chr%d" % chr)._f_getChild("sample_%s" % s).read(field="rpkm").flatten()])
+
+				sd = np.std(sd_out)
+			sd_file.write("%s\t%f\n" % (s,sd))
+
+		sd_file.close()
+
+	if args.plot_scree != "":
+		plt.title("Scree plot")
+		if len(samples) < 50:
+			plt.xlim([0,len(samples)])
+			plt.xlabel("S values")
+		else:
+			plt.xlim([0,50])
+			plt.xlabel("S values (only first 50 plotted)")
+		plt.ylabel("Relative contributed variance")
+		plt.savefig(args.plot_scree)
+
+	print "[FINISHED]"
+	h5file_out.close()
+	sys.exit(0)
+
+def CF_export(args):
+	try:
+		h5file_in_fn = str(args.input)
+		h5file_in = openFile(h5file_in_fn, mode='r')
+	except IOError as e:
+		print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn
+		sys.exit(0)
+
+	# read probes
+	probes = {}
+	for probes_chr in h5file_in.root.probes:
+		probes[probes_chr.title] = probes_chr.read()
+
+	if args.sample =='all':
+		all_samples = list(h5file_in.root.samples.samples.read(field="sampleID"))
+
+		out_path = os.path.abspath(args.output)
+
+		print "[INIT] Preparing to export all samples (%d samples) to %s" % (len(all_samples), out_path)
+		for sample in all_samples:
+			try:
+				outfile_fn = out_path + "/" + sample + ".bed"
+				outfile_f = open(outfile_fn,'w')
+			except IOError as e:
+				print '[ERROR] Cannot open output file for writing: ', outfile_fn
+				sys.exit(0)
+			print "[RUNNING] Exporting %s" % sample
+
+			cf.export_sample(h5file_in,sample,probes,outfile_f)
+			outfile_f.close()
+
+	elif len(args.sample) == 1:
+		out_path = os.path.abspath(args.output)
+		sample = args.sample[0]
+		print "[INIT] Preparing to export sampleID %s to %s" % (args.sample[0], out_path)
+		try:
+			if os.path.isdir(out_path):
+				outfile_fn = out_path + "/" + sample + ".bed"
+			else:
+				outfile_fn = out_path
+			outfile_f = open(outfile_fn,'w')
+		except IOError as e:
+			print '[ERROR] Cannot open output file for writing: ', outfile_fn
+			sys.exit(0)
+		print "[RUNNING] Exporting %s to %s" % (sample, outfile_fn)
+
+		cf.export_sample(h5file_in,sample,probes,outfile_f)
+		outfile_f.close()
+
+	else:
+		out_path = os.path.abspath(args.output)
+		print "[INIT] Preparing to export %d samples to %s" % (len(args.sample), out_path)
+		for sample in args.sample:
+			try:
+				if os.path.isdir(out_path):
+					outfile_fn = out_path + "/" + sample + ".bed"
+				else:
+					outfile_fn = out_path
+				outfile_f = open(outfile_fn,'w')
+			except IOError as e:
+				print '[ERROR] Cannot open output file for writing: ', outfile_fn
+				sys.exit(0)
+			print "[RUNNING] Exporting %s to %s" % (sample, outfile_fn)
+
+			cf.export_sample(h5file_in,sample,probes,outfile_f)
+			outfile_f.close()
+	sys.exit(0)
+
+def CF_call(args):
+	try:
+		h5file_in_fn = str(args.input)
+		h5file_in = openFile(h5file_in_fn, mode='r')
+	except IOError as e:
+		print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn
+		sys.exit(0)
+
+	try:
+		callfile_fn = str(args.output)
+		callfile_f = open(callfile_fn, mode='w')
+	except IOError as e:
+		print '[ERROR] Cannot open output file for writing: ', callfile_fn
+		sys.exit(0)
+
+	chrs_to_process = []
+	for chr in h5file_in.root:
+		if chr._v_title not in ('probes','samples'):
+			chrs_to_process.append(chr._v_title.replace("chr",""))
+
+	h5file_in.close()
+
+	print '[INIT] Initializing caller at threshold = %f' % (args.threshold)
+
+	r = cf.rpkm_reader(h5file_in_fn)
+
+	all_calls = []
+
+	for chr in chrs_to_process:
+		print '[RUNNING] Now processing chr%s' % chr
+		data = r.getExonValuesByRegion(chr)
+
+		#raw_data = copy.copy(data)
+		_ = data.smooth()
+
+		mean= np.mean(data.rpkm,axis=1)
+		sd =  np.std(data.rpkm,axis=1)
+
+		for sample in r.getSampleList():
+			sample_data = data.getSample([sample]).flatten()
+			#sample_raw_data = raw_data.getSample([sample]).flatten()
+
+			dup_mask = sample_data >= args.threshold
+			del_mask = sample_data <= -1*args.threshold
+
+			dup_bkpoints = cf.getbkpoints(dup_mask) #returns exon coordinates for this chromosome (numpy array coords)
+			del_bkpoints = cf.getbkpoints(del_mask)
+
+
+			dups = []
+			for start,stop in dup_bkpoints:
+				try: new_start =  np.max(np.where(sample_data[:start] < (mean[:start] + 3*sd[:start])))
+				except ValueError: new_start = 0
+				try: new_stop = stop + np.min(np.where(sample_data[stop:] < (mean[stop:] + 3*sd[stop:])))
+				except ValueError: new_stop = data.shape[1]-1
+				dups.append({"sampleID":sample,"chromosome":  cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "dup"})
+
+			dels = []
+			for start,stop in del_bkpoints:
+				try: new_start =  np.max(np.where(sample_data[:start] > (-1*mean[:start] - 3*sd[:start])))
+				except ValueError: new_start = 0
+				try: new_stop = stop + np.min(np.where(sample_data[stop:] > (-1*mean[stop:] - 3*sd[stop:])))
+				except ValueError: new_stop = data.shape[1]-1
+				dels.append({"sampleID":sample,"chromosome": cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "del"})
+
+			dels = cf.mergeCalls(dels) #merges overlapping calls
+			dups = cf.mergeCalls(dups)
+
+			#print sampleID, len(dels), len(dups)
+
+			all_calls.extend(list(dels))
+			all_calls.extend(list(dups))
+
+	# print calls to file
+	header = ['sampleID','chromosome','start','stop','state']
+
+	callfile_f.write('\t'.join(header) + "\n")
+	for call in all_calls:
+		print "%s\t%s\t%d\t%d\t%s" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"])
+		callfile_f.write("%s\t%s\t%d\t%d\t%s\n" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"]))
+
+	sys.exit(0)
+
+def CF_plot(args):
+	try:
+		import locale
+		import matplotlib
+		matplotlib.use('Agg')
+		import matplotlib.pyplot as plt
+		import pylab as P
+		from matplotlib.lines import Line2D
+		from matplotlib.patches import Rectangle
+		_ = locale.setlocale(locale.LC_ALL, '')
+	except:
+		print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
+		sys.exit(0)
+
+
+	chr, start, stop = cf.parseLocString(args.region)
+
+	r = cf.rpkm_reader(args.input)
+
+	data = r.getExonValuesByRegion(chr,start,stop)
+	_ = data.smooth()
+
+	plt.gcf().clear()
+	fig = plt.figure(figsize=(10,5))
+	ax = fig.add_subplot(111)
+
+
+	ax.plot(data.rpkm, linewidth = 0.3, c='k')
+
+
+	if args.sample != 'none':
+		cnt = 1
+		coloriter = iter(['r','b','g','y'])
+		for sample in args.sample:
+			try:
+				color, sampleID = sample.split(":")
+			except:
+				color =coloriter.next()
+				sampleID = sample
+
+			ax.plot(data.getSample([sampleID]), linewidth = 1, c=color, label = sampleID)
+
+			if cnt == 1:
+				cf.plotRawData(ax, r.getExonValuesByRegion(chr,start,stop,sampleList=[sampleID]).getSample([sampleID]),color=color)
+			cnt +=1
+		plt.legend(prop={'size':10},frameon=False)
+
+	cf.plotGenes(ax, data)
+	cf.plotGenomicCoords(plt,data)
+	plt.xlim(0,data.shape[1])
+	plt.ylim(-3,3)
+
+	plt.title("%s: %s - %s" % (cf.chrInt2Str(chr),locale.format("%d",start, grouping=True),locale.format("%d",stop, grouping=True)))
+	plt.xlabel("Position")
+	plt.ylabel("SVD-ZRPKM Values")
+
+	plt.savefig(args.output)
+
+	sys.exit(0)
+
+def CF_plotcalls(args):
+	try:
+		import matplotlib
+		matplotlib.use('Agg')
+		import matplotlib.pyplot as plt
+		import pylab as P
+		from matplotlib.lines import Line2D
+		from matplotlib.patches import Rectangle
+	except:
+		print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
+		sys.exit(0)
+
+	import locale
+	try:
+		_ = locale.setlocale(locale.LC_ALL, 'en_US')
+	except:
+		_ = locale.setlocale(locale.LC_ALL, '')
+
+	try:
+		callfile_fn = str(args.calls)
+		callfile_f = open(callfile_fn, mode='r')
+	except IOError as e:
+		print '[ERROR] Cannot open call file for reading: ', callfile_fn
+		sys.exit(0)
+
+	all_calls = []
+	header = callfile_f.readline()
+
+	for line in callfile_f:
+		sampleID, chr, start, stop, state = line.strip().split()
+		chr = cf.chrStr2Int(chr)
+		all_calls.append({"chromosome":int(chr), "start":int(start), "stop":int(stop), "sampleID":sampleID})
+
+	r = cf.rpkm_reader(args.input)
+
+	for call in all_calls:
+		chr = call["chromosome"]
+		start = call["start"]
+		stop = call["stop"]
+		sampleID = call["sampleID"]
+
+		exons = r.getExonIDs(chr,int(start),int(stop))
+
+
+		window_start = max(exons[0]-args.window,0)
+		window_stop = exons[-1]+args.window
+
+		data = r.getExonValuesByExons(chr,window_start, window_stop)
+		_ = data.smooth()
+
+		plt.gcf().clear()
+		fig = plt.figure(figsize=(10,5))
+		ax = fig.add_subplot(111)
+
+
+		ax.plot(data.rpkm, linewidth = 0.3, c='k')
+
+
+		ax.plot(data.getSample([sampleID]), linewidth = 1, c='r', label = sampleID)
+		cf.plotRawData(ax, r.getExonValuesByExons(chr,window_start, window_stop,sampleList=[sampleID]).getSample([sampleID]),color='r')
+
+		plt.legend(prop={'size':10},frameon=False)
+
+		cf.plotGenes(ax, data)
+		cf.plotGenomicCoords(plt,data)
+
+		exon_start = np.where(data.exons["start"] == start)[0][0]
+		exon_stop = np.where(data.exons["stop"] == stop)[0][0]
+		_ = ax.add_line(matplotlib.lines.Line2D([exon_start,exon_stop],[2,2],color='k',lw=6,linestyle='-',alpha=1,solid_capstyle='butt'))
+
+		_ = plt.xlim(0,data.shape[1])
+		_ = plt.ylim(-3,3)
+
+		plt.title("%s: %s - %s" % (cf.chrInt2Str(chr),locale.format("%d",start, grouping=True),locale.format("%d",stop, grouping=True)))
+		plt.xlabel("Position")
+		plt.ylabel("SVD-ZRPKM Values")
+		outfile = "%s_%d_%d_%s.png" % (cf.chrInt2Str(chr), start, stop, sampleID)
+		plt.savefig(args.outputdir + "/" + outfile)
+
+def CF_bam2RPKM(args):
+	try:
+		import pysam
+	except:
+		print '[ERROR] Cannot load pysam module! Make sure it is insalled'
+		sys.exit(0)
+	try:
+		# read probes table
+		probe_fn = str(args.probes[0])
+		probes = cf.loadProbeList(probe_fn)
+		num_probes = len(probes)
+		print '[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn)
+	except IOError as e:
+		print '[ERROR] Cannot read probes file: ', probe_fn
+		sys.exit(0)
+
+	try:
+		rpkm_f = open(args.output[0],'w')
+	except IOError as e:
+		print '[ERROR] Cannot open rpkm file for writing: ', args.output
+		sys.exit(0)
+
+	print "[RUNNING] Counting total number of reads in bam file..."
+	total_reads = float(pysam.view("-c", args.input[0])[0].strip("\n"))
+	print "[RUNNING] Found %d reads" % total_reads
+
+	f = pysam.Samfile(args.input[0], "rb" )
+
+	if not f._hasIndex():
+		print "[ERROR] No index found for bam file (%s)!\n[ERROR] You must first index the bam file and include the .bai file in the same directory as the bam file!" % args.input[0]
+		sys.exit(0)
+
+	# will be storing values in these arrays
+	readcount = np.zeros(num_probes)
+	exon_bp = np.zeros(num_probes)
+	probeIDs = np.zeros(num_probes)
+	counter = 0
+
+	# detect contig naming scheme here # TODO, add an optional "contigs.txt" file or automatically handle contig naming
+	bam_contigs = f.references
+	probes_contigs = [str(p) for p in set(map(operator.itemgetter("chr"),probes))]
+
+	probes2contigmap = {}
+
+	for probes_contig in probes_contigs:
+		if probes_contig in bam_contigs:
+			probes2contigmap[probes_contig] = probes_contig
+		elif cf.chrInt2Str(probes_contig) in bam_contigs:
+			probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig)
+		elif cf.chrInt2Str(probes_contig).replace("chr","") in bam_contigs:
+			probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig).replace("chr","")
+		else:
+			print "[ERROR] Could not find contig '%s' from %s in bam file! \n[ERROR] Perhaps the contig names for the probes are incompatible with the bam file ('chr1' vs. '1'), or unsupported contig naming is used?" % (probes_contig, probe_fn)
+			sys.exit(0)
+
+	print "[RUNNING] Calculating RPKM values..."
+
+	# loop through each probe
+	for p in probes:
+
+		# f.fetch is a pysam method and returns an iterator for reads overlapping interval
+
+		p_chr = probes2contigmap[str(p["chr"])]
+
+		p_start = p["start"]
+		p_stop = p["stop"]
+		try:
+			iter = f.fetch(p_chr,p_start,p_stop)
+		except:
+			print "[ERROR] Could not retrieve mappings for region %s:%d-%d. Check that contigs are named correctly and the bam file is properly indexed" % (p_chr,p_start,p_stop)
+			sys.exit(0)
+
+		for i in iter:
+			if i.pos+1 >= p_start: #this checks to make sure a read actually starts in an interval
+				readcount[counter] += 1
+
+		exon_bp[counter] = p_stop-p_start
+		probeIDs[counter] = counter +1 #probeIDs are 1-based
+		counter +=1
+
+	#calcualte RPKM values for all probes
+	rpkm = (10**9*(readcount)/(exon_bp))/(total_reads)
+
+	out = np.vstack([probeIDs,readcount,rpkm])
+
+	np.savetxt(rpkm_f,out.transpose(),delimiter='\t',fmt=['%d','%d','%f'])
+
+	rpkm_f.close()
+
+
+
+VERSION = "0.2.2"
+parser = argparse.ArgumentParser(prog="CoNIFER", description="This is CoNIFER %s (Copy Number Inference From Exome Reads), designed to detect and genotype CNVs and CNPs from exome sequence read-depth data. See Krumm et al., Genome Research (2012) doi:10.1101/gr.138115.112 \nNiklas Krumm, 2012\n nkrumm@uw.edu" % VERSION)
+parser.add_argument('--version', action='version', version='%(prog)s ' + VERSION)
+subparsers = parser.add_subparsers(help='Command to be run.')
+
+# rpkm command
+rpkm_parser= subparsers.add_parser('rpkm', help='Create an RPKM file from a BAM file')
+rpkm_parser.add_argument('--probes',action='store', required=True, metavar='/path/to/probes_file.txt',  nargs=1,help="Probe definition file")
+rpkm_parser.add_argument('--input',action='store', required=True, metavar='sample.bam',nargs=1,  help="Aligned BAM file")
+rpkm_parser.add_argument('--output',action='store', required=True, metavar='sample.rpkm.txt',nargs=1,  help="RPKM file to write")
+rpkm_parser.set_defaults(func=CF_bam2RPKM)
+
+# analyze command
+analyze_parser= subparsers.add_parser('analyze', help='Basic CoNIFER analysis. Reads a directory of RPKM files and a probe list and outputs a HDF5 file containing SVD-ZRPKM values.')
+analyze_parser.add_argument('--probes',action='store', required=True, metavar='/path/to/probes_file.txt',  nargs=1,help="Probe definition file")
+analyze_parser.add_argument('--rpkm_dir',action='store', required=True, metavar='/path/to/folder/containing/rpkm_files/',nargs=1,  help="Location of folder containing RPKM files. Folder should contain ONLY contain RPKM files, and all readable RPKM files will used in analysis.")
+analyze_parser.add_argument('--output','-o', required=True, metavar='/path/to/output_file.hdf5', type=str, help="Output location of HDF5 file to contain SVD-ZRPKM values")
+analyze_parser.add_argument('--svd', metavar='12', type=int, nargs='?', default = 12,help="Number of components to remove")
+analyze_parser.add_argument('--min_rpkm', metavar='1.00', type=float, nargs="?", default = 1.00,help="Minimum population median RPKM per probe.")
+analyze_parser.add_argument('--write_svals', metavar='SingularValues.txt', type=str, default= "", help="Optional file to write singular values (S-values). Used for Scree Plot.")
+analyze_parser.add_argument('--plot_scree', metavar='ScreePlot.png', type=str, default= "", help="Optional graphical scree plot. Requires matplotlib.")
+analyze_parser.add_argument('--write_sd', metavar='StandardDeviations.txt', type=str, default= "", help="Optional file with sample SVD-ZRPKM standard deviations. Used to filter noisy samples.")
+analyze_parser.set_defaults(func=CF_analyze)
+
+# export command
+export_parser= subparsers.add_parser('export', help='Export SVD-ZRPKM values from a HDF5 file to bed or vcf format.')
+export_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
+export_parser.add_argument('--output','-o',action='store', required=False, default='.', metavar='output.bed',help="Location of output file[s]. When exporting multiple samples, top-level directory of this path will be used.")
+export_parser.add_argument('--sample','-s',action='store', required=False, metavar='sampleID', default='all', nargs="+",help="SampleID or comma-separated list of sampleIDs to export. Default: export all samples")
+export_parser.set_defaults(func=CF_export)
+
+# plot command
+plot_parser= subparsers.add_parser('plot', help='Plot SVD-ZRPKM values using matplotlib')
+plot_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
+plot_parser.add_argument('--region',action='store', required=True, metavar='chr#:start-stop',help="Region to plot")
+plot_parser.add_argument('--output',action='store', required=True, metavar='image.png',help="Output path and filetype. PDF, PNG, PS, EPS, and SVG are supported.")
+plot_parser.add_argument('--sample',action='store', required=False, metavar='sampleID',nargs="+",default='none',help="Sample[s] to highlight. The following optional color spec can be used: <color>:<sampleID>. Available colors are r,b,g,y,c,m,y,k. The unsmoothed SVD-ZRPKM values for the first sample in this list will be drawn. Default: No samples highlighted.")
+plot_parser.set_defaults(func=CF_plot)
+
+# make calls command
+call_parser= subparsers.add_parser('call', help='Very rudimentary caller for CNVs using SVD-ZRPKM thresholding.')
+call_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
+call_parser.add_argument('--output',action='store', required=True, metavar='calls.txt',help="Output file for calls")
+call_parser.add_argument('--threshold', metavar='1.50', type=float, nargs='?', required=False, default = 1.50,help="+/- Threshold for calling (minimum SVD-ZRPKM)")
+call_parser.set_defaults(func=CF_call)
+
+# plotcalls command
+plotcalls_parser= subparsers.add_parser('plotcalls', help='Make basic plots from call file from "call" command.')
+plotcalls_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
+plotcalls_parser.add_argument('--calls',action='store', required=True, metavar='calls.txt',help="File with calls from 'call' command.")
+plotcalls_parser.add_argument('--outputdir',action='store', required=True, metavar='/path/to/directory',help="Output directory for plots")
+plotcalls_parser.add_argument('--window',action='store', required=False, metavar='50',default=50,help="In exons, the amount of padding to plot around each call")
+plotcalls_parser.set_defaults(func=CF_plotcalls)
+
+args = parser.parse_args()
+args.func(args)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/conifer/conifer_functions.py	Tue Oct 18 09:26:52 2016 -0400
@@ -0,0 +1,370 @@
+#######################################################################
+#######################################################################
+# CoNIFER: Copy Number Inference From Exome Reads
+# Developed by Niklas Krumm (C) 2012
+# nkrumm@gmail.com
+#
+# homepage: http://conifer.sf.net
+# This program is described in:
+# Krumm et al. 2012. Genome Research. doi:10.1101/gr.138115.112
+#
+# This file is part of CoNIFER.
+# CoNIFER is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#######################################################################
+#######################################################################
+
+import csv
+from tables import *
+import numpy as np
+import operator
+
+class rpkm_value(IsDescription):
+	probeID = UInt32Col(pos=0)
+	rpkm = FloatCol(pos=1)
+
+class probe(IsDescription):
+	probeID = 	UInt32Col(pos=0)
+	start = 	UInt32Col(pos=1) # start of probe
+	stop = 		UInt32Col(pos=2) # stop of probe
+	name = 		StringCol(20,pos=3)   # 20-character String
+
+def chrInt2Str(chromosome_int):
+	if int(chromosome_int) == 23:
+		return 'chrX'
+	elif int(chromosome_int) == 24:
+		return 'chrY'
+	else:
+		return 'chr' + str(chromosome_int)
+
+
+def chrStr2Int(chromosome_str):
+	chr = chromosome_str.replace('chr','')
+	if chr == 'X':
+		return 23
+	elif chr == 'Y':
+		return 24
+	else:
+		return int(chr)
+
+def parseLocString(locstr):
+	try:
+		chr,locstr = locstr.split(":")
+		start, stop = locstr.split("-")
+	except:
+		chr, start, stop = locstr.split("\t")
+
+	chr = chrStr2Int(chr)
+	start = int(start)
+	stop = int(stop)
+	return (chr,start,stop)
+
+def zrpkm(rpkm,median,sd):
+	return (rpkm - median) / sd
+
+
+
+class sample(IsDescription):
+	sampleID = 	StringCol(100,pos=0) # 20-char string (sampleID)
+
+def loadProbeList(CF_probe_filename):
+	# Load data files
+	probefile = open(CF_probe_filename, 'rb')
+	s = csv.Sniffer()
+	header = s.has_header(probefile.read(1024))
+	probefile.seek(0)
+	dialect = s.sniff(probefile.read(1024))
+	probefile.seek(0)
+	if header:
+		r = csv.DictReader(probefile, dialect=dialect)
+	else:
+		r = csv.DictReader(probefile, dialect=dialect, fieldnames=['chr','start','stop','name'])
+
+	probes = []
+	probeID = 1
+	for row in r:
+		probes.append({'probeID': probeID, 'chr':chrStr2Int(row['chr']),'start':int(row['start']),'stop':int(row['stop']), 'name':row['name']})
+		probeID +=1
+
+	if len(probes) == 0:
+		raise Exception("No probes in probe file")
+
+	return probes
+
+
+def export_sample(h5file_in,sample,probes,outfile_f):
+	dt = np.dtype([('chr','|S10'),('start', '<u4'), ('stop', '<u4'), ('name', '|S20'),('SVDZRPKM',np.float)])
+	for chr in h5file_in.root:
+		if chr._v_title in ('probes','samples'):
+			continue
+
+		out_data = np.empty(len(probes[chr._v_title]),dtype=dt)
+		out_data["SVDZRPKM"] = chr._f_getChild("sample_" + sample).read(field='rpkm')
+		out_data["chr"] = np.repeat(chr._v_title,len(out_data))
+		out_data["start"] = probes[chr._v_title]["start"]
+		out_data["stop"] = probes[chr._v_title]["stop"]
+		out_data["name"] = probes[chr._v_title]["name"]
+		np.savetxt(outfile_f, out_data,fmt=["%s","%d","%d","%s","%f"], delimiter="\t")
+
+
+
+def plotGenes(axis, rpkm_data, levels=5,x_pos=-2,text_pos='right',line_spacing=0.1,text_offset=0.25,data_range=None):
+	from matplotlib.lines import Line2D
+	counter = 0
+	prev_gene = ""
+	if data_range is not None:
+		exon_set = rpkm_data.exons[data_range]
+	else:
+		exon_set = rpkm_data.exons
+	for gene in exon_set["name"]:
+		if gene == prev_gene:
+			continue
+		elif gene == 'None':
+			continue
+		start = np.min(np.where(exon_set["name"] == gene))
+		stop = np.max(np.where(exon_set["name"] == gene)) + 1
+		_ = axis.add_line(Line2D([start-0.5,stop-0.5],[x_pos - (counter * line_spacing),x_pos - (counter * line_spacing)],color=(102/255.,33/255.,168/255.,0.6),linewidth=5,linestyle='-',alpha=0.5,solid_capstyle='butt'))
+		_ = axis.text(stop+text_offset, x_pos - (counter * line_spacing), gene, ha='left',va='center',fontsize=6)
+		counter +=1
+		prev_gene = gene
+		if counter > 5:
+			counter = 0
+
+
+def plotGenomicCoords(plt, rpkm_data,fontsize=10,rotation=0):
+	import operator
+	import locale
+	exon_set = rpkm_data.exons
+	genomic_coords = np.array(map(operator.itemgetter("start"),exon_set))
+
+	ticks = range(0,len(exon_set),len(exon_set)/5)
+
+	ticks[-1] -= 1 # the last tick is going to be off the chart, so we estimate it as the second to last genomic coord.
+	labels = [locale.format("%d", genomic_coords[i], grouping=True) for i in ticks if i < len(genomic_coords)]
+	if rotation != 0:
+		ha = "right"
+	else:
+		ha = "center"
+	_ = plt.xticks(ticks,labels,fontsize=fontsize,rotation=rotation,ha=ha)
+
+
+def plotRawData(axis, rpkm_data, color='r',linewidth=0.7):
+	zero_stack = np.zeros(len(rpkm_data))
+	positions = np.repeat(np.arange(0,len(rpkm_data)),3)
+	logr = np.vstack([zero_stack,rpkm_data.flatten(),zero_stack]).transpose().ravel()
+	axis.plot(positions,logr,color=color,marker=None,linewidth=1)
+
+def getbkpoints(mask):
+	bkpoints = np.nonzero(np.logical_xor(mask[0:-1],mask[1:]))[0]+1
+	if mask[0] == 1:
+		bkpoints = np.hstack([0,bkpoints])
+	if mask[-1] == 1:
+		bkpoints = np.hstack([bkpoints,len(mask)])
+	return bkpoints.reshape(len(bkpoints)/2,2)
+
+def mergeCalls(calls):
+	if len(calls) == 0:
+		return []
+
+	out_calls = []
+	calls=np.array(calls)[np.argsort(np.array(map(operator.itemgetter("start"),calls),dtype=np.int))]
+	pstart = calls[0]["start"]
+	pstop = calls[0]["stop"]
+	for d in calls:
+		if d["start"] <= pstop:
+			pstop = max(d["stop"],pstop)
+		else:
+			out_calls.append({"sampleID": d["sampleID"], "chromosome": d["chromosome"], "start":pstart, "stop":pstop, "state": d["state"]})
+			pstart = d["start"]
+			pstop = d["stop"]
+
+	out_calls.append({"sampleID": d["sampleID"], "chromosome": d["chromosome"], "start":pstart, "stop":pstop, "state": d["state"]})
+	return out_calls
+
+class rpkm_data:
+	def __init__(self):
+		self.rpkm = None
+		self.samples = None
+		self.exons = None
+		self.isGenotype = False
+		self.calls = []
+		self.refined_calls = []
+
+	def smooth(self, window = 15, padded = False): #todo, fix the padding here
+		if self.isGenotype:
+			print "Warning: the data in this rpkm_data container are single genotype values. Smoothing will have no effect!"
+			return self.rpkm
+
+		if window > 0:
+			weightings=np.blackman(window)
+			weightings = weightings/weightings.sum()
+			smoothed_data = np.array([])
+			for row in self.rpkm.transpose():
+				smoothed = np.convolve(row, weightings)[(window-1)/2:-((window-1)/2)]
+				if len(smoothed_data) == 0:
+					smoothed_data  = smoothed
+				else:
+					smoothed_data  = np.vstack([smoothed_data,smoothed])
+
+			self.rpkm = smoothed_data.transpose()
+			return self.rpkm
+		else:
+			return self.rpkm
+
+	def getSample(self, sampleIDs):
+		sample_array = np.array(self.samples)
+		if isinstance(sampleIDs,list):
+			mask = np.zeros(len(sample_array),dtype=np.bool)
+			for sampleID in sampleIDs:
+				mask = np.logical_or(mask, sample_array == str(sampleID))
+
+			return self.rpkm[:,mask]
+		else:
+			mask = sample_array == str(sampleID)
+			return self.rpkm[:,mask]
+
+	def getSamples(self, sampleIDs):
+		return self.getSample(sampleIDs)
+
+	@property
+	def shape(self):
+		if self.isGenotype:
+			return [len(self.samples), 1]
+		else:
+			return [len(self.samples), len(self.exons)]
+
+
+class rpkm_reader:
+	def __init__(self, rpkm_fn=None):
+		"""Initialize an rpkm_reader instance. Specify the location of the data file"""
+
+		if rpkm_fn == None:
+			print "Must specify RPKM HDF5 file!"
+			return 0
+		# set up file access
+		self.h5file = openFile(rpkm_fn, mode='r')
+		self.sample_table = self.h5file.root.samples.samples
+
+	def __del__(self):
+		self.h5file.close()
+
+	def getExonValuesByExons(self, chromosome, start_exon, stop_exon, sampleList=None,genotype=False):
+
+		probe_tbl = self.h5file.root.probes._f_getChild("probes_chr" + str(chromosome))
+		#table_rows = probe_tbl.getWhereList('(start >= %d) & (stop <= %d)' % (start,stop))
+		start_exon = max(start_exon,0)
+		stop_exon = min(stop_exon, probe_tbl.nrows)
+		#print start_exon, stop_exon
+		table_rows = np.arange(start_exon,stop_exon,1)
+		data_tbl  = self.h5file.root._f_getChild("chr" + str(chromosome))
+
+		if sampleList == None:
+			num_samples = data_tbl._v_nchildren
+			samples = data_tbl
+		else:
+			num_samples = len(sampleList)
+			samples = [data_tbl._f_getChild("sample_" + s) for s in sampleList]
+
+		data = np.empty([num_samples,len(table_rows)],dtype=np.float)
+
+		out_sample_list = []
+		cnt = 0
+		for sample_tbl in samples:
+			d = sample_tbl.readCoordinates(table_rows,field="rpkm")
+			data[cnt,:] = d
+			cnt +=1
+			out_sample_list.append(sample_tbl.title)
+
+		d = rpkm_data()
+		if genotype: # return average #todo-- implement median and SD?
+			d.rpkm = data.transpose().mean(axis=0)
+			d.isGenotype = True
+		else: #return all data points
+			d.rpkm = data.transpose()
+		d.samples = out_sample_list
+		d.exons = probe_tbl.readCoordinates(table_rows)
+
+		return d
+
+	def getExonValuesByRegion(self, chromosome, start=None, stop=None, sampleList=None,genotype=False):
+		probe_tbl = self.h5file.root.probes._f_getChild("probes_chr" + str(chromosome))
+		if (start is not None) and (stop is not None):
+			table_rows = probe_tbl.getWhereList('(start >= %d) & (stop <= %d)' % (start,stop))
+		else:
+			table_rows = probe_tbl.getWhereList('(start >= 0) & (stop <= 1000000000)')
+
+		data_tbl  = self.h5file.root._f_getChild("chr" + str(chromosome))
+
+		if sampleList == None:
+			num_samples = data_tbl._v_nchildren
+			samples = data_tbl
+		else:
+			num_samples = len(sampleList)
+			samples = [data_tbl._f_getChild("sample_" + s) for s in sampleList]
+
+		data = np.empty([num_samples,len(table_rows)],dtype=np.float)
+
+		out_sample_list = []
+		cnt = 0
+		for sample_tbl in samples:
+			d = sample_tbl.readCoordinates(table_rows,field="rpkm")
+			data[cnt,:] = d
+			cnt +=1
+			out_sample_list.append(sample_tbl.title)
+
+		d = rpkm_data()
+		if genotype: # return average #todo-- implement median and SD?
+			d.rpkm = data.transpose().mean(axis=0)
+			d.isGenotype = True
+		else: #return all data points
+			d.rpkm = data.transpose()
+		d.samples = out_sample_list
+		d.exons = probe_tbl.readCoordinates(table_rows)
+
+		return d
+
+	def getSampleList(self,cohort=None,sex=None,ethnicity=None,custom=None):
+		"""Return a list of available samples in the current data file. Specifying no arguments will return all available samples"""
+
+		readWhereStr = ""
+		if custom != None:
+			readWhereStr = custom
+		else:
+			if cohort != None:
+				if isinstance(cohort,list):
+					for c in cohort:
+						readWhereStr += "(cohort=='%s') | " % c
+					readWhereStr = readWhereStr.strip(" |")
+					readWhereStr += " & "
+				else:
+					readWhereStr += "(cohort=='%s') " % cohort
+			if sex != None:
+				if sex not in ['M','F']:
+					sex = sex.upper()[0]
+				readWhereStr += " (sex=='%s') &" % sex
+			if ethnicity != None:
+				readWhereStr += " (ethnicity=='%s') &" % ethnicity
+
+			readWhereStr = readWhereStr.strip(" &") # remove leading or trailing characters
+		if readWhereStr != "":
+			#print readWhereStr
+			sampleIDs = self.sample_table.readWhere(readWhereStr,field='sampleID')
+		else:
+			sampleIDs = self.sample_table.read(field='sampleID')
+
+		return sampleIDs
+
+	def getExonIDs(self, chromosome, start, stop):
+		probe_tbl = self.h5file.root.probes._f_getChild("probes_chr" + str(chromosome))
+		exons = probe_tbl.getWhereList('(start >= %d) & (stop <= %d)' % (start,stop))
+		return exons
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/conifer/conifer_wrapper.pl	Tue Oct 18 09:26:52 2016 -0400
@@ -0,0 +1,51 @@
+#!/usr/bin/env perl
+
+# Execute CoNIFER plotcalls and
+# returns a HTML page with links to PNG plots
+
+use strict;
+use warnings;
+use Getopt::Long;
+use File::Basename;
+use File::Path qw(make_path remove_tree);
+
+my $command;
+my $dir=$ENV{'CONIFER_PATH'};
+
+our ($multiple, $input, $regions, $sample, $window, $html_file, $html_folder, $verbose);
+
+GetOptions('multiple'=>\$multiple, 'input=s'=>\$input, 'regions=s'=>\$regions,
+    'sample:s'=>\$sample, 'window:i'=>\$window, 'verbose'=>\$verbose,
+    'html_file=s'=>\$html_file, 'html_folder=s'=>\$html_folder);
+
+make_path($html_folder);
+
+# Build command
+if ($multiple){
+    # Reformat file with regions as required by CoNIFER plotcalls
+    system("awk '{print \$5,\$1,\$2,\$3,\$4}' OFS=\"\t\" $regions > regions_sorted");
+
+    $command = "python ".$dir."/conifer.py plotcalls --input $input --calls regions_sorted --window $window --outputdir $html_folder 2>&1";
+}else{
+    my $sample_command = ($sample eq "") ? "" : "--sample $sample";
+    my $plot_name = $regions;
+    $plot_name =~ s/[:-]/_/g;
+    $command = "python ".$dir."/conifer.py plot --input $input --region $regions $sample_command --output $html_folder/$plot_name.png 2>&1";
+}
+
+# Run CoNIFER
+system($command);
+$verbose and print $command,"\n";
+
+# Write HTML file
+open(HTML, ">$html_file");
+print HTML "<html><head><title>CoNIFER: Copy Number Analysis for Targeted Resequencing</title></head><body><h3>CoNIFER Output Files:</h3><p><ul>\n";
+opendir(DIR, $html_folder);
+
+my @FILES= grep { /png$/ }  readdir(DIR);
+closedir(DIR);
+foreach my $file (@FILES) {
+    print HTML "<li><a href=$file>$file</a><img src=\"$file\" height=\"50\" width=\"100\"></li>\n";
+}
+print HTML "</ul></p></body></html>\n";
+close(HTML);