annotate conifer/conifer.py @ 0:ca5354286bee draft

Uploaded
author bzonnedda
date Tue, 18 Oct 2016 09:26:52 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
1 #######################################################################
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
2 #######################################################################
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
3 # CoNIFER: Copy Number Inference From Exome Reads
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
4 # Developed by Niklas Krumm (C) 2012
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
5 # nkrumm@gmail.com
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
6 #
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
7 # homepage: http://conifer.sf.net
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
8 # This program is described in:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
9 # Krumm et al. 2012. Genome Research. doi:10.1101/gr.138115.112
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
10 #
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
11 # This file is part of CoNIFER.
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
12 # CoNIFER is free software: you can redistribute it and/or modify
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
13 # it under the terms of the GNU General Public License as published by
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
14 # the Free Software Foundation, either version 3 of the License, or
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
15 # (at your option) any later version.
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
16 #
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
17 # This program is distributed in the hope that it will be useful,
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
20 # GNU General Public License for more details.
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
21 #
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
22 # You should have received a copy of the GNU General Public License
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
23 # along with this program. If not, see <http://www.gnu.org/licenses/>.
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
24 #######################################################################
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
25 #######################################################################
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
26
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
27 import argparse
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
28 import os, sys, copy
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
29 import glob
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
30 import csv
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
31 import conifer_functions as cf
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
32 import operator
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
33 from tables import *
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
34 import numpy as np
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
35
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
36 def CF_analyze(args):
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
37 # do path/file checks:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
38 try:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
39 # read probes table
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
40 probe_fn = str(args.probes[0])
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
41 probes = cf.loadProbeList(probe_fn)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
42 num_probes = len(probes)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
43 print '[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
44 except IOError as e:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
45 print '[ERROR] Cannot read probes file: ', probe_fn
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
46 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
47
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
48 try:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
49 svd_outfile_fn = str(args.output)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
50 h5file_out = openFile(svd_outfile_fn, mode='w')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
51 probe_group = h5file_out.createGroup("/","probes","probes")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
52 except IOError as e:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
53 print '[ERROR] Cannot open SVD output file for writing: ', svd_outfile_fn
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
54 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
55
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
56 if args.write_svals != "":
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
57 sval_f = open(args.write_svals,'w')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
58
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
59 if args.plot_scree != "":
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
60 try:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
61 import matplotlib
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
62 matplotlib.use('Agg')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
63 import matplotlib.pyplot as plt
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
64 import pylab as P
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
65 from matplotlib.lines import Line2D
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
66 from matplotlib.patches import Rectangle
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
67 except:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
68 print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
69 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
70
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
71 plt.gcf().clear()
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
72 fig = plt.figure(figsize=(10,5))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
73 ax = fig.add_subplot(111)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
74
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
75 rpkm_dir = str(args.rpkm_dir[0])
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
76 rpkm_files = glob.glob(rpkm_dir + "/*")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
77 if len(rpkm_files) == 0:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
78 print '[ERROR] Cannot find any files in RPKM directory (or directory path is incorrect): ', rpkm_dir
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
79 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
80 elif len(rpkm_files) == 1:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
81 print '[ERROR] Found only 1 RPKM file (sample). CoNIFER requires multiple samples (8 or more) to run. Exiting.'
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
82 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
83 elif len(rpkm_files) < 8:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
84 print '[WARNING] Only found %d samples... this is less than the recommended minimum, and CoNIFER may not analyze this dataset correctly!' % len(rpkm_files)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
85 elif len(rpkm_files) <= int(args.svd):
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
86 print '[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (len(rpkm_files), int(args.svd))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
87 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
88 else:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
89 print '[INIT] Found %d RPKM files in %s' % (len(rpkm_files), rpkm_dir)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
90
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
91 # read in samples names and generate file list
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
92 samples = {}
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
93 for f in rpkm_files:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
94 s = '.'.join(f.split('/')[-1].split('.')[0:-1])
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
95 print "[INIT] Mapping file to sampleID: %s --> %s" % (f, s)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
96 samples[s] = f
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
97
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
98 #check uniqueness and total # of samples
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
99 if len(set(samples)) != len(set(rpkm_files)):
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
100 print '[ERROR] Could not successfully derive sample names from RPKM filenames. There are probably non-unique sample names! Please rename files using <sampleID>.txt format!'
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
101 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
102
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
103 # LOAD RPKM DATA
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
104 RPKM_data = np.zeros([num_probes,len(samples)], dtype=np.float)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
105 failed_samples = 0
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
106
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
107 for i,s in enumerate(samples.keys()):
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
108 t = np.loadtxt(samples[s], dtype=np.float, delimiter="\t", skiprows=0, usecols=[2])
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
109 if len(t) != num_probes:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
110 print "[WARNING] Number of RPKM values for %s in file %s does not match number of defined probes in %s. **This sample will be dropped from analysis**!" % (s, samples[s], probe_fn)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
111 _ = samples.pop(s)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
112 failed_samples += 1
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
113 else:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
114 RPKM_data[:,i] = t
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
115 print "[INIT] Successfully read RPKM data for sampleID: %s" % s
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
116
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
117 RPKM_data = RPKM_data[:,0:len(samples)]
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
118 print "[INIT] Finished reading RPKM files. Total number of samples in experiment: %d (%d failed to read properly)" % (len(samples), failed_samples)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
119
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
120 if len(samples) <= int(args.svd):
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
121 print '[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (int(args.svd), len(samples))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
122 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
123
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
124 # BEGIN
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
125 chrs_to_process = set(map(operator.itemgetter("chr"),probes))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
126 chrs_to_process_str = ', '.join([cf.chrInt2Str(c) for c in chrs_to_process])
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
127 print '[INIT] Attempting to process chromosomes: ', chrs_to_process_str
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
128
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
129
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
130
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
131 for chr in chrs_to_process:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
132 print "[RUNNING: chr%d] Now on: %s" %(chr, cf.chrInt2Str(chr))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
133 chr_group_name = "chr%d" % chr
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
134 chr_group = h5file_out.createGroup("/",chr_group_name,chr_group_name)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
135
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
136 chr_probes = filter(lambda i: i["chr"] == chr, probes)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
137 num_chr_probes = len(chr_probes)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
138 start_probeID = chr_probes[0]['probeID']
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
139 stop_probeID = chr_probes[-1]['probeID']
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
140 print "[RUNNING: chr%d] Found %d probes; probeID range is [%d-%d]" % (chr, len(chr_probes), start_probeID-1, stop_probeID) # probeID is 1-based and slicing is 0-based, hence the start_probeID-1 term
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
141
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
142 rpkm = RPKM_data[start_probeID:stop_probeID,:]
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
143
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
144 print "[RUNNING: chr%d] Calculating median RPKM" % chr
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
145 median = np.median(rpkm,1)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
146 sd = np.std(rpkm,1)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
147 probe_mask = median >= float(args.min_rpkm)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
148 print "[RUNNING: chr%d] Masking %d probes with median RPKM < %f" % (chr, np.sum(probe_mask==False), float(args.min_rpkm))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
149
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
150 rpkm = rpkm[probe_mask, :]
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
151 num_chr_probes = np.sum(probe_mask)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
152
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
153 if num_chr_probes <= len(samples):
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
154 print "[ERROR] This chromosome has fewer informative probes than there are samples in the analysis! There are probably no mappings on this chromosome. Please remove these probes from the probes.txt file"
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
155 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
156
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
157 probeIDs = np.array(map(operator.itemgetter("probeID"),chr_probes))[probe_mask]
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
158 probe_starts = np.array(map(operator.itemgetter("start"),chr_probes))[probe_mask]
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
159 probe_stops = np.array(map(operator.itemgetter("stop"),chr_probes))[probe_mask]
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
160 gene_names = np.array(map(operator.itemgetter("name"),chr_probes))[probe_mask]
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
161
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
162 dt = np.dtype([('probeID',np.uint32),('start',np.uint32),('stop',np.uint32), ('name', np.str_, 20)])
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
163
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
164 out_probes = np.empty(num_chr_probes,dtype=dt)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
165 out_probes['probeID'] = probeIDs
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
166 out_probes['start'] = probe_starts
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
167 out_probes['stop'] = probe_stops
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
168 out_probes['name'] = gene_names
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
169 probe_table = h5file_out.createTable(probe_group,"probes_chr%d" % chr,cf.probe,"chr%d" % chr)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
170 probe_table.append(out_probes)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
171
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
172 print "[RUNNING: chr%d] Calculating ZRPKM scores..." % chr
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
173 rpkm = np.apply_along_axis(cf.zrpkm, 0, rpkm, median[probe_mask], sd[probe_mask])
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
174
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
175 # svd transform
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
176 print "[RUNNING: chr%d] SVD decomposition..." % chr
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
177 components_removed = int(args.svd)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
178
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
179 U, S, Vt = np.linalg.svd(rpkm,full_matrices=False)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
180 new_S = np.diag(np.hstack([np.zeros([components_removed]),S[components_removed:]]))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
181
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
182 if args.write_svals != "":
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
183 sval_f.write('chr' + str(chr) + '\t' + '\t'.join([str(_i) for _i in S]) + "\n")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
184
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
185 if args.plot_scree != "":
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
186 ax.plot(S, label='chr' + str(chr),lw=0.5)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
187
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
188 # reconstruct data matrix
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
189 rpkm = np.dot(U, np.dot(new_S, Vt))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
190
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
191
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
192 # save to HDF5 file
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
193 print "[RUNNING: chr%d] Saving SVD-ZRPKM values" % chr
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
194
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
195 for i,s in enumerate(samples):
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
196 out_data = np.empty(num_chr_probes,dtype='u4,f8')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
197 out_data['f0'] = probeIDs
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
198 out_data['f1'] = rpkm[:,i]
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
199 sample_tbl = h5file_out.createTable(chr_group,"sample_" + str(s),cf.rpkm_value,"%s" % str(s))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
200 sample_tbl.append(out_data)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
201
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
202
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
203 print "[RUNNING] Saving sampleIDs to file..."
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
204 sample_group = h5file_out.createGroup("/","samples","samples")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
205 sample_table = h5file_out.createTable(sample_group,"samples",cf.sample,"samples")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
206 dt = np.dtype([('sampleID',np.str_,100)])
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
207 out_samples = np.empty(len(samples.keys()),dtype=dt)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
208 out_samples['sampleID'] = np.array(samples.keys())
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
209 sample_table.append(out_samples)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
210
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
211
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
212 if args.write_sd != "":
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
213 print "[RUNNING] Calculating standard deviations for all samples (this can take a while)..."
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
214
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
215 sd_file = open(args.write_sd,'w')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
216
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
217 for i,s in enumerate(samples):
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
218 # collect all SVD-ZRPKM values
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
219 count = 1
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
220 for chr in chrs_to_process:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
221 if count == 1:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
222 sd_out = h5file_out.root._f_getChild("chr%d" % chr)._f_getChild("sample_%s" % s).read(field="rpkm").flatten()
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
223 else:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
224 sd_out = np.hstack([sd_out,out.h5file_out.root._f_getChild("chr%d" % chr)._f_getChild("sample_%s" % s).read(field="rpkm").flatten()])
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
225
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
226 sd = np.std(sd_out)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
227 sd_file.write("%s\t%f\n" % (s,sd))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
228
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
229 sd_file.close()
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
230
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
231 if args.plot_scree != "":
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
232 plt.title("Scree plot")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
233 if len(samples) < 50:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
234 plt.xlim([0,len(samples)])
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
235 plt.xlabel("S values")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
236 else:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
237 plt.xlim([0,50])
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
238 plt.xlabel("S values (only first 50 plotted)")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
239 plt.ylabel("Relative contributed variance")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
240 plt.savefig(args.plot_scree)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
241
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
242 print "[FINISHED]"
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
243 h5file_out.close()
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
244 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
245
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
246 def CF_export(args):
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
247 try:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
248 h5file_in_fn = str(args.input)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
249 h5file_in = openFile(h5file_in_fn, mode='r')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
250 except IOError as e:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
251 print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
252 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
253
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
254 # read probes
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
255 probes = {}
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
256 for probes_chr in h5file_in.root.probes:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
257 probes[probes_chr.title] = probes_chr.read()
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
258
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
259 if args.sample =='all':
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
260 all_samples = list(h5file_in.root.samples.samples.read(field="sampleID"))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
261
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
262 out_path = os.path.abspath(args.output)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
263
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
264 print "[INIT] Preparing to export all samples (%d samples) to %s" % (len(all_samples), out_path)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
265 for sample in all_samples:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
266 try:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
267 outfile_fn = out_path + "/" + sample + ".bed"
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
268 outfile_f = open(outfile_fn,'w')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
269 except IOError as e:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
270 print '[ERROR] Cannot open output file for writing: ', outfile_fn
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
271 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
272 print "[RUNNING] Exporting %s" % sample
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
273
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
274 cf.export_sample(h5file_in,sample,probes,outfile_f)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
275 outfile_f.close()
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
276
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
277 elif len(args.sample) == 1:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
278 out_path = os.path.abspath(args.output)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
279 sample = args.sample[0]
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
280 print "[INIT] Preparing to export sampleID %s to %s" % (args.sample[0], out_path)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
281 try:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
282 if os.path.isdir(out_path):
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
283 outfile_fn = out_path + "/" + sample + ".bed"
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
284 else:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
285 outfile_fn = out_path
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
286 outfile_f = open(outfile_fn,'w')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
287 except IOError as e:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
288 print '[ERROR] Cannot open output file for writing: ', outfile_fn
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
289 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
290 print "[RUNNING] Exporting %s to %s" % (sample, outfile_fn)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
291
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
292 cf.export_sample(h5file_in,sample,probes,outfile_f)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
293 outfile_f.close()
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
294
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
295 else:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
296 out_path = os.path.abspath(args.output)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
297 print "[INIT] Preparing to export %d samples to %s" % (len(args.sample), out_path)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
298 for sample in args.sample:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
299 try:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
300 if os.path.isdir(out_path):
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
301 outfile_fn = out_path + "/" + sample + ".bed"
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
302 else:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
303 outfile_fn = out_path
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
304 outfile_f = open(outfile_fn,'w')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
305 except IOError as e:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
306 print '[ERROR] Cannot open output file for writing: ', outfile_fn
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
307 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
308 print "[RUNNING] Exporting %s to %s" % (sample, outfile_fn)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
309
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
310 cf.export_sample(h5file_in,sample,probes,outfile_f)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
311 outfile_f.close()
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
312 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
313
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
314 def CF_call(args):
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
315 try:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
316 h5file_in_fn = str(args.input)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
317 h5file_in = openFile(h5file_in_fn, mode='r')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
318 except IOError as e:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
319 print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
320 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
321
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
322 try:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
323 callfile_fn = str(args.output)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
324 callfile_f = open(callfile_fn, mode='w')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
325 except IOError as e:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
326 print '[ERROR] Cannot open output file for writing: ', callfile_fn
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
327 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
328
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
329 chrs_to_process = []
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
330 for chr in h5file_in.root:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
331 if chr._v_title not in ('probes','samples'):
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
332 chrs_to_process.append(chr._v_title.replace("chr",""))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
333
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
334 h5file_in.close()
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
335
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
336 print '[INIT] Initializing caller at threshold = %f' % (args.threshold)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
337
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
338 r = cf.rpkm_reader(h5file_in_fn)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
339
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
340 all_calls = []
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
341
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
342 for chr in chrs_to_process:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
343 print '[RUNNING] Now processing chr%s' % chr
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
344 data = r.getExonValuesByRegion(chr)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
345
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
346 #raw_data = copy.copy(data)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
347 _ = data.smooth()
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
348
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
349 mean= np.mean(data.rpkm,axis=1)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
350 sd = np.std(data.rpkm,axis=1)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
351
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
352 for sample in r.getSampleList():
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
353 sample_data = data.getSample([sample]).flatten()
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
354 #sample_raw_data = raw_data.getSample([sample]).flatten()
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
355
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
356 dup_mask = sample_data >= args.threshold
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
357 del_mask = sample_data <= -1*args.threshold
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
358
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
359 dup_bkpoints = cf.getbkpoints(dup_mask) #returns exon coordinates for this chromosome (numpy array coords)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
360 del_bkpoints = cf.getbkpoints(del_mask)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
361
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
362
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
363 dups = []
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
364 for start,stop in dup_bkpoints:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
365 try: new_start = np.max(np.where(sample_data[:start] < (mean[:start] + 3*sd[:start])))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
366 except ValueError: new_start = 0
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
367 try: new_stop = stop + np.min(np.where(sample_data[stop:] < (mean[stop:] + 3*sd[stop:])))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
368 except ValueError: new_stop = data.shape[1]-1
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
369 dups.append({"sampleID":sample,"chromosome": cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "dup"})
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
370
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
371 dels = []
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
372 for start,stop in del_bkpoints:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
373 try: new_start = np.max(np.where(sample_data[:start] > (-1*mean[:start] - 3*sd[:start])))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
374 except ValueError: new_start = 0
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
375 try: new_stop = stop + np.min(np.where(sample_data[stop:] > (-1*mean[stop:] - 3*sd[stop:])))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
376 except ValueError: new_stop = data.shape[1]-1
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
377 dels.append({"sampleID":sample,"chromosome": cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "del"})
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
378
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
379 dels = cf.mergeCalls(dels) #merges overlapping calls
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
380 dups = cf.mergeCalls(dups)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
381
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
382 #print sampleID, len(dels), len(dups)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
383
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
384 all_calls.extend(list(dels))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
385 all_calls.extend(list(dups))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
386
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
387 # print calls to file
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
388 header = ['sampleID','chromosome','start','stop','state']
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
389
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
390 callfile_f.write('\t'.join(header) + "\n")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
391 for call in all_calls:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
392 print "%s\t%s\t%d\t%d\t%s" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"])
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
393 callfile_f.write("%s\t%s\t%d\t%d\t%s\n" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"]))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
394
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
395 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
396
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
397 def CF_plot(args):
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
398 try:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
399 import locale
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
400 import matplotlib
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
401 matplotlib.use('Agg')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
402 import matplotlib.pyplot as plt
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
403 import pylab as P
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
404 from matplotlib.lines import Line2D
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
405 from matplotlib.patches import Rectangle
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
406 _ = locale.setlocale(locale.LC_ALL, '')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
407 except:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
408 print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
409 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
410
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
411
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
412 chr, start, stop = cf.parseLocString(args.region)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
413
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
414 r = cf.rpkm_reader(args.input)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
415
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
416 data = r.getExonValuesByRegion(chr,start,stop)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
417 _ = data.smooth()
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
418
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
419 plt.gcf().clear()
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
420 fig = plt.figure(figsize=(10,5))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
421 ax = fig.add_subplot(111)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
422
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
423
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
424 ax.plot(data.rpkm, linewidth = 0.3, c='k')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
425
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
426
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
427 if args.sample != 'none':
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
428 cnt = 1
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
429 coloriter = iter(['r','b','g','y'])
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
430 for sample in args.sample:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
431 try:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
432 color, sampleID = sample.split(":")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
433 except:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
434 color =coloriter.next()
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
435 sampleID = sample
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
436
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
437 ax.plot(data.getSample([sampleID]), linewidth = 1, c=color, label = sampleID)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
438
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
439 if cnt == 1:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
440 cf.plotRawData(ax, r.getExonValuesByRegion(chr,start,stop,sampleList=[sampleID]).getSample([sampleID]),color=color)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
441 cnt +=1
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
442 plt.legend(prop={'size':10},frameon=False)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
443
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
444 cf.plotGenes(ax, data)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
445 cf.plotGenomicCoords(plt,data)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
446 plt.xlim(0,data.shape[1])
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
447 plt.ylim(-3,3)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
448
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
449 plt.title("%s: %s - %s" % (cf.chrInt2Str(chr),locale.format("%d",start, grouping=True),locale.format("%d",stop, grouping=True)))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
450 plt.xlabel("Position")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
451 plt.ylabel("SVD-ZRPKM Values")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
452
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
453 plt.savefig(args.output)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
454
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
455 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
456
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
457 def CF_plotcalls(args):
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
458 try:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
459 import matplotlib
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
460 matplotlib.use('Agg')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
461 import matplotlib.pyplot as plt
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
462 import pylab as P
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
463 from matplotlib.lines import Line2D
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
464 from matplotlib.patches import Rectangle
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
465 except:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
466 print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
467 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
468
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
469 import locale
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
470 try:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
471 _ = locale.setlocale(locale.LC_ALL, 'en_US')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
472 except:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
473 _ = locale.setlocale(locale.LC_ALL, '')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
474
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
475 try:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
476 callfile_fn = str(args.calls)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
477 callfile_f = open(callfile_fn, mode='r')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
478 except IOError as e:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
479 print '[ERROR] Cannot open call file for reading: ', callfile_fn
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
480 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
481
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
482 all_calls = []
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
483 header = callfile_f.readline()
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
484
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
485 for line in callfile_f:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
486 sampleID, chr, start, stop, state = line.strip().split()
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
487 chr = cf.chrStr2Int(chr)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
488 all_calls.append({"chromosome":int(chr), "start":int(start), "stop":int(stop), "sampleID":sampleID})
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
489
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
490 r = cf.rpkm_reader(args.input)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
491
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
492 for call in all_calls:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
493 chr = call["chromosome"]
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
494 start = call["start"]
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
495 stop = call["stop"]
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
496 sampleID = call["sampleID"]
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
497
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
498 exons = r.getExonIDs(chr,int(start),int(stop))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
499
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
500
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
501 window_start = max(exons[0]-args.window,0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
502 window_stop = exons[-1]+args.window
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
503
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
504 data = r.getExonValuesByExons(chr,window_start, window_stop)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
505 _ = data.smooth()
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
506
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
507 plt.gcf().clear()
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
508 fig = plt.figure(figsize=(10,5))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
509 ax = fig.add_subplot(111)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
510
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
511
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
512 ax.plot(data.rpkm, linewidth = 0.3, c='k')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
513
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
514
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
515 ax.plot(data.getSample([sampleID]), linewidth = 1, c='r', label = sampleID)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
516 cf.plotRawData(ax, r.getExonValuesByExons(chr,window_start, window_stop,sampleList=[sampleID]).getSample([sampleID]),color='r')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
517
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
518 plt.legend(prop={'size':10},frameon=False)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
519
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
520 cf.plotGenes(ax, data)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
521 cf.plotGenomicCoords(plt,data)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
522
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
523 exon_start = np.where(data.exons["start"] == start)[0][0]
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
524 exon_stop = np.where(data.exons["stop"] == stop)[0][0]
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
525 _ = ax.add_line(matplotlib.lines.Line2D([exon_start,exon_stop],[2,2],color='k',lw=6,linestyle='-',alpha=1,solid_capstyle='butt'))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
526
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
527 _ = plt.xlim(0,data.shape[1])
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
528 _ = plt.ylim(-3,3)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
529
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
530 plt.title("%s: %s - %s" % (cf.chrInt2Str(chr),locale.format("%d",start, grouping=True),locale.format("%d",stop, grouping=True)))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
531 plt.xlabel("Position")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
532 plt.ylabel("SVD-ZRPKM Values")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
533 outfile = "%s_%d_%d_%s.png" % (cf.chrInt2Str(chr), start, stop, sampleID)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
534 plt.savefig(args.outputdir + "/" + outfile)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
535
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
536 def CF_bam2RPKM(args):
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
537 try:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
538 import pysam
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
539 except:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
540 print '[ERROR] Cannot load pysam module! Make sure it is insalled'
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
541 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
542 try:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
543 # read probes table
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
544 probe_fn = str(args.probes[0])
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
545 probes = cf.loadProbeList(probe_fn)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
546 num_probes = len(probes)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
547 print '[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
548 except IOError as e:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
549 print '[ERROR] Cannot read probes file: ', probe_fn
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
550 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
551
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
552 try:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
553 rpkm_f = open(args.output[0],'w')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
554 except IOError as e:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
555 print '[ERROR] Cannot open rpkm file for writing: ', args.output
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
556 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
557
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
558 print "[RUNNING] Counting total number of reads in bam file..."
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
559 total_reads = float(pysam.view("-c", args.input[0])[0].strip("\n"))
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
560 print "[RUNNING] Found %d reads" % total_reads
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
561
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
562 f = pysam.Samfile(args.input[0], "rb" )
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
563
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
564 if not f._hasIndex():
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
565 print "[ERROR] No index found for bam file (%s)!\n[ERROR] You must first index the bam file and include the .bai file in the same directory as the bam file!" % args.input[0]
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
566 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
567
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
568 # will be storing values in these arrays
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
569 readcount = np.zeros(num_probes)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
570 exon_bp = np.zeros(num_probes)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
571 probeIDs = np.zeros(num_probes)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
572 counter = 0
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
573
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
574 # detect contig naming scheme here # TODO, add an optional "contigs.txt" file or automatically handle contig naming
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
575 bam_contigs = f.references
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
576 probes_contigs = [str(p) for p in set(map(operator.itemgetter("chr"),probes))]
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
577
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
578 probes2contigmap = {}
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
579
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
580 for probes_contig in probes_contigs:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
581 if probes_contig in bam_contigs:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
582 probes2contigmap[probes_contig] = probes_contig
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
583 elif cf.chrInt2Str(probes_contig) in bam_contigs:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
584 probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
585 elif cf.chrInt2Str(probes_contig).replace("chr","") in bam_contigs:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
586 probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig).replace("chr","")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
587 else:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
588 print "[ERROR] Could not find contig '%s' from %s in bam file! \n[ERROR] Perhaps the contig names for the probes are incompatible with the bam file ('chr1' vs. '1'), or unsupported contig naming is used?" % (probes_contig, probe_fn)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
589 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
590
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
591 print "[RUNNING] Calculating RPKM values..."
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
592
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
593 # loop through each probe
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
594 for p in probes:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
595
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
596 # f.fetch is a pysam method and returns an iterator for reads overlapping interval
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
597
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
598 p_chr = probes2contigmap[str(p["chr"])]
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
599
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
600 p_start = p["start"]
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
601 p_stop = p["stop"]
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
602 try:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
603 iter = f.fetch(p_chr,p_start,p_stop)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
604 except:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
605 print "[ERROR] Could not retrieve mappings for region %s:%d-%d. Check that contigs are named correctly and the bam file is properly indexed" % (p_chr,p_start,p_stop)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
606 sys.exit(0)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
607
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
608 for i in iter:
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
609 if i.pos+1 >= p_start: #this checks to make sure a read actually starts in an interval
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
610 readcount[counter] += 1
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
611
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
612 exon_bp[counter] = p_stop-p_start
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
613 probeIDs[counter] = counter +1 #probeIDs are 1-based
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
614 counter +=1
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
615
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
616 #calcualte RPKM values for all probes
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
617 rpkm = (10**9*(readcount)/(exon_bp))/(total_reads)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
618
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
619 out = np.vstack([probeIDs,readcount,rpkm])
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
620
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
621 np.savetxt(rpkm_f,out.transpose(),delimiter='\t',fmt=['%d','%d','%f'])
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
622
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
623 rpkm_f.close()
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
624
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
625
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
626
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
627 VERSION = "0.2.2"
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
628 parser = argparse.ArgumentParser(prog="CoNIFER", description="This is CoNIFER %s (Copy Number Inference From Exome Reads), designed to detect and genotype CNVs and CNPs from exome sequence read-depth data. See Krumm et al., Genome Research (2012) doi:10.1101/gr.138115.112 \nNiklas Krumm, 2012\n nkrumm@uw.edu" % VERSION)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
629 parser.add_argument('--version', action='version', version='%(prog)s ' + VERSION)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
630 subparsers = parser.add_subparsers(help='Command to be run.')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
631
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
632 # rpkm command
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
633 rpkm_parser= subparsers.add_parser('rpkm', help='Create an RPKM file from a BAM file')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
634 rpkm_parser.add_argument('--probes',action='store', required=True, metavar='/path/to/probes_file.txt', nargs=1,help="Probe definition file")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
635 rpkm_parser.add_argument('--input',action='store', required=True, metavar='sample.bam',nargs=1, help="Aligned BAM file")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
636 rpkm_parser.add_argument('--output',action='store', required=True, metavar='sample.rpkm.txt',nargs=1, help="RPKM file to write")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
637 rpkm_parser.set_defaults(func=CF_bam2RPKM)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
638
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
639 # analyze command
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
640 analyze_parser= subparsers.add_parser('analyze', help='Basic CoNIFER analysis. Reads a directory of RPKM files and a probe list and outputs a HDF5 file containing SVD-ZRPKM values.')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
641 analyze_parser.add_argument('--probes',action='store', required=True, metavar='/path/to/probes_file.txt', nargs=1,help="Probe definition file")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
642 analyze_parser.add_argument('--rpkm_dir',action='store', required=True, metavar='/path/to/folder/containing/rpkm_files/',nargs=1, help="Location of folder containing RPKM files. Folder should contain ONLY contain RPKM files, and all readable RPKM files will used in analysis.")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
643 analyze_parser.add_argument('--output','-o', required=True, metavar='/path/to/output_file.hdf5', type=str, help="Output location of HDF5 file to contain SVD-ZRPKM values")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
644 analyze_parser.add_argument('--svd', metavar='12', type=int, nargs='?', default = 12,help="Number of components to remove")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
645 analyze_parser.add_argument('--min_rpkm', metavar='1.00', type=float, nargs="?", default = 1.00,help="Minimum population median RPKM per probe.")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
646 analyze_parser.add_argument('--write_svals', metavar='SingularValues.txt', type=str, default= "", help="Optional file to write singular values (S-values). Used for Scree Plot.")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
647 analyze_parser.add_argument('--plot_scree', metavar='ScreePlot.png', type=str, default= "", help="Optional graphical scree plot. Requires matplotlib.")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
648 analyze_parser.add_argument('--write_sd', metavar='StandardDeviations.txt', type=str, default= "", help="Optional file with sample SVD-ZRPKM standard deviations. Used to filter noisy samples.")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
649 analyze_parser.set_defaults(func=CF_analyze)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
650
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
651 # export command
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
652 export_parser= subparsers.add_parser('export', help='Export SVD-ZRPKM values from a HDF5 file to bed or vcf format.')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
653 export_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
654 export_parser.add_argument('--output','-o',action='store', required=False, default='.', metavar='output.bed',help="Location of output file[s]. When exporting multiple samples, top-level directory of this path will be used.")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
655 export_parser.add_argument('--sample','-s',action='store', required=False, metavar='sampleID', default='all', nargs="+",help="SampleID or comma-separated list of sampleIDs to export. Default: export all samples")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
656 export_parser.set_defaults(func=CF_export)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
657
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
658 # plot command
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
659 plot_parser= subparsers.add_parser('plot', help='Plot SVD-ZRPKM values using matplotlib')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
660 plot_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
661 plot_parser.add_argument('--region',action='store', required=True, metavar='chr#:start-stop',help="Region to plot")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
662 plot_parser.add_argument('--output',action='store', required=True, metavar='image.png',help="Output path and filetype. PDF, PNG, PS, EPS, and SVG are supported.")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
663 plot_parser.add_argument('--sample',action='store', required=False, metavar='sampleID',nargs="+",default='none',help="Sample[s] to highlight. The following optional color spec can be used: <color>:<sampleID>. Available colors are r,b,g,y,c,m,y,k. The unsmoothed SVD-ZRPKM values for the first sample in this list will be drawn. Default: No samples highlighted.")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
664 plot_parser.set_defaults(func=CF_plot)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
665
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
666 # make calls command
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
667 call_parser= subparsers.add_parser('call', help='Very rudimentary caller for CNVs using SVD-ZRPKM thresholding.')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
668 call_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
669 call_parser.add_argument('--output',action='store', required=True, metavar='calls.txt',help="Output file for calls")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
670 call_parser.add_argument('--threshold', metavar='1.50', type=float, nargs='?', required=False, default = 1.50,help="+/- Threshold for calling (minimum SVD-ZRPKM)")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
671 call_parser.set_defaults(func=CF_call)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
672
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
673 # plotcalls command
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
674 plotcalls_parser= subparsers.add_parser('plotcalls', help='Make basic plots from call file from "call" command.')
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
675 plotcalls_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
676 plotcalls_parser.add_argument('--calls',action='store', required=True, metavar='calls.txt',help="File with calls from 'call' command.")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
677 plotcalls_parser.add_argument('--outputdir',action='store', required=True, metavar='/path/to/directory',help="Output directory for plots")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
678 plotcalls_parser.add_argument('--window',action='store', required=False, metavar='50',default=50,help="In exons, the amount of padding to plot around each call")
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
679 plotcalls_parser.set_defaults(func=CF_plotcalls)
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
680
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
681 args = parser.parse_args()
ca5354286bee Uploaded
bzonnedda
parents:
diff changeset
682 args.func(args)