annotate conifer.py @ 1:54973c4a1125 draft

Uploaded
author bzonnedda
date Mon, 06 Feb 2017 10:55:27 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
1 #######################################################################
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
2 #######################################################################
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
3 # CoNIFER: Copy Number Inference From Exome Reads
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
4 # Developed by Niklas Krumm (C) 2012
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
5 # nkrumm@gmail.com
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
6 #
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
7 # homepage: http://conifer.sf.net
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
8 # This program is described in:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
9 # Krumm et al. 2012. Genome Research. doi:10.1101/gr.138115.112
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
10 #
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
11 # This file is part of CoNIFER.
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
12 # CoNIFER is free software: you can redistribute it and/or modify
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
13 # it under the terms of the GNU General Public License as published by
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
14 # the Free Software Foundation, either version 3 of the License, or
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
15 # (at your option) any later version.
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
16 #
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
17 # This program is distributed in the hope that it will be useful,
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
20 # GNU General Public License for more details.
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
21 #
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
22 # You should have received a copy of the GNU General Public License
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
23 # along with this program. If not, see <http://www.gnu.org/licenses/>.
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
24 #######################################################################
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
25 #######################################################################
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
26
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
27 import argparse
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
28 import os, sys, copy
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
29 import glob
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
30 import csv
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
31 import conifer_functions as cf
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
32 import operator
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
33 from tables import *
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
34 import numpy as np
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
35
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
36 def CF_analyze(args):
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
37 # do path/file checks:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
38 try:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
39 # read probes table
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
40 probe_fn = str(args.probes[0])
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
41 probes = cf.loadProbeList(probe_fn)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
42 num_probes = len(probes)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
43 print '[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
44 except IOError as e:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
45 print '[ERROR] Cannot read probes file: ', probe_fn
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
46 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
47
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
48 try:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
49 svd_outfile_fn = str(args.output)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
50 # h5file_out = openFile(svd_outfile_fn, mode='w')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
51 h5file_out = open_file(svd_outfile_fn, mode='w')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
52 # probe_group = h5file_out.createGroup("/","probes","probes")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
53 probe_group = h5file_out.create_group("/","probes","probes")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
54 except IOError as e:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
55 print '[ERROR] Cannot open SVD output file for writing: ', svd_outfile_fn
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
56 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
57
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
58 if args.write_svals != "":
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
59 sval_f = open(args.write_svals,'w')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
60
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
61 if args.plot_scree != "":
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
62 try:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
63 import matplotlib
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
64 matplotlib.use('Agg')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
65 import matplotlib.pyplot as plt
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
66 import pylab as P
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
67 from matplotlib.lines import Line2D
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
68 from matplotlib.patches import Rectangle
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
69 except:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
70 print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
71 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
72
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
73 plt.gcf().clear()
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
74 fig = plt.figure(figsize=(10,5))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
75 ax = fig.add_subplot(111)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
76
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
77 rpkm_dir = str(args.rpkm_dir[0])
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
78 rpkm_files = glob.glob(rpkm_dir + "/*")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
79 if len(rpkm_files) == 0:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
80 print '[ERROR] Cannot find any files in RPKM directory (or directory path is incorrect): ', rpkm_dir
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
81 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
82 elif len(rpkm_files) == 1:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
83 print '[ERROR] Found only 1 RPKM file (sample). CoNIFER requires multiple samples (8 or more) to run. Exiting.'
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
84 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
85 elif len(rpkm_files) < 8:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
86 print '[WARNING] Only found %d samples... this is less than the recommended minimum, and CoNIFER may not analyze this dataset correctly!' % len(rpkm_files)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
87 elif len(rpkm_files) <= int(args.svd):
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
88 print '[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (len(rpkm_files), int(args.svd))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
89 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
90 else:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
91 print '[INIT] Found %d RPKM files in %s' % (len(rpkm_files), rpkm_dir)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
92
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
93 # read in samples names and generate file list
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
94 samples = {}
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
95 for f in rpkm_files:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
96 s = '.'.join(f.split('/')[-1].split('.')[0:-1])
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
97 print "[INIT] Mapping file to sampleID: %s --> %s" % (f, s)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
98 samples[s] = f
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
99
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
100 #check uniqueness and total # of samples
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
101 if len(set(samples)) != len(set(rpkm_files)):
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
102 print '[ERROR] Could not successfully derive sample names from RPKM filenames. There are probably non-unique sample names! Please rename files using <sampleID>.txt format!'
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
103 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
104
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
105 # LOAD RPKM DATA
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
106 RPKM_data = np.zeros([num_probes,len(samples)], dtype=np.float)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
107 failed_samples = 0
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
108
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
109 for i,s in enumerate(samples.keys()):
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
110 t = np.loadtxt(samples[s], dtype=np.float, delimiter="\t", skiprows=0, usecols=[2])
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
111 if len(t) != num_probes:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
112 print "[WARNING] Number of RPKM values for %s in file %s does not match number of defined probes in %s. **This sample will be dropped from analysis**!" % (s, samples[s], probe_fn)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
113 _ = samples.pop(s)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
114 failed_samples += 1
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
115 else:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
116 RPKM_data[:,i] = t
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
117 print "[INIT] Successfully read RPKM data for sampleID: %s" % s
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
118
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
119 RPKM_data = RPKM_data[:,0:len(samples)]
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
120 print "[INIT] Finished reading RPKM files. Total number of samples in experiment: %d (%d failed to read properly)" % (len(samples), failed_samples)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
121
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
122 if len(samples) <= int(args.svd):
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
123 print '[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (int(args.svd), len(samples))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
124 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
125
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
126 # BEGIN
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
127 chrs_to_process = set(map(operator.itemgetter("chr"),probes))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
128 chrs_to_process_str = ', '.join([cf.chrInt2Str(c) for c in chrs_to_process])
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
129 print '[INIT] Attempting to process chromosomes: ', chrs_to_process_str
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
130
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
131
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
132
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
133 for chr in chrs_to_process:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
134 print "[RUNNING: chr%d] Now on: %s" %(chr, cf.chrInt2Str(chr))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
135 chr_group_name = "chr%d" % chr
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
136 # chr_group = h5file_out.createGroup("/",chr_group_name,chr_group_name)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
137 chr_group = h5file_out.create_group("/",chr_group_name,chr_group_name)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
138
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
139 chr_probes = filter(lambda i: i["chr"] == chr, probes)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
140 num_chr_probes = len(chr_probes)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
141 start_probeID = chr_probes[0]['probeID']
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
142 stop_probeID = chr_probes[-1]['probeID']
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
143 print "[RUNNING: chr%d] Found %d probes; probeID range is [%d-%d]" % (chr, len(chr_probes), start_probeID-1, stop_probeID) # probeID is 1-based and slicing is 0-based, hence the start_probeID-1 term
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
144
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
145 rpkm = RPKM_data[start_probeID:stop_probeID,:]
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
146
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
147 print "[RUNNING: chr%d] Calculating median RPKM" % chr
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
148 median = np.median(rpkm,1)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
149 sd = np.std(rpkm,1)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
150 probe_mask = median >= float(args.min_rpkm)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
151 print "[RUNNING: chr%d] Masking %d probes with median RPKM < %f" % (chr, np.sum(probe_mask==False), float(args.min_rpkm))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
152
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
153 rpkm = rpkm[probe_mask, :]
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
154 num_chr_probes = np.sum(probe_mask)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
155
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
156 if num_chr_probes <= len(samples):
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
157 print "[ERROR] This chromosome has fewer informative probes than there are samples in the analysis! There are probably no mappings on this chromosome. Please remove these probes from the probes.txt file"
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
158 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
159
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
160 probeIDs = np.array(map(operator.itemgetter("probeID"),chr_probes))[probe_mask]
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
161 probe_starts = np.array(map(operator.itemgetter("start"),chr_probes))[probe_mask]
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
162 probe_stops = np.array(map(operator.itemgetter("stop"),chr_probes))[probe_mask]
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
163 gene_names = np.array(map(operator.itemgetter("name"),chr_probes))[probe_mask]
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
164
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
165 dt = np.dtype([('probeID',np.uint32),('start',np.uint32),('stop',np.uint32), ('name', np.str_, 20)])
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
166
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
167 out_probes = np.empty(num_chr_probes,dtype=dt)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
168 out_probes['probeID'] = probeIDs
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
169 out_probes['start'] = probe_starts
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
170 out_probes['stop'] = probe_stops
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
171 out_probes['name'] = gene_names
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
172 # probe_table = h5file_out.createTable(probe_group,"probes_chr%d" % chr,cf.probe,"chr%d" % chr)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
173 probe_table = h5file_out.create_table(probe_group,"probes_chr%d" % chr,cf.probe,"chr%d" % chr)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
174 probe_table.append(out_probes)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
175
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
176 print "[RUNNING: chr%d] Calculating ZRPKM scores..." % chr
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
177 rpkm = np.apply_along_axis(cf.zrpkm, 0, rpkm, median[probe_mask], sd[probe_mask])
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
178
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
179 # svd transform
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
180 print "[RUNNING: chr%d] SVD decomposition..." % chr
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
181 components_removed = int(args.svd)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
182
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
183 U, S, Vt = np.linalg.svd(rpkm,full_matrices=False)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
184 new_S = np.diag(np.hstack([np.zeros([components_removed]),S[components_removed:]]))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
185
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
186 if args.write_svals != "":
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
187 sval_f.write('chr' + str(chr) + '\t' + '\t'.join([str(_i) for _i in S]) + "\n")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
188
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
189 if args.plot_scree != "":
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
190 ax.plot(S, label='chr' + str(chr),lw=0.5)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
191
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
192 # reconstruct data matrix
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
193 rpkm = np.dot(U, np.dot(new_S, Vt))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
194
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
195
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
196 # save to HDF5 file
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
197 print "[RUNNING: chr%d] Saving SVD-ZRPKM values" % chr
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
198
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
199 for i,s in enumerate(samples):
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
200 out_data = np.empty(num_chr_probes,dtype='u4,f8')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
201 out_data['f0'] = probeIDs
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
202 out_data['f1'] = rpkm[:,i]
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
203 # sample_tbl = h5file_out.createTable(chr_group,"sample_" + str(s),cf.rpkm_value,"%s" % str(s))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
204 sample_tbl = h5file_out.create_table(chr_group,"sample_" + str(s),cf.rpkm_value,"%s" % str(s))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
205 sample_tbl.append(out_data)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
206
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
207
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
208 print "[RUNNING] Saving sampleIDs to file..."
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
209 # sample_group = h5file_out.createGroup("/","samples","samples")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
210 # sample_table = h5file_out.createTable(sample_group,"samples",cf.sample,"samples")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
211 sample_group = h5file_out.create_group("/","samples","samples")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
212 sample_table = h5file_out.create_table(sample_group,"samples",cf.sample,"samples")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
213 dt = np.dtype([('sampleID',np.str_,100)])
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
214 out_samples = np.empty(len(samples.keys()),dtype=dt)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
215 out_samples['sampleID'] = np.array(samples.keys())
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
216 sample_table.append(out_samples)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
217
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
218
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
219 if args.write_sd != "":
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
220 print "[RUNNING] Calculating standard deviations for all samples (this can take a while)..."
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
221
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
222 sd_file = open(args.write_sd,'w')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
223
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
224 for i,s in enumerate(samples):
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
225 # collect all SVD-ZRPKM values
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
226 count = 1
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
227 for chr in chrs_to_process:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
228 if count == 1:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
229 sd_out = h5file_out.root._f_getChild("chr%d" % chr)._f_getChild("sample_%s" % s).read(field="rpkm").flatten()
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
230 else:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
231 sd_out = np.hstack([sd_out,out.h5file_out.root._f_getChild("chr%d" % chr)._f_getChild("sample_%s" % s).read(field="rpkm").flatten()])
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
232
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
233 sd = np.std(sd_out)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
234 sd_file.write("%s\t%f\n" % (s,sd))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
235
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
236 sd_file.close()
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
237
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
238 if args.plot_scree != "":
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
239 plt.title("Scree plot")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
240 if len(samples) < 50:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
241 plt.xlim([0,len(samples)])
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
242 plt.xlabel("S values")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
243 else:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
244 plt.xlim([0,50])
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
245 plt.xlabel("S values (only first 50 plotted)")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
246 plt.ylabel("Relative contributed variance")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
247 plt.savefig(args.plot_scree)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
248
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
249 print "[FINISHED]"
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
250 h5file_out.close()
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
251 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
252
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
253 def CF_export(args):
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
254 try:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
255 h5file_in_fn = str(args.input)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
256 # h5file_in = openFile(h5file_in_fn, mode='r')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
257 h5file_in = open_file(h5file_in_fn, mode='r')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
258 except IOError as e:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
259 print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
260 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
261
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
262 # read probes
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
263 probes = {}
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
264 for probes_chr in h5file_in.root.probes:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
265 probes[probes_chr.title] = probes_chr.read()
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
266
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
267 if args.sample =='all':
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
268 all_samples = list(h5file_in.root.samples.samples.read(field="sampleID"))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
269
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
270 out_path = os.path.abspath(args.output)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
271
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
272 print "[INIT] Preparing to export all samples (%d samples) to %s" % (len(all_samples), out_path)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
273 for sample in all_samples:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
274 try:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
275 outfile_fn = out_path + "/" + sample + ".bed"
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
276 outfile_f = open(outfile_fn,'w')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
277 except IOError as e:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
278 print '[ERROR] Cannot open output file for writing: ', outfile_fn
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
279 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
280 print "[RUNNING] Exporting %s" % sample
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
281
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
282 cf.export_sample(h5file_in,sample,probes,outfile_f)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
283 outfile_f.close()
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
284
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
285 elif len(args.sample) == 1:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
286 out_path = os.path.abspath(args.output)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
287 sample = args.sample[0]
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
288 print "[INIT] Preparing to export sampleID %s to %s" % (args.sample[0], out_path)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
289 try:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
290 if os.path.isdir(out_path):
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
291 outfile_fn = out_path + "/" + sample + ".bed"
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
292 else:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
293 outfile_fn = out_path
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
294 outfile_f = open(outfile_fn,'w')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
295 except IOError as e:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
296 print '[ERROR] Cannot open output file for writing: ', outfile_fn
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
297 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
298 print "[RUNNING] Exporting %s to %s" % (sample, outfile_fn)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
299
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
300 cf.export_sample(h5file_in,sample,probes,outfile_f)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
301 outfile_f.close()
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
302
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
303 else:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
304 out_path = os.path.abspath(args.output)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
305 print "[INIT] Preparing to export %d samples to %s" % (len(args.sample), out_path)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
306 for sample in args.sample:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
307 try:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
308 if os.path.isdir(out_path):
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
309 outfile_fn = out_path + "/" + sample + ".bed"
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
310 else:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
311 outfile_fn = out_path
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
312 outfile_f = open(outfile_fn,'w')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
313 except IOError as e:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
314 print '[ERROR] Cannot open output file for writing: ', outfile_fn
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
315 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
316 print "[RUNNING] Exporting %s to %s" % (sample, outfile_fn)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
317
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
318 cf.export_sample(h5file_in,sample,probes,outfile_f)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
319 outfile_f.close()
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
320 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
321
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
322 def CF_call(args):
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
323 try:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
324 h5file_in_fn = str(args.input)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
325 # h5file_in = openFile(h5file_in_fn, mode='r')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
326 h5file_in = open_file(h5file_in_fn, mode='r')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
327 except IOError as e:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
328 print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
329 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
330
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
331 try:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
332 callfile_fn = str(args.output)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
333 callfile_f = open(callfile_fn, mode='w')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
334 except IOError as e:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
335 print '[ERROR] Cannot open output file for writing: ', callfile_fn
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
336 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
337
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
338 chrs_to_process = []
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
339 for chr in h5file_in.root:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
340 if chr._v_title not in ('probes','samples'):
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
341 chrs_to_process.append(chr._v_title.replace("chr",""))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
342
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
343 h5file_in.close()
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
344
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
345 print '[INIT] Initializing caller at threshold = %f' % (args.threshold)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
346
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
347 r = cf.rpkm_reader(h5file_in_fn)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
348
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
349 all_calls = []
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
350
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
351 for chr in chrs_to_process:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
352 print '[RUNNING] Now processing chr%s' % chr
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
353 data = r.getExonValuesByRegion(chr)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
354
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
355 #raw_data = copy.copy(data)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
356 _ = data.smooth()
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
357
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
358 mean= np.mean(data.rpkm,axis=1)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
359 sd = np.std(data.rpkm,axis=1)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
360
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
361 for sample in r.getSampleList():
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
362 sample_data = data.getSample([sample]).flatten()
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
363 #sample_raw_data = raw_data.getSample([sample]).flatten()
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
364
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
365 dup_mask = sample_data >= args.threshold
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
366 del_mask = sample_data <= -1*args.threshold
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
367
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
368 dup_bkpoints = cf.getbkpoints(dup_mask) #returns exon coordinates for this chromosome (numpy array coords)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
369 del_bkpoints = cf.getbkpoints(del_mask)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
370
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
371
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
372 dups = []
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
373 for start,stop in dup_bkpoints:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
374 try: new_start = np.max(np.where(sample_data[:start] < (mean[:start] + 3*sd[:start])))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
375 except ValueError: new_start = 0
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
376 try: new_stop = stop + np.min(np.where(sample_data[stop:] < (mean[stop:] + 3*sd[stop:])))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
377 except ValueError: new_stop = data.shape[1]-1
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
378 dups.append({"sampleID":sample,"chromosome": cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "dup"})
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
379
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
380 dels = []
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
381 for start,stop in del_bkpoints:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
382 try: new_start = np.max(np.where(sample_data[:start] > (-1*mean[:start] - 3*sd[:start])))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
383 except ValueError: new_start = 0
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
384 try: new_stop = stop + np.min(np.where(sample_data[stop:] > (-1*mean[stop:] - 3*sd[stop:])))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
385 except ValueError: new_stop = data.shape[1]-1
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
386 dels.append({"sampleID":sample,"chromosome": cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "del"})
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
387
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
388 dels = cf.mergeCalls(dels) #merges overlapping calls
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
389 dups = cf.mergeCalls(dups)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
390
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
391 #print sampleID, len(dels), len(dups)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
392
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
393 all_calls.extend(list(dels))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
394 all_calls.extend(list(dups))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
395
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
396 # print calls to file
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
397 header = ['sampleID','chromosome','start','stop','state']
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
398
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
399 callfile_f.write('\t'.join(header) + "\n")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
400 for call in all_calls:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
401 print "%s\t%s\t%d\t%d\t%s" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"])
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
402 callfile_f.write("%s\t%s\t%d\t%d\t%s\n" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"]))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
403
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
404 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
405
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
406 def CF_plot(args):
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
407 try:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
408 import locale
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
409 import matplotlib
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
410 matplotlib.use('Agg')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
411 import matplotlib.pyplot as plt
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
412 import pylab as P
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
413 from matplotlib.lines import Line2D
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
414 from matplotlib.patches import Rectangle
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
415 _ = locale.setlocale(locale.LC_ALL, '')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
416 except:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
417 print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
418 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
419
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
420
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
421 chr, start, stop = cf.parseLocString(args.region)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
422
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
423 r = cf.rpkm_reader(args.input)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
424
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
425 data = r.getExonValuesByRegion(chr,start,stop)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
426 _ = data.smooth()
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
427
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
428 plt.gcf().clear()
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
429 fig = plt.figure(figsize=(10,5))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
430 ax = fig.add_subplot(111)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
431
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
432
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
433 ax.plot(data.rpkm, linewidth = 0.3, c='k')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
434
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
435
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
436 if args.sample != 'none':
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
437 cnt = 1
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
438 coloriter = iter(['r','b','g','y'])
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
439 for sample in args.sample:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
440 try:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
441 color, sampleID = sample.split(":")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
442 except:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
443 color =coloriter.next()
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
444 sampleID = sample
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
445
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
446 ax.plot(data.getSample([sampleID]), linewidth = 1, c=color, label = sampleID)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
447
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
448 if cnt == 1:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
449 cf.plotRawData(ax, r.getExonValuesByRegion(chr,start,stop,sampleList=[sampleID]).getSample([sampleID]),color=color)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
450 cnt +=1
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
451 plt.legend(prop={'size':10},frameon=False)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
452
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
453 cf.plotGenes(ax, data)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
454 cf.plotGenomicCoords(plt,data)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
455 plt.xlim(0,data.shape[1])
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
456 plt.ylim(-3,3)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
457
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
458 plt.title("%s: %s - %s" % (cf.chrInt2Str(chr),locale.format("%d",start, grouping=True),locale.format("%d",stop, grouping=True)))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
459 plt.xlabel("Position")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
460 plt.ylabel("SVD-ZRPKM Values")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
461
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
462 plt.savefig(args.output)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
463
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
464 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
465
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
466 def CF_plotcalls(args):
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
467 try:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
468 import matplotlib
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
469 matplotlib.use('Agg')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
470 import matplotlib.pyplot as plt
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
471 import pylab as P
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
472 from matplotlib.lines import Line2D
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
473 from matplotlib.patches import Rectangle
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
474 except:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
475 print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
476 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
477
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
478 import locale
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
479 try:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
480 _ = locale.setlocale(locale.LC_ALL, 'en_US')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
481 except:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
482 _ = locale.setlocale(locale.LC_ALL, '')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
483
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
484 try:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
485 callfile_fn = str(args.calls)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
486 callfile_f = open(callfile_fn, mode='r')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
487 except IOError as e:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
488 print '[ERROR] Cannot open call file for reading: ', callfile_fn
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
489 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
490
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
491 all_calls = []
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
492 header = callfile_f.readline()
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
493
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
494 for line in callfile_f:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
495 sampleID, chr, start, stop, state = line.strip().split()
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
496 chr = cf.chrStr2Int(chr)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
497 all_calls.append({"chromosome":int(chr), "start":int(start), "stop":int(stop), "sampleID":sampleID})
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
498
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
499 r = cf.rpkm_reader(args.input)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
500
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
501 for call in all_calls:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
502 chr = call["chromosome"]
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
503 start = call["start"]
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
504 stop = call["stop"]
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
505 sampleID = call["sampleID"]
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
506
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
507 exons = r.getExonIDs(chr,int(start),int(stop))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
508
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
509
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
510 window_start = max(exons[0]-args.window,0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
511 window_stop = exons[-1]+args.window
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
512
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
513 data = r.getExonValuesByExons(chr,window_start, window_stop)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
514 _ = data.smooth()
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
515
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
516 plt.gcf().clear()
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
517 fig = plt.figure(figsize=(10,5))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
518 ax = fig.add_subplot(111)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
519
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
520
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
521 ax.plot(data.rpkm, linewidth = 0.3, c='k')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
522
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
523
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
524 ax.plot(data.getSample([sampleID]), linewidth = 1, c='r', label = sampleID)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
525 cf.plotRawData(ax, r.getExonValuesByExons(chr,window_start, window_stop,sampleList=[sampleID]).getSample([sampleID]),color='r')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
526
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
527 plt.legend(prop={'size':10},frameon=False)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
528
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
529 cf.plotGenes(ax, data)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
530 cf.plotGenomicCoords(plt,data)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
531
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
532 exon_start = np.where(data.exons["start"] == start)[0][0]
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
533 exon_stop = np.where(data.exons["stop"] == stop)[0][0]
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
534 _ = ax.add_line(matplotlib.lines.Line2D([exon_start,exon_stop],[2,2],color='k',lw=6,linestyle='-',alpha=1,solid_capstyle='butt'))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
535
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
536 _ = plt.xlim(0,data.shape[1])
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
537 _ = plt.ylim(-3,3)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
538
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
539 plt.title("%s: %s - %s" % (cf.chrInt2Str(chr),locale.format("%d",start, grouping=True),locale.format("%d",stop, grouping=True)))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
540 plt.xlabel("Position")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
541 plt.ylabel("SVD-ZRPKM Values")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
542 outfile = "%s_%d_%d_%s.png" % (cf.chrInt2Str(chr), start, stop, sampleID)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
543 plt.savefig(args.outputdir + "/" + outfile)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
544
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
545 def CF_bam2RPKM(args):
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
546 try:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
547 import pysam
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
548 except:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
549 print '[ERROR] Cannot load pysam module! Make sure it is insalled'
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
550 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
551 try:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
552 # read probes table
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
553 probe_fn = str(args.probes[0])
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
554 probes = cf.loadProbeList(probe_fn)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
555 num_probes = len(probes)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
556 print '[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
557 except IOError as e:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
558 print '[ERROR] Cannot read probes file: ', probe_fn
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
559 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
560
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
561 try:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
562 rpkm_f = open(args.output[0],'w')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
563 except IOError as e:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
564 print '[ERROR] Cannot open rpkm file for writing: ', args.output
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
565 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
566
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
567 print "[RUNNING] Counting total number of reads in bam file..."
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
568 total_reads = float(pysam.view("-c", args.input[0])[0].strip("\n"))
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
569 print "[RUNNING] Found %d reads" % total_reads
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
570
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
571 f = pysam.Samfile(args.input[0], "rb" )
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
572
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
573 # if not f._hasIndex():
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
574 if not f.has_index():
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
575 print "[ERROR] No index found for bam file (%s)!\n[ERROR] You must first index the bam file and include the .bai file in the same directory as the bam file!" % args.input[0]
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
576 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
577
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
578 # will be storing values in these arrays
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
579 readcount = np.zeros(num_probes)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
580 exon_bp = np.zeros(num_probes)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
581 probeIDs = np.zeros(num_probes)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
582 counter = 0
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
583
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
584 # detect contig naming scheme here # TODO, add an optional "contigs.txt" file or automatically handle contig naming
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
585 bam_contigs = f.references
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
586 probes_contigs = [str(p) for p in set(map(operator.itemgetter("chr"),probes))]
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
587
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
588 probes2contigmap = {}
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
589
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
590 for probes_contig in probes_contigs:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
591 if probes_contig in bam_contigs:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
592 probes2contigmap[probes_contig] = probes_contig
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
593 elif cf.chrInt2Str(probes_contig) in bam_contigs:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
594 probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
595 elif cf.chrInt2Str(probes_contig).replace("chr","") in bam_contigs:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
596 probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig).replace("chr","")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
597 else:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
598 print "[ERROR] Could not find contig '%s' from %s in bam file! \n[ERROR] Perhaps the contig names for the probes are incompatible with the bam file ('chr1' vs. '1'), or unsupported contig naming is used?" % (probes_contig, probe_fn)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
599 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
600
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
601 print "[RUNNING] Calculating RPKM values..."
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
602
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
603 # loop through each probe
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
604 for p in probes:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
605
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
606 # f.fetch is a pysam method and returns an iterator for reads overlapping interval
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
607
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
608 p_chr = probes2contigmap[str(p["chr"])]
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
609
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
610 p_start = p["start"]
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
611 p_stop = p["stop"]
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
612 try:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
613 iter = f.fetch(p_chr,p_start,p_stop)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
614 except:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
615 print "[ERROR] Could not retrieve mappings for region %s:%d-%d. Check that contigs are named correctly and the bam file is properly indexed" % (p_chr,p_start,p_stop)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
616 sys.exit(0)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
617
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
618 for i in iter:
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
619 if i.pos+1 >= p_start: #this checks to make sure a read actually starts in an interval
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
620 readcount[counter] += 1
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
621
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
622 exon_bp[counter] = p_stop-p_start
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
623 probeIDs[counter] = counter +1 #probeIDs are 1-based
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
624 counter +=1
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
625
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
626 #calcualte RPKM values for all probes
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
627 rpkm = (10**9*(readcount)/(exon_bp))/(total_reads)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
628
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
629 out = np.vstack([probeIDs,readcount,rpkm])
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
630
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
631 np.savetxt(rpkm_f,out.transpose(),delimiter='\t',fmt=['%d','%d','%f'])
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
632
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
633 rpkm_f.close()
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
634
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
635
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
636
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
637 VERSION = "0.2.2"
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
638 parser = argparse.ArgumentParser(prog="CoNIFER", description="This is CoNIFER %s (Copy Number Inference From Exome Reads), designed to detect and genotype CNVs and CNPs from exome sequence read-depth data. See Krumm et al., Genome Research (2012) doi:10.1101/gr.138115.112 \nNiklas Krumm, 2012\n nkrumm@uw.edu" % VERSION)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
639 parser.add_argument('--version', action='version', version='%(prog)s ' + VERSION)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
640 subparsers = parser.add_subparsers(help='Command to be run.')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
641
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
642 # rpkm command
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
643 rpkm_parser= subparsers.add_parser('rpkm', help='Create an RPKM file from a BAM file')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
644 rpkm_parser.add_argument('--probes',action='store', required=True, metavar='/path/to/probes_file.txt', nargs=1,help="Probe definition file")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
645 rpkm_parser.add_argument('--input',action='store', required=True, metavar='sample.bam',nargs=1, help="Aligned BAM file")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
646 rpkm_parser.add_argument('--output',action='store', required=True, metavar='sample.rpkm.txt',nargs=1, help="RPKM file to write")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
647 rpkm_parser.set_defaults(func=CF_bam2RPKM)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
648
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
649 # analyze command
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
650 analyze_parser= subparsers.add_parser('analyze', help='Basic CoNIFER analysis. Reads a directory of RPKM files and a probe list and outputs a HDF5 file containing SVD-ZRPKM values.')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
651 analyze_parser.add_argument('--probes',action='store', required=True, metavar='/path/to/probes_file.txt', nargs=1,help="Probe definition file")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
652 analyze_parser.add_argument('--rpkm_dir',action='store', required=True, metavar='/path/to/folder/containing/rpkm_files/',nargs=1, help="Location of folder containing RPKM files. Folder should contain ONLY contain RPKM files, and all readable RPKM files will used in analysis.")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
653 analyze_parser.add_argument('--output','-o', required=True, metavar='/path/to/output_file.hdf5', type=str, help="Output location of HDF5 file to contain SVD-ZRPKM values")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
654 analyze_parser.add_argument('--svd', metavar='12', type=int, nargs='?', default = 12,help="Number of components to remove")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
655 analyze_parser.add_argument('--min_rpkm', metavar='1.00', type=float, nargs="?", default = 1.00,help="Minimum population median RPKM per probe.")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
656 analyze_parser.add_argument('--write_svals', metavar='SingularValues.txt', type=str, default= "", help="Optional file to write singular values (S-values). Used for Scree Plot.")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
657 analyze_parser.add_argument('--plot_scree', metavar='ScreePlot.png', type=str, default= "", help="Optional graphical scree plot. Requires matplotlib.")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
658 analyze_parser.add_argument('--write_sd', metavar='StandardDeviations.txt', type=str, default= "", help="Optional file with sample SVD-ZRPKM standard deviations. Used to filter noisy samples.")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
659 analyze_parser.set_defaults(func=CF_analyze)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
660
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
661 # export command
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
662 export_parser= subparsers.add_parser('export', help='Export SVD-ZRPKM values from a HDF5 file to bed or vcf format.')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
663 export_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
664 export_parser.add_argument('--output','-o',action='store', required=False, default='.', metavar='output.bed',help="Location of output file[s]. When exporting multiple samples, top-level directory of this path will be used.")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
665 export_parser.add_argument('--sample','-s',action='store', required=False, metavar='sampleID', default='all', nargs="+",help="SampleID or comma-separated list of sampleIDs to export. Default: export all samples")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
666 export_parser.set_defaults(func=CF_export)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
667
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
668 # plot command
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
669 plot_parser= subparsers.add_parser('plot', help='Plot SVD-ZRPKM values using matplotlib')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
670 plot_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
671 plot_parser.add_argument('--region',action='store', required=True, metavar='chr#:start-stop',help="Region to plot")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
672 plot_parser.add_argument('--output',action='store', required=True, metavar='image.png',help="Output path and filetype. PDF, PNG, PS, EPS, and SVG are supported.")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
673 plot_parser.add_argument('--sample',action='store', required=False, metavar='sampleID',nargs="+",default='none',help="Sample[s] to highlight. The following optional color spec can be used: <color>:<sampleID>. Available colors are r,b,g,y,c,m,y,k. The unsmoothed SVD-ZRPKM values for the first sample in this list will be drawn. Default: No samples highlighted.")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
674 plot_parser.set_defaults(func=CF_plot)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
675
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
676 # make calls command
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
677 call_parser= subparsers.add_parser('call', help='Very rudimentary caller for CNVs using SVD-ZRPKM thresholding.')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
678 call_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
679 call_parser.add_argument('--output',action='store', required=True, metavar='calls.txt',help="Output file for calls")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
680 call_parser.add_argument('--threshold', metavar='1.50', type=float, nargs='?', required=False, default = 1.50,help="+/- Threshold for calling (minimum SVD-ZRPKM)")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
681 call_parser.set_defaults(func=CF_call)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
682
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
683 # plotcalls command
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
684 plotcalls_parser= subparsers.add_parser('plotcalls', help='Make basic plots from call file from "call" command.')
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
685 plotcalls_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
686 plotcalls_parser.add_argument('--calls',action='store', required=True, metavar='calls.txt',help="File with calls from 'call' command.")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
687 plotcalls_parser.add_argument('--outputdir',action='store', required=True, metavar='/path/to/directory',help="Output directory for plots")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
688 plotcalls_parser.add_argument('--window',action='store', required=False, metavar='50',default=50,help="In exons, the amount of padding to plot around each call")
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
689 plotcalls_parser.set_defaults(func=CF_plotcalls)
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
690
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
691 args = parser.parse_args()
54973c4a1125 Uploaded
bzonnedda
parents:
diff changeset
692 args.func(args)