annotate conifer.py @ 16:558f04d13648 draft

Uploaded
author bzonnedda
date Thu, 23 Feb 2017 04:25:31 -0500
parents 9dd5a52b4e53
children 9196f299dc5e
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
10
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
1 #######################################################################
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
2 #######################################################################
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
3 # CoNIFER: Copy Number Inference From Exome Reads
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
4 # Developed by Niklas Krumm (C) 2012
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
5 # nkrumm@gmail.com
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
6 #
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
7 # homepage: http://conifer.sf.net
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
8 # This program is described in:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
9 # Krumm et al. 2012. Genome Research. doi:10.1101/gr.138115.112
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
10 #
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
11 # This file is part of CoNIFER.
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
12 # CoNIFER is free software: you can redistribute it and/or modify
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
13 # it under the terms of the GNU General Public License as published by
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
14 # the Free Software Foundation, either version 3 of the License, or
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
15 # (at your option) any later version.
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
16 #
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
17 # This program is distributed in the hope that it will be useful,
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
20 # GNU General Public License for more details.
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
21 #
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
22 # You should have received a copy of the GNU General Public License
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
23 # along with this program. If not, see <http://www.gnu.org/licenses/>.
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
24 #######################################################################
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
25 #######################################################################
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
26
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
27 import argparse
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
28 import os, sys, copy
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
29 import glob
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
30 import csv
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
31 import conifer_functions as cf
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
32 import operator
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
33 from tables import *
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
34 import numpy as np
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
35
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
36 def CF_analyze(args):
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
37 # do path/file checks:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
38 try:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
39 # read probes table
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
40 probe_fn = str(args.probes[0])
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
41 probes = cf.loadProbeList(probe_fn)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
42 num_probes = len(probes)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
43 print '[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
44 except IOError as e:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
45 print '[ERROR] Cannot read probes file: ', probe_fn
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
46 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
47
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
48 try:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
49 svd_outfile_fn = str(args.output)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
50 h5file_out = openFile(svd_outfile_fn, mode='w')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
51 probe_group = h5file_out.createGroup("/","probes","probes")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
52 except IOError as e:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
53 print '[ERROR] Cannot open SVD output file for writing: ', svd_outfile_fn
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
54 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
55
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
56 if args.write_svals != "":
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
57 sval_f = open(args.write_svals,'w')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
58
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
59 if args.plot_scree != "":
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
60 try:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
61 import matplotlib
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
62 matplotlib.use('Agg')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
63 import matplotlib.pyplot as plt
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
64 import pylab as P
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
65 from matplotlib.lines import Line2D
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
66 from matplotlib.patches import Rectangle
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
67 except:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
68 print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
69 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
70
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
71 plt.gcf().clear()
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
72 fig = plt.figure(figsize=(10,5))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
73 ax = fig.add_subplot(111)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
74
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
75 rpkm_dir = str(args.rpkm_dir[0])
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
76 rpkm_files = glob.glob(rpkm_dir + "/*")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
77 if len(rpkm_files) == 0:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
78 print '[ERROR] Cannot find any files in RPKM directory (or directory path is incorrect): ', rpkm_dir
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
79 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
80 elif len(rpkm_files) == 1:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
81 print '[ERROR] Found only 1 RPKM file (sample). CoNIFER requires multiple samples (8 or more) to run. Exiting.'
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
82 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
83 elif len(rpkm_files) < 8:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
84 print '[WARNING] Only found %d samples... this is less than the recommended minimum, and CoNIFER may not analyze this dataset correctly!' % len(rpkm_files)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
85 elif len(rpkm_files) <= int(args.svd):
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
86 print '[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (len(rpkm_files), int(args.svd))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
87 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
88 else:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
89 print '[INIT] Found %d RPKM files in %s' % (len(rpkm_files), rpkm_dir)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
90
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
91 # read in samples names and generate file list
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
92 samples = {}
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
93 for f in rpkm_files:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
94 s = '.'.join(f.split('/')[-1].split('.')[0:-1])
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
95 print "[INIT] Mapping file to sampleID: %s --> %s" % (f, s)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
96 samples[s] = f
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
97
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
98 #check uniqueness and total # of samples
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
99 if len(set(samples)) != len(set(rpkm_files)):
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
100 print '[ERROR] Could not successfully derive sample names from RPKM filenames. There are probably non-unique sample names! Please rename files using <sampleID>.txt format!'
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
101 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
102
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
103 # LOAD RPKM DATA
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
104 RPKM_data = np.zeros([num_probes,len(samples)], dtype=np.float)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
105 failed_samples = 0
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
106
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
107 for i,s in enumerate(samples.keys()):
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
108 t = np.loadtxt(samples[s], dtype=np.float, delimiter="\t", skiprows=0, usecols=[2])
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
109 if len(t) != num_probes:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
110 print "[WARNING] Number of RPKM values for %s in file %s does not match number of defined probes in %s. **This sample will be dropped from analysis**!" % (s, samples[s], probe_fn)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
111 _ = samples.pop(s)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
112 failed_samples += 1
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
113 else:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
114 RPKM_data[:,i] = t
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
115 print "[INIT] Successfully read RPKM data for sampleID: %s" % s
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
116
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
117 RPKM_data = RPKM_data[:,0:len(samples)]
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
118 print "[INIT] Finished reading RPKM files. Total number of samples in experiment: %d (%d failed to read properly)" % (len(samples), failed_samples)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
119
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
120 if len(samples) <= int(args.svd):
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
121 print '[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (int(args.svd), len(samples))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
122 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
123
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
124 # BEGIN
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
125 chrs_to_process = set(map(operator.itemgetter("chr"),probes))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
126 chrs_to_process_str = ', '.join([cf.chrInt2Str(c) for c in chrs_to_process])
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
127 print '[INIT] Attempting to process chromosomes: ', chrs_to_process_str
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
128
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
129
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
130
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
131 for chr in chrs_to_process:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
132 print "[RUNNING: chr%d] Now on: %s" %(chr, cf.chrInt2Str(chr))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
133 chr_group_name = "chr%d" % chr
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
134 chr_group = h5file_out.createGroup("/",chr_group_name,chr_group_name)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
135
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
136 chr_probes = filter(lambda i: i["chr"] == chr, probes)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
137 num_chr_probes = len(chr_probes)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
138 start_probeID = chr_probes[0]['probeID']
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
139 stop_probeID = chr_probes[-1]['probeID']
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
140 print "[RUNNING: chr%d] Found %d probes; probeID range is [%d-%d]" % (chr, len(chr_probes), start_probeID-1, stop_probeID) # probeID is 1-based and slicing is 0-based, hence the start_probeID-1 term
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
141
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
142 rpkm = RPKM_data[start_probeID:stop_probeID,:]
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
143
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
144 print "[RUNNING: chr%d] Calculating median RPKM" % chr
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
145 median = np.median(rpkm,1)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
146 sd = np.std(rpkm,1)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
147 probe_mask = median >= float(args.min_rpkm)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
148 print "[RUNNING: chr%d] Masking %d probes with median RPKM < %f" % (chr, np.sum(probe_mask==False), float(args.min_rpkm))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
149
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
150 rpkm = rpkm[probe_mask, :]
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
151 num_chr_probes = np.sum(probe_mask)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
152
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
153 if num_chr_probes <= len(samples):
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
154 print "[ERROR] This chromosome has fewer informative probes than there are samples in the analysis! There are probably no mappings on this chromosome. Please remove these probes from the probes.txt file"
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
155 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
156
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
157 probeIDs = np.array(map(operator.itemgetter("probeID"),chr_probes))[probe_mask]
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
158 probe_starts = np.array(map(operator.itemgetter("start"),chr_probes))[probe_mask]
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
159 probe_stops = np.array(map(operator.itemgetter("stop"),chr_probes))[probe_mask]
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
160 gene_names = np.array(map(operator.itemgetter("name"),chr_probes))[probe_mask]
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
161
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
162 dt = np.dtype([('probeID',np.uint32),('start',np.uint32),('stop',np.uint32), ('name', np.str_, 20)])
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
163
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
164 out_probes = np.empty(num_chr_probes,dtype=dt)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
165 out_probes['probeID'] = probeIDs
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
166 out_probes['start'] = probe_starts
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
167 out_probes['stop'] = probe_stops
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
168 out_probes['name'] = gene_names
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
169 probe_table = h5file_out.createTable(probe_group,"probes_chr%d" % chr,cf.probe,"chr%d" % chr)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
170 probe_table.append(out_probes)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
171
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
172 print "[RUNNING: chr%d] Calculating ZRPKM scores..." % chr
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
173 rpkm = np.apply_along_axis(cf.zrpkm, 0, rpkm, median[probe_mask], sd[probe_mask])
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
174
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
175 # svd transform
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
176 print "[RUNNING: chr%d] SVD decomposition..." % chr
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
177 components_removed = int(args.svd)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
178
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
179 U, S, Vt = np.linalg.svd(rpkm,full_matrices=False)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
180 new_S = np.diag(np.hstack([np.zeros([components_removed]),S[components_removed:]]))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
181
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
182 if args.write_svals != "":
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
183 sval_f.write('chr' + str(chr) + '\t' + '\t'.join([str(_i) for _i in S]) + "\n")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
184
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
185 if args.plot_scree != "":
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
186 ax.plot(S, label='chr' + str(chr),lw=0.5)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
187
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
188 # reconstruct data matrix
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
189 rpkm = np.dot(U, np.dot(new_S, Vt))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
190
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
191
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
192 # save to HDF5 file
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
193 print "[RUNNING: chr%d] Saving SVD-ZRPKM values" % chr
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
194
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
195 for i,s in enumerate(samples):
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
196 out_data = np.empty(num_chr_probes,dtype='u4,f8')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
197 out_data['f0'] = probeIDs
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
198 out_data['f1'] = rpkm[:,i]
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
199 sample_tbl = h5file_out.createTable(chr_group,"sample_" + str(s),cf.rpkm_value,"%s" % str(s))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
200 sample_tbl.append(out_data)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
201
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
202
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
203 print "[RUNNING] Saving sampleIDs to file..."
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
204 sample_group = h5file_out.createGroup("/","samples","samples")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
205 sample_table = h5file_out.createTable(sample_group,"samples",cf.sample,"samples")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
206 dt = np.dtype([('sampleID',np.str_,100)])
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
207 out_samples = np.empty(len(samples.keys()),dtype=dt)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
208 out_samples['sampleID'] = np.array(samples.keys())
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
209 sample_table.append(out_samples)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
210
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
211
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
212 if args.write_sd != "":
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
213 print "[RUNNING] Calculating standard deviations for all samples (this can take a while)..."
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
214
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
215 sd_file = open(args.write_sd,'w')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
216
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
217 for i,s in enumerate(samples):
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
218 # collect all SVD-ZRPKM values
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
219 count = 1
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
220 for chr in chrs_to_process:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
221 if count == 1:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
222 sd_out = h5file_out.root._f_getChild("chr%d" % chr)._f_getChild("sample_%s" % s).read(field="rpkm").flatten()
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
223 else:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
224 sd_out = np.hstack([sd_out,out.h5file_out.root._f_getChild("chr%d" % chr)._f_getChild("sample_%s" % s).read(field="rpkm").flatten()])
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
225
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
226 sd = np.std(sd_out)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
227 sd_file.write("%s\t%f\n" % (s,sd))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
228
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
229 sd_file.close()
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
230
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
231 if args.plot_scree != "":
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
232 plt.title("Scree plot")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
233 if len(samples) < 50:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
234 plt.xlim([0,len(samples)])
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
235 plt.xlabel("S values")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
236 else:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
237 plt.xlim([0,50])
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
238 plt.xlabel("S values (only first 50 plotted)")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
239 plt.ylabel("Relative contributed variance")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
240 plt.savefig(args.plot_scree)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
241
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
242 print "[FINISHED]"
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
243 h5file_out.close()
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
244 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
245
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
246 def CF_export(args):
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
247 try:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
248 h5file_in_fn = str(args.input)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
249 h5file_in = openFile(h5file_in_fn, mode='r')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
250 except IOError as e:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
251 print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
252 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
253
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
254 # read probes
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
255 probes = {}
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
256 for probes_chr in h5file_in.root.probes:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
257 probes[probes_chr.title] = probes_chr.read()
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
258
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
259 if args.sample =='all':
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
260 all_samples = list(h5file_in.root.samples.samples.read(field="sampleID"))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
261
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
262 out_path = os.path.abspath(args.output)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
263
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
264 print "[INIT] Preparing to export all samples (%d samples) to %s" % (len(all_samples), out_path)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
265 for sample in all_samples:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
266 try:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
267 outfile_fn = out_path + "/" + sample + ".bed"
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
268 outfile_f = open(outfile_fn,'w')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
269 except IOError as e:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
270 print '[ERROR] Cannot open output file for writing: ', outfile_fn
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
271 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
272 print "[RUNNING] Exporting %s" % sample
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
273
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
274 cf.export_sample(h5file_in,sample,probes,outfile_f)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
275 outfile_f.close()
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
276
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
277 elif len(args.sample) == 1:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
278 out_path = os.path.abspath(args.output)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
279 sample = args.sample[0]
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
280 print "[INIT] Preparing to export sampleID %s to %s" % (args.sample[0], out_path)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
281 try:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
282 if os.path.isdir(out_path):
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
283 outfile_fn = out_path + "/" + sample + ".bed"
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
284 else:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
285 outfile_fn = out_path
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
286 outfile_f = open(outfile_fn,'w')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
287 except IOError as e:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
288 print '[ERROR] Cannot open output file for writing: ', outfile_fn
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
289 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
290 print "[RUNNING] Exporting %s to %s" % (sample, outfile_fn)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
291
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
292 cf.export_sample(h5file_in,sample,probes,outfile_f)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
293 outfile_f.close()
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
294
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
295 else:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
296 out_path = os.path.abspath(args.output)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
297 print "[INIT] Preparing to export %d samples to %s" % (len(args.sample), out_path)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
298 for sample in args.sample:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
299 try:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
300 if os.path.isdir(out_path):
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
301 outfile_fn = out_path + "/" + sample + ".bed"
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
302 else:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
303 outfile_fn = out_path
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
304 outfile_f = open(outfile_fn,'w')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
305 except IOError as e:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
306 print '[ERROR] Cannot open output file for writing: ', outfile_fn
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
307 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
308 print "[RUNNING] Exporting %s to %s" % (sample, outfile_fn)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
309
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
310 cf.export_sample(h5file_in,sample,probes,outfile_f)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
311 outfile_f.close()
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
312 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
313
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
314 def CF_call(args):
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
315 try:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
316 h5file_in_fn = str(args.input)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
317 h5file_in = openFile(h5file_in_fn, mode='r')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
318 except IOError as e:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
319 print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
320 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
321
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
322 try:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
323 callfile_fn = str(args.output)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
324 callfile_f = open(callfile_fn, mode='w')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
325 except IOError as e:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
326 print '[ERROR] Cannot open output file for writing: ', callfile_fn
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
327 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
328
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
329 chrs_to_process = []
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
330 for chr in h5file_in.root:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
331 if chr._v_title not in ('probes','samples'):
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
332 chrs_to_process.append(chr._v_title.replace("chr",""))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
333
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
334 h5file_in.close()
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
335
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
336 print '[INIT] Initializing caller at threshold = %f' % (args.threshold)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
337
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
338 r = cf.rpkm_reader(h5file_in_fn)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
339
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
340 all_calls = []
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
341
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
342 for chr in chrs_to_process:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
343 print '[RUNNING] Now processing chr%s' % chr
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
344 data = r.getExonValuesByRegion(chr)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
345
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
346 #raw_data = copy.copy(data)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
347 _ = data.smooth()
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
348
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
349 mean= np.mean(data.rpkm,axis=1)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
350 sd = np.std(data.rpkm,axis=1)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
351
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
352 for sample in r.getSampleList():
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
353 sample_data = data.getSample([sample]).flatten()
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
354 #sample_raw_data = raw_data.getSample([sample]).flatten()
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
355
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
356 dup_mask = sample_data >= args.threshold
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
357 del_mask = sample_data <= -1*args.threshold
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
358
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
359 dup_bkpoints = cf.getbkpoints(dup_mask) #returns exon coordinates for this chromosome (numpy array coords)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
360 del_bkpoints = cf.getbkpoints(del_mask)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
361
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
362
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
363 dups = []
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
364 for start,stop in dup_bkpoints:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
365 try: new_start = np.max(np.where(sample_data[:start] < (mean[:start] + 3*sd[:start])))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
366 except ValueError: new_start = 0
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
367 try: new_stop = stop + np.min(np.where(sample_data[stop:] < (mean[stop:] + 3*sd[stop:])))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
368 except ValueError: new_stop = data.shape[1]-1
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
369 dups.append({"sampleID":sample,"chromosome": cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "dup"})
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
370
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
371 dels = []
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
372 for start,stop in del_bkpoints:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
373 try: new_start = np.max(np.where(sample_data[:start] > (-1*mean[:start] - 3*sd[:start])))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
374 except ValueError: new_start = 0
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
375 try: new_stop = stop + np.min(np.where(sample_data[stop:] > (-1*mean[stop:] - 3*sd[stop:])))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
376 except ValueError: new_stop = data.shape[1]-1
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
377 dels.append({"sampleID":sample,"chromosome": cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "del"})
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
378
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
379 dels = cf.mergeCalls(dels) #merges overlapping calls
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
380 dups = cf.mergeCalls(dups)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
381
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
382 #print sampleID, len(dels), len(dups)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
383
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
384 all_calls.extend(list(dels))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
385 all_calls.extend(list(dups))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
386
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
387 # print calls to file
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
388 header = ['sampleID','chromosome','start','stop','state']
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
389
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
390 callfile_f.write('\t'.join(header) + "\n")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
391 for call in all_calls:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
392 print "%s\t%s\t%d\t%d\t%s" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"])
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
393 callfile_f.write("%s\t%s\t%d\t%d\t%s\n" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"]))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
394
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
395 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
396
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
397 def CF_plot(args):
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
398 try:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
399 import locale
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
400 import matplotlib
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
401 matplotlib.use('Agg')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
402 import matplotlib.pyplot as plt
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
403 import pylab as P
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
404 from matplotlib.lines import Line2D
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
405 from matplotlib.patches import Rectangle
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
406 _ = locale.setlocale(locale.LC_ALL, '')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
407 except:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
408 print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
409 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
410
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
411
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
412 chr, start, stop = cf.parseLocString(args.region)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
413
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
414 r = cf.rpkm_reader(args.input)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
415
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
416 data = r.getExonValuesByRegion(chr,start,stop)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
417 _ = data.smooth()
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
418
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
419 plt.gcf().clear()
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
420 fig = plt.figure(figsize=(10,5))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
421 ax = fig.add_subplot(111)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
422
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
423
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
424 ax.plot(data.rpkm, linewidth = 0.3, c='k')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
425
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
426
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
427 if args.sample != 'none':
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
428 cnt = 1
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
429 coloriter = iter(['r','b','g','y'])
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
430 for sample in args.sample:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
431 try:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
432 color, sampleID = sample.split(":")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
433 except:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
434 color =coloriter.next()
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
435 sampleID = sample
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
436
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
437 ax.plot(data.getSample([sampleID]), linewidth = 1, c=color, label = sampleID)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
438
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
439 if cnt == 1:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
440 cf.plotRawData(ax, r.getExonValuesByRegion(chr,start,stop,sampleList=[sampleID]).getSample([sampleID]),color=color)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
441 cnt +=1
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
442 plt.legend(prop={'size':10},frameon=False)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
443
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
444 cf.plotGenes(ax, data)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
445 cf.plotGenomicCoords(plt,data)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
446 plt.xlim(0,data.shape[1])
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
447 plt.ylim(-3,3)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
448
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
449 plt.title("%s: %s - %s" % (cf.chrInt2Str(chr),locale.format("%d",start, grouping=True),locale.format("%d",stop, grouping=True)))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
450 plt.xlabel("Position")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
451 plt.ylabel("SVD-ZRPKM Values")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
452
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
453 plt.savefig(args.output)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
454
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
455 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
456
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
457 def CF_plotcalls(args):
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
458 try:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
459 import matplotlib
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
460 matplotlib.use('Agg')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
461 import matplotlib.pyplot as plt
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
462 import pylab as P
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
463 from matplotlib.lines import Line2D
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
464 from matplotlib.patches import Rectangle
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
465 except:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
466 print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
467 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
468
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
469 import locale
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
470 try:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
471 _ = locale.setlocale(locale.LC_ALL, 'en_US')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
472 except:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
473 _ = locale.setlocale(locale.LC_ALL, '')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
474
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
475 try:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
476 callfile_fn = str(args.calls)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
477 callfile_f = open(callfile_fn, mode='r')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
478 except IOError as e:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
479 print '[ERROR] Cannot open call file for reading: ', callfile_fn
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
480 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
481
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
482 all_calls = []
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
483 header = callfile_f.readline()
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
484
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
485 for line in callfile_f:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
486 sampleID, chr, start, stop, state = line.strip().split()
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
487 chr = cf.chrStr2Int(chr)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
488 all_calls.append({"chromosome":int(chr), "start":int(start), "stop":int(stop), "sampleID":sampleID})
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
489
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
490 r = cf.rpkm_reader(args.input)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
491
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
492 for call in all_calls:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
493 chr = call["chromosome"]
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
494 start = call["start"]
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
495 stop = call["stop"]
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
496 sampleID = call["sampleID"]
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
497
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
498 exons = r.getExonIDs(chr,int(start),int(stop))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
499
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
500
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
501 window_start = max(exons[0]-args.window,0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
502 window_stop = exons[-1]+args.window
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
503
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
504 data = r.getExonValuesByExons(chr,window_start, window_stop)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
505 _ = data.smooth()
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
506
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
507 plt.gcf().clear()
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
508 fig = plt.figure(figsize=(10,5))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
509 ax = fig.add_subplot(111)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
510
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
511
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
512 ax.plot(data.rpkm, linewidth = 0.3, c='k')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
513
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
514
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
515 ax.plot(data.getSample([sampleID]), linewidth = 1, c='r', label = sampleID)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
516 cf.plotRawData(ax, r.getExonValuesByExons(chr,window_start, window_stop,sampleList=[sampleID]).getSample([sampleID]),color='r')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
517
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
518 plt.legend(prop={'size':10},frameon=False)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
519
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
520 cf.plotGenes(ax, data)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
521 cf.plotGenomicCoords(plt,data)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
522
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
523 exon_start = np.where(data.exons["start"] == start)[0][0]
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
524 exon_stop = np.where(data.exons["stop"] == stop)[0][0]
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
525 _ = ax.add_line(matplotlib.lines.Line2D([exon_start,exon_stop],[2,2],color='k',lw=6,linestyle='-',alpha=1,solid_capstyle='butt'))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
526
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
527 _ = plt.xlim(0,data.shape[1])
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
528 _ = plt.ylim(-3,3)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
529
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
530 plt.title("%s: %s - %s" % (cf.chrInt2Str(chr),locale.format("%d",start, grouping=True),locale.format("%d",stop, grouping=True)))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
531 plt.xlabel("Position")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
532 plt.ylabel("SVD-ZRPKM Values")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
533 outfile = "%s_%d_%d_%s.png" % (cf.chrInt2Str(chr), start, stop, sampleID)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
534 plt.savefig(args.outputdir + "/" + outfile)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
535
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
536 def CF_bam2RPKM(args):
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
537 try:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
538 import pysam
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
539 except:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
540 print '[ERROR] Cannot load pysam module! Make sure it is insalled'
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
541 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
542 try:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
543 # read probes table
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
544 probe_fn = str(args.probes[0])
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
545 probes = cf.loadProbeList(probe_fn)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
546 num_probes = len(probes)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
547 print '[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
548 except IOError as e:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
549 print '[ERROR] Cannot read probes file: ', probe_fn
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
550 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
551
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
552 try:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
553 rpkm_f = open(args.output[0],'w')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
554 except IOError as e:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
555 print '[ERROR] Cannot open rpkm file for writing: ', args.output
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
556 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
557
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
558 print "[RUNNING] Counting total number of reads in bam file..."
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
559 total_reads = float(pysam.view("-c", args.input[0])[0].strip("\n"))
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
560 print "[RUNNING] Found %d reads" % total_reads
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
561
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
562 f = pysam.Samfile(args.input[0], "rb" )
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
563
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
564 if not f._hasIndex():
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
565 print "[ERROR] No index found for bam file (%s)!\n[ERROR] You must first index the bam file and include the .bai file in the same directory as the bam file!" % args.input[0]
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
566 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
567
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
568 # will be storing values in these arrays
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
569 readcount = np.zeros(num_probes)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
570 exon_bp = np.zeros(num_probes)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
571 probeIDs = np.zeros(num_probes)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
572 counter = 0
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
573
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
574 # detect contig naming scheme here # TODO, add an optional "contigs.txt" file or automatically handle contig naming
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
575 bam_contigs = f.references
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
576 probes_contigs = [str(p) for p in set(map(operator.itemgetter("chr"),probes))]
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
577
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
578 probes2contigmap = {}
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
579
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
580 for probes_contig in probes_contigs:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
581 if probes_contig in bam_contigs:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
582 probes2contigmap[probes_contig] = probes_contig
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
583 elif cf.chrInt2Str(probes_contig) in bam_contigs:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
584 probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
585 elif cf.chrInt2Str(probes_contig).replace("chr","") in bam_contigs:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
586 probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig).replace("chr","")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
587 else:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
588 print "[ERROR] Could not find contig '%s' from %s in bam file! \n[ERROR] Perhaps the contig names for the probes are incompatible with the bam file ('chr1' vs. '1'), or unsupported contig naming is used?" % (probes_contig, probe_fn)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
589 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
590
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
591 print "[RUNNING] Calculating RPKM values..."
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
592
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
593 # loop through each probe
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
594 for p in probes:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
595
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
596 # f.fetch is a pysam method and returns an iterator for reads overlapping interval
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
597
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
598 p_chr = probes2contigmap[str(p["chr"])]
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
599
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
600 p_start = p["start"]
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
601 p_stop = p["stop"]
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
602 try:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
603 iter = f.fetch(p_chr,p_start,p_stop)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
604 except:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
605 print "[ERROR] Could not retrieve mappings for region %s:%d-%d. Check that contigs are named correctly and the bam file is properly indexed" % (p_chr,p_start,p_stop)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
606 sys.exit(0)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
607
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
608 for i in iter:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
609 if i.pos+1 >= p_start: #this checks to make sure a read actually starts in an interval
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
610 readcount[counter] += 1
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
611
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
612 exon_bp[counter] = p_stop-p_start
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
613 probeIDs[counter] = counter +1 #probeIDs are 1-based
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
614 counter +=1
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
615
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
616 #calcualte RPKM values for all probes
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
617 rpkm = (10**9*(readcount)/(exon_bp))/(total_reads)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
618
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
619 out = np.vstack([probeIDs,readcount,rpkm])
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
620
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
621 np.savetxt(rpkm_f,out.transpose(),delimiter='\t',fmt=['%d','%d','%f'])
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
622
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
623 rpkm_f.close()
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
624
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
625
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
626
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
627 VERSION = "0.2.2"
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
628 parser = argparse.ArgumentParser(prog="CoNIFER", description="This is CoNIFER %s (Copy Number Inference From Exome Reads), designed to detect and genotype CNVs and CNPs from exome sequence read-depth data. See Krumm et al., Genome Research (2012) doi:10.1101/gr.138115.112 \nNiklas Krumm, 2012\n nkrumm@uw.edu" % VERSION)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
629 parser.add_argument('--version', action='version', version='%(prog)s ' + VERSION)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
630 subparsers = parser.add_subparsers(help='Command to be run.')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
631
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
632 # rpkm command
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
633 rpkm_parser= subparsers.add_parser('rpkm', help='Create an RPKM file from a BAM file')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
634 rpkm_parser.add_argument('--probes',action='store', required=True, metavar='/path/to/probes_file.txt', nargs=1,help="Probe definition file")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
635 rpkm_parser.add_argument('--input',action='store', required=True, metavar='sample.bam',nargs=1, help="Aligned BAM file")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
636 rpkm_parser.add_argument('--output',action='store', required=True, metavar='sample.rpkm.txt',nargs=1, help="RPKM file to write")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
637 rpkm_parser.set_defaults(func=CF_bam2RPKM)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
638
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
639 # analyze command
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
640 analyze_parser= subparsers.add_parser('analyze', help='Basic CoNIFER analysis. Reads a directory of RPKM files and a probe list and outputs a HDF5 file containing SVD-ZRPKM values.')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
641 analyze_parser.add_argument('--probes',action='store', required=True, metavar='/path/to/probes_file.txt', nargs=1,help="Probe definition file")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
642 analyze_parser.add_argument('--rpkm_dir',action='store', required=True, metavar='/path/to/folder/containing/rpkm_files/',nargs=1, help="Location of folder containing RPKM files. Folder should contain ONLY contain RPKM files, and all readable RPKM files will used in analysis.")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
643 analyze_parser.add_argument('--output','-o', required=True, metavar='/path/to/output_file.hdf5', type=str, help="Output location of HDF5 file to contain SVD-ZRPKM values")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
644 analyze_parser.add_argument('--svd', metavar='12', type=int, nargs='?', default = 12,help="Number of components to remove")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
645 analyze_parser.add_argument('--min_rpkm', metavar='1.00', type=float, nargs="?", default = 1.00,help="Minimum population median RPKM per probe.")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
646 analyze_parser.add_argument('--write_svals', metavar='SingularValues.txt', type=str, default= "", help="Optional file to write singular values (S-values). Used for Scree Plot.")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
647 analyze_parser.add_argument('--plot_scree', metavar='ScreePlot.png', type=str, default= "", help="Optional graphical scree plot. Requires matplotlib.")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
648 analyze_parser.add_argument('--write_sd', metavar='StandardDeviations.txt', type=str, default= "", help="Optional file with sample SVD-ZRPKM standard deviations. Used to filter noisy samples.")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
649 analyze_parser.set_defaults(func=CF_analyze)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
650
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
651 # export command
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
652 export_parser= subparsers.add_parser('export', help='Export SVD-ZRPKM values from a HDF5 file to bed or vcf format.')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
653 export_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
654 export_parser.add_argument('--output','-o',action='store', required=False, default='.', metavar='output.bed',help="Location of output file[s]. When exporting multiple samples, top-level directory of this path will be used.")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
655 export_parser.add_argument('--sample','-s',action='store', required=False, metavar='sampleID', default='all', nargs="+",help="SampleID or comma-separated list of sampleIDs to export. Default: export all samples")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
656 export_parser.set_defaults(func=CF_export)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
657
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
658 # plot command
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
659 plot_parser= subparsers.add_parser('plot', help='Plot SVD-ZRPKM values using matplotlib')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
660 plot_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
661 plot_parser.add_argument('--region',action='store', required=True, metavar='chr#:start-stop',help="Region to plot")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
662 plot_parser.add_argument('--output',action='store', required=True, metavar='image.png',help="Output path and filetype. PDF, PNG, PS, EPS, and SVG are supported.")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
663 plot_parser.add_argument('--sample',action='store', required=False, metavar='sampleID',nargs="+",default='none',help="Sample[s] to highlight. The following optional color spec can be used: <color>:<sampleID>. Available colors are r,b,g,y,c,m,y,k. The unsmoothed SVD-ZRPKM values for the first sample in this list will be drawn. Default: No samples highlighted.")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
664 plot_parser.set_defaults(func=CF_plot)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
665
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
666 # make calls command
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
667 call_parser= subparsers.add_parser('call', help='Very rudimentary caller for CNVs using SVD-ZRPKM thresholding.')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
668 call_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
669 call_parser.add_argument('--output',action='store', required=True, metavar='calls.txt',help="Output file for calls")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
670 call_parser.add_argument('--threshold', metavar='1.50', type=float, nargs='?', required=False, default = 1.50,help="+/- Threshold for calling (minimum SVD-ZRPKM)")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
671 call_parser.set_defaults(func=CF_call)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
672
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
673 # plotcalls command
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
674 plotcalls_parser= subparsers.add_parser('plotcalls', help='Make basic plots from call file from "call" command.')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
675 plotcalls_parser.add_argument('--input','-i',action='store', required=True, metavar='CoNIFER_SVD.hdf5',help="HDF5 file from CoNIFER 'analyze' step")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
676 plotcalls_parser.add_argument('--calls',action='store', required=True, metavar='calls.txt',help="File with calls from 'call' command.")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
677 plotcalls_parser.add_argument('--outputdir',action='store', required=True, metavar='/path/to/directory',help="Output directory for plots")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
678 plotcalls_parser.add_argument('--window',action='store', required=False, metavar='50',default=50,help="In exons, the amount of padding to plot around each call")
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
679 plotcalls_parser.set_defaults(func=CF_plotcalls)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
680
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
681 args = parser.parse_args()
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
682 args.func(args)