annotate conifer.py @ 29:2e8281109e79 draft

Uploaded
author bzonnedda
date Fri, 07 Jul 2017 04:14:15 -0400
parents abdc05fae16e
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
10
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
1 #######################################################################
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
2 #######################################################################
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
3 # CoNIFER: Copy Number Inference From Exome Reads
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
4 # Developed by Niklas Krumm (C) 2012
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
5 # nkrumm@gmail.com
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
6 #
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
7 # homepage: http://conifer.sf.net
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
8 # This program is described in:
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
9 # Krumm et al. 2012. Genome Research. doi:10.1101/gr.138115.112
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
10 #
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
11 # This file is part of CoNIFER.
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
12 # CoNIFER is free software: you can redistribute it and/or modify
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
13 # it under the terms of the GNU General Public License as published by
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
14 # the Free Software Foundation, either version 3 of the License, or
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
15 # (at your option) any later version.
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
16 #
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
17 # This program is distributed in the hope that it will be useful,
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
20 # GNU General Public License for more details.
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
21 #
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
22 # You should have received a copy of the GNU General Public License
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
23 # along with this program. If not, see <http://www.gnu.org/licenses/>.
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
24 #######################################################################
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
25 #######################################################################
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
26
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
27 import argparse
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
28 import os, sys, copy
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
29 import glob
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
30 import csv
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
31 import conifer_functions as cf
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
32 import operator
25
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
33 from tables import *
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
34 import pysam
10
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
35 import numpy as np
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
36
25
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
37
10
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
38 def CF_analyze(args):
25
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
39 # do path/file checks:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
40 try:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
41 # read probes table
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
42 probe_fn = str(args.probes[0])
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
43 probes = cf.loadProbeList(probe_fn)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
44 num_probes = len(probes)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
45 print('[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
46 except IOError as e:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
47 print('[ERROR] Cannot read probes file: ', probe_fn)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
48 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
49
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
50 try:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
51 svd_outfile_fn = str(args.output)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
52 h5file_out = OpenFile(svd_outfile_fn, mode='w')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
53 probe_group = h5file_out.createGroup("/", "probes", "probes")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
54 except IOError as e:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
55 print('[ERROR] Cannot open SVD output file for writing: ', svd_outfile_fn)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
56 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
57
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
58 if args.write_svals != "":
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
59 sval_f = open(args.write_svals, 'w')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
60
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
61 if args.plot_scree != "":
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
62 try:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
63 import matplotlib
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
64 matplotlib.use('Agg')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
65 import matplotlib.pyplot as plt
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
66 import pylab as P
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
67 from matplotlib.lines import Line2D
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
68 from matplotlib.patches import Rectangle
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
69 except:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
70 print("[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
71 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
72
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
73 plt.gcf().clear()
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
74 fig = plt.figure(figsize=(10, 5))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
75 ax = fig.add_subplot(111)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
76
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
77 rpkm_dir = str(args.rpkm_dir[0])
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
78 rpkm_files = glob.glob(rpkm_dir + "/*")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
79 if len(rpkm_files) == 0:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
80 print('[ERROR] Cannot find any files in RPKM directory (or directory path is incorrect): ', rpkm_dir)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
81 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
82 elif len(rpkm_files) == 1:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
83 print('[ERROR] Found only 1 RPKM file (sample). CoNIFER requires multiple samples (8 or more) to run. Exiting.')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
84 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
85 elif len(rpkm_files) < 8:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
86 print('[WARNING] Only found %d samples... this is less than the recommended minimum, and CoNIFER may not analyze this dataset correctly!' % len(
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
87 rpkm_files))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
88 elif len(rpkm_files) <= int(args.svd):
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
89 print('[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
90 len(rpkm_files), int(args.svd)))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
91 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
92 else:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
93 print('[INIT] Found %d RPKM files in %s' % (len(rpkm_files), rpkm_dir))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
94
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
95 # read in samples names and generate file list
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
96 samples = {}
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
97 for f in rpkm_files:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
98 s = '.'.join(f.split('/')[-1].split('.')[0:-1])
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
99 print("[INIT] Mapping file to sampleID: %s --> %s" % (f, s))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
100 samples[s] = f
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
101
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
102 # check uniqueness and total # of samples
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
103 if len(set(samples)) != len(set(rpkm_files)):
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
104 print('[ERROR] Could not successfully derive sample names from RPKM filenames. There are probably non-unique sample names! Please rename files using <sampleID>.txt format!')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
105 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
106
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
107 # LOAD RPKM DATA
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
108 RPKM_data = np.zeros([num_probes, len(samples)], dtype=np.float)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
109 failed_samples = 0
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
110
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
111 for i, s in enumerate(samples.keys()):
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
112 t = np.loadtxt(samples[s], dtype=np.float, delimiter="\t", skiprows=0, usecols=[2])
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
113 if len(t) != num_probes:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
114 print("[WARNING] Number of RPKM values for %s in file %s does not match number of defined probes in %s. **This sample will be dropped from analysis**!" % (
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
115 s, samples[s], probe_fn))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
116 _ = samples.pop(s)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
117 failed_samples += 1
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
118 else:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
119 RPKM_data[:, i] = t
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
120 print("[INIT] Successfully read RPKM data for sampleID: %s" % s)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
121
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
122 RPKM_data = RPKM_data[:, 0:len(samples)]
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
123 print("[INIT] Finished reading RPKM files. Total number of samples in experiment: %d (%d failed to read properly)" % (
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
124 len(samples), failed_samples))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
125
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
126 if len(samples) <= int(args.svd):
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
127 print('[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % (
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
128 int(args.svd), len(samples)))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
129 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
130
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
131 # BEGIN
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
132 chrs_to_process = set(map(operator.itemgetter("chr"), probes))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
133 chrs_to_process_str = ', '.join([cf.chrInt2Str(c) for c in chrs_to_process])
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
134 print('[INIT] Attempting to process chromosomes: ', chrs_to_process_str)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
135
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
136 for chr in chrs_to_process:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
137 print("[RUNNING: chr%d] Now on: %s" % (chr, cf.chrInt2Str(chr)))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
138 chr_group_name = "chr%d" % chr
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
139 chr_group = h5file_out.createGroup("/", chr_group_name, chr_group_name)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
140
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
141 chr_probes = [i for i in probes if i["chr"] == chr]
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
142 num_chr_probes = len(chr_probes)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
143 start_probeID = chr_probes[0]['probeID']
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
144 stop_probeID = chr_probes[-1]['probeID']
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
145 print("[RUNNING: chr%d] Found %d probes; probeID range is [%d-%d]" % (chr, len(chr_probes), start_probeID - 1,
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
146 stop_probeID)) # probeID is 1-based and slicing is 0-based, hence the start_probeID-1 term
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
147
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
148 rpkm = RPKM_data[start_probeID:stop_probeID, :]
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
149
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
150 print("[RUNNING: chr%d] Calculating median RPKM" % chr)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
151 median = np.median(rpkm, 1)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
152 sd = np.std(rpkm, 1)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
153 probe_mask = median >= float(args.min_rpkm)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
154 print("[RUNNING: chr%d] Masking %d probes with median RPKM < %f" % (
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
155 chr, np.sum(probe_mask == False), float(args.min_rpkm)))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
156
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
157 rpkm = rpkm[probe_mask, :]
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
158 num_chr_probes = np.sum(probe_mask)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
159
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
160 if num_chr_probes <= len(samples):
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
161 print("[ERROR] This chromosome has fewer informative probes than there are samples in the analysis! There are probably no mappings on this chromosome. Please remove these probes from the probes.txt file")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
162 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
163
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
164 probeIDs = np.array(list(map(operator.itemgetter("probeID"), chr_probes)))[probe_mask]
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
165 probe_starts = np.array(list(map(operator.itemgetter("start"), chr_probes)))[probe_mask]
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
166 probe_stops = np.array(list(map(operator.itemgetter("stop"), chr_probes)))[probe_mask]
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
167 gene_names = np.array(list(map(operator.itemgetter("name"), chr_probes)))[probe_mask]
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
168
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
169 dt = np.dtype([('probeID', np.uint32), ('start', np.uint32), ('stop', np.uint32), ('name', np.str_, 20)])
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
170
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
171 out_probes = np.empty(num_chr_probes, dtype=dt)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
172 out_probes['probeID'] = probeIDs
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
173 out_probes['start'] = probe_starts
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
174 out_probes['stop'] = probe_stops
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
175 out_probes['name'] = gene_names
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
176 probe_table = h5file_out.createTable(probe_group, "probes_chr%d" % chr, cf.probe, "chr%d" % chr)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
177 probe_table.append(out_probes)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
178
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
179 print("[RUNNING: chr%d] Calculating ZRPKM scores..." % chr)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
180 rpkm = np.apply_along_axis(cf.zrpkm, 0, rpkm, median[probe_mask], sd[probe_mask])
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
181
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
182 # svd transform
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
183 print("[RUNNING: chr%d] SVD decomposition..." % chr)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
184 components_removed = int(args.svd)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
185
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
186 U, S, Vt = np.linalg.svd(rpkm, full_matrices=False)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
187 new_S = np.diag(np.hstack([np.zeros([components_removed]), S[components_removed:]]))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
188
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
189 if args.write_svals != "":
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
190 sval_f.write('chr' + str(chr) + '\t' + '\t'.join([str(_i) for _i in S]) + "\n")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
191
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
192 if args.plot_scree != "":
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
193 ax.plot(S, label='chr' + str(chr), lw=0.5)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
194
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
195 # reconstruct data matrix
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
196 rpkm = np.dot(U, np.dot(new_S, Vt))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
197
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
198 # save to HDF5 file
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
199 print("[RUNNING: chr%d] Saving SVD-ZRPKM values" % chr)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
200
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
201 for i, s in enumerate(samples):
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
202 out_data = np.empty(num_chr_probes, dtype='u4,f8')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
203 out_data['f0'] = probeIDs
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
204 out_data['f1'] = rpkm[:, i]
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
205 sample_tbl = h5file_out.createTable(chr_group, "sample_" + str(s), cf.rpkm_value, "%s" % str(s))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
206 sample_tbl.append(out_data)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
207
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
208 print("[RUNNING] Saving sampleIDs to file...")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
209 sample_group = h5file_out.createGroup("/", "samples", "samples")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
210 sample_table = h5file_out.createTable(sample_group, "samples", cf.sample, "samples")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
211 dt = np.dtype([('sampleID', np.str_, 100)])
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
212 out_samples = np.empty(len(list(samples.keys())), dtype=dt)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
213 out_samples['sampleID'] = np.array(list(samples.keys()))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
214 sample_table.append(out_samples)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
215
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
216 if args.write_sd != "":
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
217 print("[RUNNING] Calculating standard deviations for all samples (this can take a while)...")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
218
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
219 sd_file = open(args.write_sd, 'w')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
220
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
221 for i, s in enumerate(samples):
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
222 # collect all SVD-ZRPKM values
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
223 count = 1
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
224 for chr in chrs_to_process:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
225 if count == 1:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
226 sd_out = h5file_out.root._f_getChild("chr%d" % chr)._f_getChild("sample_%s" % s).read(
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
227 field="rpkm").flatten()
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
228 else:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
229 sd_out = np.hstack([sd_out, out.h5file_out.root._f_getChild("chr%d" % chr)._f_getChild(
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
230 "sample_%s" % s).read(field="rpkm").flatten()])
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
231
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
232 sd = np.std(sd_out)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
233 sd_file.write("%s\t%f\n" % (s, sd))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
234
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
235 sd_file.close()
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
236
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
237 if args.plot_scree != "":
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
238 plt.title("Scree plot")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
239 if len(samples) < 50:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
240 plt.xlim([0, len(samples)])
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
241 plt.xlabel("S values")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
242 else:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
243 plt.xlim([0, 50])
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
244 plt.xlabel("S values (only first 50 plotted)")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
245 plt.ylabel("Relative contributed variance")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
246 plt.savefig(args.plot_scree)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
247
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
248 print("[FINISHED]")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
249 h5file_out.close()
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
250 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
251
10
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
252
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
253 def CF_export(args):
25
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
254 try:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
255 h5file_in_fn = str(args.input)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
256 h5file_in = OpenFile(h5file_in_fn, mode='r')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
257 except IOError as e:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
258 print('[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
259 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
260
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
261 # read probes
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
262 probes = {}
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
263 for probes_chr in h5file_in.root.probes:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
264 probes[probes_chr.title] = probes_chr.read()
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
265
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
266 if args.sample == 'all':
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
267 all_samples = list(h5file_in.root.samples.samples.read(field="sampleID"))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
268
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
269 out_path = os.path.abspath(args.output)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
270
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
271 print("[INIT] Preparing to export all samples (%d samples) to %s" % (len(all_samples), out_path))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
272 for sample in all_samples:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
273 try:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
274 outfile_fn = out_path + "/" + sample + ".bed"
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
275 outfile_f = open(outfile_fn, 'w')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
276 except IOError as e:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
277 print('[ERROR] Cannot open output file for writing: ', outfile_fn)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
278 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
279 print("[RUNNING] Exporting %s" % sample)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
280
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
281 cf.export_sample(h5file_in, sample, probes, outfile_f)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
282 outfile_f.close()
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
283
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
284 elif len(args.sample) == 1:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
285 out_path = os.path.abspath(args.output)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
286 sample = args.sample[0]
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
287 print("[INIT] Preparing to export sampleID %s to %s" % (args.sample[0], out_path))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
288 try:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
289 if os.path.isdir(out_path):
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
290 outfile_fn = out_path + "/" + sample + ".bed"
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
291 else:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
292 outfile_fn = out_path
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
293 outfile_f = open(outfile_fn, 'w')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
294 except IOError as e:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
295 print('[ERROR] Cannot open output file for writing: ', outfile_fn)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
296 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
297 print("[RUNNING] Exporting %s to %s" % (sample, outfile_fn))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
298
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
299 cf.export_sample(h5file_in, sample, probes, outfile_f)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
300 outfile_f.close()
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
301
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
302 else:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
303 out_path = os.path.abspath(args.output)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
304 print("[INIT] Preparing to export %d samples to %s" % (len(args.sample), out_path))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
305 for sample in args.sample:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
306 try:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
307 if os.path.isdir(out_path):
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
308 outfile_fn = out_path + "/" + sample + ".bed"
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
309 else:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
310 outfile_fn = out_path
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
311 outfile_f = open(outfile_fn, 'w')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
312 except IOError as e:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
313 print('[ERROR] Cannot open output file for writing: ', outfile_fn)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
314 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
315 print("[RUNNING] Exporting %s to %s" % (sample, outfile_fn))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
316
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
317 cf.export_sample(h5file_in, sample, probes, outfile_f)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
318 outfile_f.close()
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
319 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
320
10
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
321
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
322 def CF_call(args):
25
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
323 try:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
324 h5file_in_fn = str(args.input)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
325 h5file_in = OpenFile(h5file_in_fn, mode='r')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
326 except IOError as e:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
327 print('[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
328 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
329
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
330 try:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
331 callfile_fn = str(args.output)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
332 callfile_f = open(callfile_fn, mode='w')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
333 except IOError as e:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
334 print('[ERROR] Cannot open output file for writing: ', callfile_fn)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
335 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
336
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
337 chrs_to_process = []
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
338 for chr in h5file_in.root:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
339 if chr._v_title not in ('probes', 'samples'):
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
340 chrs_to_process.append(chr._v_title.replace("chr", ""))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
341
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
342 h5file_in.close()
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
343
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
344 print('[INIT] Initializing caller at threshold = %f' % (args.threshold))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
345
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
346 r = cf.rpkm_reader(h5file_in_fn)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
347
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
348 all_calls = []
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
349
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
350 for chr in chrs_to_process:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
351 print('[RUNNING] Now processing chr%s' % chr)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
352 data = r.getExonValuesByRegion(chr)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
353
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
354 # raw_data = copy.copy(data)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
355 _ = data.smooth()
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
356
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
357 mean = np.mean(data.rpkm, axis=1)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
358 sd = np.std(data.rpkm, axis=1)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
359
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
360 for sample in r.getSampleList():
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
361 sample_data = data.getSample([sample]).flatten()
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
362 # sample_raw_data = raw_data.getSample([sample]).flatten()
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
363
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
364 dup_mask = sample_data >= args.threshold
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
365 del_mask = sample_data <= -1 * args.threshold
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
366
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
367 dup_bkpoints = cf.getbkpoints(dup_mask) # returns exon coordinates for this chromosome (numpy array coords)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
368 del_bkpoints = cf.getbkpoints(del_mask)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
369
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
370 dups = []
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
371 for start, stop in dup_bkpoints:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
372 try:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
373 new_start = np.max(np.where(sample_data[:start] < (mean[:start] + 3 * sd[:start])))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
374 except ValueError:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
375 new_start = 0
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
376 try:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
377 new_stop = stop + np.min(np.where(sample_data[stop:] < (mean[stop:] + 3 * sd[stop:])))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
378 except ValueError:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
379 new_stop = data.shape[1] - 1
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
380 dups.append(
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
381 {"sampleID": sample, "chromosome": cf.chrInt2Str(chr), "start": data.exons[new_start]["start"],
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
382 "stop": data.exons[new_stop]["stop"], "state": "dup"})
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
383
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
384 dels = []
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
385 for start, stop in del_bkpoints:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
386 try:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
387 new_start = np.max(np.where(sample_data[:start] > (-1 * mean[:start] - 3 * sd[:start])))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
388 except ValueError:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
389 new_start = 0
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
390 try:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
391 new_stop = stop + np.min(np.where(sample_data[stop:] > (-1 * mean[stop:] - 3 * sd[stop:])))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
392 except ValueError:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
393 new_stop = data.shape[1] - 1
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
394 dels.append(
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
395 {"sampleID": sample, "chromosome": cf.chrInt2Str(chr), "start": data.exons[new_start]["start"],
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
396 "stop": data.exons[new_stop]["stop"], "state": "del"})
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
397
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
398 dels = cf.mergeCalls(dels) # merges overlapping calls
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
399 dups = cf.mergeCalls(dups)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
400
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
401 # print sampleID, len(dels), len(dups)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
402
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
403 all_calls.extend(list(dels))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
404 all_calls.extend(list(dups))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
405
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
406 # print calls to file
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
407 header = ['sampleID', 'chromosome', 'start', 'stop', 'state']
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
408
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
409 callfile_f.write('\t'.join(header) + "\n")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
410 for call in all_calls:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
411 print("%s\t%s\t%d\t%d\t%s" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"]))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
412 callfile_f.write(
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
413 "%s\t%s\t%d\t%d\t%s\n" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"]))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
414
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
415 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
416
10
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
417
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
418 def CF_plot(args):
25
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
419 try:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
420 import locale
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
421 import matplotlib
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
422 matplotlib.use('Agg')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
423 import matplotlib.pyplot as plt
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
424 import pylab as P
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
425 from matplotlib.lines import Line2D
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
426 from matplotlib.patches import Rectangle
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
427 _ = locale.setlocale(locale.LC_ALL, '')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
428 except:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
429 print("[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
430 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
431
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
432 chr, start, stop = cf.parseLocString(args.region)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
433
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
434 r = cf.rpkm_reader(args.input)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
435
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
436 data = r.getExonValuesByRegion(chr, start, stop)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
437 _ = data.smooth()
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
438
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
439 plt.gcf().clear()
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
440 fig = plt.figure(figsize=(10, 5))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
441 ax = fig.add_subplot(111)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
442
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
443 ax.plot(data.rpkm, linewidth=0.3, c='k')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
444
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
445 if args.sample != 'none':
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
446 cnt = 1
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
447 coloriter = iter(['r', 'b', 'g', 'y'])
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
448 for sample in args.sample:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
449 try:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
450 color, sampleID = sample.split(":")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
451 except:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
452 color = next(coloriter)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
453 sampleID = sample
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
454
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
455 ax.plot(data.getSample([sampleID]), linewidth=1, c=color, label=sampleID)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
456
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
457 if cnt == 1:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
458 cf.plotRawData(ax,
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
459 r.getExonValuesByRegion(chr, start, stop, sampleList=[sampleID]).getSample([sampleID]),
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
460 color=color)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
461 cnt += 1
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
462 plt.legend(prop={'size': 10}, frameon=False)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
463
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
464 cf.plotGenes(ax, data)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
465 cf.plotGenomicCoords(plt, data)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
466 plt.xlim(0, data.shape[1])
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
467 plt.ylim(-3, 3)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
468
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
469 plt.title("%s: %s - %s" % (
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
470 cf.chrInt2Str(chr), locale.format("%d", start, grouping=True), locale.format("%d", stop, grouping=True)))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
471 plt.xlabel("Position")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
472 plt.ylabel("SVD-ZRPKM Values")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
473
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
474 plt.savefig(args.output)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
475
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
476 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
477
10
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
478
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
479 def CF_plotcalls(args):
25
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
480 try:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
481 import matplotlib
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
482 matplotlib.use('Agg')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
483 import matplotlib.pyplot as plt
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
484 import pylab as P
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
485 from matplotlib.lines import Line2D
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
486 from matplotlib.patches import Rectangle
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
487 except:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
488 print("[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
489 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
490
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
491 import locale
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
492 try:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
493 _ = locale.setlocale(locale.LC_ALL, 'en_US')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
494 except:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
495 _ = locale.setlocale(locale.LC_ALL, '')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
496
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
497 try:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
498 callfile_fn = str(args.calls)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
499 callfile_f = open(callfile_fn, mode='r')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
500 except IOError as e:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
501 print('[ERROR] Cannot open call file for reading: ', callfile_fn)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
502 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
503
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
504 all_calls = []
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
505 header = callfile_f.readline()
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
506
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
507 for line in callfile_f:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
508 sampleID, chr, start, stop, state = line.strip().split()
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
509 chr = cf.chrStr2Int(chr)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
510 all_calls.append({"chromosome": int(chr), "start": int(start), "stop": int(stop), "sampleID": sampleID})
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
511
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
512 r = cf.rpkm_reader(args.input)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
513
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
514 for call in all_calls:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
515 chr = call["chromosome"]
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
516 start = call["start"]
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
517 stop = call["stop"]
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
518 sampleID = call["sampleID"]
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
519
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
520 exons = r.getExonIDs(chr, int(start), int(stop))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
521
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
522 window_start = max(exons[0] - args.window, 0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
523 window_stop = exons[-1] + args.window
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
524
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
525 data = r.getExonValuesByExons(chr, window_start, window_stop)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
526 _ = data.smooth()
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
527
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
528 plt.gcf().clear()
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
529 fig = plt.figure(figsize=(10, 5))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
530 ax = fig.add_subplot(111)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
531
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
532 ax.plot(data.rpkm, linewidth=0.3, c='k')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
533
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
534 ax.plot(data.getSample([sampleID]), linewidth=1, c='r', label=sampleID)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
535 cf.plotRawData(ax, r.getExonValuesByExons(chr, window_start, window_stop, sampleList=[sampleID]).getSample(
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
536 [sampleID]), color='r')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
537
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
538 plt.legend(prop={'size': 10}, frameon=False)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
539
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
540 cf.plotGenes(ax, data)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
541 cf.plotGenomicCoords(plt, data)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
542
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
543 exon_start = np.where(data.exons["start"] == start)[0][0]
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
544 exon_stop = np.where(data.exons["stop"] == stop)[0][0]
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
545 _ = ax.add_line(
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
546 matplotlib.lines.Line2D([exon_start, exon_stop], [2, 2], color='k', lw=6, linestyle='-', alpha=1,
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
547 solid_capstyle='butt'))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
548
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
549 _ = plt.xlim(0, data.shape[1])
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
550 _ = plt.ylim(-3, 3)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
551
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
552 plt.title("%s: %s - %s" % (
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
553 cf.chrInt2Str(chr), locale.format("%d", start, grouping=True), locale.format("%d", stop, grouping=True)))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
554 plt.xlabel("Position")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
555 plt.ylabel("SVD-ZRPKM Values")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
556 outfile = "%s_%d_%d_%s.png" % (cf.chrInt2Str(chr), start, stop, sampleID)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
557 plt.savefig(args.outputdir + "/" + outfile)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
558
10
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
559
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
560 def CF_bam2RPKM(args):
25
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
561 try:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
562 import pysam
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
563 except:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
564 print('[ERROR] Cannot load pysam module! Make sure it is insalled')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
565 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
566 try:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
567 # read probes table
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
568 probe_fn = str(args.probes[0])
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
569 probes = cf.loadProbeList(probe_fn)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
570 num_probes = len(probes)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
571 print('[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
572 except IOError as e:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
573 print('[ERROR] Cannot read probes file: ', probe_fn)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
574 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
575
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
576 try:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
577 rpkm_f = open(args.output[0], 'wb')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
578 except IOError as e:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
579 print('[ERROR] Cannot open rpkm file for writing: ', args.output)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
580 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
581
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
582 print("[RUNNING] Counting total number of reads in bam file...")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
583 total_reads = float(pysam.view("-c", args.input[0])[0].strip("\n"))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
584 print("[RUNNING] Found %d reads" % total_reads)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
585
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
586 f = pysam.Samfile(args.input[0], "rb")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
587
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
588 # if not f._hasIndex():
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
589 if not f.has_index():
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
590 print("[ERROR] No index found for bam file (%s)!\n[ERROR] You must first index the bam file and include the .bai file in the same directory as the bam file!" % \
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
591 args.input[0])
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
592 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
593
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
594 # will be storing values in these arrays
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
595 readcount = np.zeros(num_probes)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
596 exon_bp = np.zeros(num_probes)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
597 probeIDs = np.zeros(num_probes)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
598 counter = 0
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
599
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
600 # detect contig naming scheme here # TODO, add an optional "contigs.txt" file or automatically handle contig naming
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
601 bam_contigs = f.references
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
602 probes_contigs = [str(p) for p in set(map(operator.itemgetter("chr"), probes))]
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
603
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
604 probes2contigmap = {}
10
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
605
25
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
606 for probes_contig in probes_contigs:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
607 if probes_contig in bam_contigs:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
608 probes2contigmap[probes_contig] = probes_contig
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
609 elif cf.chrInt2Str(probes_contig) in bam_contigs:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
610 probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
611 elif cf.chrInt2Str(probes_contig).replace("chr", "") in bam_contigs:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
612 probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig).replace("chr", "")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
613 else:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
614 print("[ERROR] Could not find contig '%s' from %s in bam file! \n[ERROR] Perhaps the contig names for the probes are incompatible with the bam file ('chr1' vs. '1'), or unsupported contig naming is used?" % (
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
615 probes_contig, probe_fn))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
616 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
617
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
618 print("[RUNNING] Calculating RPKM values...")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
619
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
620 # loop through each probe
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
621 for p in probes:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
622
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
623 # f.fetch is a pysam method and returns an iterator for reads overlapping interval
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
624
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
625 p_chr = probes2contigmap[str(p["chr"])]
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
626
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
627 p_start = p["start"]
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
628 p_stop = p["stop"]
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
629 try:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
630 iter = f.fetch(p_chr, p_start, p_stop)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
631 except:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
632 print("[ERROR] Could not retrieve mappings for region %s:%d-%d. Check that contigs are named correctly and the bam file is properly indexed" % (
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
633 p_chr, p_start, p_stop))
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
634 sys.exit(0)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
635
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
636 for i in iter:
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
637 if i.pos + 1 >= p_start: # this checks to make sure a read actually starts in an interval
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
638 readcount[counter] += 1
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
639
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
640 exon_bp[counter] = p_stop - p_start
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
641 probeIDs[counter] = counter + 1 # probeIDs are 1-based
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
642 counter += 1
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
643
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
644 # calcualte RPKM values for all probes
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
645 rpkm = (10 ** 9 * (readcount) / (exon_bp)) / (total_reads)
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
646
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
647 out = np.vstack([probeIDs, readcount, rpkm])
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
648
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
649 np.savetxt(rpkm_f, out.transpose(), delimiter='\t', fmt=['%d', '%d', '%f'])
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
650
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
651 rpkm_f.close()
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
652
10
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
653
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
654 VERSION = "0.2.2"
25
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
655 parser = argparse.ArgumentParser(prog="CoNIFER",
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
656 description="This is CoNIFER %s (Copy Number Inference From Exome Reads), designed to detect and genotype CNVs and CNPs from exome sequence read-depth data. See Krumm et al., Genome Research (2012) doi:10.1101/gr.138115.112 \nNiklas Krumm, 2012\n nkrumm@uw.edu" % VERSION)
10
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
657 parser.add_argument('--version', action='version', version='%(prog)s ' + VERSION)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
658 subparsers = parser.add_subparsers(help='Command to be run.')
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
659
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
660 # rpkm command
25
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
661 rpkm_parser = subparsers.add_parser('rpkm', help='Create an RPKM file from a BAM file')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
662 rpkm_parser.add_argument('--probes', action='store', required=True, metavar='/path/to/probes_file.txt', nargs=1,
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
663 help="Probe definition file")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
664 rpkm_parser.add_argument('--input', action='store', required=True, metavar='sample.bam', nargs=1,
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
665 help="Aligned BAM file")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
666 rpkm_parser.add_argument('--output', action='store', required=True, metavar='sample.rpkm.txt', nargs=1,
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
667 help="RPKM file to write")
10
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
668 rpkm_parser.set_defaults(func=CF_bam2RPKM)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
669
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
670 # analyze command
25
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
671 analyze_parser = subparsers.add_parser('analyze',
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
672 help='Basic CoNIFER analysis. Reads a directory of RPKM files and a probe list and outputs a HDF5 file containing SVD-ZRPKM values.')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
673 analyze_parser.add_argument('--probes', action='store', required=True, metavar='/path/to/probes_file.txt', nargs=1,
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
674 help="Probe definition file")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
675 analyze_parser.add_argument('--rpkm_dir', action='store', required=True,
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
676 metavar='/path/to/folder/containing/rpkm_files/', nargs=1,
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
677 help="Location of folder containing RPKM files. Folder should contain ONLY contain RPKM files, and all readable RPKM files will used in analysis.")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
678 analyze_parser.add_argument('--output', '-o', required=True, metavar='/path/to/output_file.hdf5', type=str,
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
679 help="Output location of HDF5 file to contain SVD-ZRPKM values")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
680 analyze_parser.add_argument('--svd', metavar='12', type=int, nargs='?', default=12,
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
681 help="Number of components to remove")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
682 analyze_parser.add_argument('--min_rpkm', metavar='1.00', type=float, nargs="?", default=1.00,
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
683 help="Minimum population median RPKM per probe.")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
684 analyze_parser.add_argument('--write_svals', metavar='SingularValues.txt', type=str, default="",
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
685 help="Optional file to write singular values (S-values). Used for Scree Plot.")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
686 analyze_parser.add_argument('--plot_scree', metavar='ScreePlot.png', type=str, default="",
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
687 help="Optional graphical scree plot. Requires matplotlib.")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
688 analyze_parser.add_argument('--write_sd', metavar='StandardDeviations.txt', type=str, default="",
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
689 help="Optional file with sample SVD-ZRPKM standard deviations. Used to filter noisy samples.")
10
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
690 analyze_parser.set_defaults(func=CF_analyze)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
691
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
692 # export command
25
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
693 export_parser = subparsers.add_parser('export', help='Export SVD-ZRPKM values from a HDF5 file to bed or vcf format.')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
694 export_parser.add_argument('--input', '-i', action='store', required=True, metavar='CoNIFER_SVD.hdf5',
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
695 help="HDF5 file from CoNIFER 'analyze' step")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
696 export_parser.add_argument('--output', '-o', action='store', required=False, default='.', metavar='output.bed',
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
697 help="Location of output file[s]. When exporting multiple samples, top-level directory of this path will be used.")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
698 export_parser.add_argument('--sample', '-s', action='store', required=False, metavar='sampleID', default='all',
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
699 nargs="+",
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
700 help="SampleID or comma-separated list of sampleIDs to export. Default: export all samples")
10
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
701 export_parser.set_defaults(func=CF_export)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
702
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
703 # plot command
25
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
704 plot_parser = subparsers.add_parser('plot', help='Plot SVD-ZRPKM values using matplotlib')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
705 plot_parser.add_argument('--input', '-i', action='store', required=True, metavar='CoNIFER_SVD.hdf5',
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
706 help="HDF5 file from CoNIFER 'analyze' step")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
707 plot_parser.add_argument('--region', action='store', required=True, metavar='chr#:start-stop', help="Region to plot")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
708 plot_parser.add_argument('--output', action='store', required=True, metavar='image.png',
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
709 help="Output path and filetype. PDF, PNG, PS, EPS, and SVG are supported.")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
710 plot_parser.add_argument('--sample', action='store', required=False, metavar='sampleID', nargs="+", default='none',
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
711 help="Sample[s] to highlight. The following optional color spec can be used: <color>:<sampleID>. Available colors are r,b,g,y,c,m,y,k. The unsmoothed SVD-ZRPKM values for the first sample in this list will be drawn. Default: No samples highlighted.")
10
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
712 plot_parser.set_defaults(func=CF_plot)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
713
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
714 # make calls command
25
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
715 call_parser = subparsers.add_parser('call', help='Very rudimentary caller for CNVs using SVD-ZRPKM thresholding.')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
716 call_parser.add_argument('--input', '-i', action='store', required=True, metavar='CoNIFER_SVD.hdf5',
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
717 help="HDF5 file from CoNIFER 'analyze' step")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
718 call_parser.add_argument('--output', action='store', required=True, metavar='calls.txt', help="Output file for calls")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
719 call_parser.add_argument('--threshold', metavar='1.50', type=float, nargs='?', required=False, default=1.50,
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
720 help="+/- Threshold for calling (minimum SVD-ZRPKM)")
10
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
721 call_parser.set_defaults(func=CF_call)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
722
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
723 # plotcalls command
25
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
724 plotcalls_parser = subparsers.add_parser('plotcalls', help='Make basic plots from call file from "call" command.')
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
725 plotcalls_parser.add_argument('--input', '-i', action='store', required=True, metavar='CoNIFER_SVD.hdf5',
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
726 help="HDF5 file from CoNIFER 'analyze' step")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
727 plotcalls_parser.add_argument('--calls', action='store', required=True, metavar='calls.txt',
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
728 help="File with calls from 'call' command.")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
729 plotcalls_parser.add_argument('--outputdir', action='store', required=True, metavar='/path/to/directory',
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
730 help="Output directory for plots")
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
731 plotcalls_parser.add_argument('--window', action='store', required=False, metavar='50', default=50,
abdc05fae16e Uploaded
bzonnedda
parents: 23
diff changeset
732 help="In exons, the amount of padding to plot around each call")
10
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
733 plotcalls_parser.set_defaults(func=CF_plotcalls)
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
734
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
735 args = parser.parse_args()
9dd5a52b4e53 Uploaded
bzonnedda
parents:
diff changeset
736 args.func(args)