| 10 | 1 ####################################################################### | 
|  | 2 ####################################################################### | 
|  | 3 # CoNIFER: Copy Number Inference From Exome Reads | 
|  | 4 # Developed by Niklas Krumm (C) 2012 | 
|  | 5 # nkrumm@gmail.com | 
|  | 6 # | 
|  | 7 # homepage: http://conifer.sf.net | 
|  | 8 # This program is described in: | 
|  | 9 # Krumm et al. 2012. Genome Research. doi:10.1101/gr.138115.112 | 
|  | 10 # | 
|  | 11 # This file is part of CoNIFER. | 
|  | 12 # CoNIFER is free software:  you can redistribute it and/or modify | 
|  | 13 # it under the terms of the GNU General Public License as published by | 
|  | 14 # the Free Software Foundation, either version 3 of the License, or | 
|  | 15 # (at your option) any later version. | 
|  | 16 # | 
|  | 17 # This program is distributed in the hope that it will be useful, | 
|  | 18 # but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|  | 19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
|  | 20 # GNU General Public License for more details. | 
|  | 21 # | 
|  | 22 # You should have received a copy of the GNU General Public License | 
|  | 23 # along with this program.  If not, see <http://www.gnu.org/licenses/>. | 
|  | 24 ####################################################################### | 
|  | 25 ####################################################################### | 
|  | 26 | 
|  | 27 import argparse | 
|  | 28 import os, sys, copy | 
|  | 29 import glob | 
|  | 30 import csv | 
|  | 31 import conifer_functions as cf | 
|  | 32 import operator | 
| 25 | 33 from tables import * | 
|  | 34 import pysam | 
| 10 | 35 import numpy  as np | 
|  | 36 | 
| 25 | 37 | 
| 10 | 38 def CF_analyze(args): | 
| 25 | 39     # do path/file checks: | 
|  | 40     try: | 
|  | 41         # read probes table | 
|  | 42         probe_fn = str(args.probes[0]) | 
|  | 43         probes = cf.loadProbeList(probe_fn) | 
|  | 44         num_probes = len(probes) | 
|  | 45         print('[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn)) | 
|  | 46     except IOError as e: | 
|  | 47         print('[ERROR] Cannot read probes file: ', probe_fn) | 
|  | 48         sys.exit(0) | 
|  | 49 | 
|  | 50     try: | 
|  | 51         svd_outfile_fn = str(args.output) | 
|  | 52         h5file_out = OpenFile(svd_outfile_fn, mode='w') | 
|  | 53         probe_group = h5file_out.createGroup("/", "probes", "probes") | 
|  | 54     except IOError as e: | 
|  | 55         print('[ERROR] Cannot open SVD output file for writing: ', svd_outfile_fn) | 
|  | 56         sys.exit(0) | 
|  | 57 | 
|  | 58     if args.write_svals != "": | 
|  | 59         sval_f = open(args.write_svals, 'w') | 
|  | 60 | 
|  | 61     if args.plot_scree != "": | 
|  | 62         try: | 
|  | 63             import matplotlib | 
|  | 64             matplotlib.use('Agg') | 
|  | 65             import matplotlib.pyplot as plt | 
|  | 66             import pylab as P | 
|  | 67             from matplotlib.lines import Line2D | 
|  | 68             from matplotlib.patches import Rectangle | 
|  | 69         except: | 
|  | 70             print("[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?") | 
|  | 71             sys.exit(0) | 
|  | 72 | 
|  | 73         plt.gcf().clear() | 
|  | 74         fig = plt.figure(figsize=(10, 5)) | 
|  | 75         ax = fig.add_subplot(111) | 
|  | 76 | 
|  | 77     rpkm_dir = str(args.rpkm_dir[0]) | 
|  | 78     rpkm_files = glob.glob(rpkm_dir + "/*") | 
|  | 79     if len(rpkm_files) == 0: | 
|  | 80         print('[ERROR] Cannot find any files in RPKM directory (or directory path is incorrect): ', rpkm_dir) | 
|  | 81         sys.exit(0) | 
|  | 82     elif len(rpkm_files) == 1: | 
|  | 83         print('[ERROR] Found only 1 RPKM file (sample). CoNIFER requires multiple samples (8 or more) to run. Exiting.') | 
|  | 84         sys.exit(0) | 
|  | 85     elif len(rpkm_files) < 8: | 
|  | 86         print('[WARNING] Only found %d samples... this is less than the recommended minimum, and CoNIFER may not analyze this dataset correctly!' % len( | 
|  | 87             rpkm_files)) | 
|  | 88     elif len(rpkm_files) <= int(args.svd): | 
|  | 89         print('[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % ( | 
|  | 90         len(rpkm_files), int(args.svd))) | 
|  | 91         sys.exit(0) | 
|  | 92     else: | 
|  | 93         print('[INIT] Found %d RPKM files in %s' % (len(rpkm_files), rpkm_dir)) | 
|  | 94 | 
|  | 95     # read in samples names and generate file list | 
|  | 96     samples = {} | 
|  | 97     for f in rpkm_files: | 
|  | 98         s = '.'.join(f.split('/')[-1].split('.')[0:-1]) | 
|  | 99         print("[INIT] Mapping file to sampleID: %s --> %s" % (f, s)) | 
|  | 100         samples[s] = f | 
|  | 101 | 
|  | 102     # check uniqueness and total # of samples | 
|  | 103     if len(set(samples)) != len(set(rpkm_files)): | 
|  | 104         print('[ERROR] Could not successfully derive sample names from RPKM filenames. There are probably non-unique sample names! Please rename files using <sampleID>.txt format!') | 
|  | 105         sys.exit(0) | 
|  | 106 | 
|  | 107     # LOAD RPKM DATA | 
|  | 108     RPKM_data = np.zeros([num_probes, len(samples)], dtype=np.float) | 
|  | 109     failed_samples = 0 | 
|  | 110 | 
|  | 111     for i, s in enumerate(samples.keys()): | 
|  | 112         t = np.loadtxt(samples[s], dtype=np.float, delimiter="\t", skiprows=0, usecols=[2]) | 
|  | 113         if len(t) != num_probes: | 
|  | 114             print("[WARNING] Number of RPKM values for %s in file %s does not match number of defined probes in %s. **This sample will be dropped from analysis**!" % ( | 
|  | 115             s, samples[s], probe_fn)) | 
|  | 116             _ = samples.pop(s) | 
|  | 117             failed_samples += 1 | 
|  | 118         else: | 
|  | 119             RPKM_data[:, i] = t | 
|  | 120             print("[INIT] Successfully read RPKM data for sampleID: %s" % s) | 
|  | 121 | 
|  | 122     RPKM_data = RPKM_data[:, 0:len(samples)] | 
|  | 123     print("[INIT] Finished reading RPKM files. Total number of samples in experiment: %d (%d failed to read properly)" % ( | 
|  | 124     len(samples), failed_samples)) | 
|  | 125 | 
|  | 126     if len(samples) <= int(args.svd): | 
|  | 127         print('[ERROR] The number of SVD values specified (%d) must be less than the number of samples (%d). Either add more samples to the analysis or reduce the --svd parameter! Exiting.' % ( | 
|  | 128         int(args.svd), len(samples))) | 
|  | 129         sys.exit(0) | 
|  | 130 | 
|  | 131     # BEGIN | 
|  | 132     chrs_to_process = set(map(operator.itemgetter("chr"), probes)) | 
|  | 133     chrs_to_process_str = ', '.join([cf.chrInt2Str(c) for c in chrs_to_process]) | 
|  | 134     print('[INIT] Attempting to process chromosomes: ', chrs_to_process_str) | 
|  | 135 | 
|  | 136     for chr in chrs_to_process: | 
|  | 137         print("[RUNNING: chr%d] Now on: %s" % (chr, cf.chrInt2Str(chr))) | 
|  | 138         chr_group_name = "chr%d" % chr | 
|  | 139         chr_group = h5file_out.createGroup("/", chr_group_name, chr_group_name) | 
|  | 140 | 
|  | 141         chr_probes = [i for i in probes if i["chr"] == chr] | 
|  | 142         num_chr_probes = len(chr_probes) | 
|  | 143         start_probeID = chr_probes[0]['probeID'] | 
|  | 144         stop_probeID = chr_probes[-1]['probeID'] | 
|  | 145         print("[RUNNING: chr%d] Found %d probes; probeID range is [%d-%d]" % (chr, len(chr_probes), start_probeID - 1, | 
|  | 146                                                                               stop_probeID))  # probeID is 1-based and slicing is 0-based, hence the start_probeID-1 term | 
|  | 147 | 
|  | 148         rpkm = RPKM_data[start_probeID:stop_probeID, :] | 
|  | 149 | 
|  | 150         print("[RUNNING: chr%d] Calculating median RPKM" % chr) | 
|  | 151         median = np.median(rpkm, 1) | 
|  | 152         sd = np.std(rpkm, 1) | 
|  | 153         probe_mask = median >= float(args.min_rpkm) | 
|  | 154         print("[RUNNING: chr%d] Masking %d probes with median RPKM < %f" % ( | 
|  | 155         chr, np.sum(probe_mask == False), float(args.min_rpkm))) | 
|  | 156 | 
|  | 157         rpkm = rpkm[probe_mask, :] | 
|  | 158         num_chr_probes = np.sum(probe_mask) | 
|  | 159 | 
|  | 160         if num_chr_probes <= len(samples): | 
|  | 161             print("[ERROR] This chromosome has fewer informative probes than there are samples in the analysis! There are probably no mappings on this chromosome. Please remove these probes from the probes.txt file") | 
|  | 162             sys.exit(0) | 
|  | 163 | 
|  | 164         probeIDs = np.array(list(map(operator.itemgetter("probeID"), chr_probes)))[probe_mask] | 
|  | 165         probe_starts = np.array(list(map(operator.itemgetter("start"), chr_probes)))[probe_mask] | 
|  | 166         probe_stops = np.array(list(map(operator.itemgetter("stop"), chr_probes)))[probe_mask] | 
|  | 167         gene_names = np.array(list(map(operator.itemgetter("name"), chr_probes)))[probe_mask] | 
|  | 168 | 
|  | 169         dt = np.dtype([('probeID', np.uint32), ('start', np.uint32), ('stop', np.uint32), ('name', np.str_, 20)]) | 
|  | 170 | 
|  | 171         out_probes = np.empty(num_chr_probes, dtype=dt) | 
|  | 172         out_probes['probeID'] = probeIDs | 
|  | 173         out_probes['start'] = probe_starts | 
|  | 174         out_probes['stop'] = probe_stops | 
|  | 175         out_probes['name'] = gene_names | 
|  | 176         probe_table = h5file_out.createTable(probe_group, "probes_chr%d" % chr, cf.probe, "chr%d" % chr) | 
|  | 177         probe_table.append(out_probes) | 
|  | 178 | 
|  | 179         print("[RUNNING: chr%d] Calculating ZRPKM scores..." % chr) | 
|  | 180         rpkm = np.apply_along_axis(cf.zrpkm, 0, rpkm, median[probe_mask], sd[probe_mask]) | 
|  | 181 | 
|  | 182         # svd transform | 
|  | 183         print("[RUNNING: chr%d] SVD decomposition..." % chr) | 
|  | 184         components_removed = int(args.svd) | 
|  | 185 | 
|  | 186         U, S, Vt = np.linalg.svd(rpkm, full_matrices=False) | 
|  | 187         new_S = np.diag(np.hstack([np.zeros([components_removed]), S[components_removed:]])) | 
|  | 188 | 
|  | 189         if args.write_svals != "": | 
|  | 190             sval_f.write('chr' + str(chr) + '\t' + '\t'.join([str(_i) for _i in S]) + "\n") | 
|  | 191 | 
|  | 192         if args.plot_scree != "": | 
|  | 193             ax.plot(S, label='chr' + str(chr), lw=0.5) | 
|  | 194 | 
|  | 195         # reconstruct data matrix | 
|  | 196         rpkm = np.dot(U, np.dot(new_S, Vt)) | 
|  | 197 | 
|  | 198         # save to HDF5 file | 
|  | 199         print("[RUNNING: chr%d] Saving SVD-ZRPKM values" % chr) | 
|  | 200 | 
|  | 201         for i, s in enumerate(samples): | 
|  | 202             out_data = np.empty(num_chr_probes, dtype='u4,f8') | 
|  | 203             out_data['f0'] = probeIDs | 
|  | 204             out_data['f1'] = rpkm[:, i] | 
|  | 205             sample_tbl = h5file_out.createTable(chr_group, "sample_" + str(s), cf.rpkm_value, "%s" % str(s)) | 
|  | 206             sample_tbl.append(out_data) | 
|  | 207 | 
|  | 208     print("[RUNNING] Saving sampleIDs to file...") | 
|  | 209     sample_group = h5file_out.createGroup("/", "samples", "samples") | 
|  | 210     sample_table = h5file_out.createTable(sample_group, "samples", cf.sample, "samples") | 
|  | 211     dt = np.dtype([('sampleID', np.str_, 100)]) | 
|  | 212     out_samples = np.empty(len(list(samples.keys())), dtype=dt) | 
|  | 213     out_samples['sampleID'] = np.array(list(samples.keys())) | 
|  | 214     sample_table.append(out_samples) | 
|  | 215 | 
|  | 216     if args.write_sd != "": | 
|  | 217         print("[RUNNING] Calculating standard deviations for all samples (this can take a while)...") | 
|  | 218 | 
|  | 219         sd_file = open(args.write_sd, 'w') | 
|  | 220 | 
|  | 221         for i, s in enumerate(samples): | 
|  | 222             # collect all SVD-ZRPKM values | 
|  | 223             count = 1 | 
|  | 224             for chr in chrs_to_process: | 
|  | 225                 if count == 1: | 
|  | 226                     sd_out = h5file_out.root._f_getChild("chr%d" % chr)._f_getChild("sample_%s" % s).read( | 
|  | 227                         field="rpkm").flatten() | 
|  | 228                 else: | 
|  | 229                     sd_out = np.hstack([sd_out, out.h5file_out.root._f_getChild("chr%d" % chr)._f_getChild( | 
|  | 230                         "sample_%s" % s).read(field="rpkm").flatten()]) | 
|  | 231 | 
|  | 232                 sd = np.std(sd_out) | 
|  | 233             sd_file.write("%s\t%f\n" % (s, sd)) | 
|  | 234 | 
|  | 235         sd_file.close() | 
|  | 236 | 
|  | 237     if args.plot_scree != "": | 
|  | 238         plt.title("Scree plot") | 
|  | 239         if len(samples) < 50: | 
|  | 240             plt.xlim([0, len(samples)]) | 
|  | 241             plt.xlabel("S values") | 
|  | 242         else: | 
|  | 243             plt.xlim([0, 50]) | 
|  | 244             plt.xlabel("S values (only first 50 plotted)") | 
|  | 245         plt.ylabel("Relative contributed variance") | 
|  | 246         plt.savefig(args.plot_scree) | 
|  | 247 | 
|  | 248     print("[FINISHED]") | 
|  | 249     h5file_out.close() | 
|  | 250     sys.exit(0) | 
|  | 251 | 
| 10 | 252 | 
|  | 253 def CF_export(args): | 
| 25 | 254     try: | 
|  | 255         h5file_in_fn = str(args.input) | 
|  | 256         h5file_in = OpenFile(h5file_in_fn, mode='r') | 
|  | 257     except IOError as e: | 
|  | 258         print('[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn) | 
|  | 259         sys.exit(0) | 
|  | 260 | 
|  | 261     # read probes | 
|  | 262     probes = {} | 
|  | 263     for probes_chr in h5file_in.root.probes: | 
|  | 264         probes[probes_chr.title] = probes_chr.read() | 
|  | 265 | 
|  | 266     if args.sample == 'all': | 
|  | 267         all_samples = list(h5file_in.root.samples.samples.read(field="sampleID")) | 
|  | 268 | 
|  | 269         out_path = os.path.abspath(args.output) | 
|  | 270 | 
|  | 271         print("[INIT] Preparing to export all samples (%d samples) to %s" % (len(all_samples), out_path)) | 
|  | 272         for sample in all_samples: | 
|  | 273             try: | 
|  | 274                 outfile_fn = out_path + "/" + sample + ".bed" | 
|  | 275                 outfile_f = open(outfile_fn, 'w') | 
|  | 276             except IOError as e: | 
|  | 277                 print('[ERROR] Cannot open output file for writing: ', outfile_fn) | 
|  | 278                 sys.exit(0) | 
|  | 279             print("[RUNNING] Exporting %s" % sample) | 
|  | 280 | 
|  | 281             cf.export_sample(h5file_in, sample, probes, outfile_f) | 
|  | 282             outfile_f.close() | 
|  | 283 | 
|  | 284     elif len(args.sample) == 1: | 
|  | 285         out_path = os.path.abspath(args.output) | 
|  | 286         sample = args.sample[0] | 
|  | 287         print("[INIT] Preparing to export sampleID %s to %s" % (args.sample[0], out_path)) | 
|  | 288         try: | 
|  | 289             if os.path.isdir(out_path): | 
|  | 290                 outfile_fn = out_path + "/" + sample + ".bed" | 
|  | 291             else: | 
|  | 292                 outfile_fn = out_path | 
|  | 293             outfile_f = open(outfile_fn, 'w') | 
|  | 294         except IOError as e: | 
|  | 295             print('[ERROR] Cannot open output file for writing: ', outfile_fn) | 
|  | 296             sys.exit(0) | 
|  | 297         print("[RUNNING] Exporting %s to %s" % (sample, outfile_fn)) | 
|  | 298 | 
|  | 299         cf.export_sample(h5file_in, sample, probes, outfile_f) | 
|  | 300         outfile_f.close() | 
|  | 301 | 
|  | 302     else: | 
|  | 303         out_path = os.path.abspath(args.output) | 
|  | 304         print("[INIT] Preparing to export %d samples to %s" % (len(args.sample), out_path)) | 
|  | 305         for sample in args.sample: | 
|  | 306             try: | 
|  | 307                 if os.path.isdir(out_path): | 
|  | 308                     outfile_fn = out_path + "/" + sample + ".bed" | 
|  | 309                 else: | 
|  | 310                     outfile_fn = out_path | 
|  | 311                 outfile_f = open(outfile_fn, 'w') | 
|  | 312             except IOError as e: | 
|  | 313                 print('[ERROR] Cannot open output file for writing: ', outfile_fn) | 
|  | 314                 sys.exit(0) | 
|  | 315             print("[RUNNING] Exporting %s to %s" % (sample, outfile_fn)) | 
|  | 316 | 
|  | 317             cf.export_sample(h5file_in, sample, probes, outfile_f) | 
|  | 318             outfile_f.close() | 
|  | 319     sys.exit(0) | 
|  | 320 | 
| 10 | 321 | 
|  | 322 def CF_call(args): | 
| 25 | 323     try: | 
|  | 324         h5file_in_fn = str(args.input) | 
|  | 325         h5file_in = OpenFile(h5file_in_fn, mode='r') | 
|  | 326     except IOError as e: | 
|  | 327         print('[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn) | 
|  | 328         sys.exit(0) | 
|  | 329 | 
|  | 330     try: | 
|  | 331         callfile_fn = str(args.output) | 
|  | 332         callfile_f = open(callfile_fn, mode='w') | 
|  | 333     except IOError as e: | 
|  | 334         print('[ERROR] Cannot open output file for writing: ', callfile_fn) | 
|  | 335         sys.exit(0) | 
|  | 336 | 
|  | 337     chrs_to_process = [] | 
|  | 338     for chr in h5file_in.root: | 
|  | 339         if chr._v_title not in ('probes', 'samples'): | 
|  | 340             chrs_to_process.append(chr._v_title.replace("chr", "")) | 
|  | 341 | 
|  | 342     h5file_in.close() | 
|  | 343 | 
|  | 344     print('[INIT] Initializing caller at threshold = %f' % (args.threshold)) | 
|  | 345 | 
|  | 346     r = cf.rpkm_reader(h5file_in_fn) | 
|  | 347 | 
|  | 348     all_calls = [] | 
|  | 349 | 
|  | 350     for chr in chrs_to_process: | 
|  | 351         print('[RUNNING] Now processing chr%s' % chr) | 
|  | 352         data = r.getExonValuesByRegion(chr) | 
|  | 353 | 
|  | 354         # raw_data = copy.copy(data) | 
|  | 355         _ = data.smooth() | 
|  | 356 | 
|  | 357         mean = np.mean(data.rpkm, axis=1) | 
|  | 358         sd = np.std(data.rpkm, axis=1) | 
|  | 359 | 
|  | 360         for sample in r.getSampleList(): | 
|  | 361             sample_data = data.getSample([sample]).flatten() | 
|  | 362             # sample_raw_data = raw_data.getSample([sample]).flatten() | 
|  | 363 | 
|  | 364             dup_mask = sample_data >= args.threshold | 
|  | 365             del_mask = sample_data <= -1 * args.threshold | 
|  | 366 | 
|  | 367             dup_bkpoints = cf.getbkpoints(dup_mask)  # returns exon coordinates for this chromosome (numpy array coords) | 
|  | 368             del_bkpoints = cf.getbkpoints(del_mask) | 
|  | 369 | 
|  | 370             dups = [] | 
|  | 371             for start, stop in dup_bkpoints: | 
|  | 372                 try: | 
|  | 373                     new_start = np.max(np.where(sample_data[:start] < (mean[:start] + 3 * sd[:start]))) | 
|  | 374                 except ValueError: | 
|  | 375                     new_start = 0 | 
|  | 376                 try: | 
|  | 377                     new_stop = stop + np.min(np.where(sample_data[stop:] < (mean[stop:] + 3 * sd[stop:]))) | 
|  | 378                 except ValueError: | 
|  | 379                     new_stop = data.shape[1] - 1 | 
|  | 380                 dups.append( | 
|  | 381                     {"sampleID": sample, "chromosome": cf.chrInt2Str(chr), "start": data.exons[new_start]["start"], | 
|  | 382                      "stop": data.exons[new_stop]["stop"], "state": "dup"}) | 
|  | 383 | 
|  | 384             dels = [] | 
|  | 385             for start, stop in del_bkpoints: | 
|  | 386                 try: | 
|  | 387                     new_start = np.max(np.where(sample_data[:start] > (-1 * mean[:start] - 3 * sd[:start]))) | 
|  | 388                 except ValueError: | 
|  | 389                     new_start = 0 | 
|  | 390                 try: | 
|  | 391                     new_stop = stop + np.min(np.where(sample_data[stop:] > (-1 * mean[stop:] - 3 * sd[stop:]))) | 
|  | 392                 except ValueError: | 
|  | 393                     new_stop = data.shape[1] - 1 | 
|  | 394                 dels.append( | 
|  | 395                     {"sampleID": sample, "chromosome": cf.chrInt2Str(chr), "start": data.exons[new_start]["start"], | 
|  | 396                      "stop": data.exons[new_stop]["stop"], "state": "del"}) | 
|  | 397 | 
|  | 398             dels = cf.mergeCalls(dels)  # merges overlapping calls | 
|  | 399             dups = cf.mergeCalls(dups) | 
|  | 400 | 
|  | 401             # print sampleID, len(dels), len(dups) | 
|  | 402 | 
|  | 403             all_calls.extend(list(dels)) | 
|  | 404             all_calls.extend(list(dups)) | 
|  | 405 | 
|  | 406     # print calls to file | 
|  | 407     header = ['sampleID', 'chromosome', 'start', 'stop', 'state'] | 
|  | 408 | 
|  | 409     callfile_f.write('\t'.join(header) + "\n") | 
|  | 410     for call in all_calls: | 
|  | 411         print("%s\t%s\t%d\t%d\t%s" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"])) | 
|  | 412         callfile_f.write( | 
|  | 413             "%s\t%s\t%d\t%d\t%s\n" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"])) | 
|  | 414 | 
|  | 415     sys.exit(0) | 
|  | 416 | 
| 10 | 417 | 
|  | 418 def CF_plot(args): | 
| 25 | 419     try: | 
|  | 420         import locale | 
|  | 421         import matplotlib | 
|  | 422         matplotlib.use('Agg') | 
|  | 423         import matplotlib.pyplot as plt | 
|  | 424         import pylab as P | 
|  | 425         from matplotlib.lines import Line2D | 
|  | 426         from matplotlib.patches import Rectangle | 
|  | 427         _ = locale.setlocale(locale.LC_ALL, '') | 
|  | 428     except: | 
|  | 429         print("[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?") | 
|  | 430         sys.exit(0) | 
|  | 431 | 
|  | 432     chr, start, stop = cf.parseLocString(args.region) | 
|  | 433 | 
|  | 434     r = cf.rpkm_reader(args.input) | 
|  | 435 | 
|  | 436     data = r.getExonValuesByRegion(chr, start, stop) | 
|  | 437     _ = data.smooth() | 
|  | 438 | 
|  | 439     plt.gcf().clear() | 
|  | 440     fig = plt.figure(figsize=(10, 5)) | 
|  | 441     ax = fig.add_subplot(111) | 
|  | 442 | 
|  | 443     ax.plot(data.rpkm, linewidth=0.3, c='k') | 
|  | 444 | 
|  | 445     if args.sample != 'none': | 
|  | 446         cnt = 1 | 
|  | 447         coloriter = iter(['r', 'b', 'g', 'y']) | 
|  | 448         for sample in args.sample: | 
|  | 449             try: | 
|  | 450                 color, sampleID = sample.split(":") | 
|  | 451             except: | 
|  | 452                 color = next(coloriter) | 
|  | 453                 sampleID = sample | 
|  | 454 | 
|  | 455             ax.plot(data.getSample([sampleID]), linewidth=1, c=color, label=sampleID) | 
|  | 456 | 
|  | 457             if cnt == 1: | 
|  | 458                 cf.plotRawData(ax, | 
|  | 459                                r.getExonValuesByRegion(chr, start, stop, sampleList=[sampleID]).getSample([sampleID]), | 
|  | 460                                color=color) | 
|  | 461             cnt += 1 | 
|  | 462         plt.legend(prop={'size': 10}, frameon=False) | 
|  | 463 | 
|  | 464     cf.plotGenes(ax, data) | 
|  | 465     cf.plotGenomicCoords(plt, data) | 
|  | 466     plt.xlim(0, data.shape[1]) | 
|  | 467     plt.ylim(-3, 3) | 
|  | 468 | 
|  | 469     plt.title("%s: %s - %s" % ( | 
|  | 470     cf.chrInt2Str(chr), locale.format("%d", start, grouping=True), locale.format("%d", stop, grouping=True))) | 
|  | 471     plt.xlabel("Position") | 
|  | 472     plt.ylabel("SVD-ZRPKM Values") | 
|  | 473 | 
|  | 474     plt.savefig(args.output) | 
|  | 475 | 
|  | 476     sys.exit(0) | 
|  | 477 | 
| 10 | 478 | 
|  | 479 def CF_plotcalls(args): | 
| 25 | 480     try: | 
|  | 481         import matplotlib | 
|  | 482         matplotlib.use('Agg') | 
|  | 483         import matplotlib.pyplot as plt | 
|  | 484         import pylab as P | 
|  | 485         from matplotlib.lines import Line2D | 
|  | 486         from matplotlib.patches import Rectangle | 
|  | 487     except: | 
|  | 488         print("[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?") | 
|  | 489         sys.exit(0) | 
|  | 490 | 
|  | 491     import locale | 
|  | 492     try: | 
|  | 493         _ = locale.setlocale(locale.LC_ALL, 'en_US') | 
|  | 494     except: | 
|  | 495         _ = locale.setlocale(locale.LC_ALL, '') | 
|  | 496 | 
|  | 497     try: | 
|  | 498         callfile_fn = str(args.calls) | 
|  | 499         callfile_f = open(callfile_fn, mode='r') | 
|  | 500     except IOError as e: | 
|  | 501         print('[ERROR] Cannot open call file for reading: ', callfile_fn) | 
|  | 502         sys.exit(0) | 
|  | 503 | 
|  | 504     all_calls = [] | 
|  | 505     header = callfile_f.readline() | 
|  | 506 | 
|  | 507     for line in callfile_f: | 
|  | 508         sampleID, chr, start, stop, state = line.strip().split() | 
|  | 509         chr = cf.chrStr2Int(chr) | 
|  | 510         all_calls.append({"chromosome": int(chr), "start": int(start), "stop": int(stop), "sampleID": sampleID}) | 
|  | 511 | 
|  | 512     r = cf.rpkm_reader(args.input) | 
|  | 513 | 
|  | 514     for call in all_calls: | 
|  | 515         chr = call["chromosome"] | 
|  | 516         start = call["start"] | 
|  | 517         stop = call["stop"] | 
|  | 518         sampleID = call["sampleID"] | 
|  | 519 | 
|  | 520         exons = r.getExonIDs(chr, int(start), int(stop)) | 
|  | 521 | 
|  | 522         window_start = max(exons[0] - args.window, 0) | 
|  | 523         window_stop = exons[-1] + args.window | 
|  | 524 | 
|  | 525         data = r.getExonValuesByExons(chr, window_start, window_stop) | 
|  | 526         _ = data.smooth() | 
|  | 527 | 
|  | 528         plt.gcf().clear() | 
|  | 529         fig = plt.figure(figsize=(10, 5)) | 
|  | 530         ax = fig.add_subplot(111) | 
|  | 531 | 
|  | 532         ax.plot(data.rpkm, linewidth=0.3, c='k') | 
|  | 533 | 
|  | 534         ax.plot(data.getSample([sampleID]), linewidth=1, c='r', label=sampleID) | 
|  | 535         cf.plotRawData(ax, r.getExonValuesByExons(chr, window_start, window_stop, sampleList=[sampleID]).getSample( | 
|  | 536             [sampleID]), color='r') | 
|  | 537 | 
|  | 538         plt.legend(prop={'size': 10}, frameon=False) | 
|  | 539 | 
|  | 540         cf.plotGenes(ax, data) | 
|  | 541         cf.plotGenomicCoords(plt, data) | 
|  | 542 | 
|  | 543         exon_start = np.where(data.exons["start"] == start)[0][0] | 
|  | 544         exon_stop = np.where(data.exons["stop"] == stop)[0][0] | 
|  | 545         _ = ax.add_line( | 
|  | 546             matplotlib.lines.Line2D([exon_start, exon_stop], [2, 2], color='k', lw=6, linestyle='-', alpha=1, | 
|  | 547                                     solid_capstyle='butt')) | 
|  | 548 | 
|  | 549         _ = plt.xlim(0, data.shape[1]) | 
|  | 550         _ = plt.ylim(-3, 3) | 
|  | 551 | 
|  | 552         plt.title("%s: %s - %s" % ( | 
|  | 553         cf.chrInt2Str(chr), locale.format("%d", start, grouping=True), locale.format("%d", stop, grouping=True))) | 
|  | 554         plt.xlabel("Position") | 
|  | 555         plt.ylabel("SVD-ZRPKM Values") | 
|  | 556         outfile = "%s_%d_%d_%s.png" % (cf.chrInt2Str(chr), start, stop, sampleID) | 
|  | 557         plt.savefig(args.outputdir + "/" + outfile) | 
|  | 558 | 
| 10 | 559 | 
|  | 560 def CF_bam2RPKM(args): | 
| 25 | 561     try: | 
|  | 562         import pysam | 
|  | 563     except: | 
|  | 564         print('[ERROR] Cannot load pysam module! Make sure it is insalled') | 
|  | 565         sys.exit(0) | 
|  | 566     try: | 
|  | 567         # read probes table | 
|  | 568         probe_fn = str(args.probes[0]) | 
|  | 569         probes = cf.loadProbeList(probe_fn) | 
|  | 570         num_probes = len(probes) | 
|  | 571         print('[INIT] Successfully read in %d probes from %s' % (num_probes, probe_fn)) | 
|  | 572     except IOError as e: | 
|  | 573         print('[ERROR] Cannot read probes file: ', probe_fn) | 
|  | 574         sys.exit(0) | 
|  | 575 | 
|  | 576     try: | 
|  | 577         rpkm_f = open(args.output[0], 'wb') | 
|  | 578     except IOError as e: | 
|  | 579         print('[ERROR] Cannot open rpkm file for writing: ', args.output) | 
|  | 580         sys.exit(0) | 
|  | 581 | 
|  | 582     print("[RUNNING] Counting total number of reads in bam file...") | 
|  | 583     total_reads = float(pysam.view("-c", args.input[0])[0].strip("\n")) | 
|  | 584     print("[RUNNING] Found %d reads" % total_reads) | 
|  | 585 | 
|  | 586     f = pysam.Samfile(args.input[0], "rb") | 
|  | 587 | 
|  | 588 #    if not f._hasIndex(): | 
|  | 589     if not f.has_index(): | 
|  | 590         print("[ERROR] No index found for bam file (%s)!\n[ERROR] You must first index the bam file and include the .bai file in the same directory as the bam file!" % \ | 
|  | 591               args.input[0]) | 
|  | 592         sys.exit(0) | 
|  | 593 | 
|  | 594     # will be storing values in these arrays | 
|  | 595     readcount = np.zeros(num_probes) | 
|  | 596     exon_bp = np.zeros(num_probes) | 
|  | 597     probeIDs = np.zeros(num_probes) | 
|  | 598     counter = 0 | 
|  | 599 | 
|  | 600     # detect contig naming scheme here # TODO, add an optional "contigs.txt" file or automatically handle contig naming | 
|  | 601     bam_contigs = f.references | 
|  | 602     probes_contigs = [str(p) for p in set(map(operator.itemgetter("chr"), probes))] | 
|  | 603 | 
|  | 604     probes2contigmap = {} | 
| 10 | 605 | 
| 25 | 606     for probes_contig in probes_contigs: | 
|  | 607         if probes_contig in bam_contigs: | 
|  | 608             probes2contigmap[probes_contig] = probes_contig | 
|  | 609         elif cf.chrInt2Str(probes_contig) in bam_contigs: | 
|  | 610             probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig) | 
|  | 611         elif cf.chrInt2Str(probes_contig).replace("chr", "") in bam_contigs: | 
|  | 612             probes2contigmap[probes_contig] = cf.chrInt2Str(probes_contig).replace("chr", "") | 
|  | 613         else: | 
|  | 614             print("[ERROR] Could not find contig '%s' from %s in bam file! \n[ERROR] Perhaps the contig names for the probes are incompatible with the bam file ('chr1' vs. '1'), or unsupported contig naming is used?" % ( | 
|  | 615             probes_contig, probe_fn)) | 
|  | 616             sys.exit(0) | 
|  | 617 | 
|  | 618     print("[RUNNING] Calculating RPKM values...") | 
|  | 619 | 
|  | 620     # loop through each probe | 
|  | 621     for p in probes: | 
|  | 622 | 
|  | 623         # f.fetch is a pysam method and returns an iterator for reads overlapping interval | 
|  | 624 | 
|  | 625         p_chr = probes2contigmap[str(p["chr"])] | 
|  | 626 | 
|  | 627         p_start = p["start"] | 
|  | 628         p_stop = p["stop"] | 
|  | 629         try: | 
|  | 630             iter = f.fetch(p_chr, p_start, p_stop) | 
|  | 631         except: | 
|  | 632             print("[ERROR] Could not retrieve mappings for region %s:%d-%d. Check that contigs are named correctly and the bam file is properly indexed" % ( | 
|  | 633             p_chr, p_start, p_stop)) | 
|  | 634             sys.exit(0) | 
|  | 635 | 
|  | 636         for i in iter: | 
|  | 637             if i.pos + 1 >= p_start:  # this checks to make sure a read actually starts in an interval | 
|  | 638                 readcount[counter] += 1 | 
|  | 639 | 
|  | 640         exon_bp[counter] = p_stop - p_start | 
|  | 641         probeIDs[counter] = counter + 1  # probeIDs are 1-based | 
|  | 642         counter += 1 | 
|  | 643 | 
|  | 644     # calcualte RPKM values for all probes | 
|  | 645     rpkm = (10 ** 9 * (readcount) / (exon_bp)) / (total_reads) | 
|  | 646 | 
|  | 647     out = np.vstack([probeIDs, readcount, rpkm]) | 
|  | 648 | 
|  | 649     np.savetxt(rpkm_f, out.transpose(), delimiter='\t', fmt=['%d', '%d', '%f']) | 
|  | 650 | 
|  | 651     rpkm_f.close() | 
|  | 652 | 
| 10 | 653 | 
|  | 654 VERSION = "0.2.2" | 
| 25 | 655 parser = argparse.ArgumentParser(prog="CoNIFER", | 
|  | 656                                  description="This is CoNIFER %s (Copy Number Inference From Exome Reads), designed to detect and genotype CNVs and CNPs from exome sequence read-depth data. See Krumm et al., Genome Research (2012) doi:10.1101/gr.138115.112 \nNiklas Krumm, 2012\n nkrumm@uw.edu" % VERSION) | 
| 10 | 657 parser.add_argument('--version', action='version', version='%(prog)s ' + VERSION) | 
|  | 658 subparsers = parser.add_subparsers(help='Command to be run.') | 
|  | 659 | 
|  | 660 # rpkm command | 
| 25 | 661 rpkm_parser = subparsers.add_parser('rpkm', help='Create an RPKM file from a BAM file') | 
|  | 662 rpkm_parser.add_argument('--probes', action='store', required=True, metavar='/path/to/probes_file.txt', nargs=1, | 
|  | 663                          help="Probe definition file") | 
|  | 664 rpkm_parser.add_argument('--input', action='store', required=True, metavar='sample.bam', nargs=1, | 
|  | 665                          help="Aligned BAM file") | 
|  | 666 rpkm_parser.add_argument('--output', action='store', required=True, metavar='sample.rpkm.txt', nargs=1, | 
|  | 667                          help="RPKM file to write") | 
| 10 | 668 rpkm_parser.set_defaults(func=CF_bam2RPKM) | 
|  | 669 | 
|  | 670 # analyze command | 
| 25 | 671 analyze_parser = subparsers.add_parser('analyze', | 
|  | 672                                        help='Basic CoNIFER analysis. Reads a directory of RPKM files and a probe list and outputs a HDF5 file containing SVD-ZRPKM values.') | 
|  | 673 analyze_parser.add_argument('--probes', action='store', required=True, metavar='/path/to/probes_file.txt', nargs=1, | 
|  | 674                             help="Probe definition file") | 
|  | 675 analyze_parser.add_argument('--rpkm_dir', action='store', required=True, | 
|  | 676                             metavar='/path/to/folder/containing/rpkm_files/', nargs=1, | 
|  | 677                             help="Location of folder containing RPKM files. Folder should contain ONLY contain RPKM files, and all readable RPKM files will used in analysis.") | 
|  | 678 analyze_parser.add_argument('--output', '-o', required=True, metavar='/path/to/output_file.hdf5', type=str, | 
|  | 679                             help="Output location of HDF5 file to contain SVD-ZRPKM values") | 
|  | 680 analyze_parser.add_argument('--svd', metavar='12', type=int, nargs='?', default=12, | 
|  | 681                             help="Number of components to remove") | 
|  | 682 analyze_parser.add_argument('--min_rpkm', metavar='1.00', type=float, nargs="?", default=1.00, | 
|  | 683                             help="Minimum population median RPKM per probe.") | 
|  | 684 analyze_parser.add_argument('--write_svals', metavar='SingularValues.txt', type=str, default="", | 
|  | 685                             help="Optional file to write singular values (S-values). Used for Scree Plot.") | 
|  | 686 analyze_parser.add_argument('--plot_scree', metavar='ScreePlot.png', type=str, default="", | 
|  | 687                             help="Optional graphical scree plot. Requires matplotlib.") | 
|  | 688 analyze_parser.add_argument('--write_sd', metavar='StandardDeviations.txt', type=str, default="", | 
|  | 689                             help="Optional file with sample SVD-ZRPKM standard deviations. Used to filter noisy samples.") | 
| 10 | 690 analyze_parser.set_defaults(func=CF_analyze) | 
|  | 691 | 
|  | 692 # export command | 
| 25 | 693 export_parser = subparsers.add_parser('export', help='Export SVD-ZRPKM values from a HDF5 file to bed or vcf format.') | 
|  | 694 export_parser.add_argument('--input', '-i', action='store', required=True, metavar='CoNIFER_SVD.hdf5', | 
|  | 695                            help="HDF5 file from CoNIFER 'analyze' step") | 
|  | 696 export_parser.add_argument('--output', '-o', action='store', required=False, default='.', metavar='output.bed', | 
|  | 697                            help="Location of output file[s]. When exporting multiple samples, top-level directory of this path will be used.") | 
|  | 698 export_parser.add_argument('--sample', '-s', action='store', required=False, metavar='sampleID', default='all', | 
|  | 699                            nargs="+", | 
|  | 700                            help="SampleID or comma-separated list of sampleIDs to export. Default: export all samples") | 
| 10 | 701 export_parser.set_defaults(func=CF_export) | 
|  | 702 | 
|  | 703 # plot command | 
| 25 | 704 plot_parser = subparsers.add_parser('plot', help='Plot SVD-ZRPKM values using matplotlib') | 
|  | 705 plot_parser.add_argument('--input', '-i', action='store', required=True, metavar='CoNIFER_SVD.hdf5', | 
|  | 706                          help="HDF5 file from CoNIFER 'analyze' step") | 
|  | 707 plot_parser.add_argument('--region', action='store', required=True, metavar='chr#:start-stop', help="Region to plot") | 
|  | 708 plot_parser.add_argument('--output', action='store', required=True, metavar='image.png', | 
|  | 709                          help="Output path and filetype. PDF, PNG, PS, EPS, and SVG are supported.") | 
|  | 710 plot_parser.add_argument('--sample', action='store', required=False, metavar='sampleID', nargs="+", default='none', | 
|  | 711                          help="Sample[s] to highlight. The following optional color spec can be used: <color>:<sampleID>. Available colors are r,b,g,y,c,m,y,k. The unsmoothed SVD-ZRPKM values for the first sample in this list will be drawn. Default: No samples highlighted.") | 
| 10 | 712 plot_parser.set_defaults(func=CF_plot) | 
|  | 713 | 
|  | 714 # make calls command | 
| 25 | 715 call_parser = subparsers.add_parser('call', help='Very rudimentary caller for CNVs using SVD-ZRPKM thresholding.') | 
|  | 716 call_parser.add_argument('--input', '-i', action='store', required=True, metavar='CoNIFER_SVD.hdf5', | 
|  | 717                          help="HDF5 file from CoNIFER 'analyze' step") | 
|  | 718 call_parser.add_argument('--output', action='store', required=True, metavar='calls.txt', help="Output file for calls") | 
|  | 719 call_parser.add_argument('--threshold', metavar='1.50', type=float, nargs='?', required=False, default=1.50, | 
|  | 720                          help="+/- Threshold for calling (minimum SVD-ZRPKM)") | 
| 10 | 721 call_parser.set_defaults(func=CF_call) | 
|  | 722 | 
|  | 723 # plotcalls command | 
| 25 | 724 plotcalls_parser = subparsers.add_parser('plotcalls', help='Make basic plots from call file from "call" command.') | 
|  | 725 plotcalls_parser.add_argument('--input', '-i', action='store', required=True, metavar='CoNIFER_SVD.hdf5', | 
|  | 726                               help="HDF5 file from CoNIFER 'analyze' step") | 
|  | 727 plotcalls_parser.add_argument('--calls', action='store', required=True, metavar='calls.txt', | 
|  | 728                               help="File with calls from 'call' command.") | 
|  | 729 plotcalls_parser.add_argument('--outputdir', action='store', required=True, metavar='/path/to/directory', | 
|  | 730                               help="Output directory for plots") | 
|  | 731 plotcalls_parser.add_argument('--window', action='store', required=False, metavar='50', default=50, | 
|  | 732                               help="In exons, the amount of padding to plot around each call") | 
| 10 | 733 plotcalls_parser.set_defaults(func=CF_plotcalls) | 
|  | 734 | 
|  | 735 args = parser.parse_args() | 
|  | 736 args.func(args) |