Mercurial > repos > scottx611x > cooler_convert
annotate recursive_agg_onefile.py @ 10:f0d4e43f95b5 draft
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
| author | scottx611x |
|---|---|
| date | Mon, 21 Nov 2016 08:49:12 -0500 |
| parents | 4543c0fdd563 |
| children | 1745b88127c0 |
| rev | line source |
|---|---|
|
0
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
1 from __future__ import division, print_function |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
2 from multiprocessing import Pool |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
3 import argparse |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
4 import sys |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
5 |
|
5
4543c0fdd563
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
4
diff
changeset
|
6 import numpy as np |
|
10
f0d4e43f95b5
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
5
diff
changeset
|
7 import cooler |
|
5
4543c0fdd563
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
4
diff
changeset
|
8 from cooler.io import CoolerAggregator |
|
4543c0fdd563
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
4
diff
changeset
|
9 import cooler.ice |
|
0
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
10 import h5py |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
11 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
12 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
13 FACTOR = 2 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
14 TILESIZE = 256 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
15 N_CPU = 8 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
16 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
17 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
18 def set_postmortem_hook(): |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
19 import sys, traceback, ipdb |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
20 def _excepthook(exc_type, value, tb): |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
21 traceback.print_exception(exc_type, value, tb) |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
22 print() |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
23 ipdb.pm() |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
24 sys.excepthook = _excepthook |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
25 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
26 set_postmortem_hook() |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
27 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
28 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
29 def main(infile, outfile, chunksize): |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
30 c = cooler.Cooler(infile) |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
31 binsize = c.info['bin-size'] |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
32 chromtable = c.chroms()[:] |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
33 chromsizes = chromtable.set_index('name')['length'] |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
34 chroms = chromtable['name'].values |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
35 lengths = chromtable['length'].values |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
36 total_length = np.sum(chromsizes.values) |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
37 n_tiles = total_length / binsize / TILESIZE |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
38 n_zooms = int(np.ceil(np.log2(n_tiles))) |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
39 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
40 print( |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
41 "Copying base matrix to level {0} and producing {0} zoom levels starting from 0...".format(n_zooms), |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
42 file=sys.stderr |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
43 ) |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
44 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
45 # transfer base matrix |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
46 with h5py.File(outfile, 'w') as dest, \ |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
47 h5py.File(infile, 'r') as src: |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
48 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
49 zoomLevel = str(n_zooms) |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
50 src.copy('/', dest, zoomLevel) |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
51 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
52 print(zoomLevel, file=sys.stderr) |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
53 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
54 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
55 # produce aggregations |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
56 with h5py.File(outfile, 'r+') as f: |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
57 grp = f[str(n_zooms)] |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
58 c = cooler.Cooler(grp) |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
59 binsize = cooler.info(grp)['bin-size'] |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
60 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
61 for i in range(n_zooms - 1, -1, -1): |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
62 zoomLevel = str(i) |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
63 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
64 # aggregate |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
65 new_binsize = binsize * FACTOR |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
66 new_bins = cooler.util.binnify(chromsizes, new_binsize) |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
67 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
68 reader = CoolerAggregator(c, new_bins, chunksize) |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
69 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
70 grp = f.create_group(zoomLevel) |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
71 f.attrs[zoomLevel] = new_binsize |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
72 cooler.io.create(grp, chroms, lengths, new_bins, reader) |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
73 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
74 # balance |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
75 #with Pool(N_CPU) as pool: |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
76 too_close = 20000 # for HindIII |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
77 ignore_diags = max(int(np.ceil(too_close / new_binsize)), 3) |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
78 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
79 bias = cooler.ice.iterative_correction( |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
80 f, zoomLevel, |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
81 chunksize=chunksize, |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
82 min_nnz=10, |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
83 mad_max=3, |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
84 ignore_diags=ignore_diags, |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
85 map=map) |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
86 h5opts = dict(compression='gzip', compression_opts=6) |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
87 grp['bins'].create_dataset('weight', data=bias, **h5opts) |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
88 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
89 print(zoomLevel, file=sys.stderr) |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
90 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
91 c = cooler.Cooler(grp) |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
92 binsize = new_binsize |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
93 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
94 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
95 if __name__ == '__main__': |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
96 parser = argparse.ArgumentParser( |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
97 description="Recursively aggregate a single resolution cooler file into a multi-resolution file.") |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
98 parser.add_argument( |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
99 "cooler_file", |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
100 help="Cooler file", |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
101 metavar="COOLER_PATH") |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
102 parser.add_argument( |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
103 "--out", "-o", |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
104 help="Output text file") |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
105 args = vars(parser.parse_args()) |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
106 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
107 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
108 infile = args['cooler_file'] |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
109 if args['out'] is None: |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
110 outfile = infile.replace('.cool', '.multires.cool') |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
111 else: |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
112 outfile = args['out'] |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
113 |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
114 chunksize = int(1e6) |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
115 main(infile, outfile, chunksize) |
|
c7d624661e8a
planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff
changeset
|
116 |
