annotate recursive_agg_onefile.py @ 0:a80d960a253b draft

planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
author scottx611x
date Tue, 22 Nov 2016 16:39:51 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
1 from __future__ import division, print_function
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
2 import sys
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
3
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
4 from multiprocessing import Pool
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
5 import argparse
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
6 import warnings
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
7
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
8 import numpy as np
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
9 import cooler
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
10 from cooler.io import CoolerAggregator
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
11 import cooler.ice
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
12 import h5py
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
13
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
14
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
15 FACTOR = 2
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
16 TILESIZE = 256
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
17 N_CPU = 8
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
18
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
19
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
20 def main(infile, outfile, chunksize):
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
21 c = cooler.Cooler(infile)
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
22 binsize = c.info['bin-size']
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
23 chromtable = c.chroms()[:]
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
24 chromsizes = chromtable.set_index('name')['length']
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
25 chroms = chromtable['name'].values
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
26 lengths = chromtable['length'].values
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
27 total_length = np.sum(chromsizes.values)
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
28 n_tiles = total_length / binsize / TILESIZE
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
29 n_zooms = int(np.ceil(np.log2(n_tiles)))
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
30
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
31 print(
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
32 "Copying base matrix to level {0} and producing {0} zoom levels starting from 0...".format(
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
33 n_zooms),
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
34 file=sys.stdout
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
35 )
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
36
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
37 # transfer base matrix
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
38 with h5py.File(outfile, 'w') as dest, \
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
39 h5py.File(infile, 'r') as src:
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
40 zoomLevel = str(n_zooms)
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
41 src.copy('/', dest, zoomLevel)
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
42 print(zoomLevel, file=sys.stdout)
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
43
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
44 # produce aggregations
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
45 with h5py.File(outfile, 'r+') as f:
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
46 grp = f[str(n_zooms)]
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
47 c = cooler.Cooler(grp)
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
48 binsize = cooler.info(grp)['bin-size']
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
49
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
50 for i in range(n_zooms - 1, -1, -1):
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
51 zoomLevel = str(i)
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
52
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
53 # aggregate
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
54 new_binsize = binsize * FACTOR
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
55 new_bins = cooler.util.binnify(chromsizes, new_binsize)
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
56
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
57 reader = CoolerAggregator(c, new_bins, chunksize)
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
58
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
59 grp = f.create_group(zoomLevel)
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
60 f.attrs[zoomLevel] = new_binsize
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
61 cooler.io.create(grp, chroms, lengths, new_bins, reader)
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
62
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
63 # balance
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
64 # with Pool(N_CPU) as pool:
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
65 too_close = 20000 # for HindIII
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
66 ignore_diags = max(int(np.ceil(too_close / new_binsize)), 3)
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
67
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
68 bias = cooler.ice.iterative_correction(
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
69 f, zoomLevel,
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
70 chunksize=chunksize,
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
71 min_nnz=10,
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
72 mad_max=3,
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
73 ignore_diags=ignore_diags,
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
74 map=map)
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
75 h5opts = dict(compression='gzip', compression_opts=6)
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
76 grp['bins'].create_dataset('weight', data=bias, **h5opts)
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
77
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
78 print(zoomLevel, file=sys.stdout)
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
79
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
80 c = cooler.Cooler(grp)
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
81 binsize = new_binsize
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
82
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
83 if __name__ == '__main__':
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
84 parser = argparse.ArgumentParser(
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
85 description="Recursively aggregate a single resolution cooler file into a multi-resolution file.")
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
86 parser.add_argument(
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
87 "cooler_file",
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
88 help="Cooler file",
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
89 metavar="COOLER_PATH")
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
90 parser.add_argument(
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
91 "--out", "-o",
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
92 help="Output text file")
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
93 args = vars(parser.parse_args())
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
94
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
95 infile = args['cooler_file']
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
96 if args['out'] is None:
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
97 outfile = infile.replace('.cool', '.multires.cool')
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
98 else:
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
99 outfile = args['out']
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
100
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
101 chunksize = int(1e6)
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
102 with warnings.catch_warnings():
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
103 warnings.simplefilter("ignore")
a80d960a253b planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
104 main(infile, outfile, chunksize)