annotate recursive_agg_onefile.py @ 37:8e5ca9b2b63d draft

planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
author scottx611x
date Mon, 21 Nov 2016 17:33:22 -0500
parents 79176387b351
children 1daf2cd88f17
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
35
7ede301a13f8 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 34
diff changeset
1 from __future__ import division, print_function
33
51af070be3e5 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 22
diff changeset
2 import sys
36
79176387b351 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 35
diff changeset
3
33
51af070be3e5 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 22
diff changeset
4 print(sys.executable)
51af070be3e5 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 22
diff changeset
5
36
79176387b351 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 35
diff changeset
6
79176387b351 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 35
diff changeset
7 import pip
79176387b351 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 35
diff changeset
8 installed_packages = pip.get_installed_distributions()
79176387b351 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 35
diff changeset
9 installed_packages_list = sorted(["%s==%s" % (i.key, i.version) for i in installed_packages])
79176387b351 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 35
diff changeset
10 print(installed_packages_list)
37
8e5ca9b2b63d planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 36
diff changeset
11 print(pip.main(["--version"]))
36
79176387b351 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 35
diff changeset
12
79176387b351 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 35
diff changeset
13
0
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
14 from multiprocessing import Pool
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
15 import argparse
22
90005893d112 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 21
diff changeset
16 import warnings
0
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
17
5
4543c0fdd563 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 4
diff changeset
18 import numpy as np
10
f0d4e43f95b5 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 5
diff changeset
19 import cooler
5
4543c0fdd563 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 4
diff changeset
20 from cooler.io import CoolerAggregator
4543c0fdd563 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 4
diff changeset
21 import cooler.ice
0
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
22 import h5py
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
23
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
24
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
25 FACTOR = 2
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
26 TILESIZE = 256
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
27 N_CPU = 8
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
28
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
29 def main(infile, outfile, chunksize):
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
30 c = cooler.Cooler(infile)
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
31 binsize = c.info['bin-size']
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
32 chromtable = c.chroms()[:]
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
33 chromsizes = chromtable.set_index('name')['length']
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
34 chroms = chromtable['name'].values
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
35 lengths = chromtable['length'].values
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
36 total_length = np.sum(chromsizes.values)
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
37 n_tiles = total_length / binsize / TILESIZE
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
38 n_zooms = int(np.ceil(np.log2(n_tiles)))
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
39
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
40 print(
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
41 "Copying base matrix to level {0} and producing {0} zoom levels starting from 0...".format(n_zooms),
21
202ef48959d3 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 20
diff changeset
42 file=sys.stdout
0
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
43 )
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
44
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
45 # transfer base matrix
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
46 with h5py.File(outfile, 'w') as dest, \
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
47 h5py.File(infile, 'r') as src:
20
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
48 zoomLevel = str(n_zooms)
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
49 src.copy('/', dest, zoomLevel)
21
202ef48959d3 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 20
diff changeset
50 print(zoomLevel, file=sys.stdout)
0
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
51
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
52
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
53 # produce aggregations
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
54 with h5py.File(outfile, 'r+') as f:
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
55 grp = f[str(n_zooms)]
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
56 c = cooler.Cooler(grp)
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
57 binsize = cooler.info(grp)['bin-size']
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
58
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
59 for i in range(n_zooms - 1, -1, -1):
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
60 zoomLevel = str(i)
19
70d6350f982d planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 18
diff changeset
61
20
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
62 # aggregate
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
63 new_binsize = binsize * FACTOR
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
64 new_bins = cooler.util.binnify(chromsizes, new_binsize)
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
65
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
66 reader = CoolerAggregator(c, new_bins, chunksize)
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
67
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
68 grp = f.create_group(zoomLevel)
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
69 f.attrs[zoomLevel] = new_binsize
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
70 cooler.io.create(grp, chroms, lengths, new_bins, reader)
0
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
71
20
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
72 # balance
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
73 #with Pool(N_CPU) as pool:
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
74 too_close = 20000 # for HindIII
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
75 ignore_diags = max(int(np.ceil(too_close / new_binsize)), 3)
0
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
76
20
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
77 bias = cooler.ice.iterative_correction(
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
78 f, zoomLevel,
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
79 chunksize=chunksize,
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
80 min_nnz=10,
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
81 mad_max=3,
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
82 ignore_diags=ignore_diags,
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
83 map=map)
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
84 h5opts = dict(compression='gzip', compression_opts=6)
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
85 grp['bins'].create_dataset('weight', data=bias, **h5opts)
0
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
86
21
202ef48959d3 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 20
diff changeset
87 print(zoomLevel, file=sys.stdout)
0
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
88
20
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
89 c = cooler.Cooler(grp)
797818b4e2f6 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 19
diff changeset
90 binsize = new_binsize
0
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
91
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
92 if __name__ == '__main__':
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
93 parser = argparse.ArgumentParser(
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
94 description="Recursively aggregate a single resolution cooler file into a multi-resolution file.")
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
95 parser.add_argument(
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
96 "cooler_file",
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
97 help="Cooler file",
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
98 metavar="COOLER_PATH")
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
99 parser.add_argument(
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
100 "--out", "-o",
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
101 help="Output text file")
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
102 args = vars(parser.parse_args())
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
103
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
104
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
105 infile = args['cooler_file']
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
106 if args['out'] is None:
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
107 outfile = infile.replace('.cool', '.multires.cool')
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
108 else:
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
109 outfile = args['out']
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
110
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
111 chunksize = int(1e6)
22
90005893d112 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 21
diff changeset
112 with warnings.catch_warnings():
90005893d112 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 21
diff changeset
113 warnings.simplefilter("ignore")
90005893d112 planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents: 21
diff changeset
114 main(infile, outfile, chunksize)
0
c7d624661e8a planemo upload commit 9e980862c8b763cf3dd0209be49f0d2d75a90614-dirty
scottx611x
parents:
diff changeset
115