annotate chemfp_clustering/butina_clustering.py @ 11:9e2e43fde5fe

Uploaded
author bgruening
date Tue, 14 May 2013 17:02:11 -0400
parents 438bc12d591b
children 6c496b524b41
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
1 #!/usr/bin/env python
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
2 """
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
3 Modified version of code examples from the chemfp project.
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
4 http://code.google.com/p/chem-fingerprints/
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
5 Thanks to Andrew Dalke of Andrew Dalke Scientific!
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
6 """
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
7
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
8 import chemfp
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
9 import sys
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
10 import os
4
a4e261ee0a51 Uploaded
bgruening
parents: 0
diff changeset
11 import tempfile
6
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
12 import argparse
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
13 import subprocess
11
9e2e43fde5fe Uploaded
bgruening
parents: 6
diff changeset
14 from chemfp import search
0
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
15
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
16
6
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
17 def unix_sort(results):
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
18 temp_unsorted = tempfile.NamedTemporaryFile(delete=False)
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
19 for (i,indices) in enumerate( results.iter_indices() ):
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
20 temp_unsorted.write('%s %s\n' % (len(indices), i))
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
21 temp_unsorted.close()
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
22 temp_sorted = tempfile.NamedTemporaryFile(delete=False)
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
23 temp_sorted.close()
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
24 p = subprocess.Popen(['sort', '-n', '-r', '-k', '1,1'], stdin=open(temp_unsorted.name), stdout=open(temp_sorted.name, 'w+'))
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
25 stdout, stderr = p.communicate()
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
26 return_code = p.returncode
0
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
27
6
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
28 if return_code:
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
29 sys.stdout.write(stdout)
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
30 sys.stderr.write(stderr)
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
31 sys.stderr.write("Return error code %i from command:\n" % return_code)
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
32 temp_sorted.close()
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
33 os.remove(temp_unsorted.name)
0
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
34
6
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
35 for line in open(temp_sorted.name):
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
36 size, fp_idx = line.strip().split()
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
37 yield (int(size), int(fp_idx))
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
38
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
39 os.remove(temp_sorted.name)
0
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
40
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
41
6
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
42 def butina( args ):
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
43 """
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
44 Taylor-Butina clustering from the chemfp help.
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
45 """
0
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
46
6
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
47 # make sure that the file ending is fps
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
48 temp_file = tempfile.NamedTemporaryFile()
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
49 temp_link = "%s.%s" % (temp_file.name, 'fps')
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
50 temp_file.close()
11
9e2e43fde5fe Uploaded
bgruening
parents: 6
diff changeset
51 os.symlink(args.input_path, temp_link)
6
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
52 #os.system('ln -s %s %s' % (args.input_path, temp_link) )
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
53
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
54 out = args.output_path
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
55 arena = chemfp.load_fingerprints( temp_link )
0
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
56
6
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
57 chemfp.set_num_threads( args.processors )
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
58 results = search.threshold_tanimoto_search_symmetric(arena, threshold = args.tanimoto_threshold)
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
59 results.reorder_all("move-closest-first")
0
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
60
6
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
61 # TODO: more memory efficient search?
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
62 # Reorder so the centroid with the most hits comes first.
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
63 # (That's why I do a reverse search.)
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
64 # Ignore the arbitrariness of breaking ties by fingerprint index
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
65 """
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
66 results = sorted( ( (len(indices), i, indices)
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
67 for (i,indices) in enumerate(results.iter_indices()) ),
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
68 reverse=True)
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
69 """
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
70 sorted_ids = unix_sort(results)
11
9e2e43fde5fe Uploaded
bgruening
parents: 6
diff changeset
71
6
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
72 # Determine the true/false singletons and the clusters
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
73 true_singletons = []
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
74 false_singletons = []
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
75 clusters = []
0
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
76
6
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
77 seen = set()
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
78 #for (size, fp_idx, members) in results:
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
79 for (size, fp_idx) in sorted_ids:
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
80 members = results[fp_idx].get_indices()
11
9e2e43fde5fe Uploaded
bgruening
parents: 6
diff changeset
81 #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members]
6
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
82 if fp_idx in seen:
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
83 # Can't use a centroid which is already assigned
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
84 continue
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
85 seen.add(fp_idx)
0
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
86
6
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
87 if size == 0:
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
88 # The only fingerprint in the exclusion sphere is itself
11
9e2e43fde5fe Uploaded
bgruening
parents: 6
diff changeset
89 true_singletons.append( fp_idx )
6
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
90 continue
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
91
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
92 # Figure out which ones haven't yet been assigned
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
93 unassigned = set(members) - seen
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
94
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
95 if not unassigned:
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
96 false_singletons.append(fp_idx)
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
97 continue
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
98
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
99 # this is a new cluster
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
100 clusters.append( (fp_idx, unassigned) )
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
101 seen.update(unassigned)
0
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
102
6
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
103 len_cluster = len(clusters)
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
104 #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) )
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
105 #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) )
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
106
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
107 out.write( "#%s true singletons\n" % len(true_singletons) )
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
108 out.write( "#%s false singletons\n" % len(false_singletons) )
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
109 out.write( "#clusters: %s\n" % len_cluster )
0
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
110
11
9e2e43fde5fe Uploaded
bgruening
parents: 6
diff changeset
111
6
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
112 # Sort so the cluster with the most compounds comes first,
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
113 # then by alphabetically smallest id
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
114 def cluster_sort_key(cluster):
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
115 centroid_idx, members = cluster
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
116 return -len(members), arena.ids[centroid_idx]
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
117
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
118 clusters.sort(key=cluster_sort_key)
0
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
119
6
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
120 for centroid_idx, members in clusters:
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
121 centroid_name = arena.ids[centroid_idx]
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
122 out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(arena.ids[idx] for idx in members)))
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
123 #ToDo: len(members) need to be some biggest top 90% or something ...
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
124
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
125 for idx in true_singletons:
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
126 out.write("%s\t%s\n" % (arena.ids[idx], 0))
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
127
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
128 out.close()
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
129 os.remove( temp_link )
0
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
130
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
131
6
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
132 if __name__ == "__main__":
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
133 parser = argparse.ArgumentParser(description="""Taylor-Butina clustering for fps files.
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
134 For more details please see the original publication or the chemfp documentation:
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
135 http://www.chemomine.co.uk/dbclus-paper.pdf
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
136 https://chemfp.readthedocs.org
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
137 """)
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
138
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
139 parser.add_argument("-i", "--input", dest="input_path",
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
140 required=True,
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
141 help="Path to the input file.")
0
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
142
6
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
143 parser.add_argument("-o", "--output", dest="output_path", type=argparse.FileType('w'),
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
144 default=sys.stdout,
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
145 help="Path to the output file.")
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
146
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
147 parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", type=float,
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
148 default=0.8,
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
149 help="Tanimoto threshold [0.8]")
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
150
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
151 parser.add_argument('-p', '--processors', type=int,
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
152 default=4)
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
153
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
154 options = parser.parse_args()
438bc12d591b Uploaded
bgruening
parents: 4
diff changeset
155 butina( options )
0
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
156
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
157