Mercurial > repos > bgruening > chemfp
comparison chemfp_clustering/butina_clustering.py @ 0:a8ac5250d59c
Uploaded
author | bgruening |
---|---|
date | Tue, 26 Mar 2013 13:05:41 -0400 |
parents | |
children | a4e261ee0a51 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a8ac5250d59c |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 Modified version of code examples from the chemfp project. | |
4 http://code.google.com/p/chem-fingerprints/ | |
5 Thanks to Andrew Dalke of Andrew Dalke Scientific! | |
6 """ | |
7 | |
8 import chemfp | |
9 import sys | |
10 import os | |
11 | |
12 chemfp_fingerprint_file = sys.argv[1] | |
13 tanimoto_threshold = float(sys.argv[2]) | |
14 outfile = sys.argv[3] | |
15 processors = int(sys.argv[4]) | |
16 | |
17 | |
18 def get_hit_indicies(hits): | |
19 return [id for (id, score) in hits] | |
20 | |
21 out = open(outfile, 'w') | |
22 dataset = chemfp.load_fingerprints( chemfp_fingerprint_file ) | |
23 | |
24 chemfp.set_num_threads( processors ) | |
25 search = dataset.threshold_tanimoto_search_arena(dataset, threshold = tanimoto_threshold) | |
26 | |
27 # Reorder so the centroid with the most hits comes first. | |
28 # (That's why I do a reverse search.) | |
29 # Ignore the arbitrariness of breaking ties by fingerprint index | |
30 results = sorted( ( (len(hits), i, hits) for (i, hits) in enumerate(search.iter_indices_and_scores()) ),reverse=True) | |
31 | |
32 | |
33 # Determine the true/false singletons and the clusters | |
34 true_singletons = [] | |
35 false_singletons = [] | |
36 clusters = [] | |
37 | |
38 seen = set() | |
39 | |
40 for (size, fp_idx, hits) in results: | |
41 if fp_idx in seen: | |
42 # Can't use a centroid which is already assigned | |
43 continue | |
44 seen.add(fp_idx) | |
45 | |
46 if size == 1: | |
47 # The only fingerprint in the exclusion sphere is itself | |
48 true_singletons.append(fp_idx) | |
49 continue | |
50 | |
51 members = get_hit_indicies(hits) | |
52 # Figure out which ones haven't yet been assigned | |
53 unassigned = [target_idx for target_idx in members if target_idx not in seen] | |
54 | |
55 if not unassigned: | |
56 false_singletons.append(fp_idx) | |
57 continue | |
58 | |
59 # this is a new cluster | |
60 clusters.append( (fp_idx, unassigned) ) | |
61 seen.update(unassigned) | |
62 | |
63 len_cluster = len(clusters) | |
64 #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(dataset.ids[idx] for idx in true_singletons)) ) ) | |
65 #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(dataset.ids[idx] for idx in false_singletons)) ) ) | |
66 | |
67 out.write( "#%s true singletons\n" % len(true_singletons) ) | |
68 out.write( "#%s false singletons\n" % len(false_singletons) ) | |
69 out.write( "#clusters: %s\n" % len_cluster ) | |
70 | |
71 | |
72 # Sort so the cluster with the most compounds comes first, | |
73 # then by alphabetically smallest id | |
74 def cluster_sort_key(cluster): | |
75 centroid_idx, members = cluster | |
76 return -len(members), dataset.ids[centroid_idx] | |
77 | |
78 clusters.sort(key=cluster_sort_key) | |
79 | |
80 | |
81 for centroid_idx, members in clusters: | |
82 centroid_name = dataset.ids[centroid_idx] | |
83 out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(dataset.ids[idx] for idx in members))) | |
84 #ToDo: len(members) need to be some biggest top 90% or something ... | |
85 | |
86 for idx in true_singletons: | |
87 out.write("%s\t%s\n" % (dataset.ids[idx], 0)) | |
88 | |
89 out.close() | |
90 | |
91 |