0
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 Modified version of code examples from the chemfp project.
|
|
4 http://code.google.com/p/chem-fingerprints/
|
|
5 Thanks to Andrew Dalke of Andrew Dalke Scientific!
|
|
6 """
|
|
7
|
|
8 import chemfp
|
|
9 import sys
|
|
10 import os
|
|
11
|
|
12 chemfp_fingerprint_file = sys.argv[1]
|
|
13 tanimoto_threshold = float(sys.argv[2])
|
|
14 outfile = sys.argv[3]
|
|
15 processors = int(sys.argv[4])
|
|
16
|
|
17
|
|
18 def get_hit_indicies(hits):
|
|
19 return [id for (id, score) in hits]
|
|
20
|
|
21 out = open(outfile, 'w')
|
|
22 dataset = chemfp.load_fingerprints( chemfp_fingerprint_file )
|
|
23
|
|
24 chemfp.set_num_threads( processors )
|
|
25 search = dataset.threshold_tanimoto_search_arena(dataset, threshold = tanimoto_threshold)
|
|
26
|
|
27 # Reorder so the centroid with the most hits comes first.
|
|
28 # (That's why I do a reverse search.)
|
|
29 # Ignore the arbitrariness of breaking ties by fingerprint index
|
|
30 results = sorted( ( (len(hits), i, hits) for (i, hits) in enumerate(search.iter_indices_and_scores()) ),reverse=True)
|
|
31
|
|
32
|
|
33 # Determine the true/false singletons and the clusters
|
|
34 true_singletons = []
|
|
35 false_singletons = []
|
|
36 clusters = []
|
|
37
|
|
38 seen = set()
|
|
39
|
|
40 for (size, fp_idx, hits) in results:
|
|
41 if fp_idx in seen:
|
|
42 # Can't use a centroid which is already assigned
|
|
43 continue
|
|
44 seen.add(fp_idx)
|
|
45
|
|
46 if size == 1:
|
|
47 # The only fingerprint in the exclusion sphere is itself
|
|
48 true_singletons.append(fp_idx)
|
|
49 continue
|
|
50
|
|
51 members = get_hit_indicies(hits)
|
|
52 # Figure out which ones haven't yet been assigned
|
|
53 unassigned = [target_idx for target_idx in members if target_idx not in seen]
|
|
54
|
|
55 if not unassigned:
|
|
56 false_singletons.append(fp_idx)
|
|
57 continue
|
|
58
|
|
59 # this is a new cluster
|
|
60 clusters.append( (fp_idx, unassigned) )
|
|
61 seen.update(unassigned)
|
|
62
|
|
63 len_cluster = len(clusters)
|
|
64 #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(dataset.ids[idx] for idx in true_singletons)) ) )
|
|
65 #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(dataset.ids[idx] for idx in false_singletons)) ) )
|
|
66
|
|
67 out.write( "#%s true singletons\n" % len(true_singletons) )
|
|
68 out.write( "#%s false singletons\n" % len(false_singletons) )
|
|
69 out.write( "#clusters: %s\n" % len_cluster )
|
|
70
|
|
71
|
|
72 # Sort so the cluster with the most compounds comes first,
|
|
73 # then by alphabetically smallest id
|
|
74 def cluster_sort_key(cluster):
|
|
75 centroid_idx, members = cluster
|
|
76 return -len(members), dataset.ids[centroid_idx]
|
|
77
|
|
78 clusters.sort(key=cluster_sort_key)
|
|
79
|
|
80
|
|
81 for centroid_idx, members in clusters:
|
|
82 centroid_name = dataset.ids[centroid_idx]
|
|
83 out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(dataset.ids[idx] for idx in members)))
|
|
84 #ToDo: len(members) need to be some biggest top 90% or something ...
|
|
85
|
|
86 for idx in true_singletons:
|
|
87 out.write("%s\t%s\n" % (dataset.ids[idx], 0))
|
|
88
|
|
89 out.close()
|
|
90
|
|
91
|