0
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 Modified version of code examples from the chemfp project.
|
|
4 http://code.google.com/p/chem-fingerprints/
|
|
5 Thanks to Andrew Dalke of Andrew Dalke Scientific!
|
|
6 """
|
|
7
|
|
8 import chemfp
|
|
9 import sys
|
|
10 import os
|
4
|
11 import tempfile
|
6
|
12 import argparse
|
|
13 import subprocess
|
11
|
14 from chemfp import search
|
0
|
15
|
|
16
|
6
|
17 def unix_sort(results):
|
|
18 temp_unsorted = tempfile.NamedTemporaryFile(delete=False)
|
|
19 for (i,indices) in enumerate( results.iter_indices() ):
|
|
20 temp_unsorted.write('%s %s\n' % (len(indices), i))
|
|
21 temp_unsorted.close()
|
|
22 temp_sorted = tempfile.NamedTemporaryFile(delete=False)
|
|
23 temp_sorted.close()
|
|
24 p = subprocess.Popen(['sort', '-n', '-r', '-k', '1,1'], stdin=open(temp_unsorted.name), stdout=open(temp_sorted.name, 'w+'))
|
|
25 stdout, stderr = p.communicate()
|
|
26 return_code = p.returncode
|
0
|
27
|
6
|
28 if return_code:
|
|
29 sys.stdout.write(stdout)
|
|
30 sys.stderr.write(stderr)
|
|
31 sys.stderr.write("Return error code %i from command:\n" % return_code)
|
|
32 temp_sorted.close()
|
|
33 os.remove(temp_unsorted.name)
|
0
|
34
|
6
|
35 for line in open(temp_sorted.name):
|
|
36 size, fp_idx = line.strip().split()
|
|
37 yield (int(size), int(fp_idx))
|
|
38
|
|
39 os.remove(temp_sorted.name)
|
0
|
40
|
|
41
|
6
|
42 def butina( args ):
|
|
43 """
|
|
44 Taylor-Butina clustering from the chemfp help.
|
|
45 """
|
0
|
46
|
6
|
47 # make sure that the file ending is fps
|
|
48 temp_file = tempfile.NamedTemporaryFile()
|
|
49 temp_link = "%s.%s" % (temp_file.name, 'fps')
|
|
50 temp_file.close()
|
11
|
51 os.symlink(args.input_path, temp_link)
|
6
|
52 #os.system('ln -s %s %s' % (args.input_path, temp_link) )
|
|
53
|
|
54 out = args.output_path
|
|
55 arena = chemfp.load_fingerprints( temp_link )
|
0
|
56
|
6
|
57 chemfp.set_num_threads( args.processors )
|
|
58 results = search.threshold_tanimoto_search_symmetric(arena, threshold = args.tanimoto_threshold)
|
|
59 results.reorder_all("move-closest-first")
|
0
|
60
|
6
|
61 # TODO: more memory efficient search?
|
|
62 # Reorder so the centroid with the most hits comes first.
|
|
63 # (That's why I do a reverse search.)
|
|
64 # Ignore the arbitrariness of breaking ties by fingerprint index
|
|
65 """
|
|
66 results = sorted( ( (len(indices), i, indices)
|
|
67 for (i,indices) in enumerate(results.iter_indices()) ),
|
|
68 reverse=True)
|
|
69 """
|
|
70 sorted_ids = unix_sort(results)
|
11
|
71
|
6
|
72 # Determine the true/false singletons and the clusters
|
|
73 true_singletons = []
|
|
74 false_singletons = []
|
|
75 clusters = []
|
0
|
76
|
6
|
77 seen = set()
|
|
78 #for (size, fp_idx, members) in results:
|
|
79 for (size, fp_idx) in sorted_ids:
|
|
80 members = results[fp_idx].get_indices()
|
11
|
81 #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members]
|
6
|
82 if fp_idx in seen:
|
|
83 # Can't use a centroid which is already assigned
|
|
84 continue
|
|
85 seen.add(fp_idx)
|
0
|
86
|
6
|
87 if size == 0:
|
|
88 # The only fingerprint in the exclusion sphere is itself
|
11
|
89 true_singletons.append( fp_idx )
|
6
|
90 continue
|
|
91
|
|
92 # Figure out which ones haven't yet been assigned
|
|
93 unassigned = set(members) - seen
|
|
94
|
|
95 if not unassigned:
|
|
96 false_singletons.append(fp_idx)
|
|
97 continue
|
|
98
|
|
99 # this is a new cluster
|
|
100 clusters.append( (fp_idx, unassigned) )
|
|
101 seen.update(unassigned)
|
0
|
102
|
6
|
103 len_cluster = len(clusters)
|
|
104 #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) )
|
|
105 #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) )
|
|
106
|
|
107 out.write( "#%s true singletons\n" % len(true_singletons) )
|
|
108 out.write( "#%s false singletons\n" % len(false_singletons) )
|
|
109 out.write( "#clusters: %s\n" % len_cluster )
|
0
|
110
|
11
|
111
|
6
|
112 # Sort so the cluster with the most compounds comes first,
|
|
113 # then by alphabetically smallest id
|
|
114 def cluster_sort_key(cluster):
|
|
115 centroid_idx, members = cluster
|
|
116 return -len(members), arena.ids[centroid_idx]
|
|
117
|
|
118 clusters.sort(key=cluster_sort_key)
|
0
|
119
|
6
|
120 for centroid_idx, members in clusters:
|
|
121 centroid_name = arena.ids[centroid_idx]
|
|
122 out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(arena.ids[idx] for idx in members)))
|
|
123 #ToDo: len(members) need to be some biggest top 90% or something ...
|
|
124
|
|
125 for idx in true_singletons:
|
|
126 out.write("%s\t%s\n" % (arena.ids[idx], 0))
|
|
127
|
|
128 out.close()
|
|
129 os.remove( temp_link )
|
0
|
130
|
|
131
|
6
|
132 if __name__ == "__main__":
|
|
133 parser = argparse.ArgumentParser(description="""Taylor-Butina clustering for fps files.
|
|
134 For more details please see the original publication or the chemfp documentation:
|
|
135 http://www.chemomine.co.uk/dbclus-paper.pdf
|
|
136 https://chemfp.readthedocs.org
|
|
137 """)
|
|
138
|
|
139 parser.add_argument("-i", "--input", dest="input_path",
|
|
140 required=True,
|
|
141 help="Path to the input file.")
|
0
|
142
|
6
|
143 parser.add_argument("-o", "--output", dest="output_path", type=argparse.FileType('w'),
|
|
144 default=sys.stdout,
|
|
145 help="Path to the output file.")
|
|
146
|
|
147 parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", type=float,
|
|
148 default=0.8,
|
|
149 help="Tanimoto threshold [0.8]")
|
|
150
|
|
151 parser.add_argument('-p', '--processors', type=int,
|
|
152 default=4)
|
|
153
|
|
154 options = parser.parse_args()
|
|
155 butina( options )
|
0
|
156
|
|
157
|