0
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 Modified version of code examples from the chemfp project.
|
|
4 http://code.google.com/p/chem-fingerprints/
|
|
5 Thanks to Andrew Dalke of Andrew Dalke Scientific!
|
|
6 """
|
|
7 import matplotlib
|
|
8 matplotlib.use('Agg')
|
6
|
9 import argparse
|
0
|
10 import os
|
|
11 import chemfp
|
|
12 import scipy.cluster.hierarchy as hcluster
|
|
13 import pylab
|
|
14 import numpy
|
|
15
|
|
16
|
6
|
17 def distance_matrix(arena, tanimoto_threshold = 0.0):
|
0
|
18 n = len(arena)
|
6
|
19 # Start off a similarity matrix with 1.0s along the diagonal
|
|
20 try:
|
|
21 similarities = numpy.identity(n, "d")
|
|
22 except:
|
|
23 raise Exception('Input dataset is to large!')
|
|
24 chemfp.set_num_threads( args.processors )
|
0
|
25
|
6
|
26 ## Compute the full similarity matrix.
|
|
27 # The implementation computes the upper-triangle then copies
|
|
28 # the upper-triangle into lower-triangle. It does not include
|
|
29 # terms for the diagonal.
|
|
30 results = chemfp.search.threshold_tanimoto_search_symmetric(arena, threshold=tanimoto_threshold)
|
0
|
31
|
6
|
32 # Copy the results into the NumPy array.
|
|
33 for row_index, row in enumerate(results.iter_indices_and_scores()):
|
|
34 for target_index, target_score in row:
|
|
35 similarities[row_index, target_index] = target_score
|
0
|
36
|
6
|
37 # Return the distance matrix using the similarity matrix
|
|
38 return 1.0 - similarities
|
0
|
39
|
|
40
|
|
41
|
6
|
42 if __name__ == "__main__":
|
|
43 parser = argparse.ArgumentParser(description="""NxN clustering for fps files.
|
|
44 For more details please see the chemfp documentation:
|
|
45 https://chemfp.readthedocs.org
|
|
46 """)
|
|
47
|
|
48 parser.add_argument("-i", "--input", dest="input_path",
|
|
49 required=True,
|
|
50 help="Path to the input file.")
|
|
51
|
|
52 parser.add_argument("-o", "--output", dest="output_path",
|
|
53 help="Path to the output file.")
|
|
54
|
|
55 parser.add_argument("-t", "--threshold", dest="tanimoto_threshold",
|
|
56 type=float, default=0.0,
|
|
57 help="Tanimoto threshold [0.0]")
|
|
58
|
|
59 parser.add_argument("--oformat", default='png', help="Output format (png, svg).")
|
|
60
|
|
61 parser.add_argument('-p', '--processors', type=int,
|
|
62 default=4)
|
|
63
|
|
64 args = parser.parse_args()
|
|
65
|
|
66
|
|
67 arena = chemfp.load_fingerprints( args.input_path )
|
|
68 distances = distance_matrix( arena, args.tanimoto_threshold )
|
|
69 linkage = hcluster.linkage( distances, method="single", metric="euclidean" )
|
|
70
|
|
71 hcluster.dendrogram(linkage, labels=arena.ids)
|
|
72
|
|
73 pylab.savefig( args.output_path, format=args.oformat )
|
0
|
74
|
|
75
|
|
76
|