0
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 Modified version of code examples from the chemfp project.
|
|
4 http://code.google.com/p/chem-fingerprints/
|
|
5 Thanks to Andrew Dalke of Andrew Dalke Scientific!
|
|
6 """
|
|
7 import matplotlib
|
|
8 matplotlib.use('Agg')
|
|
9 import sys
|
|
10 import os
|
|
11 import chemfp
|
|
12 import scipy.cluster.hierarchy as hcluster
|
|
13 import pylab
|
|
14 import numpy
|
|
15
|
|
16
|
|
17 def distance_matrix(arena,t):
|
|
18 n = len(arena)
|
|
19 # The Tanimoto search computes all of the scores when threshold=0.0.
|
|
20 # The SearchResult contains sparse data, so I set all values
|
|
21 # now to 1.0 so you can experiment with higher thresholds.
|
|
22 distances = numpy.ones((n, n), numpy.float64)
|
|
23
|
|
24 # Keep track of where the query subarena is in the query
|
|
25 query_row = 0
|
|
26
|
|
27 for query_arena in arena.iter_arenas():
|
|
28 results = arena.threshold_tanimoto_search_arena(query_arena, threshold=t)
|
|
29 for q_i, hits in enumerate(results.iter_indices_and_scores()):
|
|
30 query_idx = query_row + q_i
|
|
31 for target_idx, score in hits:
|
|
32 distances[query_idx, target_idx] = 1.0 - score
|
|
33 query_row += len(query_arena)
|
|
34
|
|
35 return distances
|
|
36
|
|
37 dataset = chemfp.load_fingerprints( sys.argv[1] )
|
|
38 distances = distance_matrix( dataset,float( sys.argv[2] ) )
|
|
39 linkage = hcluster.linkage( distances, method="single", metric="euclidean" )
|
|
40
|
|
41 # Plot using matplotlib, which you must have installed
|
|
42 hcluster.dendrogram(linkage, labels=dataset.ids)
|
|
43
|
|
44 pylab.savefig( sys.argv[3], format='svg' )
|
|
45
|
|
46
|
|
47
|
|
48
|
|
49
|
|
50
|
|
51
|