annotate chemfp_clustering/nxn_clustering.py @ 0:a8ac5250d59c

Uploaded
author bgruening
date Tue, 26 Mar 2013 13:05:41 -0400
parents
children 438bc12d591b
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
1 #!/usr/bin/env python
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
2 """
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
3 Modified version of code examples from the chemfp project.
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
4 http://code.google.com/p/chem-fingerprints/
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
5 Thanks to Andrew Dalke of Andrew Dalke Scientific!
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
6 """
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
7 import matplotlib
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
8 matplotlib.use('Agg')
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
9 import sys
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
10 import os
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
11 import chemfp
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
12 import scipy.cluster.hierarchy as hcluster
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
13 import pylab
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
14 import numpy
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
15
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
16
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
17 def distance_matrix(arena,t):
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
18 n = len(arena)
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
19 # The Tanimoto search computes all of the scores when threshold=0.0.
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
20 # The SearchResult contains sparse data, so I set all values
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
21 # now to 1.0 so you can experiment with higher thresholds.
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
22 distances = numpy.ones((n, n), numpy.float64)
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
23
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
24 # Keep track of where the query subarena is in the query
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
25 query_row = 0
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
26
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
27 for query_arena in arena.iter_arenas():
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
28 results = arena.threshold_tanimoto_search_arena(query_arena, threshold=t)
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
29 for q_i, hits in enumerate(results.iter_indices_and_scores()):
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
30 query_idx = query_row + q_i
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
31 for target_idx, score in hits:
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
32 distances[query_idx, target_idx] = 1.0 - score
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
33 query_row += len(query_arena)
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
34
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
35 return distances
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
36
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
37 dataset = chemfp.load_fingerprints( sys.argv[1] )
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
38 distances = distance_matrix( dataset,float( sys.argv[2] ) )
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
39 linkage = hcluster.linkage( distances, method="single", metric="euclidean" )
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
40
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
41 # Plot using matplotlib, which you must have installed
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
42 hcluster.dendrogram(linkage, labels=dataset.ids)
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
43
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
44 pylab.savefig( sys.argv[3], format='svg' )
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
45
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
46
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
47
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
48
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
49
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
50
a8ac5250d59c Uploaded
bgruening
parents:
diff changeset
51