comparison chemfp_clustering/nxn_clustering.py @ 6:438bc12d591b

Uploaded
author bgruening
date Fri, 26 Apr 2013 08:02:45 -0400
parents a8ac5250d59c
children 7c84cfa515e0
comparison
equal deleted inserted replaced
5:39a856defcc2 6:438bc12d591b
4 http://code.google.com/p/chem-fingerprints/ 4 http://code.google.com/p/chem-fingerprints/
5 Thanks to Andrew Dalke of Andrew Dalke Scientific! 5 Thanks to Andrew Dalke of Andrew Dalke Scientific!
6 """ 6 """
7 import matplotlib 7 import matplotlib
8 matplotlib.use('Agg') 8 matplotlib.use('Agg')
9 import sys 9 import argparse
10 import os 10 import os
11 import chemfp 11 import chemfp
12 import scipy.cluster.hierarchy as hcluster 12 import scipy.cluster.hierarchy as hcluster
13 import pylab 13 import pylab
14 import numpy 14 import numpy
15 15
16 16
17 def distance_matrix(arena,t): 17 def distance_matrix(arena, tanimoto_threshold = 0.0):
18 n = len(arena) 18 n = len(arena)
19 # The Tanimoto search computes all of the scores when threshold=0.0. 19 # Start off a similarity matrix with 1.0s along the diagonal
20 # The SearchResult contains sparse data, so I set all values 20 try:
21 # now to 1.0 so you can experiment with higher thresholds. 21 similarities = numpy.identity(n, "d")
22 distances = numpy.ones((n, n), numpy.float64) 22 except:
23 raise Exception('Input dataset is to large!')
24 chemfp.set_num_threads( args.processors )
23 25
24 # Keep track of where the query subarena is in the query 26 ## Compute the full similarity matrix.
25 query_row = 0 27 # The implementation computes the upper-triangle then copies
28 # the upper-triangle into lower-triangle. It does not include
29 # terms for the diagonal.
30 results = chemfp.search.threshold_tanimoto_search_symmetric(arena, threshold=tanimoto_threshold)
26 31
27 for query_arena in arena.iter_arenas(): 32 # Copy the results into the NumPy array.
28 results = arena.threshold_tanimoto_search_arena(query_arena, threshold=t) 33 for row_index, row in enumerate(results.iter_indices_and_scores()):
29 for q_i, hits in enumerate(results.iter_indices_and_scores()): 34 for target_index, target_score in row:
30 query_idx = query_row + q_i 35 similarities[row_index, target_index] = target_score
31 for target_idx, score in hits:
32 distances[query_idx, target_idx] = 1.0 - score
33 query_row += len(query_arena)
34 36
35 return distances 37 # Return the distance matrix using the similarity matrix
36 38 return 1.0 - similarities
37 dataset = chemfp.load_fingerprints( sys.argv[1] )
38 distances = distance_matrix( dataset,float( sys.argv[2] ) )
39 linkage = hcluster.linkage( distances, method="single", metric="euclidean" )
40
41 # Plot using matplotlib, which you must have installed
42 hcluster.dendrogram(linkage, labels=dataset.ids)
43
44 pylab.savefig( sys.argv[3], format='svg' )
45 39
46 40
47 41
42 if __name__ == "__main__":
43 parser = argparse.ArgumentParser(description="""NxN clustering for fps files.
44 For more details please see the chemfp documentation:
45 https://chemfp.readthedocs.org
46 """)
47
48 parser.add_argument("-i", "--input", dest="input_path",
49 required=True,
50 help="Path to the input file.")
51
52 parser.add_argument("-o", "--output", dest="output_path",
53 help="Path to the output file.")
54
55 parser.add_argument("-t", "--threshold", dest="tanimoto_threshold",
56 type=float, default=0.0,
57 help="Tanimoto threshold [0.0]")
58
59 parser.add_argument("--oformat", default='png', help="Output format (png, svg).")
60
61 parser.add_argument('-p', '--processors', type=int,
62 default=4)
63
64 args = parser.parse_args()
65
66
67 arena = chemfp.load_fingerprints( args.input_path )
68 distances = distance_matrix( arena, args.tanimoto_threshold )
69 linkage = hcluster.linkage( distances, method="single", metric="euclidean" )
70
71 hcluster.dendrogram(linkage, labels=arena.ids)
72
73 pylab.savefig( args.output_path, format=args.oformat )
48 74
49 75
50 76
51