Mercurial > repos > bgruening > chemfp
comparison chemfp_clustering/nxn_clustering.py @ 6:438bc12d591b
Uploaded
author | bgruening |
---|---|
date | Fri, 26 Apr 2013 08:02:45 -0400 |
parents | a8ac5250d59c |
children | 7c84cfa515e0 |
comparison
equal
deleted
inserted
replaced
5:39a856defcc2 | 6:438bc12d591b |
---|---|
4 http://code.google.com/p/chem-fingerprints/ | 4 http://code.google.com/p/chem-fingerprints/ |
5 Thanks to Andrew Dalke of Andrew Dalke Scientific! | 5 Thanks to Andrew Dalke of Andrew Dalke Scientific! |
6 """ | 6 """ |
7 import matplotlib | 7 import matplotlib |
8 matplotlib.use('Agg') | 8 matplotlib.use('Agg') |
9 import sys | 9 import argparse |
10 import os | 10 import os |
11 import chemfp | 11 import chemfp |
12 import scipy.cluster.hierarchy as hcluster | 12 import scipy.cluster.hierarchy as hcluster |
13 import pylab | 13 import pylab |
14 import numpy | 14 import numpy |
15 | 15 |
16 | 16 |
17 def distance_matrix(arena,t): | 17 def distance_matrix(arena, tanimoto_threshold = 0.0): |
18 n = len(arena) | 18 n = len(arena) |
19 # The Tanimoto search computes all of the scores when threshold=0.0. | 19 # Start off a similarity matrix with 1.0s along the diagonal |
20 # The SearchResult contains sparse data, so I set all values | 20 try: |
21 # now to 1.0 so you can experiment with higher thresholds. | 21 similarities = numpy.identity(n, "d") |
22 distances = numpy.ones((n, n), numpy.float64) | 22 except: |
23 raise Exception('Input dataset is to large!') | |
24 chemfp.set_num_threads( args.processors ) | |
23 | 25 |
24 # Keep track of where the query subarena is in the query | 26 ## Compute the full similarity matrix. |
25 query_row = 0 | 27 # The implementation computes the upper-triangle then copies |
28 # the upper-triangle into lower-triangle. It does not include | |
29 # terms for the diagonal. | |
30 results = chemfp.search.threshold_tanimoto_search_symmetric(arena, threshold=tanimoto_threshold) | |
26 | 31 |
27 for query_arena in arena.iter_arenas(): | 32 # Copy the results into the NumPy array. |
28 results = arena.threshold_tanimoto_search_arena(query_arena, threshold=t) | 33 for row_index, row in enumerate(results.iter_indices_and_scores()): |
29 for q_i, hits in enumerate(results.iter_indices_and_scores()): | 34 for target_index, target_score in row: |
30 query_idx = query_row + q_i | 35 similarities[row_index, target_index] = target_score |
31 for target_idx, score in hits: | |
32 distances[query_idx, target_idx] = 1.0 - score | |
33 query_row += len(query_arena) | |
34 | 36 |
35 return distances | 37 # Return the distance matrix using the similarity matrix |
36 | 38 return 1.0 - similarities |
37 dataset = chemfp.load_fingerprints( sys.argv[1] ) | |
38 distances = distance_matrix( dataset,float( sys.argv[2] ) ) | |
39 linkage = hcluster.linkage( distances, method="single", metric="euclidean" ) | |
40 | |
41 # Plot using matplotlib, which you must have installed | |
42 hcluster.dendrogram(linkage, labels=dataset.ids) | |
43 | |
44 pylab.savefig( sys.argv[3], format='svg' ) | |
45 | 39 |
46 | 40 |
47 | 41 |
42 if __name__ == "__main__": | |
43 parser = argparse.ArgumentParser(description="""NxN clustering for fps files. | |
44 For more details please see the chemfp documentation: | |
45 https://chemfp.readthedocs.org | |
46 """) | |
47 | |
48 parser.add_argument("-i", "--input", dest="input_path", | |
49 required=True, | |
50 help="Path to the input file.") | |
51 | |
52 parser.add_argument("-o", "--output", dest="output_path", | |
53 help="Path to the output file.") | |
54 | |
55 parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", | |
56 type=float, default=0.0, | |
57 help="Tanimoto threshold [0.0]") | |
58 | |
59 parser.add_argument("--oformat", default='png', help="Output format (png, svg).") | |
60 | |
61 parser.add_argument('-p', '--processors', type=int, | |
62 default=4) | |
63 | |
64 args = parser.parse_args() | |
65 | |
66 | |
67 arena = chemfp.load_fingerprints( args.input_path ) | |
68 distances = distance_matrix( arena, args.tanimoto_threshold ) | |
69 linkage = hcluster.linkage( distances, method="single", metric="euclidean" ) | |
70 | |
71 hcluster.dendrogram(linkage, labels=arena.ids) | |
72 | |
73 pylab.savefig( args.output_path, format=args.oformat ) | |
48 | 74 |
49 | 75 |
50 | 76 |
51 |