Mercurial > repos > bgruening > chemfp
diff chemfp_clustering/nxn_clustering.py @ 0:a8ac5250d59c
Uploaded
author | bgruening |
---|---|
date | Tue, 26 Mar 2013 13:05:41 -0400 |
parents | |
children | 438bc12d591b |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chemfp_clustering/nxn_clustering.py Tue Mar 26 13:05:41 2013 -0400 @@ -0,0 +1,51 @@ +#!/usr/bin/env python +""" + Modified version of code examples from the chemfp project. + http://code.google.com/p/chem-fingerprints/ + Thanks to Andrew Dalke of Andrew Dalke Scientific! +""" +import matplotlib +matplotlib.use('Agg') +import sys +import os +import chemfp +import scipy.cluster.hierarchy as hcluster +import pylab +import numpy + + +def distance_matrix(arena,t): + n = len(arena) + # The Tanimoto search computes all of the scores when threshold=0.0. + # The SearchResult contains sparse data, so I set all values + # now to 1.0 so you can experiment with higher thresholds. + distances = numpy.ones((n, n), numpy.float64) + + # Keep track of where the query subarena is in the query + query_row = 0 + + for query_arena in arena.iter_arenas(): + results = arena.threshold_tanimoto_search_arena(query_arena, threshold=t) + for q_i, hits in enumerate(results.iter_indices_and_scores()): + query_idx = query_row + q_i + for target_idx, score in hits: + distances[query_idx, target_idx] = 1.0 - score + query_row += len(query_arena) + + return distances + +dataset = chemfp.load_fingerprints( sys.argv[1] ) +distances = distance_matrix( dataset,float( sys.argv[2] ) ) +linkage = hcluster.linkage( distances, method="single", metric="euclidean" ) + +# Plot using matplotlib, which you must have installed +hcluster.dendrogram(linkage, labels=dataset.ids) + +pylab.savefig( sys.argv[3], format='svg' ) + + + + + + +