diff chemfp_clustering/nxn_clustering.py @ 6:438bc12d591b

Uploaded
author bgruening
date Fri, 26 Apr 2013 08:02:45 -0400
parents a8ac5250d59c
children 7c84cfa515e0
line wrap: on
line diff
--- a/chemfp_clustering/nxn_clustering.py	Tue Apr 02 05:26:28 2013 -0400
+++ b/chemfp_clustering/nxn_clustering.py	Fri Apr 26 08:02:45 2013 -0400
@@ -6,7 +6,7 @@
 """
 import matplotlib
 matplotlib.use('Agg')
-import sys
+import argparse
 import os
 import chemfp
 import scipy.cluster.hierarchy as hcluster
@@ -14,38 +14,63 @@
 import numpy
 
 
-def distance_matrix(arena,t):
+def distance_matrix(arena, tanimoto_threshold = 0.0):
     n = len(arena)
-    # The Tanimoto search computes all of the scores when threshold=0.0.
-    # The SearchResult contains sparse data, so I set all values
-    # now to 1.0 so you can experiment with higher thresholds.
-    distances = numpy.ones((n, n), numpy.float64)
-
-    # Keep track of where the query subarena is in the query
-    query_row = 0
+    # Start off a similarity matrix with 1.0s along the diagonal
+    try:
+        similarities = numpy.identity(n, "d")
+    except:
+        raise Exception('Input dataset is to large!')
+    chemfp.set_num_threads( args.processors )
 
-    for query_arena in arena.iter_arenas():
-        results = arena.threshold_tanimoto_search_arena(query_arena, threshold=t)  
-    for q_i, hits in enumerate(results.iter_indices_and_scores()):
-            query_idx = query_row + q_i
-            for target_idx, score in hits:
-                distances[query_idx, target_idx] = 1.0 - score
-        query_row += len(query_arena)
+    ## Compute the full similarity matrix.
+    # The implementation computes the upper-triangle then copies
+    # the upper-triangle into lower-triangle. It does not include
+    # terms for the diagonal.
+    results = chemfp.search.threshold_tanimoto_search_symmetric(arena, threshold=tanimoto_threshold)
 
-    return distances
+    # Copy the results into the NumPy array.
+    for row_index, row in enumerate(results.iter_indices_and_scores()):
+        for target_index, target_score in row:
+            similarities[row_index, target_index] = target_score
 
-dataset = chemfp.load_fingerprints( sys.argv[1] )
-distances  = distance_matrix( dataset,float( sys.argv[2] ) )
-linkage = hcluster.linkage( distances, method="single", metric="euclidean" )
-
-# Plot using matplotlib, which you must have installed
-hcluster.dendrogram(linkage, labels=dataset.ids)
-
-pylab.savefig( sys.argv[3], format='svg' )
+    # Return the distance matrix using the similarity matrix
+    return 1.0 - similarities
 
 
 
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="""NxN clustering for fps files.
+For more details please see the chemfp documentation:
+https://chemfp.readthedocs.org
+""")
+
+    parser.add_argument("-i", "--input", dest="input_path",
+                    required=True,
+                    help="Path to the input file.")
+
+    parser.add_argument("-o", "--output", dest="output_path",
+                    help="Path to the output file.")
+
+    parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", 
+                    type=float, default=0.0,
+                    help="Tanimoto threshold [0.0]")
+
+    parser.add_argument("--oformat", default='png', help="Output format (png, svg).")
+
+    parser.add_argument('-p', '--processors', type=int, 
+        default=4)
+
+    args = parser.parse_args()
+
+
+    arena = chemfp.load_fingerprints( args.input_path )
+    distances  = distance_matrix( arena, args.tanimoto_threshold )
+    linkage = hcluster.linkage( distances, method="single", metric="euclidean" )
+
+    hcluster.dendrogram(linkage, labels=arena.ids)
+
+    pylab.savefig( args.output_path, format=args.oformat )
 
 
 
-