changeset 11:9e2e43fde5fe

Uploaded
author bgruening
date Tue, 14 May 2013 17:02:11 -0400
parents 4b5cb4328146
children 64698400ddbb
files chemfp_clustering/butina_clustering.py repository_dependencies.xml
diffstat 2 files changed, 7 insertions(+), 9 deletions(-) [+]
line wrap: on
line diff
--- a/chemfp_clustering/butina_clustering.py	Sat May 11 17:31:10 2013 -0400
+++ b/chemfp_clustering/butina_clustering.py	Tue May 14 17:02:11 2013 -0400
@@ -6,19 +6,18 @@
 """
 
 import chemfp
-from chemfp import search
 import sys
 import os
 import tempfile
 import argparse
 import subprocess
+from chemfp import search
 
 
 def unix_sort(results):
     temp_unsorted = tempfile.NamedTemporaryFile(delete=False)
     for (i,indices) in enumerate( results.iter_indices() ):
         temp_unsorted.write('%s %s\n' % (len(indices), i))
-        print i, indices
     temp_unsorted.close()
     temp_sorted = tempfile.NamedTemporaryFile(delete=False)
     temp_sorted.close()
@@ -49,7 +48,7 @@
     temp_file = tempfile.NamedTemporaryFile()
     temp_link = "%s.%s" % (temp_file.name, 'fps')
     temp_file.close()
-    os.symlink(os.path.realpath(args.input_path), temp_link)
+    os.symlink(args.input_path, temp_link)
     #os.system('ln -s %s %s' % (args.input_path, temp_link) )
 
     out = args.output_path
@@ -58,7 +57,6 @@
     chemfp.set_num_threads( args.processors )
     results = search.threshold_tanimoto_search_symmetric(arena, threshold = args.tanimoto_threshold)
     results.reorder_all("move-closest-first")
-    print [r.get_indices() for r in results]
 
     # TODO: more memory efficient search?
     # Reorder so the centroid with the most hits comes first.
@@ -70,7 +68,7 @@
                       reverse=True)
     """
     sorted_ids = unix_sort(results)
-    
+
     # Determine the true/false singletons and the clusters
     true_singletons = []
     false_singletons = []
@@ -80,8 +78,7 @@
     #for (size, fp_idx, members) in results:
     for (size, fp_idx) in sorted_ids:
         members = results[fp_idx].get_indices()
-        print 'indices (s: %s, fp_idx:%s) -> %s' % (size, fp_idx, members)
-        print 'scores (s: %s, fp_idx:%s) -> %s' % (size, fp_idx,  results[fp_idx].get_scores())
+        #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members]
         if fp_idx in seen:
             # Can't use a centroid which is already assigned
             continue
@@ -89,7 +86,7 @@
 
         if size == 0:
             # The only fingerprint in the exclusion sphere is itself
-            true_singletons.append(fp_idx)
+            true_singletons.append( fp_idx )
             continue
 
         # Figure out which ones haven't yet been assigned
@@ -111,6 +108,7 @@
     out.write( "#%s false singletons\n" % len(false_singletons) )
     out.write( "#clusters: %s\n" % len_cluster )
 
+
     # Sort so the cluster with the most compounds comes first,
     # then by alphabetically smallest id
     def cluster_sort_key(cluster):
--- a/repository_dependencies.xml	Sat May 11 17:31:10 2013 -0400
+++ b/repository_dependencies.xml	Tue May 14 17:02:11 2013 -0400
@@ -1,5 +1,5 @@
 <?xml version="1.0"?>
 <repositories description="This requires the Molecule datatype definitions (e.g. SMILES, InChI, SD-format) and the python numpy package.">
-    <repository toolshed="http://testtoolshed.g2.bx.psu.edu/" name="molecule_datatypes" owner="bgruening" changeset_revision="1a070566e9c6" />
+    <repository toolshed="http://testtoolshed.g2.bx.psu.edu/" name="molecule_datatypes" owner="bgruening" changeset_revision="1b63345907ec" />
     <repository toolshed="http://testtoolshed.g2.bx.psu.edu/" name="package_numpy_1_7" owner="bgruening" changeset_revision="ec80bba4bccb" />
 </repositories>