diff chemfp_clustering/butina_clustering.py @ 22:6c496b524b41

ChemicalToolBoX update.
author Bjoern Gruening <bjoern.gruening@gmail.com>
date Sun, 02 Jun 2013 19:53:56 +0200
parents 9e2e43fde5fe
children
line wrap: on
line diff
--- a/chemfp_clustering/butina_clustering.py	Sat Jun 01 20:03:04 2013 +0200
+++ b/chemfp_clustering/butina_clustering.py	Sun Jun 02 19:53:56 2013 +0200
@@ -13,7 +13,6 @@
 import subprocess
 from chemfp import search
 
-
 def unix_sort(results):
     temp_unsorted = tempfile.NamedTemporaryFile(delete=False)
     for (i,indices) in enumerate( results.iter_indices() ):
@@ -38,35 +37,18 @@
 
     os.remove(temp_sorted.name)
 
-
 def butina( args ):
     """
         Taylor-Butina clustering from the chemfp help.
     """
-
-    # make sure that the file ending is fps
-    temp_file = tempfile.NamedTemporaryFile()
-    temp_link = "%s.%s" % (temp_file.name, 'fps')
-    temp_file.close()
-    os.symlink(args.input_path, temp_link)
-    #os.system('ln -s %s %s' % (args.input_path, temp_link) )
-
     out = args.output_path
-    arena = chemfp.load_fingerprints( temp_link )
+    targets = chemfp.open( args.input_path, format='fps' )
+    arena = chemfp.load_fingerprints( targets )
 
     chemfp.set_num_threads( args.processors )
     results = search.threshold_tanimoto_search_symmetric(arena, threshold = args.tanimoto_threshold)
     results.reorder_all("move-closest-first")
 
-    # TODO: more memory efficient search?
-    # Reorder so the centroid with the most hits comes first.
-    # (That's why I do a reverse search.)
-    # Ignore the arbitrariness of breaking ties by fingerprint index
-    """
-    results = sorted( (  (len(indices), i, indices)
-                              for (i,indices) in enumerate(results.iter_indices()) ),
-                      reverse=True)
-    """
     sorted_ids = unix_sort(results)
 
     # Determine the true/false singletons and the clusters
@@ -108,7 +90,6 @@
     out.write( "#%s false singletons\n" % len(false_singletons) )
     out.write( "#clusters: %s\n" % len_cluster )
 
-
     # Sort so the cluster with the most compounds comes first,
     # then by alphabetically smallest id
     def cluster_sort_key(cluster):
@@ -126,7 +107,6 @@
         out.write("%s\t%s\n" % (arena.ids[idx], 0))
 
     out.close()
-    os.remove( temp_link )
 
 
 if __name__ == "__main__":
@@ -153,5 +133,3 @@
 
     options = parser.parse_args()
     butina( options )
-
-