Mercurial > repos > bgruening > chemfp

--- a/chemfp_clustering/butina_clustering.py	Sat Jun 01 20:03:04 2013 +0200
+++ b/chemfp_clustering/butina_clustering.py	Sun Jun 02 19:53:56 2013 +0200
@@ -13,7 +13,6 @@
 import subprocess
 from chemfp import search

-
 def unix_sort(results):
     temp_unsorted = tempfile.NamedTemporaryFile(delete=False)
     for (i,indices) in enumerate( results.iter_indices() ):
@@ -38,35 +37,18 @@

     os.remove(temp_sorted.name)

-
 def butina( args ):
     """
         Taylor-Butina clustering from the chemfp help.
     """
-
-    # make sure that the file ending is fps
-    temp_file = tempfile.NamedTemporaryFile()
-    temp_link = "%s.%s" % (temp_file.name, 'fps')
-    temp_file.close()
-    os.symlink(args.input_path, temp_link)
-    #os.system('ln -s %s %s' % (args.input_path, temp_link) )
-
     out = args.output_path
-    arena = chemfp.load_fingerprints( temp_link )
+    targets = chemfp.open( args.input_path, format='fps' )
+    arena = chemfp.load_fingerprints( targets )

     chemfp.set_num_threads( args.processors )
     results = search.threshold_tanimoto_search_symmetric(arena, threshold = args.tanimoto_threshold)
     results.reorder_all("move-closest-first")

-    # TODO: more memory efficient search?
-    # Reorder so the centroid with the most hits comes first.
-    # (That's why I do a reverse search.)
-    # Ignore the arbitrariness of breaking ties by fingerprint index
-    """
-    results = sorted( (  (len(indices), i, indices)
-                              for (i,indices) in enumerate(results.iter_indices()) ),
-                      reverse=True)
-    """
     sorted_ids = unix_sort(results)

     # Determine the true/false singletons and the clusters
@@ -108,7 +90,6 @@
     out.write( "#%s false singletons\n" % len(false_singletons) )
     out.write( "#clusters: %s\n" % len_cluster )

-
     # Sort so the cluster with the most compounds comes first,
     # then by alphabetically smallest id
     def cluster_sort_key(cluster):
@@ -126,7 +107,6 @@
         out.write("%s\t%s\n" % (arena.ids[idx], 0))

     out.close()
-    os.remove( temp_link )


 if __name__ == "__main__":
@@ -153,5 +133,3 @@

     options = parser.parse_args()
     butina( options )
-
-
--- a/chemfp_clustering/butina_clustering.xml	Sat Jun 01 20:03:04 2013 +0200
+++ b/chemfp_clustering/butina_clustering.xml	Sun Jun 02 19:53:56 2013 +0200
@@ -30,7 +30,9 @@
 **Note**. You need molecular fingerprints in FPS format. Open Babel Fastsearch index is not supported.

 **What it does**
-Molecule library clustering using the Taylor-Butina algorithm.
+Clustering of molecule libraries using the Taylor-Butina algorithm. This tool is based on the chemfp_ project.
+
+.. _chemfp: http://chemfp.com/

 -----

@@ -66,6 +68,13 @@
 	55091849 has 12 other members
 	=> 6499094 6485578 55079807 3153534 55102353 55091466 55091416 6485577 55169009 55091752 55091467 55168823

+
+**References**
+
+Please reference the chemfp_ project.
+
+.. _chemfp: http://chemfp.com/
+
  </help>

 </tool>
--- a/chemfp_clustering/nxn_clustering.py	Sat Jun 01 20:03:04 2013 +0200
+++ b/chemfp_clustering/nxn_clustering.py	Sun Jun 02 19:53:56 2013 +0200
@@ -12,8 +12,6 @@
 import scipy.cluster.hierarchy as hcluster
 import pylab
 import numpy
-import tempfile
-

 def distance_matrix(arena, tanimoto_threshold = 0.0):
     n = len(arena)
@@ -39,7 +37,6 @@
     return 1.0 - similarities


-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="""NxN clustering for fps files.
 For more details please see the chemfp documentation:
@@ -64,13 +61,8 @@

     args = parser.parse_args()

-    # make sure that the file ending is fps
-    temp_file = tempfile.NamedTemporaryFile()
-    temp_link = "%s.%s" % (temp_file.name, 'fps')
-    temp_file.close()
-    os.symlink(args.input_path, temp_link)
-
-    arena = chemfp.load_fingerprints( temp_link )
+    targets = chemfp.open( args.input_path, format='fps' )
+    arena = chemfp.load_fingerprints( targets )
     distances  = distance_matrix( arena, args.tanimoto_threshold )
     linkage = hcluster.linkage( distances, method="single", metric="euclidean" )

@@ -78,5 +70,3 @@

     pylab.savefig( args.output_path, format=args.oformat )

-
-
--- a/chemfp_clustering/nxn_clustering.xml	Sat Jun 01 20:03:04 2013 +0200
+++ b/chemfp_clustering/nxn_clustering.xml	Sun Jun 02 19:53:56 2013 +0200
@@ -8,9 +8,9 @@
         <requirement type="package" version="2.3.2">openbabel</requirement>
     </requirements>
     <command interpreter='python'>
-        nxn_clustering.py
-            -i $infile
-            -t $threshold
+        nxn_clustering.py
+            -i $infile
+            -t $threshold
             -o $outfile
             --oformat $oformat
     </command>
@@ -41,11 +41,14 @@

 **Note**. You need molecular fingerprints in FPS format. Open Babel Fastsearch index is not supported.

-**Note**. That tools is only useful for very small datasets.
+**Note**. Currently, that tool can only be used with a small dataset.


 **What it does**
 Generating hierarchical clusters and visualizing clusters with dendrograms.
+For the clustering and the fingerprint handling the chemfp_ project is used.
+
+.. _chemfp: http://chemfp.com/

 -----

@@ -71,11 +74,10 @@

 * output::

-	plot for the clustring
+	clustring plot

-.. image:: ./static/images/chemfpclustoutput.svg
+.. image:: $PATH_TO_IMAGES/NxN_clustering.png

-
     </help>

 </tool>
--- a/chemfp_mol2fps/mol2fps.xml	Sat Jun 01 20:03:04 2013 +0200
+++ b/chemfp_mol2fps/mol2fps.xml	Sun Jun 02 19:53:56 2013 +0200
@@ -169,7 +169,12 @@

 **What it does**

-Generate fingerprints using OpenBabel
+Generates different types of fingerprints from the `Open Babel`_ and RDkit_ project.
+This tool is using chemfp_. For more information please have a look at:
+
+    - http://code.google.com/p/rdkit/wiki/FingerprintsInTheRDKit
+    - http://openbabel.org/wiki/Tutorial:Fingerprints
+

 -----

@@ -238,5 +243,19 @@
 	10000000000080000000c0000060000c0000060810000010000000800102000000	28434379


+**References**
+
+Please reference the `Open Babel`_ or RDKit_ project and the chemfp_ project.
+
+N M O'Boyle, M Banck, C A James, C Morley, T Vandermeersch, and G R Hutchison. "Open Babel: An open chemical toolbox." J. Cheminf. (2011), 3, 33. `DOI:10.1186/1758-2946-3-33`_
+The Open Babel Package http://openbabel.sourceforge.net/
+
+
+.. _DOI:10.1186/1758-2946-3-33: http://www.jcheminf.com/content/3/1/33
+.. _chemfp: http://chemfp.com/
+.. _RDKit: http://www.rdkit.org/
+.. _`Open Babel`: http://openbabel.org/
+
+
     </help>
 </tool>
--- a/chemfp_sdf2fps/sdf2fps.xml	Sat Jun 01 20:03:04 2013 +0200
+++ b/chemfp_sdf2fps/sdf2fps.xml	Sun Jun 02 19:53:56 2013 +0200
@@ -21,8 +21,7 @@

 **What it does**

-Read a SDF file and extract the fingerprints, to stores them in a fps-file.
-TODO: currently it only works for PubChem
+Read a PubChem_ SD file and extract the fingerprints, to stores them in a FPS-file.

 -----

@@ -90,5 +89,13 @@
 	0010000002000000000000	28434379


+**References**
+
+Please reference the chemfp_ project.
+
+.. _chemfp: http://chemfp.com/
+.. _PubChem: http://pubchem.ncbi.nlm.nih.gov/
+
+
     </help>
 </tool>
--- a/tool_dependencies.xml	Sat Jun 01 20:03:04 2013 +0200
+++ b/tool_dependencies.xml	Sun Jun 02 19:53:56 2013 +0200
@@ -10,7 +10,7 @@
         <repository changeset_revision="c888aa8ed318" name="package_matplotlib_1_2" owner="bgruening" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu/" />
     </package>
     <package name="chemfp" version="1.1p1">
-        <repository changeset_revision="3e3356b13281" name="package_chemfp_1_1" owner="bgruening" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu/" />
+        <repository changeset_revision="616ee8e4abf4" name="package_chemfp_1_1" owner="bgruening" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu/" />
     </package>
     <package name="scipy" version="0.12.0">
         <repository changeset_revision="e4f395310680" name="package_scipy_0_12" owner="bgruening" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu/" />