Mercurial > repos > bgruening > chemfp
changeset 22:6c496b524b41
ChemicalToolBoX update.
author | Bjoern Gruening <bjoern.gruening@gmail.com> |
---|---|
date | Sun, 02 Jun 2013 19:53:56 +0200 |
parents | 7c84cfa515e0 |
children | 1868005213a1 |
files | chemfp_clustering/butina_clustering.py chemfp_clustering/butina_clustering.xml chemfp_clustering/nxn_clustering.py chemfp_clustering/nxn_clustering.xml chemfp_mol2fps/mol2fps.xml chemfp_sdf2fps/sdf2fps.xml tool_dependencies.xml |
diffstat | 7 files changed, 53 insertions(+), 48 deletions(-) [+] |
line wrap: on
line diff
--- a/chemfp_clustering/butina_clustering.py Sat Jun 01 20:03:04 2013 +0200 +++ b/chemfp_clustering/butina_clustering.py Sun Jun 02 19:53:56 2013 +0200 @@ -13,7 +13,6 @@ import subprocess from chemfp import search - def unix_sort(results): temp_unsorted = tempfile.NamedTemporaryFile(delete=False) for (i,indices) in enumerate( results.iter_indices() ): @@ -38,35 +37,18 @@ os.remove(temp_sorted.name) - def butina( args ): """ Taylor-Butina clustering from the chemfp help. """ - - # make sure that the file ending is fps - temp_file = tempfile.NamedTemporaryFile() - temp_link = "%s.%s" % (temp_file.name, 'fps') - temp_file.close() - os.symlink(args.input_path, temp_link) - #os.system('ln -s %s %s' % (args.input_path, temp_link) ) - out = args.output_path - arena = chemfp.load_fingerprints( temp_link ) + targets = chemfp.open( args.input_path, format='fps' ) + arena = chemfp.load_fingerprints( targets ) chemfp.set_num_threads( args.processors ) results = search.threshold_tanimoto_search_symmetric(arena, threshold = args.tanimoto_threshold) results.reorder_all("move-closest-first") - # TODO: more memory efficient search? - # Reorder so the centroid with the most hits comes first. - # (That's why I do a reverse search.) - # Ignore the arbitrariness of breaking ties by fingerprint index - """ - results = sorted( ( (len(indices), i, indices) - for (i,indices) in enumerate(results.iter_indices()) ), - reverse=True) - """ sorted_ids = unix_sort(results) # Determine the true/false singletons and the clusters @@ -108,7 +90,6 @@ out.write( "#%s false singletons\n" % len(false_singletons) ) out.write( "#clusters: %s\n" % len_cluster ) - # Sort so the cluster with the most compounds comes first, # then by alphabetically smallest id def cluster_sort_key(cluster): @@ -126,7 +107,6 @@ out.write("%s\t%s\n" % (arena.ids[idx], 0)) out.close() - os.remove( temp_link ) if __name__ == "__main__": @@ -153,5 +133,3 @@ options = parser.parse_args() butina( options ) - -
--- a/chemfp_clustering/butina_clustering.xml Sat Jun 01 20:03:04 2013 +0200 +++ b/chemfp_clustering/butina_clustering.xml Sun Jun 02 19:53:56 2013 +0200 @@ -30,7 +30,9 @@ **Note**. You need molecular fingerprints in FPS format. Open Babel Fastsearch index is not supported. **What it does** -Molecule library clustering using the Taylor-Butina algorithm. +Clustering of molecule libraries using the Taylor-Butina algorithm. This tool is based on the chemfp_ project. + +.. _chemfp: http://chemfp.com/ ----- @@ -66,6 +68,13 @@ 55091849 has 12 other members => 6499094 6485578 55079807 3153534 55102353 55091466 55091416 6485577 55169009 55091752 55091467 55168823 + +**References** + +Please reference the chemfp_ project. + +.. _chemfp: http://chemfp.com/ + </help> </tool>
--- a/chemfp_clustering/nxn_clustering.py Sat Jun 01 20:03:04 2013 +0200 +++ b/chemfp_clustering/nxn_clustering.py Sun Jun 02 19:53:56 2013 +0200 @@ -12,8 +12,6 @@ import scipy.cluster.hierarchy as hcluster import pylab import numpy -import tempfile - def distance_matrix(arena, tanimoto_threshold = 0.0): n = len(arena) @@ -39,7 +37,6 @@ return 1.0 - similarities - if __name__ == "__main__": parser = argparse.ArgumentParser(description="""NxN clustering for fps files. For more details please see the chemfp documentation: @@ -64,13 +61,8 @@ args = parser.parse_args() - # make sure that the file ending is fps - temp_file = tempfile.NamedTemporaryFile() - temp_link = "%s.%s" % (temp_file.name, 'fps') - temp_file.close() - os.symlink(args.input_path, temp_link) - - arena = chemfp.load_fingerprints( temp_link ) + targets = chemfp.open( args.input_path, format='fps' ) + arena = chemfp.load_fingerprints( targets ) distances = distance_matrix( arena, args.tanimoto_threshold ) linkage = hcluster.linkage( distances, method="single", metric="euclidean" ) @@ -78,5 +70,3 @@ pylab.savefig( args.output_path, format=args.oformat ) - -
--- a/chemfp_clustering/nxn_clustering.xml Sat Jun 01 20:03:04 2013 +0200 +++ b/chemfp_clustering/nxn_clustering.xml Sun Jun 02 19:53:56 2013 +0200 @@ -8,9 +8,9 @@ <requirement type="package" version="2.3.2">openbabel</requirement> </requirements> <command interpreter='python'> - nxn_clustering.py - -i $infile - -t $threshold + nxn_clustering.py + -i $infile + -t $threshold -o $outfile --oformat $oformat </command> @@ -41,11 +41,14 @@ **Note**. You need molecular fingerprints in FPS format. Open Babel Fastsearch index is not supported. -**Note**. That tools is only useful for very small datasets. +**Note**. Currently, that tool can only be used with a small dataset. **What it does** Generating hierarchical clusters and visualizing clusters with dendrograms. +For the clustering and the fingerprint handling the chemfp_ project is used. + +.. _chemfp: http://chemfp.com/ ----- @@ -71,11 +74,10 @@ * output:: - plot for the clustring + clustring plot -.. image:: ./static/images/chemfpclustoutput.svg +.. image:: $PATH_TO_IMAGES/NxN_clustering.png - </help> </tool>
--- a/chemfp_mol2fps/mol2fps.xml Sat Jun 01 20:03:04 2013 +0200 +++ b/chemfp_mol2fps/mol2fps.xml Sun Jun 02 19:53:56 2013 +0200 @@ -169,7 +169,12 @@ **What it does** -Generate fingerprints using OpenBabel +Generates different types of fingerprints from the `Open Babel`_ and RDkit_ project. +This tool is using chemfp_. For more information please have a look at: + + - http://code.google.com/p/rdkit/wiki/FingerprintsInTheRDKit + - http://openbabel.org/wiki/Tutorial:Fingerprints + ----- @@ -238,5 +243,19 @@ 10000000000080000000c0000060000c0000060810000010000000800102000000 28434379 +**References** + +Please reference the `Open Babel`_ or RDKit_ project and the chemfp_ project. + +N M O'Boyle, M Banck, C A James, C Morley, T Vandermeersch, and G R Hutchison. "Open Babel: An open chemical toolbox." J. Cheminf. (2011), 3, 33. `DOI:10.1186/1758-2946-3-33`_ +The Open Babel Package http://openbabel.sourceforge.net/ + + +.. _DOI:10.1186/1758-2946-3-33: http://www.jcheminf.com/content/3/1/33 +.. _chemfp: http://chemfp.com/ +.. _RDKit: http://www.rdkit.org/ +.. _`Open Babel`: http://openbabel.org/ + + </help> </tool>
--- a/chemfp_sdf2fps/sdf2fps.xml Sat Jun 01 20:03:04 2013 +0200 +++ b/chemfp_sdf2fps/sdf2fps.xml Sun Jun 02 19:53:56 2013 +0200 @@ -21,8 +21,7 @@ **What it does** -Read a SDF file and extract the fingerprints, to stores them in a fps-file. -TODO: currently it only works for PubChem +Read a PubChem_ SD file and extract the fingerprints, to stores them in a FPS-file. ----- @@ -90,5 +89,13 @@ 0010000002000000000000 28434379 +**References** + +Please reference the chemfp_ project. + +.. _chemfp: http://chemfp.com/ +.. _PubChem: http://pubchem.ncbi.nlm.nih.gov/ + + </help> </tool>
--- a/tool_dependencies.xml Sat Jun 01 20:03:04 2013 +0200 +++ b/tool_dependencies.xml Sun Jun 02 19:53:56 2013 +0200 @@ -10,7 +10,7 @@ <repository changeset_revision="c888aa8ed318" name="package_matplotlib_1_2" owner="bgruening" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu/" /> </package> <package name="chemfp" version="1.1p1"> - <repository changeset_revision="3e3356b13281" name="package_chemfp_1_1" owner="bgruening" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu/" /> + <repository changeset_revision="616ee8e4abf4" name="package_chemfp_1_1" owner="bgruening" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu/" /> </package> <package name="scipy" version="0.12.0"> <repository changeset_revision="e4f395310680" name="package_scipy_0_12" owner="bgruening" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu/" />