view chemfp_clustering/nxn_clustering.py @ 21:7c84cfa515e0

ChemicalToolBoX update.
author Bjoern Gruening <bjoern.gruening@gmail.com>
date Sat, 01 Jun 2013 20:03:04 +0200
parents 438bc12d591b
children 6c496b524b41
line wrap: on
line source

#!/usr/bin/env python
"""
    Modified version of code examples from the chemfp project.
    http://code.google.com/p/chem-fingerprints/
    Thanks to Andrew Dalke of Andrew Dalke Scientific!
"""
import matplotlib
matplotlib.use('Agg')
import argparse
import os
import chemfp
import scipy.cluster.hierarchy as hcluster
import pylab
import numpy
import tempfile


def distance_matrix(arena, tanimoto_threshold = 0.0):
    n = len(arena)
    # Start off a similarity matrix with 1.0s along the diagonal
    try:
        similarities = numpy.identity(n, "d")
    except:
        raise Exception('Input dataset is to large!')
    chemfp.set_num_threads( args.processors )

    ## Compute the full similarity matrix.
    # The implementation computes the upper-triangle then copies
    # the upper-triangle into lower-triangle. It does not include
    # terms for the diagonal.
    results = chemfp.search.threshold_tanimoto_search_symmetric(arena, threshold=tanimoto_threshold)

    # Copy the results into the NumPy array.
    for row_index, row in enumerate(results.iter_indices_and_scores()):
        for target_index, target_score in row:
            similarities[row_index, target_index] = target_score

    # Return the distance matrix using the similarity matrix
    return 1.0 - similarities



if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="""NxN clustering for fps files.
For more details please see the chemfp documentation:
https://chemfp.readthedocs.org
""")

    parser.add_argument("-i", "--input", dest="input_path",
                    required=True,
                    help="Path to the input file.")

    parser.add_argument("-o", "--output", dest="output_path",
                    help="Path to the output file.")

    parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", 
                    type=float, default=0.0,
                    help="Tanimoto threshold [0.0]")

    parser.add_argument("--oformat", default='png', help="Output format (png, svg).")

    parser.add_argument('-p', '--processors', type=int, 
        default=4)

    args = parser.parse_args()

    # make sure that the file ending is fps
    temp_file = tempfile.NamedTemporaryFile()
    temp_link = "%s.%s" % (temp_file.name, 'fps')
    temp_file.close()
    os.symlink(args.input_path, temp_link)

    arena = chemfp.load_fingerprints( temp_link )
    distances  = distance_matrix( arena, args.tanimoto_threshold )
    linkage = hcluster.linkage( distances, method="single", metric="euclidean" )

    hcluster.dendrogram(linkage, labels=arena.ids)

    pylab.savefig( args.output_path, format=args.oformat )