view chemfp_clustering/old/butina_clustering_old.py @ 6:438bc12d591b

Uploaded
author bgruening
date Fri, 26 Apr 2013 08:02:45 -0400
parents
children
line wrap: on
line source

#!/usr/bin/env python
"""
    Modified version of code examples from the chemfp project.
    http://code.google.com/p/chem-fingerprints/
    Thanks to Andrew Dalke of Andrew Dalke Scientific!
"""

import chemfp
import sys
import os
import tempfile

temp_file = tempfile.NamedTemporaryFile()
temp_link = "%s.%s" % (temp_file.name, 'fps')
temp_file.close()
os.system('ln -s %s %s' % (os.path.realpath(sys.argv[1]), temp_link) )


chemfp_fingerprint_file = temp_link
tanimoto_threshold = float(sys.argv[2])
outfile = sys.argv[3]
processors = int(sys.argv[4])


def get_hit_indicies(hits):
    return [id for (id, score) in hits]

out = open(outfile, 'w')
dataset = chemfp.load_fingerprints( chemfp_fingerprint_file )

chemfp.set_num_threads( processors )
search = dataset.threshold_tanimoto_search_arena(dataset, threshold = tanimoto_threshold)
#search = chemfp.search.threshold_tanimoto_search_symmetric (dataset, threshold = tanimoto_threshold)

# Reorder so the centroid with the most hits comes first.
# (That's why I do a reverse search.)
# Ignore the arbitrariness of breaking ties by fingerprint index
results = sorted( (  (len(hits), i, hits) for (i, hits) in enumerate(search.iter_indices_and_scores())  ),reverse=True)


# Determine the true/false singletons and the clusters
true_singletons = []
false_singletons = []
clusters = []

seen = set()

for (size, fp_idx, hits) in results:
    if fp_idx in seen:
        # Can't use a centroid which is already assigned
        continue
    seen.add(fp_idx)
    print size, fp_idx, hits
    if size == 1:
        # The only fingerprint in the exclusion sphere is itself
        true_singletons.append(fp_idx)
        continue

    members = get_hit_indicies(hits)
    # Figure out which ones haven't yet been assigned
    unassigned = [target_idx for target_idx in members if target_idx not in seen]

    if not unassigned:
        false_singletons.append(fp_idx)
        continue

    # this is a new cluster
    clusters.append( (fp_idx, unassigned) )
    seen.update(unassigned)

len_cluster = len(clusters)
#out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(dataset.ids[idx] for idx in true_singletons)) ) )
#out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(dataset.ids[idx] for idx in false_singletons)) ) )

out.write( "#%s true singletons\n" % len(true_singletons) )
out.write( "#%s false singletons\n" % len(false_singletons) )
out.write( "#clusters: %s\n" % len_cluster )

# Sort so the cluster with the most compounds comes first,
# then by alphabetically smallest id
def cluster_sort_key(cluster):
    centroid_idx, members = cluster
    return -len(members), dataset.ids[centroid_idx]

clusters.sort(key=cluster_sort_key)


for centroid_idx, members in clusters:
    centroid_name = dataset.ids[centroid_idx]
    out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(sorted(dataset.ids[idx] for idx in members))))
    #ToDo: len(members) need to be some biggest top 90% or something ...

for idx in sorted(true_singletons):
    out.write("%s\t%s\n" % (dataset.ids[idx], 0))

out.close()
os.remove( temp_link )