Mercurial > repos > bgruening > chemfp
annotate chemfp_clustering/nxn_clustering.py @ 21:7c84cfa515e0
ChemicalToolBoX update.
| author | Bjoern Gruening <bjoern.gruening@gmail.com> |
|---|---|
| date | Sat, 01 Jun 2013 20:03:04 +0200 |
| parents | 438bc12d591b |
| children | 6c496b524b41 |
| rev | line source |
|---|---|
| 0 | 1 #!/usr/bin/env python |
| 2 """ | |
| 3 Modified version of code examples from the chemfp project. | |
| 4 http://code.google.com/p/chem-fingerprints/ | |
| 5 Thanks to Andrew Dalke of Andrew Dalke Scientific! | |
| 6 """ | |
| 7 import matplotlib | |
| 8 matplotlib.use('Agg') | |
| 6 | 9 import argparse |
| 0 | 10 import os |
| 11 import chemfp | |
| 12 import scipy.cluster.hierarchy as hcluster | |
| 13 import pylab | |
| 14 import numpy | |
|
21
7c84cfa515e0
ChemicalToolBoX update.
Bjoern Gruening <bjoern.gruening@gmail.com>
parents:
6
diff
changeset
|
15 import tempfile |
| 0 | 16 |
| 17 | |
| 6 | 18 def distance_matrix(arena, tanimoto_threshold = 0.0): |
| 0 | 19 n = len(arena) |
| 6 | 20 # Start off a similarity matrix with 1.0s along the diagonal |
| 21 try: | |
| 22 similarities = numpy.identity(n, "d") | |
| 23 except: | |
| 24 raise Exception('Input dataset is to large!') | |
| 25 chemfp.set_num_threads( args.processors ) | |
| 0 | 26 |
| 6 | 27 ## Compute the full similarity matrix. |
| 28 # The implementation computes the upper-triangle then copies | |
| 29 # the upper-triangle into lower-triangle. It does not include | |
| 30 # terms for the diagonal. | |
| 31 results = chemfp.search.threshold_tanimoto_search_symmetric(arena, threshold=tanimoto_threshold) | |
| 0 | 32 |
| 6 | 33 # Copy the results into the NumPy array. |
| 34 for row_index, row in enumerate(results.iter_indices_and_scores()): | |
| 35 for target_index, target_score in row: | |
| 36 similarities[row_index, target_index] = target_score | |
| 0 | 37 |
| 6 | 38 # Return the distance matrix using the similarity matrix |
| 39 return 1.0 - similarities | |
| 0 | 40 |
| 41 | |
| 42 | |
| 6 | 43 if __name__ == "__main__": |
| 44 parser = argparse.ArgumentParser(description="""NxN clustering for fps files. | |
| 45 For more details please see the chemfp documentation: | |
| 46 https://chemfp.readthedocs.org | |
| 47 """) | |
| 48 | |
| 49 parser.add_argument("-i", "--input", dest="input_path", | |
| 50 required=True, | |
| 51 help="Path to the input file.") | |
| 52 | |
| 53 parser.add_argument("-o", "--output", dest="output_path", | |
| 54 help="Path to the output file.") | |
| 55 | |
| 56 parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", | |
| 57 type=float, default=0.0, | |
| 58 help="Tanimoto threshold [0.0]") | |
| 59 | |
| 60 parser.add_argument("--oformat", default='png', help="Output format (png, svg).") | |
| 61 | |
| 62 parser.add_argument('-p', '--processors', type=int, | |
| 63 default=4) | |
| 64 | |
| 65 args = parser.parse_args() | |
| 66 | |
|
21
7c84cfa515e0
ChemicalToolBoX update.
Bjoern Gruening <bjoern.gruening@gmail.com>
parents:
6
diff
changeset
|
67 # make sure that the file ending is fps |
|
7c84cfa515e0
ChemicalToolBoX update.
Bjoern Gruening <bjoern.gruening@gmail.com>
parents:
6
diff
changeset
|
68 temp_file = tempfile.NamedTemporaryFile() |
|
7c84cfa515e0
ChemicalToolBoX update.
Bjoern Gruening <bjoern.gruening@gmail.com>
parents:
6
diff
changeset
|
69 temp_link = "%s.%s" % (temp_file.name, 'fps') |
|
7c84cfa515e0
ChemicalToolBoX update.
Bjoern Gruening <bjoern.gruening@gmail.com>
parents:
6
diff
changeset
|
70 temp_file.close() |
|
7c84cfa515e0
ChemicalToolBoX update.
Bjoern Gruening <bjoern.gruening@gmail.com>
parents:
6
diff
changeset
|
71 os.symlink(args.input_path, temp_link) |
| 6 | 72 |
|
21
7c84cfa515e0
ChemicalToolBoX update.
Bjoern Gruening <bjoern.gruening@gmail.com>
parents:
6
diff
changeset
|
73 arena = chemfp.load_fingerprints( temp_link ) |
| 6 | 74 distances = distance_matrix( arena, args.tanimoto_threshold ) |
| 75 linkage = hcluster.linkage( distances, method="single", metric="euclidean" ) | |
| 76 | |
| 77 hcluster.dendrogram(linkage, labels=arena.ids) | |
| 78 | |
| 79 pylab.savefig( args.output_path, format=args.oformat ) | |
| 0 | 80 |
| 81 | |
| 82 |
