Mercurial > repos > bgruening > chemfp
annotate nxn_clustering.py @ 41:07e1a9d090e4 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit deb7ad38cefd13df312431b1138054a68381efdf"
author | bgruening |
---|---|
date | Fri, 18 Oct 2019 04:57:12 -0400 |
parents | 28c487eb8399 |
children | 0a1c281e9224 |
rev | line source |
---|---|
33
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
1 #!/usr/bin/env python |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
2 """ |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
3 Modified version of code examples from the chemfp project. |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
4 http://code.google.com/p/chem-fingerprints/ |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
5 Thanks to Andrew Dalke of Andrew Dalke Scientific! |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
6 """ |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
7 import matplotlib |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
8 matplotlib.use('Agg') |
39
28c487eb8399
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit e78367b77f2294891914151f642685644d43a5b7"
bgruening
parents:
38
diff
changeset
|
9 from matplotlib import rcParams |
28c487eb8399
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit e78367b77f2294891914151f642685644d43a5b7"
bgruening
parents:
38
diff
changeset
|
10 rcParams.update({'figure.autolayout': True}) |
33
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
11 import argparse |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
12 import os |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
13 import chemfp |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
14 import scipy.cluster.hierarchy as hcluster |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
15 import pylab |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
16 import numpy |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
17 |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
18 def distance_matrix(arena, tanimoto_threshold = 0.0): |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
19 n = len(arena) |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
20 # Start off a similarity matrix with 1.0s along the diagonal |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
21 try: |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
22 similarities = numpy.identity(n, "d") |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
23 except: |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
24 raise Exception('Input dataset is to large!') |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
25 chemfp.set_num_threads( args.processors ) |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
26 |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
27 ## Compute the full similarity matrix. |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
28 # The implementation computes the upper-triangle then copies |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
29 # the upper-triangle into lower-triangle. It does not include |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
30 # terms for the diagonal. |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
31 results = chemfp.search.threshold_tanimoto_search_symmetric(arena, threshold=tanimoto_threshold) |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
32 |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
33 # Copy the results into the NumPy array. |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
34 for row_index, row in enumerate(results.iter_indices_and_scores()): |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
35 for target_index, target_score in row: |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
36 similarities[row_index, target_index] = target_score |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
37 |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
38 # Return the distance matrix using the similarity matrix |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
39 return 1.0 - similarities |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
40 |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
41 |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
42 if __name__ == "__main__": |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
43 parser = argparse.ArgumentParser(description="""NxN clustering for fps files. |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
44 For more details please see the chemfp documentation: |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
45 https://chemfp.readthedocs.org |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
46 """) |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
47 |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
48 parser.add_argument("-i", "--input", dest="input_path", |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
49 required=True, |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
50 help="Path to the input file.") |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
51 |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
52 parser.add_argument("-c", "--cluster", dest="cluster_image", |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
53 help="Path to the output cluster image.") |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
54 |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
55 parser.add_argument("-s", "--smatrix", dest="similarity_matrix", |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
56 help="Path to the similarity matrix output file.") |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
57 |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
58 parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
59 type=float, default=0.0, |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
60 help="Tanimoto threshold [0.0]") |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
61 |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
62 parser.add_argument("--oformat", default='png', help="Output format (png, svg)") |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
63 |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
64 parser.add_argument('-p', '--processors', type=int, |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
65 default=4) |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
66 |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
67 args = parser.parse_args() |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
68 |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
69 targets = chemfp.open( args.input_path, format='fps' ) |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
70 arena = chemfp.load_fingerprints( targets ) |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
71 distances = distance_matrix( arena, args.tanimoto_threshold ) |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
72 |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
73 if args.similarity_matrix: |
38
02e03ac072cf
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit ed9b6859de648aa5f7cde483732f5df20aaff90e
bgruening
parents:
33
diff
changeset
|
74 numpy.savetxt(args.similarity_matrix, distances) |
33
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
75 |
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
76 if args.cluster_image: |
38
02e03ac072cf
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit ed9b6859de648aa5f7cde483732f5df20aaff90e
bgruening
parents:
33
diff
changeset
|
77 linkage = hcluster.linkage(distances, method="single", metric="euclidean") |
02e03ac072cf
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit ed9b6859de648aa5f7cde483732f5df20aaff90e
bgruening
parents:
33
diff
changeset
|
78 hcluster.dendrogram(linkage, labels=arena.ids, leaf_rotation=90.) |
02e03ac072cf
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit ed9b6859de648aa5f7cde483732f5df20aaff90e
bgruening
parents:
33
diff
changeset
|
79 pylab.savefig(args.cluster_image, format=args.oformat) |
33
73b8c87779ae
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
bgruening
parents:
diff
changeset
|
80 |