comparison nxn_clustering.py @ 33:73b8c87779ae draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
author bgruening
date Sat, 20 May 2017 08:31:15 -0400
parents
children 02e03ac072cf
comparison
equal deleted inserted replaced
32:f48c4e271852 33:73b8c87779ae
1 #!/usr/bin/env python
2 """
3 Modified version of code examples from the chemfp project.
4 http://code.google.com/p/chem-fingerprints/
5 Thanks to Andrew Dalke of Andrew Dalke Scientific!
6 """
7 import matplotlib
8 matplotlib.use('Agg')
9 import argparse
10 import os
11 import chemfp
12 import scipy.cluster.hierarchy as hcluster
13 import pylab
14 import numpy
15
16 def distance_matrix(arena, tanimoto_threshold = 0.0):
17 n = len(arena)
18 # Start off a similarity matrix with 1.0s along the diagonal
19 try:
20 similarities = numpy.identity(n, "d")
21 except:
22 raise Exception('Input dataset is to large!')
23 chemfp.set_num_threads( args.processors )
24
25 ## Compute the full similarity matrix.
26 # The implementation computes the upper-triangle then copies
27 # the upper-triangle into lower-triangle. It does not include
28 # terms for the diagonal.
29 results = chemfp.search.threshold_tanimoto_search_symmetric(arena, threshold=tanimoto_threshold)
30
31 # Copy the results into the NumPy array.
32 for row_index, row in enumerate(results.iter_indices_and_scores()):
33 for target_index, target_score in row:
34 similarities[row_index, target_index] = target_score
35
36 # Return the distance matrix using the similarity matrix
37 return 1.0 - similarities
38
39
40 if __name__ == "__main__":
41 parser = argparse.ArgumentParser(description="""NxN clustering for fps files.
42 For more details please see the chemfp documentation:
43 https://chemfp.readthedocs.org
44 """)
45
46 parser.add_argument("-i", "--input", dest="input_path",
47 required=True,
48 help="Path to the input file.")
49
50 parser.add_argument("-c", "--cluster", dest="cluster_image",
51 help="Path to the output cluster image.")
52
53 parser.add_argument("-s", "--smatrix", dest="similarity_matrix",
54 help="Path to the similarity matrix output file.")
55
56 parser.add_argument("-t", "--threshold", dest="tanimoto_threshold",
57 type=float, default=0.0,
58 help="Tanimoto threshold [0.0]")
59
60 parser.add_argument("--oformat", default='png', help="Output format (png, svg)")
61
62 parser.add_argument('-p', '--processors', type=int,
63 default=4)
64
65 args = parser.parse_args()
66
67 targets = chemfp.open( args.input_path, format='fps' )
68 arena = chemfp.load_fingerprints( targets )
69 distances = distance_matrix( arena, args.tanimoto_threshold )
70
71 if args.similarity_matrix:
72 distances.tofile( args.similarity_matrix )
73
74 if args.cluster_image:
75 linkage = hcluster.linkage( distances, method="single", metric="euclidean" )
76
77 hcluster.dendrogram(linkage, labels=arena.ids)
78
79 pylab.savefig( args.cluster_image, format=args.oformat )
80