0
|
1 #!/usr/bin/env python
|
|
2 #
|
39
|
3 #Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)
|
|
4 #usage: phylorelatives.py [-h] [-i FASTA] [-b INT] [-p] [-r FASTA] [-j]
|
0
|
5 #
|
|
6 #Constructs relatedness of a set of sequences based on the pairwise proportion
|
39
|
7 #of different sites. It reports the test sequences relatives, NJ tree plot and
|
|
8 #Newick string. One or more test sequences are accepted as long as their name
|
|
9 #includes the strict suffix "_minor" or "_test" (i.e. >seq1_minor). IMPORTANT:
|
|
10 #Sequences must have the same length!
|
0
|
11 #
|
|
12 #optional arguments:
|
|
13 # -h, --help show this help message and exit
|
|
14 # -i FASTA, --input FASTA
|
|
15 # This option can be specified multiple times. Sequences
|
|
16 # will be added to "multi-fasta.fa". (e.g. -i major1.fa
|
|
17 # -i major2.fa -i minor1.fa)
|
|
18 # -b INT, --bootstrap INT
|
|
19 # Change number of replicas. 0 to deactivate. (Default:
|
|
20 # 1000)
|
|
21 # -p, --pairwise Use pairwise deletion of gaps/missing data. (Default:
|
|
22 # Complete deletion)
|
|
23 # -r FASTA, --root FASTA
|
|
24 # Root trees using FASTA sequence as outgroup. (Default:
|
|
25 # Display unrooted trees)
|
39
|
26 # -j, --major-only In major-only mode no minor allele sequence is
|
|
27 # required and each sequence is treated as a major
|
|
28 # allele sequence (Default: Require minor allele
|
|
29 # sequences)
|
0
|
30
|
|
31 import sys
|
|
32 import argparse
|
|
33 import array
|
|
34 import dendropy
|
|
35 import rpy2.rinterface
|
25
|
36 rpy2.rinterface.set_initoptions(('rpy2','--vanilla','--quiet'))
|
0
|
37 import rpy2.robjects as robjects
|
|
38 from rpy2.robjects.packages import importr
|
|
39
|
|
40
|
|
41 def ape_read_dna(infasta):
|
|
42 """Read multi-fasta into phylo object"""
|
|
43 ape = importr('ape')
|
|
44 data = ape.read_dna(file=infasta,format="fasta")
|
|
45 return data
|
|
46
|
|
47 def ape_nj(data,missing_info_option=0):
|
|
48 """Return ape nj tree"""
|
|
49 ape = importr('ape')
|
|
50 dist = ape.dist_dna(data,model="raw",pairwise=missing_info_option)
|
|
51 nj_tree = ape.nj(dist)
|
|
52 return nj_tree
|
|
53
|
35
|
54 def ape_bootstrap(tree,tree_function,data,iters=1000):
|
|
55 """Return bootstrap tree with bootstrap values on nodes"""
|
0
|
56 ape = importr('ape')
|
|
57 tree_func = robjects.r(tree_function)
|
|
58 bootstrap = ape.boot_phylo(tree, data, tree_func,
|
|
59 B=iters, quiet=1, trees=1)
|
35
|
60 robjects.r('''
|
|
61 add_bs_value = function(tree, boot, iters) {
|
|
62 tree$node.label = ((boot$BP)/iters)*100
|
|
63 return(tree)
|
|
64 }''')
|
|
65 add_bs_value = robjects.r['add_bs_value']
|
|
66 bs_tree = add_bs_value(tree, bootstrap, iters)
|
|
67 #bs_trees = bootstrap.rx('trees')[0]
|
|
68 #bs_tree = ape.consensus(bs_trees,p=0.5)
|
|
69 return bs_tree
|
0
|
70
|
|
71
|
|
72 def dendro_relatives(tree,minor):
|
|
73 """Return minor allele sequence relatives in tree"""
|
|
74 ape = importr('ape')
|
|
75 newick = list(ape.write_tree(tree))[0]
|
|
76 t = dendropy.Tree.get_from_string(newick,"newick")
|
|
77 minor_leaf = [node for node in t.leaf_nodes()
|
|
78 if node.get_node_str() == minor][0]
|
|
79 parent = minor_leaf.parent_node
|
|
80 relatives = []
|
|
81 while len(relatives) == 0:
|
|
82 output = [relative.get_node_str() for relative in parent.leaf_nodes()]
|
|
83 relatives = [relative for relative in output if not (relative.endswith('minor') or relative.endswith('test'))]
|
|
84 parent = parent.parent_node
|
|
85 return output
|
|
86
|
31
|
87
|
|
88 def dendro_plot(tree, root=False ):
|
|
89 """Plot tree to file in ascii format"""
|
0
|
90 ape = importr('ape')
|
31
|
91 if root:
|
|
92 newick = list(ape.write_tree(ape.root(tree,root)))[0]
|
|
93 else:
|
|
94 newick = list(ape.write_tree(tree))[0]
|
|
95 t = dendropy.Tree.get_from_string(newick,"newick")
|
|
96 ascii_tree = t.as_ascii_plot()
|
|
97 return ascii_tree
|
|
98
|
0
|
99
|
35
|
100 def ape_plot_tree(outfile, tree1, tree2=None, root=False):
|
|
101 """Plot tree to png file"""
|
|
102 ape = importr('ape')
|
|
103 graphics = importr('graphics')
|
|
104 grdevices = importr('grDevices')
|
|
105 if tree2 is None:
|
|
106 grdevices.png(file=outfile, width=1024, height=768)
|
|
107 if root:
|
|
108 ape.plot_phylo(ape.root(tree1,root),use_edge_length=0,
|
|
109 show_node_label=1,edge_width=2, font=2,
|
|
110 cex=1,underscore=0, no_margin=1)
|
|
111 else:
|
|
112 ape.plot_phylo(tree1,use_edge_length=0,
|
|
113 show_node_label=1,edge_width=2, font=2,
|
|
114 cex=1,underscore=0, no_margin=1)
|
|
115 #graphics.title(main='Neighbor Joining')
|
|
116 grdevices.dev_off()
|
|
117 elif tree2 is not None:
|
|
118 grdevices.png(file=outfile, width=1024, height=768)
|
|
119 graphics.par(mfcol=array.array('i',[1,2]))
|
|
120 if root:
|
|
121 ape.plot_phylo(ape.root(tree1,root),use_edge_length=0,
|
|
122 show_node_label=1,edge_width=2, font=2,
|
|
123 cex=1,underscore=0, no_margin=1)
|
|
124 else:
|
|
125 ape.plot_phylo(tree1,use_edge_length=0,
|
|
126 show_node_label=1,edge_width=2, font=2,
|
|
127 cex=1,underscore=0, no_margin=1)
|
|
128 graphics.title(main='Neighbor Joining', cex=1.5, font=2)
|
|
129 if root:
|
|
130 ape.plot_phylo(ape.root(tree2,root),use_edge_length=0,
|
|
131 show_node_label=1,edge_width=2, font=2,
|
|
132 cex=1,underscore=0, no_margin=1)
|
|
133 else:
|
|
134 ape.plot_phylo(tree2,use_edge_length=0,
|
|
135 show_node_label=1,edge_width=2, font=2,
|
|
136 cex=1,underscore=0, no_margin=1)
|
|
137 graphics.title(main='Maximum Parsimony',cex=1.5, font=2)
|
|
138 grdevices.dev_off()
|
|
139 return
|
|
140
|
|
141
|
0
|
142 def write_nwk(tree):
|
|
143 "Write proper Newick string"
|
|
144 ape = importr('ape')
|
|
145 nwk = list(ape.write_tree(tree))[0]
|
|
146 return nwk
|
|
147
|
|
148 def main():
|
|
149 # Parse command line options
|
39
|
150 parser = argparse.ArgumentParser(description='Constructs relatedness of a set of sequences based on the pairwise proportion of different sites. It reports the test sequences relatives, NJ tree plot and Newick string. One or more test sequences are accepted as long as their name includes the strict suffix "_minor" or "_test" (i.e. >seq1_minor). IMPORTANT: Sequences must have the same length!', epilog='Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)')
|
0
|
151 parser.add_argument('-i', '--input', metavar='FASTA', action='append', type=str, help='This option can be specified multiple times. Sequences will be added to "multi-fasta.fa". (e.g. -i major1.fa -i major2.fa -i minor1.fa)')
|
|
152 parser.add_argument('-b', '--bootstrap', type=int, metavar='INT',default=1000, help='Change number of replicas. 0 to deactivate. (Default: 1000)')
|
|
153 parser.add_argument('-p', '--pairwise', action='store_true', help='Use pairwise deletion of gaps/missing data. (Default: Complete deletion)')
|
|
154 parser.add_argument('-r', '--root', type=str, metavar='FASTA', default=False, help='Root trees using FASTA sequence as outgroup. (Default: Display unrooted trees)')
|
39
|
155 parser.add_argument('-j', '--major-only', action='store_true', help='In major-only mode no minor allele sequence is required and each sequence is treated as a major allele sequence (Default: Require minor allele sequences)')
|
0
|
156 parser.add_argument('--relatives-out', type=str, metavar='FILE', default=None, help=argparse.SUPPRESS)
|
|
157 parser.add_argument('--newick-out', type=str, metavar='FILE', default=None, help=argparse.SUPPRESS)
|
|
158 parser.add_argument('--trees-out', type=str, metavar='FILE', default=None, help=argparse.SUPPRESS)
|
|
159 parser.add_argument('-m', '--multi-fasta', type=str, default='multi-fasta.fa', help=argparse.SUPPRESS)
|
33
|
160 args = parser.parse_args()
|
35
|
161 #parser.print_help()
|
0
|
162
|
|
163 if args.input:
|
|
164 for fasta in args.input:
|
|
165 try:
|
|
166 open(fasta)
|
|
167 except:
|
|
168 sys.exit("\nERROR: Could not open %s\n" % fasta)
|
|
169 try:
|
|
170 multi = open(args.multi_fasta, 'w+')
|
|
171 except:
|
|
172 sys.exit("\nERROR: Could not create %s\n" % args.multi_fasta)
|
|
173
|
|
174 for fasta in args.input:
|
|
175 for line in list(open(fasta)):
|
|
176 multi.write(line.replace('-', '_'))
|
|
177
|
|
178
|
|
179 if args.root:
|
|
180 try:
|
|
181 root = list(open(args.root))
|
|
182 root_id = [line.strip()[1:].replace('-', '_') for line in root if line.strip().startswith(">")][0]
|
|
183 for line in root:
|
|
184 multi.write(line.replace('-', '_'))
|
|
185 except:
|
|
186 sys.exit("\nERROR: Could not open %s\n" % args.root)
|
|
187 else:
|
|
188 root_id = args.root
|
|
189 multi.close()
|
|
190
|
|
191 try:
|
|
192 data = ape_read_dna(args.multi_fasta)
|
|
193 except:
|
|
194 sys.exit("\nERROR: Check existence or proper format of %s\n" % args.multi_fasta)
|
|
195
|
|
196 # Get sequence ids in alignment and identify test sequences
|
|
197 fasta_ids = [seqname.strip()[1:] for seqname in list(open(args.multi_fasta)) if seqname.startswith('>')]
|
|
198 minor_ids = [seqname for seqname in fasta_ids if seqname.endswith('minor') or seqname.endswith('test')]
|
|
199
|
35
|
200 if len(minor_ids) == 0 and not args.major_only:
|
0
|
201 sys.exit("\nERROR: No test sequences found. _minor or _test suffixes are required in the sequence name!\n")
|
|
202 else:
|
|
203 pass
|
|
204
|
|
205 if args.pairwise:
|
|
206 nj_tree = ape_nj(data,1)
|
|
207 nj_func = 'function (xx) nj(dist.dna(xx, model="raw", pair=1))'
|
|
208 else:
|
|
209 nj_tree = ape_nj(data)
|
|
210 nj_func = 'function (xx) nj(dist.dna(xx, model="raw"))'
|
|
211
|
|
212 if args.bootstrap == 0:
|
35
|
213 bs_tree = nj_tree
|
0
|
214 elif args.bootstrap !=1000:
|
35
|
215 bs_tree = ape_bootstrap(nj_tree,nj_func,data,iters=args.bootstrap)
|
0
|
216 else:
|
35
|
217 bs_tree = ape_bootstrap(nj_tree,nj_func,data)
|
0
|
218
|
|
219 # Generate report, trees, and Newick strings
|
|
220 if args.relatives_out is not None:
|
|
221 relatives = open(args.relatives_out,'w+')
|
|
222 else:
|
|
223 relatives = open(args.multi_fasta+'-relatives.tab','w+')
|
|
224 if args.newick_out is not None:
|
|
225 newick = open(args.newick_out,'w+')
|
|
226 else:
|
35
|
227 newick = open(args.multi_fasta+'-newick.nwk','w+')
|
0
|
228 if args.trees_out is not None:
|
37
|
229 tree_plot_file = args.trees_out
|
0
|
230 else:
|
35
|
231 #tree_plot_file = open(args.multi_fasta+'-tree-ascii.txt', 'w+')
|
|
232 tree_plot_file = args.multi_fasta+'-tree.png'
|
0
|
233
|
35
|
234 newick.write('%s\n' % (write_nwk(bs_tree)))
|
|
235 #tree_plot_file.write(dendro_plot(bs_tree,root=root_id))
|
|
236 ape_plot_tree(tree_plot_file,bs_tree,root=root_id)
|
|
237
|
|
238 if args.major_only:
|
|
239 relatives.write('Major allele only mode cannot generate a report')
|
|
240 else:
|
|
241 relatives.write('#source\tsample\trelatives\n')
|
|
242 for node in minor_ids:
|
|
243 nj_relatives = [relative for relative in dendro_relatives(bs_tree,node) if relative != node]
|
|
244 relatives.write( 'Neighbor_Joining_tree\t%s\t%s\n' % (node,','.join(sorted(nj_relatives))) )
|
0
|
245
|
|
246 newick.close()
|
|
247 relatives.close()
|
|
248
|
|
249 if __name__ == "__main__": main()
|