0
|
1 #!/usr/bin/env python
|
|
2 #
|
|
3 # Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)
|
|
4 #usage: phylorelatives.py [-h] [-i FASTA] [-b INT] [-p] [-r FASTA]
|
|
5 #
|
|
6 #Constructs relatedness of a set of sequences based on the pairwise proportion
|
31
|
7 #of different sites. It reports the test sequences relatives, tree plot and
|
|
8 #tree in Newick format. One or more test sequences are accepted as long as
|
0
|
9 #their name includes the strict suffix "_minor" or "_test" (i.e. >seq1_minor).
|
|
10 #IMPORTANT: Sequences must have the same length!
|
|
11 #
|
|
12 #optional arguments:
|
|
13 # -h, --help show this help message and exit
|
|
14 # -i FASTA, --input FASTA
|
|
15 # This option can be specified multiple times. Sequences
|
|
16 # will be added to "multi-fasta.fa". (e.g. -i major1.fa
|
|
17 # -i major2.fa -i minor1.fa)
|
|
18 # -b INT, --bootstrap INT
|
|
19 # Change number of replicas. 0 to deactivate. (Default:
|
|
20 # 1000)
|
|
21 # -p, --pairwise Use pairwise deletion of gaps/missing data. (Default:
|
|
22 # Complete deletion)
|
|
23 # -r FASTA, --root FASTA
|
|
24 # Root trees using FASTA sequence as outgroup. (Default:
|
|
25 # Display unrooted trees)
|
|
26
|
|
27 import sys
|
|
28 import argparse
|
|
29 import array
|
|
30 import dendropy
|
|
31 import rpy2.rinterface
|
25
|
32 rpy2.rinterface.set_initoptions(('rpy2','--vanilla','--quiet'))
|
0
|
33 import rpy2.robjects as robjects
|
|
34 from rpy2.robjects.packages import importr
|
|
35
|
|
36
|
|
37 def ape_read_dna(infasta):
|
|
38 """Read multi-fasta into phylo object"""
|
|
39 ape = importr('ape')
|
|
40 data = ape.read_dna(file=infasta,format="fasta")
|
|
41 return data
|
|
42
|
|
43 def ape_nj(data,missing_info_option=0):
|
|
44 """Return ape nj tree"""
|
|
45 ape = importr('ape')
|
|
46 dist = ape.dist_dna(data,model="raw",pairwise=missing_info_option)
|
|
47 nj_tree = ape.nj(dist)
|
|
48 return nj_tree
|
|
49
|
35
|
50 def ape_bootstrap(tree,tree_function,data,iters=1000):
|
|
51 """Return bootstrap tree with bootstrap values on nodes"""
|
0
|
52 ape = importr('ape')
|
|
53 tree_func = robjects.r(tree_function)
|
|
54 bootstrap = ape.boot_phylo(tree, data, tree_func,
|
|
55 B=iters, quiet=1, trees=1)
|
35
|
56 robjects.r('''
|
|
57 add_bs_value = function(tree, boot, iters) {
|
|
58 tree$node.label = ((boot$BP)/iters)*100
|
|
59 return(tree)
|
|
60 }''')
|
|
61 add_bs_value = robjects.r['add_bs_value']
|
|
62 bs_tree = add_bs_value(tree, bootstrap, iters)
|
|
63 #bs_trees = bootstrap.rx('trees')[0]
|
|
64 #bs_tree = ape.consensus(bs_trees,p=0.5)
|
|
65 return bs_tree
|
0
|
66
|
|
67
|
|
68 def dendro_relatives(tree,minor):
|
|
69 """Return minor allele sequence relatives in tree"""
|
|
70 ape = importr('ape')
|
|
71 newick = list(ape.write_tree(tree))[0]
|
|
72 t = dendropy.Tree.get_from_string(newick,"newick")
|
|
73 minor_leaf = [node for node in t.leaf_nodes()
|
|
74 if node.get_node_str() == minor][0]
|
|
75 parent = minor_leaf.parent_node
|
|
76 relatives = []
|
|
77 while len(relatives) == 0:
|
|
78 output = [relative.get_node_str() for relative in parent.leaf_nodes()]
|
|
79 relatives = [relative for relative in output if not (relative.endswith('minor') or relative.endswith('test'))]
|
|
80 parent = parent.parent_node
|
|
81 return output
|
|
82
|
31
|
83
|
|
84 def dendro_plot(tree, root=False ):
|
|
85 """Plot tree to file in ascii format"""
|
0
|
86 ape = importr('ape')
|
31
|
87 if root:
|
|
88 newick = list(ape.write_tree(ape.root(tree,root)))[0]
|
|
89 else:
|
|
90 newick = list(ape.write_tree(tree))[0]
|
|
91 t = dendropy.Tree.get_from_string(newick,"newick")
|
|
92 ascii_tree = t.as_ascii_plot()
|
|
93 return ascii_tree
|
|
94
|
0
|
95
|
35
|
96 def ape_plot_tree(outfile, tree1, tree2=None, root=False):
|
|
97 """Plot tree to png file"""
|
|
98 ape = importr('ape')
|
|
99 graphics = importr('graphics')
|
|
100 grdevices = importr('grDevices')
|
|
101 if tree2 is None:
|
|
102 grdevices.png(file=outfile, width=1024, height=768)
|
|
103 if root:
|
|
104 ape.plot_phylo(ape.root(tree1,root),use_edge_length=0,
|
|
105 show_node_label=1,edge_width=2, font=2,
|
|
106 cex=1,underscore=0, no_margin=1)
|
|
107 else:
|
|
108 ape.plot_phylo(tree1,use_edge_length=0,
|
|
109 show_node_label=1,edge_width=2, font=2,
|
|
110 cex=1,underscore=0, no_margin=1)
|
|
111 #graphics.title(main='Neighbor Joining')
|
|
112 grdevices.dev_off()
|
|
113 elif tree2 is not None:
|
|
114 grdevices.png(file=outfile, width=1024, height=768)
|
|
115 graphics.par(mfcol=array.array('i',[1,2]))
|
|
116 if root:
|
|
117 ape.plot_phylo(ape.root(tree1,root),use_edge_length=0,
|
|
118 show_node_label=1,edge_width=2, font=2,
|
|
119 cex=1,underscore=0, no_margin=1)
|
|
120 else:
|
|
121 ape.plot_phylo(tree1,use_edge_length=0,
|
|
122 show_node_label=1,edge_width=2, font=2,
|
|
123 cex=1,underscore=0, no_margin=1)
|
|
124 graphics.title(main='Neighbor Joining', cex=1.5, font=2)
|
|
125 if root:
|
|
126 ape.plot_phylo(ape.root(tree2,root),use_edge_length=0,
|
|
127 show_node_label=1,edge_width=2, font=2,
|
|
128 cex=1,underscore=0, no_margin=1)
|
|
129 else:
|
|
130 ape.plot_phylo(tree2,use_edge_length=0,
|
|
131 show_node_label=1,edge_width=2, font=2,
|
|
132 cex=1,underscore=0, no_margin=1)
|
|
133 graphics.title(main='Maximum Parsimony',cex=1.5, font=2)
|
|
134 grdevices.dev_off()
|
|
135 return
|
|
136
|
|
137
|
0
|
138 def write_nwk(tree):
|
|
139 "Write proper Newick string"
|
|
140 ape = importr('ape')
|
|
141 nwk = list(ape.write_tree(tree))[0]
|
|
142 return nwk
|
|
143
|
|
144 def main():
|
|
145 # Parse command line options
|
|
146 parser = argparse.ArgumentParser(description='Constructs relatedness of a set of sequences based on the pairwise proportion of different sites. It reports the test sequences relatives, tree plots and trees in Newick format. One or more test sequences are accepted as long as their name includes the strict suffix "_minor" or "_test" (i.e. >seq1_minor). IMPORTANT: Sequences must have the same length!', epilog='Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)')
|
|
147 parser.add_argument('-i', '--input', metavar='FASTA', action='append', type=str, help='This option can be specified multiple times. Sequences will be added to "multi-fasta.fa". (e.g. -i major1.fa -i major2.fa -i minor1.fa)')
|
|
148 parser.add_argument('-b', '--bootstrap', type=int, metavar='INT',default=1000, help='Change number of replicas. 0 to deactivate. (Default: 1000)')
|
|
149 parser.add_argument('-p', '--pairwise', action='store_true', help='Use pairwise deletion of gaps/missing data. (Default: Complete deletion)')
|
|
150 parser.add_argument('-r', '--root', type=str, metavar='FASTA', default=False, help='Root trees using FASTA sequence as outgroup. (Default: Display unrooted trees)')
|
35
|
151 parser.add_argument('-j', '--major-only', action='store_true', help='In major-only mode no minor allele sequence is required and only a NJ major alleles tree is generated (Default: Require minor allele sequences)')
|
0
|
152 parser.add_argument('--relatives-out', type=str, metavar='FILE', default=None, help=argparse.SUPPRESS)
|
|
153 parser.add_argument('--newick-out', type=str, metavar='FILE', default=None, help=argparse.SUPPRESS)
|
|
154 parser.add_argument('--trees-out', type=str, metavar='FILE', default=None, help=argparse.SUPPRESS)
|
|
155 parser.add_argument('-m', '--multi-fasta', type=str, default='multi-fasta.fa', help=argparse.SUPPRESS)
|
33
|
156 args = parser.parse_args()
|
35
|
157 #parser.print_help()
|
0
|
158
|
|
159 if args.input:
|
|
160 for fasta in args.input:
|
|
161 try:
|
|
162 open(fasta)
|
|
163 except:
|
|
164 sys.exit("\nERROR: Could not open %s\n" % fasta)
|
|
165 try:
|
|
166 multi = open(args.multi_fasta, 'w+')
|
|
167 except:
|
|
168 sys.exit("\nERROR: Could not create %s\n" % args.multi_fasta)
|
|
169
|
|
170 for fasta in args.input:
|
|
171 for line in list(open(fasta)):
|
|
172 multi.write(line.replace('-', '_'))
|
|
173
|
|
174
|
|
175 if args.root:
|
|
176 try:
|
|
177 root = list(open(args.root))
|
|
178 root_id = [line.strip()[1:].replace('-', '_') for line in root if line.strip().startswith(">")][0]
|
|
179 for line in root:
|
|
180 multi.write(line.replace('-', '_'))
|
|
181 except:
|
|
182 sys.exit("\nERROR: Could not open %s\n" % args.root)
|
|
183 else:
|
|
184 root_id = args.root
|
|
185 multi.close()
|
|
186
|
|
187 try:
|
|
188 data = ape_read_dna(args.multi_fasta)
|
|
189 except:
|
|
190 sys.exit("\nERROR: Check existence or proper format of %s\n" % args.multi_fasta)
|
|
191
|
|
192 # Get sequence ids in alignment and identify test sequences
|
|
193 fasta_ids = [seqname.strip()[1:] for seqname in list(open(args.multi_fasta)) if seqname.startswith('>')]
|
|
194 minor_ids = [seqname for seqname in fasta_ids if seqname.endswith('minor') or seqname.endswith('test')]
|
|
195
|
35
|
196 if len(minor_ids) == 0 and not args.major_only:
|
0
|
197 sys.exit("\nERROR: No test sequences found. _minor or _test suffixes are required in the sequence name!\n")
|
|
198 else:
|
|
199 pass
|
|
200
|
|
201 if args.pairwise:
|
|
202 nj_tree = ape_nj(data,1)
|
|
203 nj_func = 'function (xx) nj(dist.dna(xx, model="raw", pair=1))'
|
|
204 else:
|
|
205 nj_tree = ape_nj(data)
|
|
206 nj_func = 'function (xx) nj(dist.dna(xx, model="raw"))'
|
|
207
|
|
208 if args.bootstrap == 0:
|
35
|
209 bs_tree = nj_tree
|
0
|
210 elif args.bootstrap !=1000:
|
35
|
211 bs_tree = ape_bootstrap(nj_tree,nj_func,data,iters=args.bootstrap)
|
0
|
212 else:
|
35
|
213 bs_tree = ape_bootstrap(nj_tree,nj_func,data)
|
0
|
214
|
|
215 # Generate report, trees, and Newick strings
|
|
216 if args.relatives_out is not None:
|
|
217 relatives = open(args.relatives_out,'w+')
|
|
218 else:
|
|
219 relatives = open(args.multi_fasta+'-relatives.tab','w+')
|
|
220 if args.newick_out is not None:
|
|
221 newick = open(args.newick_out,'w+')
|
|
222 else:
|
35
|
223 newick = open(args.multi_fasta+'-newick.nwk','w+')
|
0
|
224 if args.trees_out is not None:
|
31
|
225 tree_plot_file = open(args.trees_out, 'w+')
|
0
|
226 else:
|
35
|
227 #tree_plot_file = open(args.multi_fasta+'-tree-ascii.txt', 'w+')
|
|
228 tree_plot_file = args.multi_fasta+'-tree.png'
|
0
|
229
|
35
|
230 newick.write('%s\n' % (write_nwk(bs_tree)))
|
|
231 #tree_plot_file.write(dendro_plot(bs_tree,root=root_id))
|
|
232 ape_plot_tree(tree_plot_file,bs_tree,root=root_id)
|
|
233
|
|
234 if args.major_only:
|
|
235 relatives.write('Major allele only mode cannot generate a report')
|
|
236 else:
|
|
237 relatives.write('#source\tsample\trelatives\n')
|
|
238 for node in minor_ids:
|
|
239 nj_relatives = [relative for relative in dendro_relatives(bs_tree,node) if relative != node]
|
|
240 relatives.write( 'Neighbor_Joining_tree\t%s\t%s\n' % (node,','.join(sorted(nj_relatives))) )
|
0
|
241
|
|
242 newick.close()
|
|
243 relatives.close()
|
|
244
|
|
245 if __name__ == "__main__": main()
|