0
|
1 #!/usr/bin/env python
|
|
2 #
|
39
|
3 #Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)
|
|
4 #usage: phylorelatives.py [-h] [-i FASTA] [-b INT] [-p] [-r FASTA] [-j]
|
0
|
5 #
|
|
6 #Constructs relatedness of a set of sequences based on the pairwise proportion
|
39
|
7 #of different sites. It reports the test sequences relatives, NJ tree plot and
|
|
8 #Newick string. One or more test sequences are accepted as long as their name
|
|
9 #includes the strict suffix "_minor" or "_test" (i.e. >seq1_minor). IMPORTANT:
|
|
10 #Sequences must have the same length!
|
0
|
11 #
|
|
12 #optional arguments:
|
|
13 # -h, --help show this help message and exit
|
|
14 # -i FASTA, --input FASTA
|
|
15 # This option can be specified multiple times. Sequences
|
|
16 # will be added to "multi-fasta.fa". (e.g. -i major1.fa
|
|
17 # -i major2.fa -i minor1.fa)
|
|
18 # -b INT, --bootstrap INT
|
|
19 # Change number of replicas. 0 to deactivate. (Default:
|
|
20 # 1000)
|
|
21 # -p, --pairwise Use pairwise deletion of gaps/missing data. (Default:
|
|
22 # Complete deletion)
|
|
23 # -r FASTA, --root FASTA
|
|
24 # Root trees using FASTA sequence as outgroup. (Default:
|
|
25 # Display unrooted trees)
|
39
|
26 # -j, --major-only In major-only mode no minor allele sequence is
|
|
27 # required and each sequence is treated as a major
|
|
28 # allele sequence (Default: Require minor allele
|
|
29 # sequences)
|
0
|
30
|
|
31 import sys
|
|
32 import argparse
|
|
33 import array
|
|
34 import dendropy
|
|
35 import rpy2.rinterface
|
25
|
36 rpy2.rinterface.set_initoptions(('rpy2','--vanilla','--quiet'))
|
0
|
37 import rpy2.robjects as robjects
|
|
38 from rpy2.robjects.packages import importr
|
|
39
|
|
40
|
|
41 def ape_read_dna(infasta):
|
|
42 """Read multi-fasta into phylo object"""
|
|
43 ape = importr('ape')
|
|
44 data = ape.read_dna(file=infasta,format="fasta")
|
|
45 return data
|
|
46
|
|
47 def ape_nj(data,missing_info_option=0):
|
|
48 """Return ape nj tree"""
|
|
49 ape = importr('ape')
|
|
50 dist = ape.dist_dna(data,model="raw",pairwise=missing_info_option)
|
|
51 nj_tree = ape.nj(dist)
|
|
52 return nj_tree
|
|
53
|
35
|
54 def ape_bootstrap(tree,tree_function,data,iters=1000):
|
|
55 """Return bootstrap tree with bootstrap values on nodes"""
|
0
|
56 ape = importr('ape')
|
|
57 tree_func = robjects.r(tree_function)
|
|
58 bootstrap = ape.boot_phylo(tree, data, tree_func,
|
|
59 B=iters, quiet=1, trees=1)
|
35
|
60 robjects.r('''
|
|
61 add_bs_value = function(tree, boot, iters) {
|
|
62 tree$node.label = ((boot$BP)/iters)*100
|
|
63 return(tree)
|
|
64 }''')
|
|
65 add_bs_value = robjects.r['add_bs_value']
|
|
66 bs_tree = add_bs_value(tree, bootstrap, iters)
|
|
67 #bs_trees = bootstrap.rx('trees')[0]
|
|
68 #bs_tree = ape.consensus(bs_trees,p=0.5)
|
|
69 return bs_tree
|
0
|
70
|
|
71
|
|
72 def dendro_relatives(tree,minor):
|
|
73 """Return minor allele sequence relatives in tree"""
|
|
74 ape = importr('ape')
|
|
75 newick = list(ape.write_tree(tree))[0]
|
|
76 t = dendropy.Tree.get_from_string(newick,"newick")
|
|
77 minor_leaf = [node for node in t.leaf_nodes()
|
|
78 if node.get_node_str() == minor][0]
|
|
79 parent = minor_leaf.parent_node
|
|
80 relatives = []
|
|
81 while len(relatives) == 0:
|
|
82 output = [relative.get_node_str() for relative in parent.leaf_nodes()]
|
|
83 relatives = [relative for relative in output if not (relative.endswith('minor') or relative.endswith('test'))]
|
|
84 parent = parent.parent_node
|
|
85 return output
|
|
86
|
31
|
87
|
|
88 def dendro_plot(tree, root=False ):
|
|
89 """Plot tree to file in ascii format"""
|
0
|
90 ape = importr('ape')
|
31
|
91 if root:
|
|
92 newick = list(ape.write_tree(ape.root(tree,root)))[0]
|
|
93 else:
|
|
94 newick = list(ape.write_tree(tree))[0]
|
|
95 t = dendropy.Tree.get_from_string(newick,"newick")
|
|
96 ascii_tree = t.as_ascii_plot()
|
|
97 return ascii_tree
|
|
98
|
0
|
99
|
40
|
100 def ape_plot_tree(outfile, tree1, root=False):
|
35
|
101 """Plot tree to png file"""
|
|
102 ape = importr('ape')
|
|
103 graphics = importr('graphics')
|
|
104 grdevices = importr('grDevices')
|
42
|
105 grdevices.png(file=outfile, width=1024, height=768,type="cairo")
|
40
|
106 if root:
|
|
107 tree = ape.root(tree1,root)
|
|
108 labels = list(tree.rx("tip.label")[0])
|
|
109 colors = robjects.StrVector(["red" if tip.endswith("_minor") else "black" for tip in labels])
|
|
110 ape.plot_phylo(tree,tip_color=colors,use_edge_length=0,
|
|
111 show_node_label=1,edge_width=2, font=2,
|
|
112 cex=1,underscore=0, no_margin=1)
|
|
113 else:
|
|
114 tree = tree1
|
|
115 labels = list(tree.rx("tip.label")[0])
|
|
116 colors = robjects.StrVector(["red" if tip.endswith("_minor") else "black" for tip in labels])
|
|
117 ape.plot_phylo(tree,tip_color=colors,use_edge_length=0,
|
|
118 show_node_label=1,edge_width=2, font=2,
|
|
119 cex=1,underscore=0, no_margin=1)
|
|
120 #graphics.title(main='Neighbor Joining')
|
35
|
121 return
|
|
122
|
|
123
|
0
|
124 def write_nwk(tree):
|
|
125 "Write proper Newick string"
|
|
126 ape = importr('ape')
|
|
127 nwk = list(ape.write_tree(tree))[0]
|
|
128 return nwk
|
|
129
|
|
130 def main():
|
|
131 # Parse command line options
|
39
|
132 parser = argparse.ArgumentParser(description='Constructs relatedness of a set of sequences based on the pairwise proportion of different sites. It reports the test sequences relatives, NJ tree plot and Newick string. One or more test sequences are accepted as long as their name includes the strict suffix "_minor" or "_test" (i.e. >seq1_minor). IMPORTANT: Sequences must have the same length!', epilog='Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)')
|
0
|
133 parser.add_argument('-i', '--input', metavar='FASTA', action='append', type=str, help='This option can be specified multiple times. Sequences will be added to "multi-fasta.fa". (e.g. -i major1.fa -i major2.fa -i minor1.fa)')
|
|
134 parser.add_argument('-b', '--bootstrap', type=int, metavar='INT',default=1000, help='Change number of replicas. 0 to deactivate. (Default: 1000)')
|
|
135 parser.add_argument('-p', '--pairwise', action='store_true', help='Use pairwise deletion of gaps/missing data. (Default: Complete deletion)')
|
|
136 parser.add_argument('-r', '--root', type=str, metavar='FASTA', default=False, help='Root trees using FASTA sequence as outgroup. (Default: Display unrooted trees)')
|
39
|
137 parser.add_argument('-j', '--major-only', action='store_true', help='In major-only mode no minor allele sequence is required and each sequence is treated as a major allele sequence (Default: Require minor allele sequences)')
|
0
|
138 parser.add_argument('--relatives-out', type=str, metavar='FILE', default=None, help=argparse.SUPPRESS)
|
|
139 parser.add_argument('--newick-out', type=str, metavar='FILE', default=None, help=argparse.SUPPRESS)
|
|
140 parser.add_argument('--trees-out', type=str, metavar='FILE', default=None, help=argparse.SUPPRESS)
|
|
141 parser.add_argument('-m', '--multi-fasta', type=str, default='multi-fasta.fa', help=argparse.SUPPRESS)
|
33
|
142 args = parser.parse_args()
|
35
|
143 #parser.print_help()
|
0
|
144
|
|
145 if args.input:
|
|
146 for fasta in args.input:
|
|
147 try:
|
|
148 open(fasta)
|
|
149 except:
|
|
150 sys.exit("\nERROR: Could not open %s\n" % fasta)
|
|
151 try:
|
|
152 multi = open(args.multi_fasta, 'w+')
|
|
153 except:
|
|
154 sys.exit("\nERROR: Could not create %s\n" % args.multi_fasta)
|
|
155
|
|
156 for fasta in args.input:
|
|
157 for line in list(open(fasta)):
|
|
158 multi.write(line.replace('-', '_'))
|
|
159
|
|
160
|
|
161 if args.root:
|
|
162 try:
|
|
163 root = list(open(args.root))
|
|
164 root_id = [line.strip()[1:].replace('-', '_') for line in root if line.strip().startswith(">")][0]
|
|
165 for line in root:
|
|
166 multi.write(line.replace('-', '_'))
|
|
167 except:
|
|
168 sys.exit("\nERROR: Could not open %s\n" % args.root)
|
|
169 else:
|
|
170 root_id = args.root
|
|
171 multi.close()
|
|
172
|
|
173 try:
|
|
174 data = ape_read_dna(args.multi_fasta)
|
|
175 except:
|
|
176 sys.exit("\nERROR: Check existence or proper format of %s\n" % args.multi_fasta)
|
|
177
|
|
178 # Get sequence ids in alignment and identify test sequences
|
|
179 fasta_ids = [seqname.strip()[1:] for seqname in list(open(args.multi_fasta)) if seqname.startswith('>')]
|
|
180 minor_ids = [seqname for seqname in fasta_ids if seqname.endswith('minor') or seqname.endswith('test')]
|
|
181
|
35
|
182 if len(minor_ids) == 0 and not args.major_only:
|
0
|
183 sys.exit("\nERROR: No test sequences found. _minor or _test suffixes are required in the sequence name!\n")
|
|
184 else:
|
|
185 pass
|
|
186
|
|
187 if args.pairwise:
|
|
188 nj_tree = ape_nj(data,1)
|
|
189 nj_func = 'function (xx) nj(dist.dna(xx, model="raw", pair=1))'
|
|
190 else:
|
|
191 nj_tree = ape_nj(data)
|
|
192 nj_func = 'function (xx) nj(dist.dna(xx, model="raw"))'
|
|
193
|
|
194 if args.bootstrap == 0:
|
35
|
195 bs_tree = nj_tree
|
0
|
196 elif args.bootstrap !=1000:
|
35
|
197 bs_tree = ape_bootstrap(nj_tree,nj_func,data,iters=args.bootstrap)
|
0
|
198 else:
|
35
|
199 bs_tree = ape_bootstrap(nj_tree,nj_func,data)
|
0
|
200
|
|
201 # Generate report, trees, and Newick strings
|
|
202 if args.relatives_out is not None:
|
|
203 relatives = open(args.relatives_out,'w+')
|
|
204 else:
|
|
205 relatives = open(args.multi_fasta+'-relatives.tab','w+')
|
|
206 if args.newick_out is not None:
|
|
207 newick = open(args.newick_out,'w+')
|
|
208 else:
|
35
|
209 newick = open(args.multi_fasta+'-newick.nwk','w+')
|
0
|
210 if args.trees_out is not None:
|
37
|
211 tree_plot_file = args.trees_out
|
0
|
212 else:
|
35
|
213 #tree_plot_file = open(args.multi_fasta+'-tree-ascii.txt', 'w+')
|
|
214 tree_plot_file = args.multi_fasta+'-tree.png'
|
0
|
215
|
35
|
216 newick.write('%s\n' % (write_nwk(bs_tree)))
|
|
217 #tree_plot_file.write(dendro_plot(bs_tree,root=root_id))
|
|
218 ape_plot_tree(tree_plot_file,bs_tree,root=root_id)
|
|
219
|
|
220 if args.major_only:
|
|
221 relatives.write('Major allele only mode cannot generate a report')
|
|
222 else:
|
|
223 relatives.write('#source\tsample\trelatives\n')
|
|
224 for node in minor_ids:
|
|
225 nj_relatives = [relative for relative in dendro_relatives(bs_tree,node) if relative != node]
|
|
226 relatives.write( 'Neighbor_Joining_tree\t%s\t%s\n' % (node,','.join(sorted(nj_relatives))) )
|
0
|
227
|
|
228 newick.close()
|
|
229 relatives.close()
|
|
230
|
|
231 if __name__ == "__main__": main()
|