0
|
1 #!/usr/bin/env python
|
|
2 #
|
|
3 # Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)
|
|
4 #usage: phylorelatives.py [-h] [-i FASTA] [-b INT] [-p] [-r FASTA]
|
|
5 #
|
|
6 #Constructs relatedness of a set of sequences based on the pairwise proportion
|
|
7 #of different sites. It reports the test sequences relatives, tree plots and
|
|
8 #trees in Newick format. One or more test sequences are accepted as long as
|
|
9 #their name includes the strict suffix "_minor" or "_test" (i.e. >seq1_minor).
|
|
10 #IMPORTANT: Sequences must have the same length!
|
|
11 #
|
|
12 #optional arguments:
|
|
13 # -h, --help show this help message and exit
|
|
14 # -i FASTA, --input FASTA
|
|
15 # This option can be specified multiple times. Sequences
|
|
16 # will be added to "multi-fasta.fa". (e.g. -i major1.fa
|
|
17 # -i major2.fa -i minor1.fa)
|
|
18 # -b INT, --bootstrap INT
|
|
19 # Change number of replicas. 0 to deactivate. (Default:
|
|
20 # 1000)
|
|
21 # -p, --pairwise Use pairwise deletion of gaps/missing data. (Default:
|
|
22 # Complete deletion)
|
|
23 # -r FASTA, --root FASTA
|
|
24 # Root trees using FASTA sequence as outgroup. (Default:
|
|
25 # Display unrooted trees)
|
|
26
|
|
27 import sys
|
|
28 import argparse
|
|
29 import array
|
|
30 import dendropy
|
|
31 import rpy2.rinterface
|
25
|
32 rpy2.rinterface.set_initoptions(('rpy2','--vanilla','--quiet'))
|
0
|
33 import rpy2.robjects as robjects
|
|
34 from rpy2.robjects.packages import importr
|
|
35
|
|
36
|
|
37 def ape_read_dna(infasta):
|
|
38 """Read multi-fasta into phylo object"""
|
|
39 ape = importr('ape')
|
|
40 data = ape.read_dna(file=infasta,format="fasta")
|
|
41 return data
|
|
42
|
|
43 def ape_nj(data,missing_info_option=0):
|
|
44 """Return ape nj tree"""
|
|
45 ape = importr('ape')
|
|
46 dist = ape.dist_dna(data,model="raw",pairwise=missing_info_option)
|
|
47 nj_tree = ape.nj(dist)
|
|
48 return nj_tree
|
|
49
|
|
50 def ape_consensus(tree,tree_function,data,iters=1000):
|
|
51 """Return majority rule consensus tree"""
|
|
52 ape = importr('ape')
|
|
53 tree_func = robjects.r(tree_function)
|
|
54 bootstrap = ape.boot_phylo(tree, data, tree_func,
|
|
55 B=iters, quiet=1, trees=1)
|
|
56 bs_trees = bootstrap.rx('trees')[0]
|
|
57 cons_tree = ape.consensus(bs_trees,p=0.5)
|
|
58 return cons_tree
|
|
59
|
|
60 #def parsimony_tree(tree,data):
|
|
61 # """Return Maximum Parsimony tree from supplied tree"""
|
|
62 # phangorn = importr('phangorn')
|
|
63 # mp_tree = phangorn.optim_parsimony(tree,phangorn.phyDat(data), trace=0)
|
|
64 # return mp_tree
|
|
65
|
|
66
|
|
67 def dendro_relatives(tree,minor):
|
|
68 """Return minor allele sequence relatives in tree"""
|
|
69 ape = importr('ape')
|
|
70 newick = list(ape.write_tree(tree))[0]
|
|
71 t = dendropy.Tree.get_from_string(newick,"newick")
|
|
72 minor_leaf = [node for node in t.leaf_nodes()
|
|
73 if node.get_node_str() == minor][0]
|
|
74 parent = minor_leaf.parent_node
|
|
75 relatives = []
|
|
76 while len(relatives) == 0:
|
|
77 output = [relative.get_node_str() for relative in parent.leaf_nodes()]
|
|
78 relatives = [relative for relative in output if not (relative.endswith('minor') or relative.endswith('test'))]
|
|
79 parent = parent.parent_node
|
|
80 return output
|
|
81
|
|
82 def plot_tree(outfile, tree1, tree2=None, root=False):
|
|
83 """Generate tree(s) plot"""
|
|
84 ape = importr('ape')
|
|
85 graphics = importr('graphics')
|
|
86 grdevices = importr('grDevices')
|
|
87 if tree2 is None:
|
|
88 grdevices.png(file=outfile, width=1024, height=768)
|
|
89 if root:
|
|
90 ape.plot_phylo(ape.root(tree1,root),edge_width=2,cex=1,underscore=1)
|
|
91 else:
|
|
92 ape.plot_phylo(tree1,edge_width=2,cex=1,underscore=1)
|
|
93 graphics.title(main='Neighbor Joining')
|
|
94 grdevices.dev_off()
|
|
95 elif tree2 is not None:
|
|
96 grdevices.png(file=outfile, width=1024, height=768)
|
|
97 graphics.par(mfcol=array.array('i',[1,2]))
|
|
98 if root:
|
|
99 ape.plot_phylo(ape.root(tree1,root),edge_width=2,cex=1,underscore=1)
|
|
100 else:
|
|
101 ape.plot_phylo(tree1,edge_width=2,cex=1,underscore=1)
|
|
102 graphics.title(main='Neighbor Joining', cex=1.5, font=2)
|
|
103 if root:
|
|
104 ape.plot_phylo(ape.root(tree2,root),edge_width=2,cex=1,underscore=1)
|
|
105 else:
|
|
106 ape.plot_phylo(tree2,edge_width=2,cex=1,underscore=1)
|
|
107 graphics.title(main='Maximum Parsimony',cex=1.5, font=2)
|
|
108 grdevices.dev_off()
|
|
109 return
|
|
110
|
|
111 def write_nwk(tree):
|
|
112 "Write proper Newick string"
|
|
113 ape = importr('ape')
|
|
114 nwk = list(ape.write_tree(tree))[0]
|
|
115 return nwk
|
|
116
|
|
117 def main():
|
|
118 # Parse command line options
|
|
119 parser = argparse.ArgumentParser(description='Constructs relatedness of a set of sequences based on the pairwise proportion of different sites. It reports the test sequences relatives, tree plots and trees in Newick format. One or more test sequences are accepted as long as their name includes the strict suffix "_minor" or "_test" (i.e. >seq1_minor). IMPORTANT: Sequences must have the same length!', epilog='Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)')
|
|
120 parser.add_argument('-i', '--input', metavar='FASTA', action='append', type=str, help='This option can be specified multiple times. Sequences will be added to "multi-fasta.fa". (e.g. -i major1.fa -i major2.fa -i minor1.fa)')
|
|
121 parser.add_argument('-b', '--bootstrap', type=int, metavar='INT',default=1000, help='Change number of replicas. 0 to deactivate. (Default: 1000)')
|
|
122 parser.add_argument('-p', '--pairwise', action='store_true', help='Use pairwise deletion of gaps/missing data. (Default: Complete deletion)')
|
|
123 parser.add_argument('-r', '--root', type=str, metavar='FASTA', default=False, help='Root trees using FASTA sequence as outgroup. (Default: Display unrooted trees)')
|
|
124 parser.add_argument('--relatives-out', type=str, metavar='FILE', default=None, help=argparse.SUPPRESS)
|
|
125 parser.add_argument('--newick-out', type=str, metavar='FILE', default=None, help=argparse.SUPPRESS)
|
|
126 parser.add_argument('--trees-out', type=str, metavar='FILE', default=None, help=argparse.SUPPRESS)
|
|
127 parser.add_argument('-m', '--multi-fasta', type=str, default='multi-fasta.fa', help=argparse.SUPPRESS)
|
|
128 args = parser.parse_args()
|
|
129
|
|
130
|
|
131 if args.input:
|
|
132 for fasta in args.input:
|
|
133 try:
|
|
134 open(fasta)
|
|
135 except:
|
|
136 sys.exit("\nERROR: Could not open %s\n" % fasta)
|
|
137 try:
|
|
138 multi = open(args.multi_fasta, 'w+')
|
|
139 except:
|
|
140 sys.exit("\nERROR: Could not create %s\n" % args.multi_fasta)
|
|
141
|
|
142 for fasta in args.input:
|
|
143 for line in list(open(fasta)):
|
|
144 multi.write(line.replace('-', '_'))
|
|
145
|
|
146
|
|
147 if args.root:
|
|
148 try:
|
|
149 root = list(open(args.root))
|
|
150 root_id = [line.strip()[1:].replace('-', '_') for line in root if line.strip().startswith(">")][0]
|
|
151 for line in root:
|
|
152 multi.write(line.replace('-', '_'))
|
|
153 except:
|
|
154 sys.exit("\nERROR: Could not open %s\n" % args.root)
|
|
155 else:
|
|
156 root_id = args.root
|
|
157 multi.close()
|
|
158
|
|
159 try:
|
|
160 data = ape_read_dna(args.multi_fasta)
|
|
161 except:
|
|
162 sys.exit("\nERROR: Check existence or proper format of %s\n" % args.multi_fasta)
|
|
163
|
|
164 # Get sequence ids in alignment and identify test sequences
|
|
165 fasta_ids = [seqname.strip()[1:] for seqname in list(open(args.multi_fasta)) if seqname.startswith('>')]
|
|
166 minor_ids = [seqname for seqname in fasta_ids if seqname.endswith('minor') or seqname.endswith('test')]
|
|
167
|
|
168 if len(minor_ids) == 0:
|
|
169 sys.exit("\nERROR: No test sequences found. _minor or _test suffixes are required in the sequence name!\n")
|
|
170 else:
|
|
171 pass
|
|
172
|
|
173 if args.pairwise:
|
|
174 nj_tree = ape_nj(data,1)
|
|
175 nj_func = 'function (xx) nj(dist.dna(xx, model="raw", pair=1))'
|
|
176 else:
|
|
177 nj_tree = ape_nj(data)
|
|
178 nj_func = 'function (xx) nj(dist.dna(xx, model="raw"))'
|
|
179
|
|
180 if args.bootstrap == 0:
|
|
181 cons_tree = nj_tree
|
|
182 elif args.bootstrap !=1000:
|
|
183 cons_tree = ape_consensus(nj_tree,nj_func,data,iters=args.bootstrap)
|
|
184 else:
|
|
185 cons_tree = ape_consensus(nj_tree,nj_func,data)
|
|
186
|
|
187 # Generate report, trees, and Newick strings
|
|
188 if args.relatives_out is not None:
|
|
189 relatives = open(args.relatives_out,'w+')
|
|
190 else:
|
|
191 relatives = open(args.multi_fasta+'-relatives.tab','w+')
|
|
192 if args.newick_out is not None:
|
|
193 newick = open(args.newick_out,'w+')
|
|
194 else:
|
|
195 newick = open(args.multi_fasta+'-newick.txt','w+')
|
|
196 if args.trees_out is not None:
|
|
197 tree_plot_file = args.trees_out
|
|
198 else:
|
|
199 tree_plot_file = args.multi_fasta+'-tree.png'
|
|
200
|
|
201 newick.write('>Neighbor_Joining\n%s\n' % (write_nwk(cons_tree)))
|
|
202 relatives.write('#source\tsample\trelatives\n')
|
|
203 for node in minor_ids:
|
|
204 nj_relatives = [relative for relative in dendro_relatives(cons_tree,node) if relative != node]
|
|
205 relatives.write( 'Neighbor_Joining_tree\t%s\t%s\n' % (node,','.join(sorted(nj_relatives))) )
|
|
206 plot_tree(tree_plot_file,cons_tree,root=root_id)
|
|
207
|
|
208
|
|
209 # Parsimony requires X11 related dependencies. Tricky
|
|
210 # if args.max_parsimony:
|
|
211 # mp_tree = parsimony_tree(cons_tree,data)
|
|
212 # newick.write('>Maximum_Parsimony\n%s\n' % (write_nwk(mp_tree)))
|
|
213 # for node in minor_ids:
|
|
214 # mp_relatives = [relative for relative in dendro_leaves(mp_tree,node) if relative != node]
|
|
215 # relatives.write( 'Maximum_Parsimony_tree\t%s\t%s\n' % (node,','.join(sorted(mp_relatives))) )
|
|
216 # plot_tree(tree_plot_file, tree1=cons_tree, tree2=mp_tree, root=args.root)
|
|
217 # else:
|
|
218 # plot_tree(tree_plot_file,cons_tree,root=args.root)
|
|
219
|
|
220 newick.close()
|
|
221 relatives.close()
|
|
222
|
|
223 if __name__ == "__main__": main()
|