0
|
1 #!/usr/bin/env python
|
|
2 #
|
|
3 # Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)
|
|
4 #usage: phylorelatives.py [-h] [-i FASTA] [-b INT] [-p] [-r FASTA]
|
|
5 #
|
|
6 #Constructs relatedness of a set of sequences based on the pairwise proportion
|
31
|
7 #of different sites. It reports the test sequences relatives, tree plot and
|
|
8 #tree in Newick format. One or more test sequences are accepted as long as
|
0
|
9 #their name includes the strict suffix "_minor" or "_test" (i.e. >seq1_minor).
|
|
10 #IMPORTANT: Sequences must have the same length!
|
|
11 #
|
|
12 #optional arguments:
|
|
13 # -h, --help show this help message and exit
|
|
14 # -i FASTA, --input FASTA
|
|
15 # This option can be specified multiple times. Sequences
|
|
16 # will be added to "multi-fasta.fa". (e.g. -i major1.fa
|
|
17 # -i major2.fa -i minor1.fa)
|
|
18 # -b INT, --bootstrap INT
|
|
19 # Change number of replicas. 0 to deactivate. (Default:
|
|
20 # 1000)
|
|
21 # -p, --pairwise Use pairwise deletion of gaps/missing data. (Default:
|
|
22 # Complete deletion)
|
|
23 # -r FASTA, --root FASTA
|
|
24 # Root trees using FASTA sequence as outgroup. (Default:
|
|
25 # Display unrooted trees)
|
|
26
|
|
27 import sys
|
|
28 import argparse
|
|
29 import array
|
|
30 import dendropy
|
|
31 import rpy2.rinterface
|
25
|
32 rpy2.rinterface.set_initoptions(('rpy2','--vanilla','--quiet'))
|
0
|
33 import rpy2.robjects as robjects
|
|
34 from rpy2.robjects.packages import importr
|
|
35
|
|
36
|
|
37 def ape_read_dna(infasta):
|
|
38 """Read multi-fasta into phylo object"""
|
|
39 ape = importr('ape')
|
|
40 data = ape.read_dna(file=infasta,format="fasta")
|
|
41 return data
|
|
42
|
|
43 def ape_nj(data,missing_info_option=0):
|
|
44 """Return ape nj tree"""
|
|
45 ape = importr('ape')
|
|
46 dist = ape.dist_dna(data,model="raw",pairwise=missing_info_option)
|
|
47 nj_tree = ape.nj(dist)
|
|
48 return nj_tree
|
|
49
|
|
50 def ape_consensus(tree,tree_function,data,iters=1000):
|
|
51 """Return majority rule consensus tree"""
|
|
52 ape = importr('ape')
|
|
53 tree_func = robjects.r(tree_function)
|
|
54 bootstrap = ape.boot_phylo(tree, data, tree_func,
|
|
55 B=iters, quiet=1, trees=1)
|
|
56 bs_trees = bootstrap.rx('trees')[0]
|
|
57 cons_tree = ape.consensus(bs_trees,p=0.5)
|
|
58 return cons_tree
|
|
59
|
|
60
|
|
61 def dendro_relatives(tree,minor):
|
|
62 """Return minor allele sequence relatives in tree"""
|
|
63 ape = importr('ape')
|
|
64 newick = list(ape.write_tree(tree))[0]
|
|
65 t = dendropy.Tree.get_from_string(newick,"newick")
|
|
66 minor_leaf = [node for node in t.leaf_nodes()
|
|
67 if node.get_node_str() == minor][0]
|
|
68 parent = minor_leaf.parent_node
|
|
69 relatives = []
|
|
70 while len(relatives) == 0:
|
|
71 output = [relative.get_node_str() for relative in parent.leaf_nodes()]
|
|
72 relatives = [relative for relative in output if not (relative.endswith('minor') or relative.endswith('test'))]
|
|
73 parent = parent.parent_node
|
|
74 return output
|
|
75
|
31
|
76
|
|
77 def dendro_plot(tree, root=False ):
|
|
78 """Plot tree to file in ascii format"""
|
0
|
79 ape = importr('ape')
|
31
|
80 if root:
|
|
81 newick = list(ape.write_tree(ape.root(tree,root)))[0]
|
|
82 else:
|
|
83 newick = list(ape.write_tree(tree))[0]
|
|
84 t = dendropy.Tree.get_from_string(newick,"newick")
|
|
85 ascii_tree = t.as_ascii_plot()
|
|
86 return ascii_tree
|
|
87
|
0
|
88
|
|
89 def write_nwk(tree):
|
|
90 "Write proper Newick string"
|
|
91 ape = importr('ape')
|
|
92 nwk = list(ape.write_tree(tree))[0]
|
|
93 return nwk
|
|
94
|
|
95 def main():
|
|
96 # Parse command line options
|
|
97 parser = argparse.ArgumentParser(description='Constructs relatedness of a set of sequences based on the pairwise proportion of different sites. It reports the test sequences relatives, tree plots and trees in Newick format. One or more test sequences are accepted as long as their name includes the strict suffix "_minor" or "_test" (i.e. >seq1_minor). IMPORTANT: Sequences must have the same length!', epilog='Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)')
|
|
98 parser.add_argument('-i', '--input', metavar='FASTA', action='append', type=str, help='This option can be specified multiple times. Sequences will be added to "multi-fasta.fa". (e.g. -i major1.fa -i major2.fa -i minor1.fa)')
|
|
99 parser.add_argument('-b', '--bootstrap', type=int, metavar='INT',default=1000, help='Change number of replicas. 0 to deactivate. (Default: 1000)')
|
|
100 parser.add_argument('-p', '--pairwise', action='store_true', help='Use pairwise deletion of gaps/missing data. (Default: Complete deletion)')
|
|
101 parser.add_argument('-r', '--root', type=str, metavar='FASTA', default=False, help='Root trees using FASTA sequence as outgroup. (Default: Display unrooted trees)')
|
|
102 parser.add_argument('--relatives-out', type=str, metavar='FILE', default=None, help=argparse.SUPPRESS)
|
|
103 parser.add_argument('--newick-out', type=str, metavar='FILE', default=None, help=argparse.SUPPRESS)
|
|
104 parser.add_argument('--trees-out', type=str, metavar='FILE', default=None, help=argparse.SUPPRESS)
|
|
105 parser.add_argument('-m', '--multi-fasta', type=str, default='multi-fasta.fa', help=argparse.SUPPRESS)
|
31
|
106 args = parser.parse_args('-i multi.fa -b 0 -r RSRS.fasta'.split())
|
0
|
107
|
|
108
|
|
109 if args.input:
|
|
110 for fasta in args.input:
|
|
111 try:
|
|
112 open(fasta)
|
|
113 except:
|
|
114 sys.exit("\nERROR: Could not open %s\n" % fasta)
|
|
115 try:
|
|
116 multi = open(args.multi_fasta, 'w+')
|
|
117 except:
|
|
118 sys.exit("\nERROR: Could not create %s\n" % args.multi_fasta)
|
|
119
|
|
120 for fasta in args.input:
|
|
121 for line in list(open(fasta)):
|
|
122 multi.write(line.replace('-', '_'))
|
|
123
|
|
124
|
|
125 if args.root:
|
|
126 try:
|
|
127 root = list(open(args.root))
|
|
128 root_id = [line.strip()[1:].replace('-', '_') for line in root if line.strip().startswith(">")][0]
|
|
129 for line in root:
|
|
130 multi.write(line.replace('-', '_'))
|
|
131 except:
|
|
132 sys.exit("\nERROR: Could not open %s\n" % args.root)
|
|
133 else:
|
|
134 root_id = args.root
|
|
135 multi.close()
|
|
136
|
|
137 try:
|
|
138 data = ape_read_dna(args.multi_fasta)
|
|
139 except:
|
|
140 sys.exit("\nERROR: Check existence or proper format of %s\n" % args.multi_fasta)
|
|
141
|
|
142 # Get sequence ids in alignment and identify test sequences
|
|
143 fasta_ids = [seqname.strip()[1:] for seqname in list(open(args.multi_fasta)) if seqname.startswith('>')]
|
|
144 minor_ids = [seqname for seqname in fasta_ids if seqname.endswith('minor') or seqname.endswith('test')]
|
|
145
|
|
146 if len(minor_ids) == 0:
|
|
147 sys.exit("\nERROR: No test sequences found. _minor or _test suffixes are required in the sequence name!\n")
|
|
148 else:
|
|
149 pass
|
|
150
|
|
151 if args.pairwise:
|
|
152 nj_tree = ape_nj(data,1)
|
|
153 nj_func = 'function (xx) nj(dist.dna(xx, model="raw", pair=1))'
|
|
154 else:
|
|
155 nj_tree = ape_nj(data)
|
|
156 nj_func = 'function (xx) nj(dist.dna(xx, model="raw"))'
|
|
157
|
|
158 if args.bootstrap == 0:
|
|
159 cons_tree = nj_tree
|
|
160 elif args.bootstrap !=1000:
|
|
161 cons_tree = ape_consensus(nj_tree,nj_func,data,iters=args.bootstrap)
|
|
162 else:
|
|
163 cons_tree = ape_consensus(nj_tree,nj_func,data)
|
|
164
|
|
165 # Generate report, trees, and Newick strings
|
|
166 if args.relatives_out is not None:
|
|
167 relatives = open(args.relatives_out,'w+')
|
|
168 else:
|
|
169 relatives = open(args.multi_fasta+'-relatives.tab','w+')
|
|
170 if args.newick_out is not None:
|
|
171 newick = open(args.newick_out,'w+')
|
|
172 else:
|
|
173 newick = open(args.multi_fasta+'-newick.txt','w+')
|
|
174 if args.trees_out is not None:
|
31
|
175 tree_plot_file = open(args.trees_out, 'w+')
|
0
|
176 else:
|
31
|
177 tree_plot_file = open(args.multi_fasta+'-tree-ascii.txt', 'w+')
|
0
|
178
|
31
|
179 newick.write('%s\n' % (write_nwk(cons_tree)))
|
0
|
180 relatives.write('#source\tsample\trelatives\n')
|
|
181 for node in minor_ids:
|
|
182 nj_relatives = [relative for relative in dendro_relatives(cons_tree,node) if relative != node]
|
|
183 relatives.write( 'Neighbor_Joining_tree\t%s\t%s\n' % (node,','.join(sorted(nj_relatives))) )
|
|
184
|
31
|
185 tree_plot_file.write(dendro_plot(cons_tree,root=root_id))
|
0
|
186
|
|
187 newick.close()
|
|
188 relatives.close()
|
|
189
|
|
190 if __name__ == "__main__": main()
|