0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 """
|
|
4 Read metaphaln output summarizing taxonomic distribution and format in PhyloXML format
|
|
5
|
|
6 usage: %prog metaphlan.txt phylo.xml
|
|
7 """
|
|
8
|
|
9 import sys
|
|
10
|
|
11 # Metaphlan output looks like:
|
|
12 # k__Bacteria 99.07618
|
|
13 # k__Archaea 0.92382
|
|
14 # k__Bacteria|p__Proteobacteria 82.50732
|
|
15 # k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria 81.64905
|
|
16
|
|
17 rank_map = { 'k__': 'kingdom', 'p__': 'phylum', 'c__': 'class', 'o__': 'order', 'f__': 'family', 'g__': 'genus', 's__': 'species' }
|
|
18
|
|
19 class Node( object ):
|
|
20 """Node in a taxonomy"""
|
|
21 def __init__( self, rank=None, name=None ):
|
|
22 self.rank = rank
|
|
23 self.name = name
|
|
24 self.value = None
|
|
25 self.children = dict()
|
|
26 @staticmethod
|
|
27 def from_metaphlan_file( file ):
|
|
28 """
|
|
29 Build tree from metaphlan output
|
|
30 """
|
|
31 root = Node()
|
|
32 for line in file:
|
|
33 taxa, abundance = line.split()
|
|
34 parts = taxa.split( "|" )
|
|
35 root.add( parts, abundance )
|
|
36 return root
|
|
37 def add( self, parts, value ):
|
|
38 """
|
|
39 Parts is a list of node names, recursively add nodes until we reach
|
|
40 the last part, and then attach the value to that node.
|
|
41 """
|
|
42 if len( parts ) == 0:
|
|
43 self.value = value
|
|
44 else:
|
|
45 next_part = parts.pop(0)
|
|
46 rank = rank_map[ next_part[:3] ]
|
|
47 name = next_part[3:]
|
|
48 if name not in self.children:
|
|
49 self.children[name] = Node( rank, name )
|
|
50 self.children[name].add( parts, value )
|
|
51 def __str__( self ):
|
|
52 if self.children:
|
|
53 return "(" + ",".join( str( child ) for child in self.children.itervalues() ) + "):" + self.name
|
|
54 else:
|
|
55 return self.name
|
|
56 def to_phyloxml( self, out ):
|
|
57 print >>out, "<clade>"
|
|
58 if self.name:
|
|
59 print >>out, "<name>%s</name>" % self.name
|
|
60 print >>out, "<taxonomy><scientific_name>%s</scientific_name><rank>%s</rank></taxonomy>" % ( self.name, self.rank )
|
|
61 if self.value:
|
|
62 print >>out, "<property datatype='xsd:float' ref='metaphlan:abundance' applies_to='node'>%s</property>" % self.value
|
|
63 ## print >>out, "<confidence type='abundance'>%s</confidence>" % self.value
|
|
64 for child in self.children.itervalues():
|
|
65 child.to_phyloxml( out )
|
|
66 print >>out, "</clade>"
|
|
67
|
|
68 out = open( sys.argv[2], 'w' )
|
|
69
|
|
70 print >>out, '<phyloxml xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.phyloxml.org" xsi:schemaLocation="http://www.phyloxml.org http://www.phyloxml.org/1.10/phyloxml.xsd">'
|
|
71 print >>out, '<phylogeny rooted="true">'
|
|
72
|
|
73 Node.from_metaphlan_file( open( sys.argv[1] ) ).to_phyloxml( out )
|
|
74
|
|
75 print >>out, '</phylogeny>'
|
|
76 print >>out, '</phyloxml>'
|