comparison metaphlan_to_phyloxml.py @ 0:0ec6c5781381 draft

Uploaded
author jjohnson
date Tue, 09 Oct 2012 12:13:27 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:0ec6c5781381
1 #!/usr/bin/env python
2
3 """
4 Read metaphaln output summarizing taxonomic distribution and format in PhyloXML format
5
6 usage: %prog metaphlan.txt phylo.xml
7 """
8
9 import sys
10
11 # Metaphlan output looks like:
12 # k__Bacteria 99.07618
13 # k__Archaea 0.92382
14 # k__Bacteria|p__Proteobacteria 82.50732
15 # k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria 81.64905
16
17 rank_map = { 'k__': 'kingdom', 'p__': 'phylum', 'c__': 'class', 'o__': 'order', 'f__': 'family', 'g__': 'genus', 's__': 'species' }
18
19 class Node( object ):
20 """Node in a taxonomy"""
21 def __init__( self, rank=None, name=None ):
22 self.rank = rank
23 self.name = name
24 self.value = None
25 self.children = dict()
26 @staticmethod
27 def from_metaphlan_file( file ):
28 """
29 Build tree from metaphlan output
30 """
31 root = Node()
32 for line in file:
33 taxa, abundance = line.split()
34 parts = taxa.split( "|" )
35 root.add( parts, abundance )
36 return root
37 def add( self, parts, value ):
38 """
39 Parts is a list of node names, recursively add nodes until we reach
40 the last part, and then attach the value to that node.
41 """
42 if len( parts ) == 0:
43 self.value = value
44 else:
45 next_part = parts.pop(0)
46 rank = rank_map[ next_part[:3] ]
47 name = next_part[3:]
48 if name not in self.children:
49 self.children[name] = Node( rank, name )
50 self.children[name].add( parts, value )
51 def __str__( self ):
52 if self.children:
53 return "(" + ",".join( str( child ) for child in self.children.itervalues() ) + "):" + self.name
54 else:
55 return self.name
56 def to_phyloxml( self, out ):
57 print >>out, "<clade>"
58 if self.name:
59 print >>out, "<name>%s</name>" % self.name
60 print >>out, "<taxonomy><scientific_name>%s</scientific_name><rank>%s</rank></taxonomy>" % ( self.name, self.rank )
61 if self.value:
62 print >>out, "<property datatype='xsd:float' ref='metaphlan:abundance' applies_to='node'>%s</property>" % self.value
63 ## print >>out, "<confidence type='abundance'>%s</confidence>" % self.value
64 for child in self.children.itervalues():
65 child.to_phyloxml( out )
66 print >>out, "</clade>"
67
68 out = open( sys.argv[2], 'w' )
69
70 print >>out, '<phyloxml xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.phyloxml.org" xsi:schemaLocation="http://www.phyloxml.org http://www.phyloxml.org/1.10/phyloxml.xsd">'
71 print >>out, '<phylogeny rooted="true">'
72
73 Node.from_metaphlan_file( open( sys.argv[1] ) ).to_phyloxml( out )
74
75 print >>out, '</phylogeny>'
76 print >>out, '</phyloxml>'