annotate metaphlan_to_phyloxml.py @ 4:3468e70d3ed0 draft

Indents
author Jim Johnson <jj@umn.edu>
date Wed, 10 Oct 2012 08:49:28 -0500
parents 0ec6c5781381
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
1 #!/usr/bin/env python
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
2
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
3 """
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
4 Read metaphaln output summarizing taxonomic distribution and format in PhyloXML format
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
5
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
6 usage: %prog metaphlan.txt phylo.xml
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
7 """
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
8
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
9 import sys
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
10
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
11 # Metaphlan output looks like:
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
12 # k__Bacteria 99.07618
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
13 # k__Archaea 0.92382
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
14 # k__Bacteria|p__Proteobacteria 82.50732
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
15 # k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria 81.64905
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
16
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
17 rank_map = { 'k__': 'kingdom', 'p__': 'phylum', 'c__': 'class', 'o__': 'order', 'f__': 'family', 'g__': 'genus', 's__': 'species' }
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
18
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
19 class Node( object ):
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
20 """Node in a taxonomy"""
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
21 def __init__( self, rank=None, name=None ):
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
22 self.rank = rank
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
23 self.name = name
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
24 self.value = None
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
25 self.children = dict()
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
26 @staticmethod
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
27 def from_metaphlan_file( file ):
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
28 """
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
29 Build tree from metaphlan output
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
30 """
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
31 root = Node()
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
32 for line in file:
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
33 taxa, abundance = line.split()
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
34 parts = taxa.split( "|" )
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
35 root.add( parts, abundance )
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
36 return root
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
37 def add( self, parts, value ):
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
38 """
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
39 Parts is a list of node names, recursively add nodes until we reach
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
40 the last part, and then attach the value to that node.
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
41 """
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
42 if len( parts ) == 0:
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
43 self.value = value
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
44 else:
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
45 next_part = parts.pop(0)
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
46 rank = rank_map[ next_part[:3] ]
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
47 name = next_part[3:]
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
48 if name not in self.children:
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
49 self.children[name] = Node( rank, name )
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
50 self.children[name].add( parts, value )
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
51 def __str__( self ):
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
52 if self.children:
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
53 return "(" + ",".join( str( child ) for child in self.children.itervalues() ) + "):" + self.name
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
54 else:
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
55 return self.name
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
56 def to_phyloxml( self, out ):
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
57 print >>out, "<clade>"
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
58 if self.name:
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
59 print >>out, "<name>%s</name>" % self.name
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
60 print >>out, "<taxonomy><scientific_name>%s</scientific_name><rank>%s</rank></taxonomy>" % ( self.name, self.rank )
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
61 if self.value:
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
62 print >>out, "<property datatype='xsd:float' ref='metaphlan:abundance' applies_to='node'>%s</property>" % self.value
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
63 ## print >>out, "<confidence type='abundance'>%s</confidence>" % self.value
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
64 for child in self.children.itervalues():
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
65 child.to_phyloxml( out )
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
66 print >>out, "</clade>"
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
67
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
68 out = open( sys.argv[2], 'w' )
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
69
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
70 print >>out, '<phyloxml xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.phyloxml.org" xsi:schemaLocation="http://www.phyloxml.org http://www.phyloxml.org/1.10/phyloxml.xsd">'
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
71 print >>out, '<phylogeny rooted="true">'
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
72
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
73 Node.from_metaphlan_file( open( sys.argv[1] ) ).to_phyloxml( out )
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
74
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
75 print >>out, '</phylogeny>'
0ec6c5781381 Uploaded
jjohnson
parents:
diff changeset
76 print >>out, '</phyloxml>'