annotate kraken_summarize.py @ 0:0916697409ea draft

Uploaded
author cschu
date Mon, 18 May 2015 15:29:05 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0916697409ea Uploaded
cschu
parents:
diff changeset
1 #!/usr/bin/env python
0916697409ea Uploaded
cschu
parents:
diff changeset
2 import sys
0916697409ea Uploaded
cschu
parents:
diff changeset
3 import subprocess
0916697409ea Uploaded
cschu
parents:
diff changeset
4 import argparse
0916697409ea Uploaded
cschu
parents:
diff changeset
5
0916697409ea Uploaded
cschu
parents:
diff changeset
6 from collections import Counter
0916697409ea Uploaded
cschu
parents:
diff changeset
7
0916697409ea Uploaded
cschu
parents:
diff changeset
8 from Bio import Entrez
0916697409ea Uploaded
cschu
parents:
diff changeset
9
0916697409ea Uploaded
cschu
parents:
diff changeset
10
0916697409ea Uploaded
cschu
parents:
diff changeset
11
0916697409ea Uploaded
cschu
parents:
diff changeset
12 # U HWI-ST933:109:C2A2NACXX:5:1101:1311:2013 0 50 Q:0
0916697409ea Uploaded
cschu
parents:
diff changeset
13 def fetchTaxonomyData(ids, email='christian.schudoma@tsl.ac.uk'):
0916697409ea Uploaded
cschu
parents:
diff changeset
14 Entrez.email = email
0916697409ea Uploaded
cschu
parents:
diff changeset
15 handle = Entrez.efetch(db='Taxonomy', id=','.join(ids), retmode='xml')
0916697409ea Uploaded
cschu
parents:
diff changeset
16 records = Entrez.read(handle)
0916697409ea Uploaded
cschu
parents:
diff changeset
17 return records
0916697409ea Uploaded
cschu
parents:
diff changeset
18
0916697409ea Uploaded
cschu
parents:
diff changeset
19 def writeKronaInput(fi, taxInfo, unclassified=0):
0916697409ea Uploaded
cschu
parents:
diff changeset
20 if unclassified:
0916697409ea Uploaded
cschu
parents:
diff changeset
21 fi.write('%i\tUnclassified\n' % unclassified)
0916697409ea Uploaded
cschu
parents:
diff changeset
22 for tid in sorted(taxInfo, key=lambda x:taxInfo[x][0]['Lineage']):
0916697409ea Uploaded
cschu
parents:
diff changeset
23 fi.write('%i\t%s\n' % (taxInfo[tid][1], '; '.join([taxInfo[tid][0]['Lineage'].strip(), taxInfoDict[tid][0]['ScientificName']]).replace('; ', '\t').strip('\t')))
0916697409ea Uploaded
cschu
parents:
diff changeset
24 pass
0916697409ea Uploaded
cschu
parents:
diff changeset
25
0916697409ea Uploaded
cschu
parents:
diff changeset
26 def writeOutput(out, taxInfoDict, c):
0916697409ea Uploaded
cschu
parents:
diff changeset
27 for tid in sorted(taxInfoDict, reverse=True, key=lambda x:taxInfoDict[x][1]):
0916697409ea Uploaded
cschu
parents:
diff changeset
28 data = [tid, taxInfoDict[tid][1], taxInfoDict[tid][0]['TaxId'], taxInfoDict[tid][0]['Lineage'], taxInfoDict[tid][0]['ScientificName']]
0916697409ea Uploaded
cschu
parents:
diff changeset
29 out.write('\t'.join(data) + '\n')
0916697409ea Uploaded
cschu
parents:
diff changeset
30 data = (sum(c.values()) - c['0'], sum(c.values()), (sum(c.values()) - c['0']) / float(sum(c.values())) * 100, c['0'])
0916697409ea Uploaded
cschu
parents:
diff changeset
31 out.write('%i/%i (%.5f%%) classified, %i unclassified\n' % data)
0916697409ea Uploaded
cschu
parents:
diff changeset
32
0916697409ea Uploaded
cschu
parents:
diff changeset
33
0916697409ea Uploaded
cschu
parents:
diff changeset
34 def main(argv):
0916697409ea Uploaded
cschu
parents:
diff changeset
35
0916697409ea Uploaded
cschu
parents:
diff changeset
36 parser = argparse.ArgumentParser(description='')
0916697409ea Uploaded
cschu
parents:
diff changeset
37 parser.add_argument('--krona-output', type=str, default='')
0916697409ea Uploaded
cschu
parents:
diff changeset
38 parser.add_argument('--output', type=str)
0916697409ea Uploaded
cschu
parents:
diff changeset
39 parser.add_argument('--call-krona', type=str)
0916697409ea Uploaded
cschu
parents:
diff changeset
40 parser.add_argument('--include-unclassified')
0916697409ea Uploaded
cschu
parents:
diff changeset
41 parser.add_argument('kraken_summary_tsv', type=str)
0916697409ea Uploaded
cschu
parents:
diff changeset
42 args = parser.parse_args()
0916697409ea Uploaded
cschu
parents:
diff changeset
43
0916697409ea Uploaded
cschu
parents:
diff changeset
44 c = Counter(line.strip().split()[2] for line in open(sys.argv[1]))
0916697409ea Uploaded
cschu
parents:
diff changeset
45 taxids = sorted(c.keys(), key=lambda x:c[x], reverse=True)
0916697409ea Uploaded
cschu
parents:
diff changeset
46 taxData = fetchTaxonomyData(taxids)
0916697409ea Uploaded
cschu
parents:
diff changeset
47 taxInfoDict = {tinfo['TaxId']: [tinfo, c[tinfo['TaxId']]] for tinfo in taxData}
0916697409ea Uploaded
cschu
parents:
diff changeset
48
0916697409ea Uploaded
cschu
parents:
diff changeset
49 kr_out = None
0916697409ea Uploaded
cschu
parents:
diff changeset
50 if 'krona_output' in args:
0916697409ea Uploaded
cschu
parents:
diff changeset
51 with open(args.krona_output, 'wb') as kr_out:
0916697409ea Uploaded
cschu
parents:
diff changeset
52 if 'include_unclassified' in args:
0916697409ea Uploaded
cschu
parents:
diff changeset
53 writeKronaInput(kr_out, taxInfoDict, unclassified=c['0'])
0916697409ea Uploaded
cschu
parents:
diff changeset
54 else:
0916697409ea Uploaded
cschu
parents:
diff changeset
55 writeKronaInput(kr_out, taxInfoDict)
0916697409ea Uploaded
cschu
parents:
diff changeset
56
0916697409ea Uploaded
cschu
parents:
diff changeset
57 if 'call_krona' in args:
0916697409ea Uploaded
cschu
parents:
diff changeset
58 p = subprocess.Popen('source krona_tools-2.4; ktImportText -o %s %s' % (args.call_krona, args.krona_output), shell=True, stdout=subprocess.PIPE)
0916697409ea Uploaded
cschu
parents:
diff changeset
59 p.communicate()
0916697409ea Uploaded
cschu
parents:
diff changeset
60
0916697409ea Uploaded
cschu
parents:
diff changeset
61 if 'output' in args:
0916697409ea Uploaded
cschu
parents:
diff changeset
62 if args.output == '-':
0916697409ea Uploaded
cschu
parents:
diff changeset
63 writeOutput(sys.stdout, taxInfoDict, c)
0916697409ea Uploaded
cschu
parents:
diff changeset
64 else:
0916697409ea Uploaded
cschu
parents:
diff changeset
65 with open(args.output, 'wb') as out:
0916697409ea Uploaded
cschu
parents:
diff changeset
66 writeOutput(out, taxInfoDict, c)
0916697409ea Uploaded
cschu
parents:
diff changeset
67
0916697409ea Uploaded
cschu
parents:
diff changeset
68
0916697409ea Uploaded
cschu
parents:
diff changeset
69
0916697409ea Uploaded
cschu
parents:
diff changeset
70
0916697409ea Uploaded
cschu
parents:
diff changeset
71
0916697409ea Uploaded
cschu
parents:
diff changeset
72
0916697409ea Uploaded
cschu
parents:
diff changeset
73
0916697409ea Uploaded
cschu
parents:
diff changeset
74
0916697409ea Uploaded
cschu
parents:
diff changeset
75 """
0916697409ea Uploaded
cschu
parents:
diff changeset
76 2 Fats Saturated fat
0916697409ea Uploaded
cschu
parents:
diff changeset
77 3 Fats Unsaturated fat Monounsaturated fat
0916697409ea Uploaded
cschu
parents:
diff changeset
78 3 Fats Unsaturated fat Polyunsaturated fat
0916697409ea Uploaded
cschu
parents:
diff changeset
79 13 Carbohydrates Sugars
0916697409ea Uploaded
cschu
parents:
diff changeset
80 4 Carbohydrates Dietary fiber
0916697409ea Uploaded
cschu
parents:
diff changeset
81 21 Carbohydrates
0916697409ea Uploaded
cschu
parents:
diff changeset
82 5 Protein
0916697409ea Uploaded
cschu
parents:
diff changeset
83 4
0916697409ea Uploaded
cschu
parents:
diff changeset
84 """
0916697409ea Uploaded
cschu
parents:
diff changeset
85
0916697409ea Uploaded
cschu
parents:
diff changeset
86
0916697409ea Uploaded
cschu
parents:
diff changeset
87
0916697409ea Uploaded
cschu
parents:
diff changeset
88 """
0916697409ea Uploaded
cschu
parents:
diff changeset
89 handle = Entrez.efetch(db="Taxonomy", id="1323524", retmode="xml")
0916697409ea Uploaded
cschu
parents:
diff changeset
90 records = Entrez.read(handle)
0916697409ea Uploaded
cschu
parents:
diff changeset
91 records[0]["Lineage"]
0916697409ea Uploaded
cschu
parents:
diff changeset
92 'Viruses; dsRNA viruses; Partitiviridae; Betapartitivirus'
0916697409ea Uploaded
cschu
parents:
diff changeset
93 records[0]["LineageEx"]
0916697409ea Uploaded
cschu
parents:
diff changeset
94 [{u'ScientificName': 'Viruses', u'TaxId': '10239', u'Rank': 'superkingdom'}, {u'ScientificName': 'dsRNA viruses', u'TaxId': '35325', u'Rank': 'no rank'}, {u'ScientificName': 'Partitiviridae', u'TaxId': '11012', u'Rank': 'family'}, {u'ScientificName': 'Betapartitivirus', u'TaxId': '1511809', u'Rank': 'genus'}]
0916697409ea Uploaded
cschu
parents:
diff changeset
95 """
0916697409ea Uploaded
cschu
parents:
diff changeset
96
0916697409ea Uploaded
cschu
parents:
diff changeset
97 """
0916697409ea Uploaded
cschu
parents:
diff changeset
98 {u'Lineage': 'Viruses; dsDNA viruses, no RNA stage; Iridoviridae; unclassified Iridoviridae', u'Division': 'Viruses', u'ParentTaxId': '180169', u'PubDate': '2014/03/16 07:01:27', u'LineageEx': [{u'ScientificName': 'Viruses', u'TaxId': '10239', u'Rank': 'superkingdom'}, {u'ScientificName': 'dsDNA viruses, no RNA stage', u'TaxId': '35237', u'Rank': 'no rank'}, {u'ScientificName': 'Iridoviridae', u'TaxId': '10486', u'Rank': 'family'}, {u'ScientificName': 'unclassified Iridoviridae', u'TaxId': '180169', u'Rank': 'no rank'}], u'CreateDate': '2014/02/24 15:56:28', u'TaxId': '1465751', u'Rank': 'species', u'GeneticCode': {u'GCId': '1', u'GCName': 'Standard'},
0916697409ea Uploaded
cschu
parents:
diff changeset
99 u'ScientificName': 'Anopheles minimus irodovirus', u'MitoGeneticCode': {u'MGCId': '0', u'MGCName': 'Unspecified'}, u'UpdateDate': '2014/02/24 15:56:29'}
0916697409ea Uploaded
cschu
parents:
diff changeset
100
0916697409ea Uploaded
cschu
parents:
diff changeset
101 """