Mercurial > repos > cschu > kraken_tools
comparison kraken_summarize.py @ 14:fd27c97c8366 draft default tip
Uploaded
author | cschu |
---|---|
date | Mon, 18 May 2015 15:55:47 -0400 |
parents | 0916697409ea |
children |
comparison
equal
deleted
inserted
replaced
13:dc02dbf5e1a9 | 14:fd27c97c8366 |
---|---|
1 #!/usr/bin/env python | |
2 import sys | |
3 import subprocess | |
4 import argparse | |
5 | |
6 from collections import Counter | |
7 | |
8 from Bio import Entrez | |
9 | |
10 | |
11 | |
12 # U HWI-ST933:109:C2A2NACXX:5:1101:1311:2013 0 50 Q:0 | |
13 def fetchTaxonomyData(ids, email='christian.schudoma@tsl.ac.uk'): | |
14 Entrez.email = email | |
15 handle = Entrez.efetch(db='Taxonomy', id=','.join(ids), retmode='xml') | |
16 records = Entrez.read(handle) | |
17 return records | |
18 | |
19 def writeKronaInput(fi, taxInfo, unclassified=0): | |
20 if unclassified: | |
21 fi.write('%i\tUnclassified\n' % unclassified) | |
22 for tid in sorted(taxInfo, key=lambda x:taxInfo[x][0]['Lineage']): | |
23 fi.write('%i\t%s\n' % (taxInfo[tid][1], '; '.join([taxInfo[tid][0]['Lineage'].strip(), taxInfoDict[tid][0]['ScientificName']]).replace('; ', '\t').strip('\t'))) | |
24 pass | |
25 | |
26 def writeOutput(out, taxInfoDict, c): | |
27 for tid in sorted(taxInfoDict, reverse=True, key=lambda x:taxInfoDict[x][1]): | |
28 data = [tid, taxInfoDict[tid][1], taxInfoDict[tid][0]['TaxId'], taxInfoDict[tid][0]['Lineage'], taxInfoDict[tid][0]['ScientificName']] | |
29 out.write('\t'.join(data) + '\n') | |
30 data = (sum(c.values()) - c['0'], sum(c.values()), (sum(c.values()) - c['0']) / float(sum(c.values())) * 100, c['0']) | |
31 out.write('%i/%i (%.5f%%) classified, %i unclassified\n' % data) | |
32 | |
33 | |
34 def main(argv): | |
35 | |
36 parser = argparse.ArgumentParser(description='') | |
37 parser.add_argument('--krona-output', type=str, default='') | |
38 parser.add_argument('--output', type=str) | |
39 parser.add_argument('--call-krona', type=str) | |
40 parser.add_argument('--include-unclassified') | |
41 parser.add_argument('kraken_summary_tsv', type=str) | |
42 args = parser.parse_args() | |
43 | |
44 c = Counter(line.strip().split()[2] for line in open(sys.argv[1])) | |
45 taxids = sorted(c.keys(), key=lambda x:c[x], reverse=True) | |
46 taxData = fetchTaxonomyData(taxids) | |
47 taxInfoDict = {tinfo['TaxId']: [tinfo, c[tinfo['TaxId']]] for tinfo in taxData} | |
48 | |
49 kr_out = None | |
50 if 'krona_output' in args: | |
51 with open(args.krona_output, 'wb') as kr_out: | |
52 if 'include_unclassified' in args: | |
53 writeKronaInput(kr_out, taxInfoDict, unclassified=c['0']) | |
54 else: | |
55 writeKronaInput(kr_out, taxInfoDict) | |
56 | |
57 if 'call_krona' in args: | |
58 p = subprocess.Popen('source krona_tools-2.4; ktImportText -o %s %s' % (args.call_krona, args.krona_output), shell=True, stdout=subprocess.PIPE) | |
59 p.communicate() | |
60 | |
61 if 'output' in args: | |
62 if args.output == '-': | |
63 writeOutput(sys.stdout, taxInfoDict, c) | |
64 else: | |
65 with open(args.output, 'wb') as out: | |
66 writeOutput(out, taxInfoDict, c) | |
67 | |
68 | |
69 | |
70 | |
71 | |
72 | |
73 | |
74 | |
75 """ | |
76 2 Fats Saturated fat | |
77 3 Fats Unsaturated fat Monounsaturated fat | |
78 3 Fats Unsaturated fat Polyunsaturated fat | |
79 13 Carbohydrates Sugars | |
80 4 Carbohydrates Dietary fiber | |
81 21 Carbohydrates | |
82 5 Protein | |
83 4 | |
84 """ | |
85 | |
86 | |
87 | |
88 """ | |
89 handle = Entrez.efetch(db="Taxonomy", id="1323524", retmode="xml") | |
90 records = Entrez.read(handle) | |
91 records[0]["Lineage"] | |
92 'Viruses; dsRNA viruses; Partitiviridae; Betapartitivirus' | |
93 records[0]["LineageEx"] | |
94 [{u'ScientificName': 'Viruses', u'TaxId': '10239', u'Rank': 'superkingdom'}, {u'ScientificName': 'dsRNA viruses', u'TaxId': '35325', u'Rank': 'no rank'}, {u'ScientificName': 'Partitiviridae', u'TaxId': '11012', u'Rank': 'family'}, {u'ScientificName': 'Betapartitivirus', u'TaxId': '1511809', u'Rank': 'genus'}] | |
95 """ | |
96 | |
97 """ | |
98 {u'Lineage': 'Viruses; dsDNA viruses, no RNA stage; Iridoviridae; unclassified Iridoviridae', u'Division': 'Viruses', u'ParentTaxId': '180169', u'PubDate': '2014/03/16 07:01:27', u'LineageEx': [{u'ScientificName': 'Viruses', u'TaxId': '10239', u'Rank': 'superkingdom'}, {u'ScientificName': 'dsDNA viruses, no RNA stage', u'TaxId': '35237', u'Rank': 'no rank'}, {u'ScientificName': 'Iridoviridae', u'TaxId': '10486', u'Rank': 'family'}, {u'ScientificName': 'unclassified Iridoviridae', u'TaxId': '180169', u'Rank': 'no rank'}], u'CreateDate': '2014/02/24 15:56:28', u'TaxId': '1465751', u'Rank': 'species', u'GeneticCode': {u'GCId': '1', u'GCName': 'Standard'}, | |
99 u'ScientificName': 'Anopheles minimus irodovirus', u'MitoGeneticCode': {u'MGCId': '0', u'MGCName': 'Unspecified'}, u'UpdateDate': '2014/02/24 15:56:29'} | |
100 | |
101 """ |