Mercurial > repos > cschu > kraken_tools
comparison kraken_summarize.py @ 14:fd27c97c8366 draft default tip
Uploaded
| author | cschu |
|---|---|
| date | Mon, 18 May 2015 15:55:47 -0400 |
| parents | 0916697409ea |
| children |
comparison
equal
deleted
inserted
replaced
| 13:dc02dbf5e1a9 | 14:fd27c97c8366 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 import sys | |
| 3 import subprocess | |
| 4 import argparse | |
| 5 | |
| 6 from collections import Counter | |
| 7 | |
| 8 from Bio import Entrez | |
| 9 | |
| 10 | |
| 11 | |
| 12 # U HWI-ST933:109:C2A2NACXX:5:1101:1311:2013 0 50 Q:0 | |
| 13 def fetchTaxonomyData(ids, email='christian.schudoma@tsl.ac.uk'): | |
| 14 Entrez.email = email | |
| 15 handle = Entrez.efetch(db='Taxonomy', id=','.join(ids), retmode='xml') | |
| 16 records = Entrez.read(handle) | |
| 17 return records | |
| 18 | |
| 19 def writeKronaInput(fi, taxInfo, unclassified=0): | |
| 20 if unclassified: | |
| 21 fi.write('%i\tUnclassified\n' % unclassified) | |
| 22 for tid in sorted(taxInfo, key=lambda x:taxInfo[x][0]['Lineage']): | |
| 23 fi.write('%i\t%s\n' % (taxInfo[tid][1], '; '.join([taxInfo[tid][0]['Lineage'].strip(), taxInfoDict[tid][0]['ScientificName']]).replace('; ', '\t').strip('\t'))) | |
| 24 pass | |
| 25 | |
| 26 def writeOutput(out, taxInfoDict, c): | |
| 27 for tid in sorted(taxInfoDict, reverse=True, key=lambda x:taxInfoDict[x][1]): | |
| 28 data = [tid, taxInfoDict[tid][1], taxInfoDict[tid][0]['TaxId'], taxInfoDict[tid][0]['Lineage'], taxInfoDict[tid][0]['ScientificName']] | |
| 29 out.write('\t'.join(data) + '\n') | |
| 30 data = (sum(c.values()) - c['0'], sum(c.values()), (sum(c.values()) - c['0']) / float(sum(c.values())) * 100, c['0']) | |
| 31 out.write('%i/%i (%.5f%%) classified, %i unclassified\n' % data) | |
| 32 | |
| 33 | |
| 34 def main(argv): | |
| 35 | |
| 36 parser = argparse.ArgumentParser(description='') | |
| 37 parser.add_argument('--krona-output', type=str, default='') | |
| 38 parser.add_argument('--output', type=str) | |
| 39 parser.add_argument('--call-krona', type=str) | |
| 40 parser.add_argument('--include-unclassified') | |
| 41 parser.add_argument('kraken_summary_tsv', type=str) | |
| 42 args = parser.parse_args() | |
| 43 | |
| 44 c = Counter(line.strip().split()[2] for line in open(sys.argv[1])) | |
| 45 taxids = sorted(c.keys(), key=lambda x:c[x], reverse=True) | |
| 46 taxData = fetchTaxonomyData(taxids) | |
| 47 taxInfoDict = {tinfo['TaxId']: [tinfo, c[tinfo['TaxId']]] for tinfo in taxData} | |
| 48 | |
| 49 kr_out = None | |
| 50 if 'krona_output' in args: | |
| 51 with open(args.krona_output, 'wb') as kr_out: | |
| 52 if 'include_unclassified' in args: | |
| 53 writeKronaInput(kr_out, taxInfoDict, unclassified=c['0']) | |
| 54 else: | |
| 55 writeKronaInput(kr_out, taxInfoDict) | |
| 56 | |
| 57 if 'call_krona' in args: | |
| 58 p = subprocess.Popen('source krona_tools-2.4; ktImportText -o %s %s' % (args.call_krona, args.krona_output), shell=True, stdout=subprocess.PIPE) | |
| 59 p.communicate() | |
| 60 | |
| 61 if 'output' in args: | |
| 62 if args.output == '-': | |
| 63 writeOutput(sys.stdout, taxInfoDict, c) | |
| 64 else: | |
| 65 with open(args.output, 'wb') as out: | |
| 66 writeOutput(out, taxInfoDict, c) | |
| 67 | |
| 68 | |
| 69 | |
| 70 | |
| 71 | |
| 72 | |
| 73 | |
| 74 | |
| 75 """ | |
| 76 2 Fats Saturated fat | |
| 77 3 Fats Unsaturated fat Monounsaturated fat | |
| 78 3 Fats Unsaturated fat Polyunsaturated fat | |
| 79 13 Carbohydrates Sugars | |
| 80 4 Carbohydrates Dietary fiber | |
| 81 21 Carbohydrates | |
| 82 5 Protein | |
| 83 4 | |
| 84 """ | |
| 85 | |
| 86 | |
| 87 | |
| 88 """ | |
| 89 handle = Entrez.efetch(db="Taxonomy", id="1323524", retmode="xml") | |
| 90 records = Entrez.read(handle) | |
| 91 records[0]["Lineage"] | |
| 92 'Viruses; dsRNA viruses; Partitiviridae; Betapartitivirus' | |
| 93 records[0]["LineageEx"] | |
| 94 [{u'ScientificName': 'Viruses', u'TaxId': '10239', u'Rank': 'superkingdom'}, {u'ScientificName': 'dsRNA viruses', u'TaxId': '35325', u'Rank': 'no rank'}, {u'ScientificName': 'Partitiviridae', u'TaxId': '11012', u'Rank': 'family'}, {u'ScientificName': 'Betapartitivirus', u'TaxId': '1511809', u'Rank': 'genus'}] | |
| 95 """ | |
| 96 | |
| 97 """ | |
| 98 {u'Lineage': 'Viruses; dsDNA viruses, no RNA stage; Iridoviridae; unclassified Iridoviridae', u'Division': 'Viruses', u'ParentTaxId': '180169', u'PubDate': '2014/03/16 07:01:27', u'LineageEx': [{u'ScientificName': 'Viruses', u'TaxId': '10239', u'Rank': 'superkingdom'}, {u'ScientificName': 'dsDNA viruses, no RNA stage', u'TaxId': '35237', u'Rank': 'no rank'}, {u'ScientificName': 'Iridoviridae', u'TaxId': '10486', u'Rank': 'family'}, {u'ScientificName': 'unclassified Iridoviridae', u'TaxId': '180169', u'Rank': 'no rank'}], u'CreateDate': '2014/02/24 15:56:28', u'TaxId': '1465751', u'Rank': 'species', u'GeneticCode': {u'GCId': '1', u'GCName': 'Standard'}, | |
| 99 u'ScientificName': 'Anopheles minimus irodovirus', u'MitoGeneticCode': {u'MGCId': '0', u'MGCName': 'Unspecified'}, u'UpdateDate': '2014/02/24 15:56:29'} | |
| 100 | |
| 101 """ |
