Mercurial > repos > cschu > kraken_tools
changeset 14:fd27c97c8366 draft default tip
Uploaded
author | cschu |
---|---|
date | Mon, 18 May 2015 15:55:47 -0400 |
parents | dc02dbf5e1a9 |
children | |
files | kraken.py kraken.xml krakenDBs.loc.sample kraken_combine.py kraken_combine.xml kraken_count.py kraken_filter.py kraken_filter.xml kraken_summarize.py kraken_summarize.xml kraken_visualize.py repository_dependencies.xml tool_data_table_conf.xml.sample tool_dependencies.xml |
diffstat | 14 files changed, 1100 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kraken.py Mon May 18 15:55:47 2015 -0400 @@ -0,0 +1,150 @@ +#!/usr/bin/env python + +import os +import sys +import argparse +import subprocess +import struct +import shutil +import tempfile + +# from collections import Counter + +from checkFileFormat import verifyFileFormat +DEFAULT_KRAKEN_DB = '/tsl/data/krakendb/ktest/db1' + +def main(argv): + + descr = '' + parser = argparse.ArgumentParser(description=descr) + parser.add_argument('--dbtype', default='builtinDB', help='Is database built-in or supplied externally?') + parser.add_argument('--db', default=DEFAULT_KRAKEN_DB, help='Path to kraken db') + parser.add_argument('--in1', help='Single-end reads or left paired-end reads') + parser.add_argument('--in2', help='Right paired-end reads') + parser.add_argument('--quick', action='store_true', help='Use quick mode?') + parser.add_argument('--min-hits', default=1, help='Minimum number of hits required.') + parser.add_argument('--input-format', help='Input sequences stored in fa or fq file(s).', default='fq') + parser.add_argument('kraken_summary_tsv', type=str, help='The output file.') + # parser.add_argument('classified_seqs_fq', type=str, help='The output file.') + # parser.add_argument('unclassified_seqs_fq', type=str, help='The output file.') + args = parser.parse_args() + + # kraken --preload --db /tsl/scratch/macleand/ktest/db --threads 12 --quick --classified-out classified_seqs.fq --unclassified-out unclassified.fq --fastq-input --min-hits 1 --output classification.txt left_reads.fq right_reads.fq + + kraken_params = ['--preload', '--threads', '8', + '--unclassified-out', args.unclassified_seqs_fq, + '--output', args.kraken_summary_tsv] + # '--classified-out', args.classified_seqs_fq, + kraken_input = [] + + if 'db' not in args or not os.path.exists(args.db): + sys.stderr.write('Error: database is missing.\n') + sys.exit(1) + kraken_params.extend(['--db', args.db]) + # check whether input file(s) exist(s) + if not ('in1' in args and os.path.exists(args.in1)): + sys.stderr.write('Error: fwd/single input file (%s) is missing.\n' % args.in1) + sys.exit(1) + if not verifyFileFormat(args.in1, args.input_format): + fc = open(args.in1).read(1) + sys.stderr.write('Error: fwd/single input file has the wrong format assigned (%s). Starts with \'%s\'\n' % (args.input_format, fc)) + sys.exit(1) + kraken_input.append(args.in1) + if 'in2' in args: + if args.in2 is not None and not os.path.exists(args.in2): + sys.stderr.write('Error: right input file (%s) is missing.\n' % args.in2) + sys.exit(1) + elif args.in2 is not None and not verifyFileFormat(args.in2, args.input_format): + sys.stderr.write('Error: rev input file has the wrong format assigned.\n') + sys.exit(1) + elif args.in2 is not None: + kraken_params.append('--paired') + kraken_input.append(args.in2) + else: + pass + else: + pass + + if 'quick' in args: + kraken_params.append('--quick') + if 'min_hits' in args: + kraken_params.extend(['--min-hits', str(args.min_hits)]) + + if args.input_format == 'fq': + kraken_params.append('--fastq-input') + else: + kraken_params.append('--fasta-input') + + + + + + + + + + + + + + + # check whether file is gzipped + header = '' + + cmd = 'source kraken-0.10.5; ' + cmd += ' '.join(['kraken'] + kraken_params + kraken_input) + '\n' + # out = open(argv[-1], 'wb').write(str(cmd) + '\n')#.write('KRAKEN OUTPUT, HAS COUNTER\n') + # out = open(argv[-1], 'wb').write(str(args) + '\n')#.write('KRAKEN OUTPUT, HAS COUNTER\n') + + # proc = subprocess.Popen(args=cmd, shell=True, stderr=sys.stderr) + # returncode = proc.wait() + + + tmp_dir = tempfile.mkdtemp() + try: + tmp = tempfile.NamedTemporaryFile(dir=tmp_dir).name + tmp_stderr = open(tmp, 'wb') + proc = subprocess.Popen(args=cmd, shell=True, stderr=tmp_stderr.fileno()) + returncode = proc.wait() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open(tmp, 'rb') + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += tmp_stderr.read(buffsize) + if not stderr or len(stderr) % buffsize != 0: + break + except OverflowError: + pass + tmp_stderr.close() + if returncode != 0: + raise Exception, stderr + except Exception, e: + # clean up temp dirs + if os.path.exists(tmp_dir): + shutil.rmtree(tmp_dir) + sys.stderr.write('Error running kraken: %s\n' % str(e)) + sys.exit(1) + + if os.path.exists(tmp_dir): + shutil.rmtree(tmp_dir) + """ + open(args.kraken_summary_tsv, 'wb').write('\t'.join(list('ACGT')) + '\n') + open(args.classified_seqs_fq, 'wb').write(cmd + '\n') + open(args.unclassified_seqs_fq, 'wb').write('blruah\n') # check whether the database exists, if not exit with error + """ + """ + for arg in args: + out.write(str(arg) + '\n') + out.close() + """ + pass + + + + +# main(sys.argv[1:]) + +if __name__ == '__main__': main(sys.argv[1:])
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kraken.xml Mon May 18 15:55:47 2015 -0400 @@ -0,0 +1,112 @@ +<tool id="kraken" name="kraken"> + <description>taxonomic sequence classifier</description> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="0.10.5">kraken</requirement> + </requirements> + <command interpreter="python">kraken.py + --dbtype="${databaseChoice.database}" + #if $databaseChoice.database == "builtinDB": + --db="${databaseChoice.builtinDatabases.fields.path}" + #else: + --db="NOT_SUPPORTED_YET" + #end if + + --in1="${dataFormat.input1}" + #if $dataFormat.inputFormat == "pairedFastQ" or $dataFormat.inputFormat == "pairedFastA": + --in2="${dataFormat.input2}" + #end if + + #if $dataFormat.inputFormat == "singleFastQ" or $dataFormat.inputFormat == "pairedFastQ": + --input-format="fq" + #else: + --input-format="fa" + #end if + + #if $quickMode.useQuickMode == "useQuick": + --quick + --min-hits="${quickMode.minHits.value}" + #end if + + <!-- $output --> + $kraken_summary_tsv + <!-- $classified_seqs_fq + $unclassified_seqs_fq --> + </command> + <inputs> + <conditional name="databaseChoice"> + <param name="database" type="select" label="Do you want to use your own kraken database or use the built-in ktest database?"> + <option value="builtinDB">Use built-in db</option> + <option value="myownDB">Use a db from my history</option> + </param> + <when value="builtinDB"> + <param name="builtinDatabases" type="select" label="Select a built-in kraken database"> + <options from_data_table="krakenDBs"> + <filter type="sort_by" column="2" /> + <validator type="no_options" message="No databases are available" /> + </options> + </param> + </when> + <when value="myownDB"> + <param name="myownDB" type="data" format="fasta" metadata_name="dbkey" label="Select database" /> + </when> + </conditional> + + <conditional name="quickMode"> + <param name="useQuickMode" type="select" label="Use Quick operation? (use first hit or hits)"> + <option value="useQuick">Use Quick operation</option> + <option value="dontUseQuick">Do not use Quick operation</option> + </param> + <when value="useQuick"> + <param name="minHits" type="integer" label="Minimum number of hits required for classification (default:1)" value="1" default="1" /> + </when> + </conditional> + +<!-- <conditional name="libraryType"> + <param name="libtype" type="select" label="Is this library single- or paired-end?"> + <option value="single">Single-end</option> + <option value="paired">Paired-end</option> + </param> + <when value="single"> + <param name="input1" type="data" format="fasta,fastq,fasta.gz,fasta.bz2,fa.gz,fq.gz,fa,fq" label="Fasta/Fastq file" /> + </when> + <when value="paired"> + <param name="input1" type="data" format="fasta,fastq,fasta.gz,fasta.bz2,fa.gz,fq.gz,fa,fq" label="Left Fasta/Fastq file" /> + <param name="input2" type="data" format="fasta,fastq,fasta.gz,fasta.bz2,fa.gz,fq.gz,fa,fq" label="Right Fasta/Fastq file" /> + </when> + </conditional> --> + <conditional name="dataFormat"> + <param name="inputFormat" type="select" label="Please select input file type and library type."> + <option value="singleFastQ">Single-end FastQ</option> + <option value="pairedFastQ">Paired-end FastQ</option> + <option value="singleFastA">Single-end FastA</option> + <option value="pairedFastA">Paired-end FastA</option> + </param> + <when value="singleFastQ"> + <param name="input1" type="data" format="fastq,fastqillumina,fastqsanger,fq" label="FastQ file" /> + </when> + <when value="pairedFastQ"> + <param name="input1" type="data" format="fastq,fastqillumina,fastqsanger,fq" label="Forward/Left FastQ file" /> + <param name="input2" type="data" format="fastq,fastqillumina,fastqsanger,fq" label="Reverse/Right FastQ file" /> + </when> + <when value="singleFastA"> + <param name="input1" type="data" format="fasta, fa" label="FastA file" /> + </when> + <when value="pairedFastA"> + <param name="input1" type="data" format="fasta, fa" label="Forward/Left FASTA file" /> + <param name="input2" type="data" format="fasta, fa" label="Reverse/Right FASTA file" /> + </when> + </conditional> + + + </inputs> + <outputs> + <data format="tabular" name="kraken_summary_tsv" label="${tool.name} output summary of ${on_string}" /> + <!-- <data name="classified_seqs_fq" format_source="input1" label="${tool.name}-classified sequences from ${on_string}" /> + <data name="unclassified_seqs_fq" format_source="input1" label="${tool.name}-unclassified sequences from ${on_string}" /> --> + </outputs> + + <help> + This tool wraps the kraken taxonomic sequence classifier. + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/krakenDBs.loc.sample Mon May 18 15:55:47 2015 -0400 @@ -0,0 +1,1 @@ +#<unique_build_id> <dbkey> <display_name> <file_path>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kraken_combine.py Mon May 18 15:55:47 2015 -0400 @@ -0,0 +1,157 @@ +#!/usr/bin/env python +import sys +import argparse + +def anabl_getContigsFromFASTQ(fn): + with open(fn) as fi: + for head in fi: + try: + seq, sep, qual = map(lambda x:x.strip(), [fi.next() for i in xrange(3)]) + yield head[1:], seq, qual + except: + break + pass + +def anabl_getContigsFromFASTA(fn): + with open(fn) as fi: + for head in fi: + try: + yield head[1:], fi.next().strip() + except: + break + pass + +def toFASTQ(fields): + return '@%s\n%s\n+\n%s\n' % fields +def toFASTA(fields): + return '>%s\n%s\n' % fields +def getFASTQID(head): + return head.strip().split()[0].strip('@').strip('/1') +def getFASTAID(head): + return head.strip().split()[0].strip('>').strip('/1') + + + +def main(argv): + DEFAULT_KRAKEN_DB= '' + parser = argparse.ArgumentParser(description='') + parser.add_argument('--db', default=DEFAULT_KRAKEN_DB, help='Path to kraken db.') # we don't need taxonomy here + parser.add_argument('--in1', default='', help='Forward read file from metagenomic sample.') + parser.add_argument('--in2', default='', help='Reverse read file from metagenomic sample.') + parser.add_argument('--kraken-results1', help='File containing kraken classification for set1.') + parser.add_argument('--kraken-results2', help='File containing kraken classification for set2.') + parser.add_argument('--input-format', default='fa', help='fa (FASTA) or fq (FASTQ)') + parser.add_argument('--set1-output-left', type=str) + parser.add_argument('--set1-output-right', type=str) + parser.add_argument('--set2-output-left', type=str) + parser.add_argument('--set2-output-right', type=str) + parser.add_argument('--unclassified-output-left', type=str) + parser.add_argument('--unclassified-output-right', type=str) + parser.add_argument('--intersection-output-left', type=str) + parser.add_argument('--intersection-output-right', type=str) + args = parser.parse_args() + + if args.input_format == 'fa': + getContigs = anabl_getContigsFromFASTA + writeFXRecord = toFASTA + getFXID = getFASTAID + elif args.input_format == 'fq': + getContigs = anabl_getContigsFromFASTQ + writeFXRecord = toFASTQ + getFXID = getFASTQID + else: + sys.stderr.write('Error: Unknown input format (%s).\n' % args.input_format) + sys.exit() + + try: + set1 = [line.strip().split()[:2] for line in open(args.kraken_results1)] + except: + sys.stderr.write('Error: Set1 is missing, please specify --kraken_results1 parameter.\n') + sys.exit() + + nReads = len(set1) + set1 = set(map(lambda x:x[1], filter(lambda x:x[0].startswith('C'), set1))) + + try: + set2 = set([line.strip().split()[1] for line in open(args.kraken_results2) if line.startswith('C')]) + except: + sys.stderr.write('Error: Set2 is missing, please specify --kraken_results2 parameter.\n') + sys.exit() + + try: + set1_fwd = open(args.set1_output_left, 'wb') + set2_fwd = open(args.set2_output_left, 'wb') + noclass_fwd = open(args.unclassified_output_left, 'wb') + undecided_fwd = open(args.intersection_output_left, 'wb') + except: + sys.stderr.write('Error: Cannot open fwd outputfile(s).\n') + sys.exit() + + try: + fwd = getContigs(args.in1) + except: + sys.stderr.write('Error: Cannot open file %s.\n' % args.in1) + sys.exit() + + rev = None + if args.in2: + try: + rev = getContigs(args.in2) + except: + sys.stderr.write('Error: Cannot open file %s.\n' % args.in2) + sys.exit() + try: + set1_rev = open(args.set1_output_right, 'wb') + set2_rev = open(args.set2_output_right, 'wb') + noclass_rev = open(args.unclassified_output_right, 'wb') + undecided_rev = open(args.intersection_output_right, 'wb') + except: + sys.stderr.write('Error: Cannot open rev outputfile(s).\n') + sys.exit() + + + for i in xrange(nReads): + try: + r1 = fwd.next() + except: + break + r2 = None + if rev is not None: + try: + r2 = rev.next() + except: + break + + id_ = getFXID(r1[0]) + + if id_ in set1: + if id_ not in set2: + set1_fwd.write(writeFXRecord(r1)) + if set1_rev: + set1_rev.write(writeFXRecord(r2)) + else: + undecided_fwd.write(writeFXRecord(r1)) + if undecided_rev: + undecided_rev.write(writeFXRecord(r2)) + + elif id_ in set2: + set2_fwd.write(writeFXRecord(r1)) + if set2_rev: + set2_rev.write(writeFXRecord(r2)) + else: + noclass_fwd.write(writeFXRecord(r1)) + if noclass_rev: + noclass_rev.write(writeFXRecord(r2)) + pass + + set1_fwd.close() + set1_rev.close() + set2_fwd.close() + set2_rev.close() + noclass_fwd.close() + noclass_rev.close() + undecided_fwd.close() + undecided_rev.close() + pass + +if __name__ == '__main__': main(sys.argv)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kraken_combine.xml Mon May 18 15:55:47 2015 -0400 @@ -0,0 +1,83 @@ +<tool id="kraken_combine" name="kraken_combine"> + <description>extract unique and common sequences from two kraken runs</description> + <requirements> + <requirement type="package" version="2.7">python</requirement> + </requirements> + <command interpreter="python">kraken_combine.py + --in1="${dataFormat.input1}" + --set1-output-left=${set1OutputLeft} + --set2-output-left=${set2OutputLeft} + --unclassified-output-left=${unclassifiedOutputLeft} + --intersection-output-left=${intersectionOutputLeft} + --kraken-results1="${classificationSet1}" + --kraken-results2="${classificationSet2}" + + #if $dataFormat.inputFormat == "pairedFASTQ" or $dataFormat.inputFormat == "pairedFASTA": + --in2="${dataFormat.input2}" + --set1-output-right=${set1OutputRight} + --set2-output-right=${set2OutputRight} + --unclassified-output-right=${unclassifiedOutputRight} + --intersection-output-right=${intersectionOutputRight} + #end if + + #if $dataFormat.inputFormat == "singleFASTQ" or $dataFormat.inputFormat == "pairedFASTQ": + --input-format="fq" + #else: + --input-format="fa" + #end if + + + + </command> + <inputs> + <param name="classificationSet1" type="data" format="tabular" label="Kraken classification output set1" /> + <param name="classificationSet2" type="data" format="tabular" label="Kraken classification output set2" /> + + <conditional name="dataFormat"> + <param name="inputFormat" type="select" label="Please select input file type and library type."> + <option value="singleFASTQ">Single-end FASTQ</option> + <option value="pairedFASTQ">Paired-end FASTQ</option> + <option value="singleFASTA">Single-end FASTA</option> + <option value="pairedFASTA">Paired-end FASTA</option> + </param> + <when value="singleFASTQ"> + <param name="input1" type="data" format="fastq,fq.gz,fq" label="FASTQ file" /> + </when> + <when value="pairedFASTQ"> + <param name="input1" type="data" format="fastq,fq.gz,fq" label="Forward/Left FASTQ file" /> + <param name="input2" type="data" format="fastq,fq.gz,fq" label="Reverse/Right FASTQ file" /> + </when> + <when value="singleFASTA"> + <param name="input1" type="data" format="fasta,fasta.gz,fasta.bz2,fa.gz" label="FASTA file" /> + </when> + <when value="pairedFASTA"> + <param name="input1" type="data" format="fasta,fasta.gz,fasta.bz2,fa.gz" label="Forward/Left FASTA file" /> + <param name="input2" type="data" format="fasta,fasta.gz,fasta.bz2,fa.gz" label="Reverse/Right FASTA file" /> + </when> + </conditional> + </inputs> + + <outputs> + <data format="input1" name="set1OutputLeft" label="Taxonomy1-unique sequences (R1) of ${on_string}" /> + <data format="input1" name="set2OutputLeft" label="Taxonomy2-unique sequences (R1) of ${on_string}" /> + <data format="input1" name="intersectionOutputLeft" label="Common sequences (R1) of ${on_string}" /> + <data format="input1" name="unclassifiedOutputLeft" label="Unclassified sequences (R1) of ${on_string}" /> + + <data format="input2" name="set1OutputRight" label="Taxonomy1-unique sequences (R2) of ${on_string}"> + <filter>inputFormat == "pairedFASTQ" or inputFormat == "pairedFASTA"</filter> + </data> + <data format="input2" name="set2OutputRight" label="Taxonomy2-unique sequences (R2) of ${on_string}"> + <filter>inputFormat == "pairedFASTQ" or inputFormat == "pairedFASTA"</filter> + </data> + <data format="input2" name="intersectionOutputRight" label="Common sequences (R2) of ${on_string}"> + <filter>inputFormat == "pairedFASTQ" or inputFormat == "pairedFASTA"</filter> + </data> + <data format="input2" name="unclassifiedOutputRight" label="Unclassified sequences (R2) of ${on_string}"> + <filter>inputFormat == "pairedFASTQ" or inputFormat == "pairedFASTA"</filter> + </data> + </outputs> + + <help> + This tool compares the classification results on a dataset using two different kraken-databases. + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kraken_count.py Mon May 18 15:55:47 2015 -0400 @@ -0,0 +1,27 @@ +#!/usr/bin/env python + +import os +import sys +import csv + +from collections import Counter + +from kraken_visualize import readTaxonomyNames, readTaxonomyNodes, getDescendents + + +taxons = readTaxonomyNames(os.path.join(sys.argv[2], 'names.dmp')) + +taxID = int(sys.argv[3]) +validTaxons = getDescendents(taxID, readTaxonomyNodes(os.path.join(sys.argv[2], 'nodes.dmp'))[1]) + +c = Counter([int(row[2]) + for row in csv.reader(open(sys.argv[1]), delimiter='\t', quotechar='"')]) + +N = float(sum(c.values())) +ct = 0 +for k in sorted(c, key=lambda x:c[x], reverse=True): + if k in validTaxons: + print k, taxons.get(k), c[k], '%.10f' % (c[k]/N) + ct += c[k] + # print k, taxons.get(k, 'N/A'), c[k], 'VALID' if k in validTaxons else '' +print ct, ct/N
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kraken_filter.py Mon May 18 15:55:47 2015 -0400 @@ -0,0 +1,254 @@ +#!/usr/bin/env python + +import os +import sys +import argparse +import subprocess +import struct +import shutil +import tempfile + +from checkFileFormat import verifyFileFormat + +DEFAULT_KRAKEN_DB='/tsl/data/krakendb/ktest/db' +logfile = None + + +def getDescendents(taxID, tree): + descendents = set([taxID]) + queue = [taxID] + while queue: + node = queue.pop() + + children = tree.get(node, set()) + if children: + descendents = descendents.union(children) + queue.extend(children) + pass + return descendents + +def anabl_getSeqsFromFastX(fn, X=2): + it = open(fn) + for head in it: + try: + yield (head.strip(), tuple(map(lambda x:x.strip(), + ([it.next() for i in xrange(X - 1)])))) + except: + break + pass + it.close() + pass + +def readTaxonomy(db): + nodeNames = [] + for line in open(os.path.join(db, 'taxonomy', 'names.dmp')): + line = line.strip().strip('|').strip() + if not line: break + line = line.split('\t|\t') + + if line[3].strip() == 'scientific name': + nodeNames.append((int(line[0].strip()), line[1].strip())) + # break + pass + + nodeRanks = [] + nodeChildren = {} + for line in open(os.path.join(db, 'taxonomy', 'nodes.dmp')): + line = line.strip().strip('|').strip() + if not line: break + line = map(lambda x:x.strip(), line.split('\t|\t')) + line[:2] = map(int, line[:2]) + if line[0] == 1: + line[1] = 0 + try: + nodeChildren[line[1]].add(line[0]) + except: + nodeChildren[line[1]] = set([line[0]]) + nodeRanks.append((line[0], line[2])) + # break + + return dict(nodeNames), dict(nodeRanks), nodeChildren + +def getFastqIdentifier(string): + if string.endswith('/1') or string.endswith('/2'): + return string[1:-2] + else: + return string.split()[0][1:] + +def runFilter(db, taxID, inputClassification, inputR1, outputR1, inputR2=None, outputR2=None, fastx=4, debug=False): + global logfile + logfile.write('Reading taxonomy...\n') + logfile.flush() + taxonomyInfo = readTaxonomy(db) + keepSequences = set() + logfile.write('Traversing requested taxonomy branch...\n') + logfile.flush() + validIDs = getDescendents(taxID, taxonomyInfo[2]) + logfile.write('Family tree of %s has %i descendents.\n' % (str(taxID), len(validIDs))) + logfile.flush() + if debug: + for vid in validIDs: + logfile.write('Added %s to validIDs.\n' % vid) + logfile.flush() + + + logfile.write('Filtering sequences...\n') + logfile.flush() + nseqs = 0 + for line in open(inputClassification): + nseqs += 1 + line = line.strip().split() + takeUnclassified = taxID == 0 and line[0] == 'U' + takeClassified = line[0] == 'C' and int(line[2]) in validIDs + if takeUnclassified or takeClassified: + keepSequences.add(line[1].strip()) + if debug: + logfile.write('Added %s to keepSequences.\n' % line[1].strip()) + logfile.flush() + logfile.write('Keeping %i of %i sequences (%.1f).\n' % (len(keepSequences), nseqs, float(len(keepSequences))/nseqs)) + logfile.flush() + logfile.write('Writing filtered sequence sets...\n') + logfile.flush() + fwdOut = open(outputR1, 'wb') + fwdGen = anabl_getSeqsFromFastX(inputR1, X=fastx) + + revOut, revGen = None, None + revSid, revSeq = None, None + if outputR2 is not None and inputR2 is not None: + revOut = open(outputR2, 'wb') + revGen = anabl_getSeqsFromFastX(inputR2, X=fastx) + + fqid1, fqid2 = None, None + while True: + try: + fwdSid, fwdSeq = fwdGen.next() + fqid1 = getFastqIdentifier(fwdSid) + if revGen is not None: + revSid, revSeq = revGen.next() + fqid2 = getFastqIdentifier(revSid) + except: + break + if fqid1 != fqid2 and fqid2 is not None: + sys.stderr.write('Error: fqid-mismatch %s %s.\n' % (fqid1, fqid2)) + sys.exit(1) + if fqid1 in keepSequences: + fwdOut.write(('%s\n' * fastx) % ((fwdSid,) + fwdSeq)) + if revOut is not None: + revOut.write(('%s\n' * fastx) % ((revSid,) + revSeq)) + else: + # sys.stdout.write('%s is not in keepSequences\n' % fqid1) + pass + fwdOut.close() + if revOut is not None: + revOut.close() + pass + + +def main(argv): + + descr = '' + parser = argparse.ArgumentParser(description=descr) + parser.add_argument('--dbtype', default='builtinDB', help='Is database built-in or supplied externally?') + parser.add_argument('--db', default=DEFAULT_KRAKEN_DB, help='Path to kraken db') + parser.add_argument('--in1', help='The r1-file (single-end reads or left paired-end reads).') + parser.add_argument('--in2', help='The r2-file (right paired-end reads)') + parser.add_argument('--taxid', default=1, help='The root taxon id of the requested branch', type=int) + parser.add_argument('--kraken-results', type=str, help='A file containing kraken classification results for the input sequences.') + parser.add_argument('--input-format', help='Input sequences stored in fa or fq file(s).', default='fq') + + parser.add_argument('--out1', type=str, help='The r1-output file.') + parser.add_argument('--out2', type=str, help='The r2-output file.') + parser.add_argument('--debug', action='store_true') + parser.add_argument('--logfile', type=str, help='A logfile.', default='kraken_filter.log') + args = parser.parse_args() + + kraken_params = [] + kraken_input = [] + + global logfile + logfile = open(args.logfile, 'wb') + if 'db' not in args or not os.path.exists(args.db): + sys.stderr.write('Error: database is missing.\n') + sys.exit(1) + kraken_params.extend(['--db', args.db]) + # check whether input file(s) exist(s) + + if 'kraken_results' in args and os.path.exists(args.kraken_results): + kraken_input.append(args.kraken_results) + else: + sys.stderr.write('Error: kraken-classification is missing.\n') + sys.exit(1) + pass + + if not ('in1' in args and os.path.exists(args.in1)): + sys.stderr.write('Error: left/single input file (%s) is missing.\n' % args.in1) + sys.exit(1) + if not verifyFileFormat(args.in1, args.input_format): + sys.stderr.write('Error: fwd/single input file has the wrong format assigned.\n') + sys.exit(1) + + kraken_input.append(args.in1) + if 'in2' in args: + if args.in2 is not None and not os.path.exists(args.in2): + sys.stderr.write('Error: right input file (%s) is missing.\n' % args.in2) + sys.exit(1) + elif args.in2 is not None and not verifyFileFormat(args.in2, args.input_format): + sys.stderr.write('Error: rev input file has the wrong format assigned.\n') + sys.exit(1) + elif args.in2 is not None: + kraken_params.append('--paired') + kraken_input.append(args.in2) + else: + pass + else: + pass + + if 'out2' in args and 'in2' in args: + input2, output2 = args.in2, args.out2 + else: + input2, output2 = None, None + + if args.taxid < 0: + sys.stderr.write('Error: invalid taxon id %i\n' % args.taxid) + sys.exit(1) + kraken_params.extend(['--taxid', args.taxid]) + + if args.input_format == 'fq': + kraken_params.append('--fastq-input') + fastx = 4 + else: + kraken_params.append('--fasta-input') + fastx = 2 + + + + # firstChar = open(args.in1).read(1) + # if firstChar == '>': + # kraken_params.append('--fasta-input') + # fastx = 2 + # elif firstChar == '@': + # kraken_params.append('--fastq-input') + # fastx = 4 + #else: + # """ This will currently disable working with gzipped/bzipped files. """ + # sys.stderr.write('Error: Input file starts with unknown symbol %s.\n' % firstChar) + # sys.exit(1) + # pass + + #open(args.kraken_filtered_r1, 'wb').write('\n'.join(map(str, kraken_params + kraken_input))) + + runFilter(args.db, int(args.taxid), args.kraken_results, + args.in1, args.out1, + inputR2=input2, outputR2=output2, fastx=fastx, + debug=args.debug) + + logfile.close() + + pass + + + + +# main(sys.argv[1:]) + +if __name__ == '__main__': main(sys.argv[1:])
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kraken_filter.xml Mon May 18 15:55:47 2015 -0400 @@ -0,0 +1,123 @@ +<tool id="kraken_filter" name="kraken_filter"> + <description>kraken result filter</description> + <requirements> + <requirement type="package" version="2.7">python</requirement> + </requirements> + <command interpreter="python">kraken_filter.py + --dbtype="${databaseChoice.database}" + #if $databaseChoice.database == "builtinDB": + --db="${databaseChoice.builtinDatabases.fields.path}" + #else: + --db="NOT_SUPPORTED_YET" + #end if + + --in1="${dataFormat.input1}" + --out1="${kraken_filtered_r1}" + + #if $dataFormat.inputFormat == "pairedFastQ" or $dataFormat.inputFormat == "pairedFastA": + --in2="${dataFormat.input2}" + --out2="${kraken_filtered_r2}" + #end if + + #if $dataFormat.inputFormat == "singleFastQ" or $dataFormat.inputFormat == "pairedFastQ": + --input-format="fq" + #else: + --input-format="fa" + #end if + + #if $whichFilter.whichFilterp == "filterClassified": + --taxid="${whichFilter.taxonID}" + #else: + --taxid="0" + #end if + + --kraken-results="${kraken_classification}" + + --logfile="${kraken_filter_log}" + </command> + <inputs> + <conditional name="databaseChoice"> + <param name="database" type="select" label="Do you want to use your own kraken database or use the built-in ktest database?"> + <option value="builtinDB">Use built-in db (ktest)</option> + <option value="myownDB">Use a db from my history</option> + </param> + <when value="builtinDB"> + <param name="builtinDatabases" type="select" label="Select a built-in kraken database"> + <options from_data_table="krakenDBs"> + <filter type="sort_by" column="2" /> + <validator type="no_options" message="No databases are available" /> + </options> + </param> + </when> + <when value="myownDB"> + <param name="myownDB" type="data" format="fasta" metadata_name="dbkey" label="Select database" /> + </when> + </conditional> + + <conditional name="whichFilter"> + <param name="whichFilterp" type="select" label="Filter classified or unclassified reads?"> + <option value="filterClassified">Filter classified reads</option> + <option value="filterUnclassified">Filter unclassified reads</option> + </param> + <when value="filterClassified"> + <param name="taxonID" type="integer" label="Taxonomy ID of root taxon (default:1 - 'all')" value="1" default="1" /> + </when> + </conditional> + + <param name = "kraken_classification" type="data" format="tabular" label="Kraken classification output" /> +<!-- + <conditional name="libraryType"> + <param name="libtype" type="select" label="Is this library single- or paired-end?"> + <option value="single">Single-end</option> + <option value="paired">Paired-end</option> + </param> + <when value="single"> + <param name="input1" type="data" format="fasta,fastq,fasta.gz,fasta.bz2,fa.gz,fq.gz,fa,fq" label="Fasta/Fastq file" /> + </when> + <when value="paired"> + <param name="input1" type="data" format="fasta,fastq,fasta.gz,fasta.bz2,fa.gz,fq.gz,fa,fq" label="Left Fasta/Fastq file" /> + <param name="input2" type="data" format="fasta,fastq,fasta.gz,fasta.bz2,fa.gz,fq.gz,fa,fq" label="Right Fasta/Fastq file" /> + </when> + </conditional> +--> +<!-- new --> + + <conditional name="dataFormat"> + <param name="inputFormat" type="select" label="Please select input file type and library type."> + <option value="singleFastQ">Single-end FASTQ</option> + <option value="pairedFastQ">Paired-end FASTQ</option> + <option value="singleFastA">Single-end FASTA</option> + <option value="pairedFastA">Paired-end FASTA</option> + </param> + <when value="singleFastQ"> + <param name="input1" type="data" format="fastq,fq.gz,fq" label="FASTQ file" /> + </when> + <when value="pairedFastQ"> + <param name="input1" type="data" format="fastq,fq.gz,fq" label="Forward/Left FASTQ file" /> + <param name="input2" type="data" format="fastq,fq.gz,fq" label="Reverse/Right FASTQ file" /> + </when> + <when value="singleFastA"> + <param name="input1" type="data" format="fasta,fasta.gz,fasta.bz2,fa.gz" label="FASTA file" /> + </when> + <when value="pairedFastA"> + <param name="input1" type="data" format="fasta,fasta.gz,fasta.bz2,fa.gz" label="Forward/Left FASTA file" /> + <param name="input2" type="data" format="fasta,fasta.gz,fasta.bz2,fa.gz" label="Reverse/Right FASTA file" /> + </when> + </conditional> + +<!-- new --> + + </inputs> + <outputs> + <data format="input1" name="kraken_filtered_r1" label="${tool.name} filtered fwd-reads of ${on_string}" /> + <data format="input2" name="kraken_filtered_r2" label="${tool.name} filtered rev-reads of ${on_string}"> + <filter>inputFormat == "pairedFastQ" or inputFormat == "pairedFastA</filter> + + </data> + <data format="txt" name="kraken_filter_log" label="${tool.name} logfile (${on_string})" /> + </outputs> + + <help> + This tool is a downstream filter for kraken-classified sequences. + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kraken_summarize.py Mon May 18 15:55:47 2015 -0400 @@ -0,0 +1,101 @@ +#!/usr/bin/env python +import sys +import subprocess +import argparse + +from collections import Counter + +from Bio import Entrez + + + +# U HWI-ST933:109:C2A2NACXX:5:1101:1311:2013 0 50 Q:0 +def fetchTaxonomyData(ids, email='christian.schudoma@tsl.ac.uk'): + Entrez.email = email + handle = Entrez.efetch(db='Taxonomy', id=','.join(ids), retmode='xml') + records = Entrez.read(handle) + return records + +def writeKronaInput(fi, taxInfo, unclassified=0): + if unclassified: + fi.write('%i\tUnclassified\n' % unclassified) + for tid in sorted(taxInfo, key=lambda x:taxInfo[x][0]['Lineage']): + fi.write('%i\t%s\n' % (taxInfo[tid][1], '; '.join([taxInfo[tid][0]['Lineage'].strip(), taxInfoDict[tid][0]['ScientificName']]).replace('; ', '\t').strip('\t'))) + pass + +def writeOutput(out, taxInfoDict, c): + for tid in sorted(taxInfoDict, reverse=True, key=lambda x:taxInfoDict[x][1]): + data = [tid, taxInfoDict[tid][1], taxInfoDict[tid][0]['TaxId'], taxInfoDict[tid][0]['Lineage'], taxInfoDict[tid][0]['ScientificName']] + out.write('\t'.join(data) + '\n') + data = (sum(c.values()) - c['0'], sum(c.values()), (sum(c.values()) - c['0']) / float(sum(c.values())) * 100, c['0']) + out.write('%i/%i (%.5f%%) classified, %i unclassified\n' % data) + + +def main(argv): + + parser = argparse.ArgumentParser(description='') + parser.add_argument('--krona-output', type=str, default='') + parser.add_argument('--output', type=str) + parser.add_argument('--call-krona', type=str) + parser.add_argument('--include-unclassified') + parser.add_argument('kraken_summary_tsv', type=str) + args = parser.parse_args() + + c = Counter(line.strip().split()[2] for line in open(sys.argv[1])) + taxids = sorted(c.keys(), key=lambda x:c[x], reverse=True) + taxData = fetchTaxonomyData(taxids) + taxInfoDict = {tinfo['TaxId']: [tinfo, c[tinfo['TaxId']]] for tinfo in taxData} + + kr_out = None + if 'krona_output' in args: + with open(args.krona_output, 'wb') as kr_out: + if 'include_unclassified' in args: + writeKronaInput(kr_out, taxInfoDict, unclassified=c['0']) + else: + writeKronaInput(kr_out, taxInfoDict) + + if 'call_krona' in args: + p = subprocess.Popen('source krona_tools-2.4; ktImportText -o %s %s' % (args.call_krona, args.krona_output), shell=True, stdout=subprocess.PIPE) + p.communicate() + + if 'output' in args: + if args.output == '-': + writeOutput(sys.stdout, taxInfoDict, c) + else: + with open(args.output, 'wb') as out: + writeOutput(out, taxInfoDict, c) + + + + + + + + +""" +2 Fats Saturated fat +3 Fats Unsaturated fat Monounsaturated fat +3 Fats Unsaturated fat Polyunsaturated fat +13 Carbohydrates Sugars +4 Carbohydrates Dietary fiber +21 Carbohydrates +5 Protein +4 +""" + + + +""" +handle = Entrez.efetch(db="Taxonomy", id="1323524", retmode="xml") +records = Entrez.read(handle) +records[0]["Lineage"] +'Viruses; dsRNA viruses; Partitiviridae; Betapartitivirus' +records[0]["LineageEx"] +[{u'ScientificName': 'Viruses', u'TaxId': '10239', u'Rank': 'superkingdom'}, {u'ScientificName': 'dsRNA viruses', u'TaxId': '35325', u'Rank': 'no rank'}, {u'ScientificName': 'Partitiviridae', u'TaxId': '11012', u'Rank': 'family'}, {u'ScientificName': 'Betapartitivirus', u'TaxId': '1511809', u'Rank': 'genus'}] +""" + +""" +{u'Lineage': 'Viruses; dsDNA viruses, no RNA stage; Iridoviridae; unclassified Iridoviridae', u'Division': 'Viruses', u'ParentTaxId': '180169', u'PubDate': '2014/03/16 07:01:27', u'LineageEx': [{u'ScientificName': 'Viruses', u'TaxId': '10239', u'Rank': 'superkingdom'}, {u'ScientificName': 'dsDNA viruses, no RNA stage', u'TaxId': '35237', u'Rank': 'no rank'}, {u'ScientificName': 'Iridoviridae', u'TaxId': '10486', u'Rank': 'family'}, {u'ScientificName': 'unclassified Iridoviridae', u'TaxId': '180169', u'Rank': 'no rank'}], u'CreateDate': '2014/02/24 15:56:28', u'TaxId': '1465751', u'Rank': 'species', u'GeneticCode': {u'GCId': '1', u'GCName': 'Standard'}, +u'ScientificName': 'Anopheles minimus irodovirus', u'MitoGeneticCode': {u'MGCId': '0', u'MGCName': 'Unspecified'}, u'UpdateDate': '2014/02/24 15:56:29'} + +"""
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kraken_summarize.xml Mon May 18 15:55:47 2015 -0400 @@ -0,0 +1,27 @@ +<tool id="kraken_summarize" name="kraken_summarize"> + <description></description> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.65">biopython<requirement> + </requirements> + <command interpreter="python">kraken_summarize.py + --krona-output=$krona_output + --output=$tax_summary + --call-krona=$krona_html + $kraken_summary_tsv + </command> + + + <inputs> + <param format="tabular" name="kraken_summary_tsv" type="data" label="Kraken summary table"/> + </inputs> + <outputs> + <data format="html" name="krona_html" label="Krona visualization of ${tool.name}" /> + <data format="tabular" name="krona_output" label="Krona input" /> + <data format="tabular" name="tax_summary" label="Taxonomy class summary" /> + </outputs> + + <help> + Krona visualisation and classification summary of Kraken runs. + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kraken_visualize.py Mon May 18 15:55:47 2015 -0400 @@ -0,0 +1,48 @@ +#!/usr/bin/env python + +import sys + +def readTaxonomyNames(names_dmp): + nodeNames = [] + for line in open(names_dmp): + line = line.strip().strip('|').strip() + if not line: break + line = line.split('\t|\t') + if line[3].strip() == 'scientific name': + nodeNames.append((int(line[0].strip()), line[1].strip())) + pass + return dict(nodeNames) + +def readTaxonomyNodes(nodes_dmp): + nodeRanks = [] + nodeChildren = {} + nodeParents = {} + for line in open(nodes_dmp): + line = line.strip().strip('|').strip() + if not line: break + line = map(lambda x:x.strip(), line.split('\t|\t')) + line[:2] = map(int, line[:2]) + if line[0] == 1: + line[1] = 1 + + nodeParents[line[0]] = line[1] + try: + nodeChildren[line[1]].add(line[0]) + except: + nodeChildren[line[1]] = set([line[0]]) + nodeRanks.append((line[0], line[2])) + + return dict(nodeRanks), nodeChildren, nodeParents + +def getDescendents(taxID, tree): + descendents = set([taxID]) + queue = [taxID] + while queue: + node = queue.pop() + + children = tree.get(node, set()) + if children: + descendents = descendents.union(children) + queue.extend(children) + pass + return descendents \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/repository_dependencies.xml Mon May 18 15:55:47 2015 -0400 @@ -0,0 +1,4 @@ +<?xml version="1.0"?> +<repositories description="Requires Biopython as a dependency."> + <repository changeset_revision="f8d72690eeae" name="package_biopython_1_65" owner="biopython" toolshed="https://testtoolshed.g2.bx.psu.edu" /> +</repositories>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Mon May 18 15:55:47 2015 -0400 @@ -0,0 +1,7 @@ +<tables> + <!-- Locations of kraken databases --> + <table name="krakenDBs" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/krakenDBs.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon May 18 15:55:47 2015 -0400 @@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="python" version="2.7"> + <repository changeset_revision="25b20b323706" name="package_python_2_7" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>