Mercurial > repos > cschu > kraken_tools

--- a/kraken.py	Mon May 18 15:29:05 2015 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,150 +0,0 @@
-#!/usr/bin/env python
-
-import os
-import sys
-import argparse
-import subprocess
-import struct
-import shutil
-import tempfile
-
-# from collections import Counter
-
-from checkFileFormat import verifyFileFormat
-DEFAULT_KRAKEN_DB = '/tsl/data/krakendb/ktest/db1'
-
-def main(argv):
-
-    descr = ''
-    parser = argparse.ArgumentParser(description=descr)
-    parser.add_argument('--dbtype', default='builtinDB', help='Is database built-in or supplied externally?')
-    parser.add_argument('--db', default=DEFAULT_KRAKEN_DB, help='Path to kraken db')
-    parser.add_argument('--in1', help='Single-end reads or left paired-end reads')
-    parser.add_argument('--in2', help='Right paired-end reads')
-    parser.add_argument('--quick', action='store_true', help='Use quick mode?')
-    parser.add_argument('--min-hits', default=1, help='Minimum number of hits required.')
-    parser.add_argument('--input-format', help='Input sequences stored in fa or fq file(s).', default='fq')
-    parser.add_argument('kraken_summary_tsv', type=str, help='The output file.')
-    # parser.add_argument('classified_seqs_fq', type=str, help='The output file.')
-    # parser.add_argument('unclassified_seqs_fq', type=str, help='The output file.')
-    args = parser.parse_args()
-
-    #  kraken --preload --db /tsl/scratch/macleand/ktest/db --threads 12 --quick --classified-out classified_seqs.fq --unclassified-out unclassified.fq --fastq-input --min-hits 1 --output classification.txt left_reads.fq right_reads.fq
-
-    kraken_params = ['--preload', '--threads', '8',
-                     '--unclassified-out', args.unclassified_seqs_fq,
-                     '--output', args.kraken_summary_tsv]
-    #                '--classified-out', args.classified_seqs_fq,
-    kraken_input = []
-
-    if 'db' not in args or not os.path.exists(args.db):
-        sys.stderr.write('Error: database is missing.\n')
-        sys.exit(1)
-    kraken_params.extend(['--db', args.db])
-    # check whether input file(s) exist(s)
-    if not ('in1' in args and os.path.exists(args.in1)):
-        sys.stderr.write('Error: fwd/single input file (%s) is missing.\n' % args.in1)
-        sys.exit(1)
-    if not verifyFileFormat(args.in1, args.input_format):
-        fc = open(args.in1).read(1)
-        sys.stderr.write('Error: fwd/single input file has the wrong format assigned (%s). Starts with \'%s\'\n' % (args.input_format, fc))
-        sys.exit(1)
-    kraken_input.append(args.in1)
-    if 'in2' in args:
-        if args.in2 is not None and not os.path.exists(args.in2):
-            sys.stderr.write('Error: right input file (%s) is missing.\n' % args.in2)
-            sys.exit(1)
-        elif args.in2 is not None and not verifyFileFormat(args.in2, args.input_format):
-            sys.stderr.write('Error: rev input file has the wrong format assigned.\n')
-            sys.exit(1)
-        elif args.in2 is not None:
-            kraken_params.append('--paired')
-            kraken_input.append(args.in2)
-        else:
-            pass
-    else:
-        pass
-
-    if 'quick' in args:
-        kraken_params.append('--quick')
-        if 'min_hits' in args:
-            kraken_params.extend(['--min-hits', str(args.min_hits)])
-
-    if args.input_format == 'fq':
-        kraken_params.append('--fastq-input')
-    else:
-        kraken_params.append('--fasta-input')
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-    # check whether file is gzipped
-    header = ''
-
-    cmd = 'source kraken-0.10.5; '
-    cmd += ' '.join(['kraken'] + kraken_params + kraken_input) + '\n'
-    # out = open(argv[-1], 'wb').write(str(cmd) + '\n')#.write('KRAKEN OUTPUT, HAS COUNTER\n')
-    # out = open(argv[-1], 'wb').write(str(args) + '\n')#.write('KRAKEN OUTPUT, HAS COUNTER\n')
-
-    # proc = subprocess.Popen(args=cmd, shell=True, stderr=sys.stderr)
-    # returncode = proc.wait()
-
-
-    tmp_dir = tempfile.mkdtemp()
-    try:
-        tmp = tempfile.NamedTemporaryFile(dir=tmp_dir).name
-        tmp_stderr = open(tmp, 'wb')
-        proc = subprocess.Popen(args=cmd, shell=True, stderr=tmp_stderr.fileno())
-        returncode = proc.wait()
-        tmp_stderr.close()
-        # get stderr, allowing for case where it's very large
-        tmp_stderr = open(tmp, 'rb')
-        stderr = ''
-        buffsize = 1048576
-        try:
-            while True:
-                stderr += tmp_stderr.read(buffsize)
-                if not stderr or len(stderr) % buffsize != 0:
-                    break
-        except OverflowError:
-            pass
-        tmp_stderr.close()
-        if returncode != 0:
-            raise Exception, stderr
-    except Exception, e:
-        # clean up temp dirs
-        if os.path.exists(tmp_dir):
-            shutil.rmtree(tmp_dir)
-        sys.stderr.write('Error running kraken: %s\n' % str(e))
-        sys.exit(1)
-
-    if os.path.exists(tmp_dir):
-        shutil.rmtree(tmp_dir)
-    """
-    open(args.kraken_summary_tsv, 'wb').write('\t'.join(list('ACGT')) + '\n')
-    open(args.classified_seqs_fq, 'wb').write(cmd + '\n')
-    open(args.unclassified_seqs_fq, 'wb').write('blruah\n')                      # check whether the database exists, if not exit with error
-    """
-    """
-    for arg in args:
-        out.write(str(arg) + '\n')
-    out.close()
-    """
-    pass
-
-
-
-
-# main(sys.argv[1:])
-
-if __name__ == '__main__': main(sys.argv[1:])
--- a/kraken.xml	Mon May 18 15:29:05 2015 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,112 +0,0 @@
-<tool id="kraken" name="kraken">
-	<description>taxonomic sequence classifier</description>
-	<requirements>
-	  <requirement type="package" version="2.7.4">python</requirement>
-	  <requirement type="package" version="0.10.5">kraken</requirement>
-	</requirements>
-	<command interpreter="python">kraken.py
-		--dbtype="${databaseChoice.database}"
-		#if $databaseChoice.database == "builtinDB":
-		  --db="${databaseChoice.builtinDatabases.fields.path}"
-		#else:
-		  --db="NOT_SUPPORTED_YET"
-		#end if
-
-		--in1="${dataFormat.input1}"
-		#if $dataFormat.inputFormat == "pairedFastQ" or $dataFormat.inputFormat == "pairedFastA":
-		  --in2="${dataFormat.input2}"
-		#end if
-
-		#if $dataFormat.inputFormat == "singleFastQ" or $dataFormat.inputFormat == "pairedFastQ":
-		  --input-format="fq"
-		#else:
-		  --input-format="fa"
-		#end if
-
-                #if $quickMode.useQuickMode == "useQuick":
-                  --quick
-                  --min-hits="${quickMode.minHits.value}"
-                #end if
-
-	<!--		$output -->
-                $kraken_summary_tsv
-                <!-- $classified_seqs_fq
-                $unclassified_seqs_fq -->
-		</command>
-	<inputs>
-		<conditional name="databaseChoice">
-			<param name="database" type="select" label="Do you want to use your own kraken database or use the built-in ktest database?">
-				<option value="builtinDB">Use built-in db</option>
-				<option value="myownDB">Use a db from my history</option>
-			</param>
-			<when value="builtinDB">
-				<param name="builtinDatabases" type="select" label="Select a built-in kraken database">
-					<options from_data_table="krakenDBs">
-						<filter type="sort_by" column="2" />
-						<validator type="no_options" message="No databases are available" />
-					</options>
-				</param>
-			</when>
-			<when value="myownDB">
-				<param name="myownDB" type="data" format="fasta" metadata_name="dbkey" label="Select database" />
-			</when>
-		</conditional>
-
-		<conditional name="quickMode">
-			<param name="useQuickMode" type="select" label="Use Quick operation? (use first hit or hits)">
-				<option value="useQuick">Use Quick operation</option>
-				<option value="dontUseQuick">Do not use Quick operation</option>
-			</param>
-			<when value="useQuick">
-				<param name="minHits" type="integer" label="Minimum number of hits required for classification (default:1)" value="1" default="1" />
-			</when>
-		</conditional>
-
-<!--		<conditional name="libraryType">
-			<param name="libtype" type="select" label="Is this library single- or paired-end?">
-				<option value="single">Single-end</option>
-				<option value="paired">Paired-end</option>
-			</param>
-			<when value="single">
-				<param name="input1" type="data" format="fasta,fastq,fasta.gz,fasta.bz2,fa.gz,fq.gz,fa,fq" label="Fasta/Fastq file" />
-			</when>
-			<when value="paired">
-				<param name="input1" type="data" format="fasta,fastq,fasta.gz,fasta.bz2,fa.gz,fq.gz,fa,fq" label="Left Fasta/Fastq file" />
-				<param name="input2" type="data" format="fasta,fastq,fasta.gz,fasta.bz2,fa.gz,fq.gz,fa,fq" label="Right Fasta/Fastq file" />
-			</when>
-		</conditional> -->
-		<conditional name="dataFormat">
-			<param name="inputFormat" type="select" label="Please select input file type and library type.">
-				<option value="singleFastQ">Single-end FastQ</option>
-				<option value="pairedFastQ">Paired-end FastQ</option>
-				<option value="singleFastA">Single-end FastA</option>
-				<option value="pairedFastA">Paired-end FastA</option>
-			</param>
-			<when value="singleFastQ">
-				<param name="input1" type="data" format="fastq,fastqillumina,fastqsanger,fq" label="FastQ file" />
-			</when>
-			<when value="pairedFastQ">
-				<param name="input1" type="data" format="fastq,fastqillumina,fastqsanger,fq" label="Forward/Left FastQ file" />
-				<param name="input2" type="data" format="fastq,fastqillumina,fastqsanger,fq" label="Reverse/Right FastQ file" />
-			</when>
-			<when value="singleFastA">
-				<param name="input1" type="data" format="fasta, fa" label="FastA file" />
-			</when>
-			<when value="pairedFastA">
-				<param name="input1" type="data" format="fasta, fa" label="Forward/Left FASTA file" />
-				<param name="input2" type="data" format="fasta, fa" label="Reverse/Right FASTA file" />
-			</when>
-		</conditional>
-
-
-	</inputs>
-	<outputs>
-		<data format="tabular" name="kraken_summary_tsv" label="${tool.name} output summary of ${on_string}" />
-		<!-- <data name="classified_seqs_fq" format_source="input1" label="${tool.name}-classified sequences from ${on_string}" />
-		<data name="unclassified_seqs_fq" format_source="input1" label="${tool.name}-unclassified sequences from ${on_string}" /> -->
-	</outputs>
-
-	<help>
-	This tool wraps the kraken taxonomic sequence classifier.
-	</help>
-</tool>
--- a/kraken_combine.py	Mon May 18 15:29:05 2015 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,157 +0,0 @@
-#!/usr/bin/env python
-import sys
-import argparse
-
-def anabl_getContigsFromFASTQ(fn):
-    with open(fn) as fi:
-        for head in fi:
-            try:
-                seq, sep, qual = map(lambda x:x.strip(), [fi.next() for i in xrange(3)])
-                yield head[1:], seq, qual
-            except:
-                    break
-    pass
-
-def anabl_getContigsFromFASTA(fn):
-    with open(fn) as fi:
-            for head in fi:
-                try:
-                    yield head[1:], fi.next().strip()
-                except:
-                    break
-    pass
-
-def toFASTQ(fields):
-    return '@%s\n%s\n+\n%s\n' % fields
-def toFASTA(fields):
-    return '>%s\n%s\n' % fields
-def getFASTQID(head):
-    return head.strip().split()[0].strip('@').strip('/1')
-def getFASTAID(head):
-    return head.strip().split()[0].strip('>').strip('/1')
-
-
-
-def main(argv):
-    DEFAULT_KRAKEN_DB= ''
-    parser = argparse.ArgumentParser(description='')
-    parser.add_argument('--db', default=DEFAULT_KRAKEN_DB, help='Path to kraken db.')  # we don't need taxonomy here
-    parser.add_argument('--in1', default='', help='Forward read file from metagenomic sample.')
-    parser.add_argument('--in2', default='', help='Reverse read file from metagenomic sample.')
-    parser.add_argument('--kraken-results1', help='File containing kraken classification for set1.')
-    parser.add_argument('--kraken-results2', help='File containing kraken classification for set2.')
-    parser.add_argument('--input-format', default='fa', help='fa (FASTA) or fq (FASTQ)')
-    parser.add_argument('--set1-output-left', type=str)
-    parser.add_argument('--set1-output-right', type=str)
-    parser.add_argument('--set2-output-left', type=str)
-    parser.add_argument('--set2-output-right', type=str)
-    parser.add_argument('--unclassified-output-left', type=str)
-    parser.add_argument('--unclassified-output-right', type=str)
-    parser.add_argument('--intersection-output-left', type=str)
-    parser.add_argument('--intersection-output-right', type=str)
-    args = parser.parse_args()
-
-    if args.input_format == 'fa':
-        getContigs = anabl_getContigsFromFASTA
-        writeFXRecord = toFASTA
-        getFXID = getFASTAID
-    elif args.input_format == 'fq':
-        getContigs = anabl_getContigsFromFASTQ
-        writeFXRecord = toFASTQ
-        getFXID = getFASTQID
-    else:
-        sys.stderr.write('Error: Unknown input format (%s).\n' % args.input_format)
-        sys.exit()
-
-    try:
-        set1 = [line.strip().split()[:2] for line in open(args.kraken_results1)]
-    except:
-        sys.stderr.write('Error: Set1 is missing, please specify --kraken_results1 parameter.\n')
-        sys.exit()
-
-    nReads = len(set1)
-    set1 = set(map(lambda x:x[1], filter(lambda x:x[0].startswith('C'), set1)))
-
-    try:
-        set2 = set([line.strip().split()[1] for line in open(args.kraken_results2) if line.startswith('C')])
-    except:
-        sys.stderr.write('Error: Set2 is missing, please specify --kraken_results2 parameter.\n')
-        sys.exit()
-
-    try:
-        set1_fwd = open(args.set1_output_left, 'wb')
-        set2_fwd = open(args.set2_output_left, 'wb')
-        noclass_fwd = open(args.unclassified_output_left, 'wb')
-        undecided_fwd = open(args.intersection_output_left, 'wb')
-    except:
-        sys.stderr.write('Error: Cannot open fwd outputfile(s).\n')
-        sys.exit()
-
-    try:
-        fwd = getContigs(args.in1)
-    except:
-        sys.stderr.write('Error: Cannot open file %s.\n' % args.in1)
-        sys.exit()
-
-    rev = None
-    if args.in2:
-        try:
-            rev = getContigs(args.in2)
-        except:
-            sys.stderr.write('Error: Cannot open file %s.\n' % args.in2)
-            sys.exit()
-        try:
-            set1_rev = open(args.set1_output_right, 'wb')
-            set2_rev = open(args.set2_output_right, 'wb')
-            noclass_rev = open(args.unclassified_output_right, 'wb')
-            undecided_rev = open(args.intersection_output_right, 'wb')
-        except:
-            sys.stderr.write('Error: Cannot open rev outputfile(s).\n')
-            sys.exit()
-
-
-    for i in xrange(nReads):
-        try:
-            r1 = fwd.next()
-        except:
-            break
-        r2 = None
-        if rev is not None:
-            try:
-                r2 = rev.next()
-            except:
-                break
-
-        id_ = getFXID(r1[0])
-
-        if id_ in set1:
-            if id_ not in set2:
-                set1_fwd.write(writeFXRecord(r1))
-                if set1_rev:
-                    set1_rev.write(writeFXRecord(r2))
-            else:
-                undecided_fwd.write(writeFXRecord(r1))
-                if undecided_rev:
-                    undecided_rev.write(writeFXRecord(r2))
-
-        elif id_ in set2:
-            set2_fwd.write(writeFXRecord(r1))
-            if set2_rev:
-                set2_rev.write(writeFXRecord(r2))
-        else:
-            noclass_fwd.write(writeFXRecord(r1))
-            if noclass_rev:
-                noclass_rev.write(writeFXRecord(r2))
-        pass
-
-    set1_fwd.close()
-    set1_rev.close()
-    set2_fwd.close()
-    set2_rev.close()
-    noclass_fwd.close()
-    noclass_rev.close()
-    undecided_fwd.close()
-    undecided_rev.close()
-    pass
-
-if __name__ == '__main__': main(sys.argv)
--- a/kraken_combine.xml	Mon May 18 15:29:05 2015 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,83 +0,0 @@
-<tool id="kraken_combine" name="kraken_combine">
-	<description>extract unique and common sequences from two kraken runs</description>
-	<requirements>
-		<requirement type="package" version="2.7.4">python</requirement>
-	</requirements>
-	<command interpreter="python">kraken_combine.py
-		--in1="${dataFormat.input1}"
-		--set1-output-left=${set1OutputLeft}
-		--set2-output-left=${set2OutputLeft}
-		--unclassified-output-left=${unclassifiedOutputLeft}
-		--intersection-output-left=${intersectionOutputLeft}
-		--kraken-results1="${classificationSet1}"
-		--kraken-results2="${classificationSet2}"
-
-		#if $dataFormat.inputFormat == "pairedFASTQ" or $dataFormat.inputFormat == "pairedFASTA":
-			--in2="${dataFormat.input2}"
-			--set1-output-right=${set1OutputRight}
-			--set2-output-right=${set2OutputRight}
-			--unclassified-output-right=${unclassifiedOutputRight}
-			--intersection-output-right=${intersectionOutputRight}
-		#end if
-
-		#if $dataFormat.inputFormat == "singleFASTQ" or $dataFormat.inputFormat == "pairedFASTQ":
-		  --input-format="fq"
-		#else:
-		  --input-format="fa"
-		#end if
-
-
-
-	</command>
-	<inputs>
-		<param name="classificationSet1" type="data" format="tabular" label="Kraken classification output set1" />
-		<param name="classificationSet2" type="data" format="tabular" label="Kraken classification output set2" />
-
-		<conditional name="dataFormat">
-			<param name="inputFormat" type="select" label="Please select input file type and library type.">
-				<option value="singleFASTQ">Single-end FASTQ</option>
-				<option value="pairedFASTQ">Paired-end FASTQ</option>
-				<option value="singleFASTA">Single-end FASTA</option>
-				<option value="pairedFASTA">Paired-end FASTA</option>
-			</param>
-			<when value="singleFASTQ">
-				<param name="input1" type="data" format="fastq,fq.gz,fq" label="FASTQ file" />
-			</when>
-			<when value="pairedFASTQ">
-				<param name="input1" type="data" format="fastq,fq.gz,fq" label="Forward/Left FASTQ file" />
-				<param name="input2" type="data" format="fastq,fq.gz,fq" label="Reverse/Right FASTQ file" />
-			</when>
-			<when value="singleFASTA">
-				<param name="input1" type="data" format="fasta,fasta.gz,fasta.bz2,fa.gz" label="FASTA file" />
-			</when>
-			<when value="pairedFASTA">
-				<param name="input1" type="data" format="fasta,fasta.gz,fasta.bz2,fa.gz" label="Forward/Left FASTA file" />
-				<param name="input2" type="data" format="fasta,fasta.gz,fasta.bz2,fa.gz" label="Reverse/Right FASTA file" />
-			</when>
-		</conditional>
-	</inputs>
-
-	<outputs>
-		<data format="input1" name="set1OutputLeft" label="Taxonomy1-unique sequences (R1) of ${on_string}" />
-		<data format="input1" name="set2OutputLeft" label="Taxonomy2-unique sequences (R1) of ${on_string}" />
-		<data format="input1" name="intersectionOutputLeft" label="Common sequences (R1) of ${on_string}" />
-		<data format="input1" name="unclassifiedOutputLeft" label="Unclassified sequences (R1) of ${on_string}" />
-
-		<data format="input2" name="set1OutputRight" label="Taxonomy1-unique sequences (R2) of ${on_string}">
-			<filter>inputFormat == "pairedFASTQ" or inputFormat == "pairedFASTA"</filter>
-		</data>
-		<data format="input2" name="set2OutputRight" label="Taxonomy2-unique sequences (R2) of ${on_string}">
-			<filter>inputFormat == "pairedFASTQ" or inputFormat == "pairedFASTA"</filter>
-		</data>
-		<data format="input2" name="intersectionOutputRight" label="Common sequences (R2) of ${on_string}">
-			<filter>inputFormat == "pairedFASTQ" or inputFormat == "pairedFASTA"</filter>
-		</data>
-		<data format="input2" name="unclassifiedOutputRight" label="Unclassified sequences (R2) of ${on_string}">
-			<filter>inputFormat == "pairedFASTQ" or inputFormat == "pairedFASTA"</filter>
-		</data>
-	</outputs>
-
-	<help>
-	This tool compares the classification results on a dataset using two different kraken-databases.
-	</help>
-</tool>
--- a/kraken_count.py	Mon May 18 15:29:05 2015 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,27 +0,0 @@
-#!/usr/bin/env python
-
-import os
-import sys
-import csv
-
-from collections import Counter
-
-from kraken_visualize import readTaxonomyNames, readTaxonomyNodes, getDescendents
-
-
-taxons = readTaxonomyNames(os.path.join(sys.argv[2], 'names.dmp'))
-
-taxID = int(sys.argv[3])
-validTaxons = getDescendents(taxID, readTaxonomyNodes(os.path.join(sys.argv[2], 'nodes.dmp'))[1])
-
-c = Counter([int(row[2])
-	         for row in csv.reader(open(sys.argv[1]), delimiter='\t', quotechar='"')])
-
-N = float(sum(c.values()))
-ct = 0
-for k in sorted(c, key=lambda x:c[x], reverse=True):
-	if k in validTaxons:
-		print k, taxons.get(k), c[k], '%.10f' % (c[k]/N)
-		ct += c[k]
-	# print k, taxons.get(k, 'N/A'), c[k], 'VALID' if k in validTaxons else ''
-print ct, ct/N
--- a/kraken_filter.py	Mon May 18 15:29:05 2015 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,254 +0,0 @@
-#!/usr/bin/env python
-
-import os
-import sys
-import argparse
-import subprocess
-import struct
-import shutil
-import tempfile
-
-from checkFileFormat import verifyFileFormat
-
-DEFAULT_KRAKEN_DB='/tsl/data/krakendb/ktest/db'
-logfile = None
-
-
-def getDescendents(taxID, tree):
-    descendents = set([taxID])
-    queue = [taxID]
-    while queue:
-        node = queue.pop()
-
-        children = tree.get(node, set())
-        if children:
-            descendents = descendents.union(children)
-            queue.extend(children)
-        pass
-    return descendents
-
-def anabl_getSeqsFromFastX(fn, X=2):
-    it = open(fn)
-    for head in it:
-        try:
-            yield (head.strip(), tuple(map(lambda x:x.strip(),
-                                       ([it.next() for i in xrange(X - 1)]))))
-        except:
-            break
-        pass
-    it.close()
-    pass
-
-def readTaxonomy(db):
-    nodeNames = []
-    for line in open(os.path.join(db, 'taxonomy', 'names.dmp')):
-        line = line.strip().strip('|').strip()
-        if not line: break
-        line = line.split('\t|\t')
-
-        if line[3].strip() == 'scientific name':
-            nodeNames.append((int(line[0].strip()), line[1].strip()))
-        # break
-        pass
-
-    nodeRanks = []
-    nodeChildren = {}
-    for line in open(os.path.join(db, 'taxonomy', 'nodes.dmp')):
-        line = line.strip().strip('|').strip()
-        if not line: break
-        line = map(lambda x:x.strip(), line.split('\t|\t'))
-        line[:2] = map(int, line[:2])
-        if line[0] == 1:
-            line[1] = 0
-        try:
-            nodeChildren[line[1]].add(line[0])
-        except:
-            nodeChildren[line[1]] = set([line[0]])
-        nodeRanks.append((line[0], line[2]))
-        # break
-
-    return dict(nodeNames), dict(nodeRanks), nodeChildren
-
-def getFastqIdentifier(string):
-    if string.endswith('/1') or string.endswith('/2'):
-        return string[1:-2]
-    else:
-        return string.split()[0][1:]
-
-def runFilter(db, taxID, inputClassification, inputR1, outputR1, inputR2=None, outputR2=None, fastx=4, debug=False):
-    global logfile
-    logfile.write('Reading taxonomy...\n')
-    logfile.flush()
-    taxonomyInfo = readTaxonomy(db)
-    keepSequences = set()
-    logfile.write('Traversing requested taxonomy branch...\n')
-    logfile.flush()
-    validIDs = getDescendents(taxID, taxonomyInfo[2])
-    logfile.write('Family tree of %s has %i descendents.\n' % (str(taxID), len(validIDs)))
-    logfile.flush()
-    if debug:
-        for vid in validIDs:
-            logfile.write('Added %s to validIDs.\n' % vid)
-            logfile.flush()
-
-
-    logfile.write('Filtering sequences...\n')
-    logfile.flush()
-    nseqs = 0
-    for line in open(inputClassification):
-        nseqs += 1
-        line = line.strip().split()
-        takeUnclassified = taxID == 0 and line[0] == 'U'
-        takeClassified = line[0] == 'C' and int(line[2]) in validIDs
-        if takeUnclassified or takeClassified:
-            keepSequences.add(line[1].strip())
-            if debug:
-                logfile.write('Added %s to keepSequences.\n' % line[1].strip())
-                logfile.flush()
-    logfile.write('Keeping %i of %i sequences (%.1f).\n' % (len(keepSequences), nseqs, float(len(keepSequences))/nseqs))
-    logfile.flush()
-    logfile.write('Writing filtered sequence sets...\n')
-    logfile.flush()
-    fwdOut = open(outputR1, 'wb')
-    fwdGen = anabl_getSeqsFromFastX(inputR1, X=fastx)
-
-    revOut, revGen = None, None
-    revSid, revSeq = None, None
-    if outputR2 is not None and inputR2 is not None:
-        revOut = open(outputR2, 'wb')
-        revGen = anabl_getSeqsFromFastX(inputR2, X=fastx)
-
-    fqid1, fqid2 = None, None
-    while True:
-        try:
-            fwdSid, fwdSeq = fwdGen.next()
-            fqid1 = getFastqIdentifier(fwdSid)
-            if revGen is not None:
-                revSid, revSeq = revGen.next()
-                fqid2 = getFastqIdentifier(revSid)
-        except:
-            break
-        if fqid1 != fqid2 and fqid2 is not None:
-            sys.stderr.write('Error: fqid-mismatch %s %s.\n' % (fqid1, fqid2))
-            sys.exit(1)
-        if fqid1 in keepSequences:
-            fwdOut.write(('%s\n' * fastx) % ((fwdSid,) + fwdSeq))
-            if revOut is not None:
-                revOut.write(('%s\n' * fastx) % ((revSid,) + revSeq))
-        else:
-            # sys.stdout.write('%s is not in keepSequences\n' % fqid1)
-            pass
-    fwdOut.close()
-    if revOut is not None:
-        revOut.close()
-    pass
-
-
-def main(argv):
-
-    descr = ''
-    parser = argparse.ArgumentParser(description=descr)
-    parser.add_argument('--dbtype', default='builtinDB', help='Is database built-in or supplied externally?')
-    parser.add_argument('--db', default=DEFAULT_KRAKEN_DB, help='Path to kraken db')
-    parser.add_argument('--in1', help='The r1-file (single-end reads or left paired-end reads).')
-    parser.add_argument('--in2', help='The r2-file (right paired-end reads)')
-    parser.add_argument('--taxid', default=1, help='The root taxon id of the requested branch', type=int)
-    parser.add_argument('--kraken-results', type=str, help='A file containing kraken classification results for the input sequences.')
-    parser.add_argument('--input-format', help='Input sequences stored in fa or fq file(s).', default='fq')
-
-    parser.add_argument('--out1', type=str, help='The r1-output file.')
-    parser.add_argument('--out2', type=str, help='The r2-output file.')
-    parser.add_argument('--debug', action='store_true')
-    parser.add_argument('--logfile', type=str, help='A logfile.', default='kraken_filter.log')
-    args = parser.parse_args()
-
-    kraken_params = []
-    kraken_input = []
-
-    global logfile
-    logfile = open(args.logfile, 'wb')
-    if 'db' not in args or not os.path.exists(args.db):
-        sys.stderr.write('Error: database is missing.\n')
-        sys.exit(1)
-    kraken_params.extend(['--db', args.db])
-    # check whether input file(s) exist(s)
-
-    if 'kraken_results' in args and os.path.exists(args.kraken_results):
-        kraken_input.append(args.kraken_results)
-    else:
-        sys.stderr.write('Error: kraken-classification is missing.\n')
-        sys.exit(1)
-        pass
-
-    if not ('in1' in args and os.path.exists(args.in1)):
-        sys.stderr.write('Error: left/single input file (%s) is missing.\n' % args.in1)
-        sys.exit(1)
-    if not verifyFileFormat(args.in1, args.input_format):
-        sys.stderr.write('Error: fwd/single input file has the wrong format assigned.\n')
-        sys.exit(1)
-
-    kraken_input.append(args.in1)
-    if 'in2' in args:
-        if args.in2 is not None and not os.path.exists(args.in2):
-            sys.stderr.write('Error: right input file (%s) is missing.\n' % args.in2)
-            sys.exit(1)
-        elif args.in2 is not None and not verifyFileFormat(args.in2, args.input_format):
-            sys.stderr.write('Error: rev input file has the wrong format assigned.\n')
-            sys.exit(1)
-        elif args.in2 is not None:
-            kraken_params.append('--paired')
-            kraken_input.append(args.in2)
-        else:
-            pass
-    else:
-        pass
-
-    if 'out2' in args and 'in2' in args:
-        input2, output2 = args.in2, args.out2
-    else:
-        input2, output2 = None, None
-
-    if args.taxid < 0:
-        sys.stderr.write('Error: invalid taxon id %i\n' % args.taxid)
-        sys.exit(1)
-    kraken_params.extend(['--taxid', args.taxid])
-
-    if args.input_format == 'fq':
-        kraken_params.append('--fastq-input')
-        fastx = 4
-    else:
-        kraken_params.append('--fasta-input')
-        fastx = 2
-
-
-
-    # firstChar = open(args.in1).read(1)
-    # if firstChar == '>':
-    #    kraken_params.append('--fasta-input')
-    #    fastx = 2
-    # elif firstChar == '@':
-    #    kraken_params.append('--fastq-input')
-    #    fastx = 4
-    #else:
-    #    """ This will currently disable working with gzipped/bzipped files. """
-    #    sys.stderr.write('Error: Input file starts with unknown symbol %s.\n' % firstChar)
-    #    sys.exit(1)
-    #    pass
-
-    #open(args.kraken_filtered_r1, 'wb').write('\n'.join(map(str, kraken_params + kraken_input)))
-
-    runFilter(args.db, int(args.taxid), args.kraken_results,
-              args.in1, args.out1,
-              inputR2=input2, outputR2=output2, fastx=fastx,
-              debug=args.debug)
-
-    logfile.close()
-
-    pass
-
-
-
-
-# main(sys.argv[1:])
-
-if __name__ == '__main__': main(sys.argv[1:])
--- a/kraken_filter.xml	Mon May 18 15:29:05 2015 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,123 +0,0 @@
-<tool id="kraken_filter" name="kraken_filter">
-	<description>kraken result filter</description>
-	<requirements>
-	  <requirement type="package" version="2.7.4">python</requirement>
-	</requirements>
-	<command interpreter="python">kraken_filter.py
-		--dbtype="${databaseChoice.database}"
-		#if $databaseChoice.database == "builtinDB":
-		  --db="${databaseChoice.builtinDatabases.fields.path}"
-		#else:
-		  --db="NOT_SUPPORTED_YET"
-		#end if
-
-		--in1="${dataFormat.input1}"
-		--out1="${kraken_filtered_r1}"
-
-		#if $dataFormat.inputFormat == "pairedFastQ" or $dataFormat.inputFormat == "pairedFastA":
-		  --in2="${dataFormat.input2}"
-		  --out2="${kraken_filtered_r2}"
-		#end if
-
-		#if $dataFormat.inputFormat == "singleFastQ" or $dataFormat.inputFormat == "pairedFastQ":
-		  --input-format="fq"
-		#else:
-		  --input-format="fa"
-		#end if
-
-		#if $whichFilter.whichFilterp == "filterClassified":
-		  --taxid="${whichFilter.taxonID}"
-		#else:
-		  --taxid="0"
-		#end if
-
-                --kraken-results="${kraken_classification}"
-
-                --logfile="${kraken_filter_log}"
-	</command>
-	<inputs>
-		<conditional name="databaseChoice">
-			<param name="database" type="select" label="Do you want to use your own kraken database or use the built-in ktest database?">
-				<option value="builtinDB">Use built-in db (ktest)</option>
-				<option value="myownDB">Use a db from my history</option>
-			</param>
-			<when value="builtinDB">
-				<param name="builtinDatabases" type="select" label="Select a built-in kraken database">
-					<options from_data_table="krakenDBs">
-						<filter type="sort_by" column="2" />
-						<validator type="no_options" message="No databases are available" />
-					</options>
-				</param>
-			</when>
-			<when value="myownDB">
-				<param name="myownDB" type="data" format="fasta" metadata_name="dbkey" label="Select database" />
-			</when>
-		</conditional>
-
-                <conditional name="whichFilter">
-			<param name="whichFilterp" type="select" label="Filter classified or unclassified reads?">
-				<option value="filterClassified">Filter classified reads</option>
-				<option value="filterUnclassified">Filter unclassified reads</option>
-			</param>
-			<when value="filterClassified">
-				<param name="taxonID" type="integer" label="Taxonomy ID of root taxon (default:1 - 'all')" value="1" default="1" />
-			</when>
-                </conditional>
-
-		<param name = "kraken_classification" type="data" format="tabular" label="Kraken classification output" />
-<!--
-		<conditional name="libraryType">
-			<param name="libtype" type="select" label="Is this library single- or paired-end?">
-				<option value="single">Single-end</option>
-				<option value="paired">Paired-end</option>
-			</param>
-			<when value="single">
-				<param name="input1" type="data" format="fasta,fastq,fasta.gz,fasta.bz2,fa.gz,fq.gz,fa,fq" label="Fasta/Fastq file" />
-			</when>
-			<when value="paired">
-				<param name="input1" type="data" format="fasta,fastq,fasta.gz,fasta.bz2,fa.gz,fq.gz,fa,fq" label="Left Fasta/Fastq file" />
-				<param name="input2" type="data" format="fasta,fastq,fasta.gz,fasta.bz2,fa.gz,fq.gz,fa,fq" label="Right Fasta/Fastq file" />
-			</when>
-		</conditional>
--->
-<!-- new -->
-
-		<conditional name="dataFormat">
-			<param name="inputFormat" type="select" label="Please select input file type and library type.">
-				<option value="singleFastQ">Single-end FASTQ</option>
-				<option value="pairedFastQ">Paired-end FASTQ</option>
-				<option value="singleFastA">Single-end FASTA</option>
-				<option value="pairedFastA">Paired-end FASTA</option>
-			</param>
-			<when value="singleFastQ">
-				<param name="input1" type="data" format="fastq,fq.gz,fq" label="FASTQ file" />
-			</when>
-			<when value="pairedFastQ">
-				<param name="input1" type="data" format="fastq,fq.gz,fq" label="Forward/Left FASTQ file" />
-				<param name="input2" type="data" format="fastq,fq.gz,fq" label="Reverse/Right FASTQ file" />
-			</when>
-			<when value="singleFastA">
-				<param name="input1" type="data" format="fasta,fasta.gz,fasta.bz2,fa.gz" label="FASTA file" />
-			</when>
-			<when value="pairedFastA">
-				<param name="input1" type="data" format="fasta,fasta.gz,fasta.bz2,fa.gz" label="Forward/Left FASTA file" />
-				<param name="input2" type="data" format="fasta,fasta.gz,fasta.bz2,fa.gz" label="Reverse/Right FASTA file" />
-			</when>
-		</conditional>
-
-<!-- new -->
-
-	</inputs>
-	<outputs>
-                <data format="input1" name="kraken_filtered_r1" label="${tool.name} filtered fwd-reads of ${on_string}" />
-                <data format="input2" name="kraken_filtered_r2" label="${tool.name} filtered rev-reads of ${on_string}">
-                	<filter>inputFormat == "pairedFastQ" or inputFormat == "pairedFastA</filter>
-
-                </data>
-                <data format="txt" name="kraken_filter_log" label="${tool.name} logfile (${on_string})" />
-	</outputs>
-
-	<help>
-	This tool is a downstream filter for kraken-classified sequences.
-	</help>
-</tool>
--- a/kraken_summarize.py	Mon May 18 15:29:05 2015 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,101 +0,0 @@
-#!/usr/bin/env python
-import sys
-import subprocess
-import argparse
-
-from collections import Counter
-
-from Bio import Entrez
-
-
-
-# U	HWI-ST933:109:C2A2NACXX:5:1101:1311:2013	0	50	Q:0
-def fetchTaxonomyData(ids, email='christian.schudoma@tsl.ac.uk'):
-    Entrez.email = email
-    handle = Entrez.efetch(db='Taxonomy', id=','.join(ids), retmode='xml')
-    records = Entrez.read(handle)
-    return records
-
-def writeKronaInput(fi, taxInfo, unclassified=0):
-    if unclassified:
-        fi.write('%i\tUnclassified\n' % unclassified)
-    for tid in sorted(taxInfo, key=lambda x:taxInfo[x][0]['Lineage']):
-        fi.write('%i\t%s\n' % (taxInfo[tid][1], '; '.join([taxInfo[tid][0]['Lineage'].strip(), taxInfoDict[tid][0]['ScientificName']]).replace('; ', '\t').strip('\t')))
-    pass
-
-def writeOutput(out, taxInfoDict, c):
-    for tid in sorted(taxInfoDict, reverse=True, key=lambda x:taxInfoDict[x][1]):
-        data = [tid, taxInfoDict[tid][1], taxInfoDict[tid][0]['TaxId'], taxInfoDict[tid][0]['Lineage'], taxInfoDict[tid][0]['ScientificName']]
-        out.write('\t'.join(data) + '\n')
-    data = (sum(c.values()) - c['0'], sum(c.values()), (sum(c.values()) - c['0']) / float(sum(c.values())) * 100, c['0'])
-    out.write('%i/%i (%.5f%%) classified, %i unclassified\n' % data)
-
-
-def main(argv):
-
-    parser = argparse.ArgumentParser(description='')
-    parser.add_argument('--krona-output', type=str, default='')
-    parser.add_argument('--output', type=str)
-    parser.add_argument('--call-krona', type=str)
-    parser.add_argument('--include-unclassified')
-    parser.add_argument('kraken_summary_tsv', type=str)
-    args = parser.parse_args()
-
-    c = Counter(line.strip().split()[2] for line in open(sys.argv[1]))
-    taxids = sorted(c.keys(), key=lambda x:c[x], reverse=True)
-    taxData = fetchTaxonomyData(taxids)
-    taxInfoDict = {tinfo['TaxId']: [tinfo, c[tinfo['TaxId']]] for tinfo in taxData}
-
-    kr_out = None
-    if 'krona_output' in args:
-        with open(args.krona_output, 'wb') as kr_out:
-            if 'include_unclassified' in args:
-                writeKronaInput(kr_out, taxInfoDict, unclassified=c['0'])
-            else:
-                writeKronaInput(kr_out, taxInfoDict)
-
-        if 'call_krona' in args:
-            p = subprocess.Popen('source krona_tools-2.4; ktImportText -o %s %s' % (args.call_krona, args.krona_output), shell=True, stdout=subprocess.PIPE)
-            p.communicate()
-
-    if 'output' in args:
-        if args.output == '-':
-            writeOutput(sys.stdout, taxInfoDict, c)
-        else:
-            with open(args.output, 'wb') as out:
-                writeOutput(out, taxInfoDict, c)
-
-
-
-
-
-
-
-
-"""
-2	Fats	Saturated fat
-3	Fats	Unsaturated fat	Monounsaturated fat
-3	Fats	Unsaturated fat	Polyunsaturated fat
-13	Carbohydrates	Sugars
-4	Carbohydrates	Dietary fiber
-21	Carbohydrates
-5	Protein
-4
-"""
-
-
-
-"""
-handle = Entrez.efetch(db="Taxonomy", id="1323524", retmode="xml")
-records  = Entrez.read(handle)
-records[0]["Lineage"]
-'Viruses; dsRNA viruses; Partitiviridae; Betapartitivirus'
-records[0]["LineageEx"]
-[{u'ScientificName': 'Viruses', u'TaxId': '10239', u'Rank': 'superkingdom'}, {u'ScientificName': 'dsRNA viruses', u'TaxId': '35325', u'Rank': 'no rank'}, {u'ScientificName': 'Partitiviridae', u'TaxId': '11012', u'Rank': 'family'}, {u'ScientificName': 'Betapartitivirus', u'TaxId': '1511809', u'Rank': 'genus'}]
-"""
-
-"""
-{u'Lineage': 'Viruses; dsDNA viruses, no RNA stage; Iridoviridae; unclassified Iridoviridae', u'Division': 'Viruses', u'ParentTaxId': '180169', u'PubDate': '2014/03/16 07:01:27', u'LineageEx': [{u'ScientificName': 'Viruses', u'TaxId': '10239', u'Rank': 'superkingdom'}, {u'ScientificName': 'dsDNA viruses, no RNA stage', u'TaxId': '35237', u'Rank': 'no rank'}, {u'ScientificName': 'Iridoviridae', u'TaxId': '10486', u'Rank': 'family'}, {u'ScientificName': 'unclassified Iridoviridae', u'TaxId': '180169', u'Rank': 'no rank'}], u'CreateDate': '2014/02/24 15:56:28', u'TaxId': '1465751', u'Rank': 'species', u'GeneticCode': {u'GCId': '1', u'GCName': 'Standard'},
-u'ScientificName': 'Anopheles minimus irodovirus', u'MitoGeneticCode': {u'MGCId': '0', u'MGCName': 'Unspecified'}, u'UpdateDate': '2014/02/24 15:56:29'}
-
-"""
--- a/kraken_summarize.xml	Mon May 18 15:29:05 2015 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,26 +0,0 @@
-<tool id="kraken_summarize" name="kraken_summarize">
-	<description></description>
-	<requirements>
-	  <requirement type="package" version="2.7.4">python</requirement>
-	</requirements>
-	<command interpreter="python">kraken_summarize.py
-		  --krona-output=$krona_output
-			--output=$tax_summary
-			--call-krona=$krona_html
-			$kraken_summary_tsv
-	</command>
-
-
-	<inputs>
-		<param format="tabular" name="kraken_summary_tsv" type="data" label="Kraken summary table"/>
-		</inputs>
-	<outputs>
-		<data format="html" name="krona_html" label="Krona visualization of ${tool.name}" />
-		<data format="tabular" name="krona_output" label="Krona input" />
-		<data format="tabular" name="tax_summary" label="Taxonomy class summary" />
-	</outputs>
-
-	<help>
-	Krona visualisation and classification summary of Kraken runs.
-	</help>
-</tool>
--- a/kraken_visualize.py	Mon May 18 15:29:05 2015 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,48 +0,0 @@
-#!/usr/bin/env python
-
-import sys
-
-def readTaxonomyNames(names_dmp):
-    nodeNames = []
-    for line in open(names_dmp):
-        line = line.strip().strip('|').strip()
-        if not line: break
-        line = line.split('\t|\t')
-        if line[3].strip() == 'scientific name':
-            nodeNames.append((int(line[0].strip()), line[1].strip()))
-        pass
-    return dict(nodeNames)
-
-def readTaxonomyNodes(nodes_dmp):
-    nodeRanks = []
-    nodeChildren = {}
-    nodeParents = {}
-    for line in open(nodes_dmp):
-        line = line.strip().strip('|').strip()
-        if not line: break
-        line = map(lambda x:x.strip(), line.split('\t|\t'))
-        line[:2] = map(int, line[:2])
-        if line[0] == 1:
-            line[1] = 1
-
-        nodeParents[line[0]] = line[1]
-        try:
-            nodeChildren[line[1]].add(line[0])
-        except:
-            nodeChildren[line[1]] = set([line[0]])
-        nodeRanks.append((line[0], line[2]))
-
-    return dict(nodeRanks), nodeChildren, nodeParents
-
-def getDescendents(taxID, tree):
-    descendents = set([taxID])
-    queue = [taxID]
-    while queue:
-        node = queue.pop()
-
-        children = tree.get(node, set())
-        if children:
-            descendents = descendents.union(children)
-            queue.extend(children)
-        pass
-    return descendents
\ No newline at end of file
--- a/tool_dependencies.xml	Mon May 18 15:29:05 2015 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,9 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-    <package name="python" version="2.7">
-        <repository changeset_revision="25b20b323706" name="package_python_2_7" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
-    </package>
-    <package name="biopython" version="1.65">
-        <repository changeset_revision="f8d72690eeae" name="package_biopython_1_65" owner="biopython" toolshed="https://testtoolshed.g2.bx.psu.edu" />
-    </package>
-</tool_dependency>