# HG changeset patch
# User cschu
# Date 1431977345 14400
# Node ID 0916697409eaf837b3fa89319cb5e2ea10d42002
Uploaded
diff -r 000000000000 -r 0916697409ea kraken.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kraken.py	Mon May 18 15:29:05 2015 -0400
@@ -0,0 +1,150 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import argparse
+import subprocess
+import struct
+import shutil
+import tempfile
+
+# from collections import Counter
+
+from checkFileFormat import verifyFileFormat 
+DEFAULT_KRAKEN_DB = '/tsl/data/krakendb/ktest/db1'
+
+def main(argv):
+    
+    descr = ''
+    parser = argparse.ArgumentParser(description=descr)
+    parser.add_argument('--dbtype', default='builtinDB', help='Is database built-in or supplied externally?')
+    parser.add_argument('--db', default=DEFAULT_KRAKEN_DB, help='Path to kraken db')
+    parser.add_argument('--in1', help='Single-end reads or left paired-end reads')
+    parser.add_argument('--in2', help='Right paired-end reads')
+    parser.add_argument('--quick', action='store_true', help='Use quick mode?')
+    parser.add_argument('--min-hits', default=1, help='Minimum number of hits required.')
+    parser.add_argument('--input-format', help='Input sequences stored in fa or fq file(s).', default='fq')
+    parser.add_argument('kraken_summary_tsv', type=str, help='The output file.')
+    # parser.add_argument('classified_seqs_fq', type=str, help='The output file.')
+    # parser.add_argument('unclassified_seqs_fq', type=str, help='The output file.')
+    args = parser.parse_args()
+
+    #  kraken --preload --db /tsl/scratch/macleand/ktest/db --threads 12 --quick --classified-out classified_seqs.fq --unclassified-out unclassified.fq --fastq-input --min-hits 1 --output classification.txt left_reads.fq right_reads.fq
+
+    kraken_params = ['--preload', '--threads', '8', 
+                     '--unclassified-out', args.unclassified_seqs_fq,
+                     '--output', args.kraken_summary_tsv]
+    #                '--classified-out', args.classified_seqs_fq,
+    kraken_input = []
+
+    if 'db' not in args or not os.path.exists(args.db):
+        sys.stderr.write('Error: database is missing.\n')
+        sys.exit(1)
+    kraken_params.extend(['--db', args.db])
+    # check whether input file(s) exist(s)
+    if not ('in1' in args and os.path.exists(args.in1)):
+        sys.stderr.write('Error: fwd/single input file (%s) is missing.\n' % args.in1)
+        sys.exit(1)
+    if not verifyFileFormat(args.in1, args.input_format):
+        fc = open(args.in1).read(1)
+        sys.stderr.write('Error: fwd/single input file has the wrong format assigned (%s). Starts with \'%s\'\n' % (args.input_format, fc))
+        sys.exit(1)
+    kraken_input.append(args.in1)
+    if 'in2' in args:
+        if args.in2 is not None and not os.path.exists(args.in2):
+            sys.stderr.write('Error: right input file (%s) is missing.\n' % args.in2)
+            sys.exit(1)
+        elif args.in2 is not None and not verifyFileFormat(args.in2, args.input_format):
+            sys.stderr.write('Error: rev input file has the wrong format assigned.\n')
+            sys.exit(1)            
+        elif args.in2 is not None:
+            kraken_params.append('--paired')
+            kraken_input.append(args.in2)
+        else:
+            pass
+    else:
+        pass
+
+    if 'quick' in args:
+        kraken_params.append('--quick')
+        if 'min_hits' in args:
+            kraken_params.extend(['--min-hits', str(args.min_hits)])
+
+    if args.input_format == 'fq': 
+        kraken_params.append('--fastq-input')
+    else:
+        kraken_params.append('--fasta-input')
+      
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+
+    # check whether file is gzipped
+    header = ''
+      
+    cmd = 'source kraken-0.10.5; '
+    cmd += ' '.join(['kraken'] + kraken_params + kraken_input) + '\n'
+    # out = open(argv[-1], 'wb').write(str(cmd) + '\n')#.write('KRAKEN OUTPUT, HAS COUNTER\n')
+    # out = open(argv[-1], 'wb').write(str(args) + '\n')#.write('KRAKEN OUTPUT, HAS COUNTER\n')
+
+    # proc = subprocess.Popen(args=cmd, shell=True, stderr=sys.stderr)
+    # returncode = proc.wait()
+
+
+    tmp_dir = tempfile.mkdtemp()
+    try:
+        tmp = tempfile.NamedTemporaryFile(dir=tmp_dir).name
+        tmp_stderr = open(tmp, 'wb')
+        proc = subprocess.Popen(args=cmd, shell=True, stderr=tmp_stderr.fileno())
+        returncode = proc.wait()
+        tmp_stderr.close()
+        # get stderr, allowing for case where it's very large
+        tmp_stderr = open(tmp, 'rb')
+        stderr = ''
+        buffsize = 1048576
+        try:
+            while True:                    
+                stderr += tmp_stderr.read(buffsize)
+                if not stderr or len(stderr) % buffsize != 0:
+                    break
+        except OverflowError:
+            pass
+        tmp_stderr.close()
+        if returncode != 0:
+            raise Exception, stderr
+    except Exception, e:
+        # clean up temp dirs
+        if os.path.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+        sys.stderr.write('Error running kraken: %s\n' % str(e))
+        sys.exit(1)
+
+    if os.path.exists(tmp_dir):
+        shutil.rmtree(tmp_dir)
+    """
+    open(args.kraken_summary_tsv, 'wb').write('\t'.join(list('ACGT')) + '\n')
+    open(args.classified_seqs_fq, 'wb').write(cmd + '\n')
+    open(args.unclassified_seqs_fq, 'wb').write('blruah\n')                      # check whether the database exists, if not exit with error
+    """
+    """
+    for arg in args:
+        out.write(str(arg) + '\n') 
+    out.close()
+    """
+    pass
+
+
+ 
+    
+# main(sys.argv[1:])
+
+if __name__ == '__main__': main(sys.argv[1:])
diff -r 000000000000 -r 0916697409ea kraken.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kraken.xml	Mon May 18 15:29:05 2015 -0400
@@ -0,0 +1,112 @@
+
+	taxonomic sequence classifier
+	
+	  python
+	  kraken
+	
+	kraken.py 
+		--dbtype="${databaseChoice.database}"
+		#if $databaseChoice.database == "builtinDB":
+		  --db="${databaseChoice.builtinDatabases.fields.path}"
+		#else:
+		  --db="NOT_SUPPORTED_YET"
+		#end if
+
+		--in1="${dataFormat.input1}"
+		#if $dataFormat.inputFormat == "pairedFastQ" or $dataFormat.inputFormat == "pairedFastA":
+		  --in2="${dataFormat.input2}"
+		#end if
+
+		#if $dataFormat.inputFormat == "singleFastQ" or $dataFormat.inputFormat == "pairedFastQ":
+		  --input-format="fq"
+		#else:
+		  --input-format="fa"
+		#end if
+
+                #if $quickMode.useQuickMode == "useQuick":
+                  --quick
+                  --min-hits="${quickMode.minHits.value}"
+                #end if
+
+	
+                $kraken_summary_tsv
+                
+		
+	
+		
+			
+				
+				
+			
+			
+				
+					
+						
+						
+					
+				
+			
+			
+				
+			
+		
+
+		
+			
+				
+				
+			
+			
+				
+			
+		
+
+
+		
+			 
+				
+				
+				
+				
+			
+			
+				
+			
+			
+				
+				
+			
+			
+				
+			
+			
+				
+				
+			
+		
+
+
+	
+	
+		
+		
+	
+
+	
+	This tool wraps the kraken taxonomic sequence classifier.
+	
+
diff -r 000000000000 -r 0916697409ea kraken_combine.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kraken_combine.py	Mon May 18 15:29:05 2015 -0400
@@ -0,0 +1,157 @@
+#!/usr/bin/env python
+import sys
+import argparse
+
+def anabl_getContigsFromFASTQ(fn):
+    with open(fn) as fi:
+        for head in fi:
+            try:
+                seq, sep, qual = map(lambda x:x.strip(), [fi.next() for i in xrange(3)])
+                yield head[1:], seq, qual
+            except:
+                    break
+    pass
+
+def anabl_getContigsFromFASTA(fn):
+    with open(fn) as fi:
+            for head in fi:
+                try:                
+                    yield head[1:], fi.next().strip()
+                except:
+                    break
+    pass
+
+def toFASTQ(fields):
+    return '@%s\n%s\n+\n%s\n' % fields
+def toFASTA(fields):
+    return '>%s\n%s\n' % fields
+def getFASTQID(head):
+    return head.strip().split()[0].strip('@').strip('/1')
+def getFASTAID(head):
+    return head.strip().split()[0].strip('>').strip('/1')
+
+
+
+def main(argv):
+    DEFAULT_KRAKEN_DB= ''
+    parser = argparse.ArgumentParser(description='')
+    parser.add_argument('--db', default=DEFAULT_KRAKEN_DB, help='Path to kraken db.')  # we don't need taxonomy here
+    parser.add_argument('--in1', default='', help='Forward read file from metagenomic sample.')
+    parser.add_argument('--in2', default='', help='Reverse read file from metagenomic sample.')
+    parser.add_argument('--kraken-results1', help='File containing kraken classification for set1.')
+    parser.add_argument('--kraken-results2', help='File containing kraken classification for set2.')
+    parser.add_argument('--input-format', default='fa', help='fa (FASTA) or fq (FASTQ)')
+    parser.add_argument('--set1-output-left', type=str)
+    parser.add_argument('--set1-output-right', type=str)
+    parser.add_argument('--set2-output-left', type=str)
+    parser.add_argument('--set2-output-right', type=str)
+    parser.add_argument('--unclassified-output-left', type=str)
+    parser.add_argument('--unclassified-output-right', type=str)
+    parser.add_argument('--intersection-output-left', type=str)
+    parser.add_argument('--intersection-output-right', type=str)    
+    args = parser.parse_args()
+
+    if args.input_format == 'fa':
+        getContigs = anabl_getContigsFromFASTA  
+        writeFXRecord = toFASTA
+        getFXID = getFASTAID
+    elif args.input_format == 'fq':
+        getContigs = anabl_getContigsFromFASTQ
+        writeFXRecord = toFASTQ
+        getFXID = getFASTQID
+    else:
+        sys.stderr.write('Error: Unknown input format (%s).\n' % args.input_format)
+        sys.exit()
+
+    try:
+        set1 = [line.strip().split()[:2] for line in open(args.kraken_results1)]
+    except:
+        sys.stderr.write('Error: Set1 is missing, please specify --kraken_results1 parameter.\n')
+        sys.exit()
+
+    nReads = len(set1)
+    set1 = set(map(lambda x:x[1], filter(lambda x:x[0].startswith('C'), set1)))
+
+    try:
+        set2 = set([line.strip().split()[1] for line in open(args.kraken_results2) if line.startswith('C')])
+    except:
+        sys.stderr.write('Error: Set2 is missing, please specify --kraken_results2 parameter.\n')
+        sys.exit()
+
+    try:
+        set1_fwd = open(args.set1_output_left, 'wb')        
+        set2_fwd = open(args.set2_output_left, 'wb')
+        noclass_fwd = open(args.unclassified_output_left, 'wb')
+        undecided_fwd = open(args.intersection_output_left, 'wb')        
+    except:
+        sys.stderr.write('Error: Cannot open fwd outputfile(s).\n')
+        sys.exit()
+
+    try:
+        fwd = getContigs(args.in1) 
+    except:
+        sys.stderr.write('Error: Cannot open file %s.\n' % args.in1)
+        sys.exit()
+
+    rev = None
+    if args.in2:
+        try:
+            rev = getContigs(args.in2)
+        except:
+            sys.stderr.write('Error: Cannot open file %s.\n' % args.in2)
+            sys.exit()            
+        try:
+            set1_rev = open(args.set1_output_right, 'wb')
+            set2_rev = open(args.set2_output_right, 'wb')
+            noclass_rev = open(args.unclassified_output_right, 'wb')
+            undecided_rev = open(args.intersection_output_right, 'wb')
+        except:
+            sys.stderr.write('Error: Cannot open rev outputfile(s).\n')
+            sys.exit()            
+
+
+    for i in xrange(nReads):
+        try:
+            r1 = fwd.next()
+        except:
+            break
+        r2 = None
+        if rev is not None:
+            try:
+                r2 = rev.next()
+            except:
+                break
+    
+        id_ = getFXID(r1[0])
+
+        if id_ in set1:
+            if id_ not in set2:
+                set1_fwd.write(writeFXRecord(r1))
+                if set1_rev: 
+                    set1_rev.write(writeFXRecord(r2))
+            else:
+                undecided_fwd.write(writeFXRecord(r1))
+                if undecided_rev: 
+                    undecided_rev.write(writeFXRecord(r2))
+            
+        elif id_ in set2:
+            set2_fwd.write(writeFXRecord(r1))
+            if set2_rev: 
+                set2_rev.write(writeFXRecord(r2))
+        else:
+            noclass_fwd.write(writeFXRecord(r1))
+            if noclass_rev: 
+                noclass_rev.write(writeFXRecord(r2))
+        pass
+
+    set1_fwd.close()
+    set1_rev.close()
+    set2_fwd.close()
+    set2_rev.close()
+    noclass_fwd.close()
+    noclass_rev.close()
+    undecided_fwd.close()
+    undecided_rev.close()
+    pass
+
+if __name__ == '__main__': main(sys.argv)
diff -r 000000000000 -r 0916697409ea kraken_combine.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kraken_combine.xml	Mon May 18 15:29:05 2015 -0400
@@ -0,0 +1,83 @@
+
+	extract unique and common sequences from two kraken runs
+	
+		python
+	
+	kraken_combine.py
+		--in1="${dataFormat.input1}"
+		--set1-output-left=${set1OutputLeft}
+		--set2-output-left=${set2OutputLeft}
+		--unclassified-output-left=${unclassifiedOutputLeft}
+		--intersection-output-left=${intersectionOutputLeft}
+		--kraken-results1="${classificationSet1}"
+		--kraken-results2="${classificationSet2}"
+
+		#if $dataFormat.inputFormat == "pairedFASTQ" or $dataFormat.inputFormat == "pairedFASTA":
+			--in2="${dataFormat.input2}"
+			--set1-output-right=${set1OutputRight}
+			--set2-output-right=${set2OutputRight}
+			--unclassified-output-right=${unclassifiedOutputRight}
+			--intersection-output-right=${intersectionOutputRight}
+		#end if
+
+		#if $dataFormat.inputFormat == "singleFASTQ" or $dataFormat.inputFormat == "pairedFASTQ":
+		  --input-format="fq"
+		#else:
+		  --input-format="fa"
+		#end if
+
+
+
+	
+	
+		
+		
+	
+		
+			 
+				
+				
+				
+				
+			
+			
+				
+			
+			
+				
+				
+			
+			
+				
+			
+			
+				
+				
+			
+		
+	
+
+	
+		
+		
+		
+		
+
+		
+			inputFormat == "pairedFASTQ" or inputFormat == "pairedFASTA"
+		
+		
+			inputFormat == "pairedFASTQ" or inputFormat == "pairedFASTA"
+		
+		
+			inputFormat == "pairedFASTQ" or inputFormat == "pairedFASTA"
+		
+		
+			inputFormat == "pairedFASTQ" or inputFormat == "pairedFASTA"
+		
+	
+
+	
+	This tool compares the classification results on a dataset using two different kraken-databases.
+	
+
diff -r 000000000000 -r 0916697409ea kraken_count.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kraken_count.py	Mon May 18 15:29:05 2015 -0400
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import csv
+
+from collections import Counter
+
+from kraken_visualize import readTaxonomyNames, readTaxonomyNodes, getDescendents
+
+
+taxons = readTaxonomyNames(os.path.join(sys.argv[2], 'names.dmp'))
+
+taxID = int(sys.argv[3])
+validTaxons = getDescendents(taxID, readTaxonomyNodes(os.path.join(sys.argv[2], 'nodes.dmp'))[1])
+
+c = Counter([int(row[2]) 
+	         for row in csv.reader(open(sys.argv[1]), delimiter='\t', quotechar='"')])
+
+N = float(sum(c.values()))
+ct = 0
+for k in sorted(c, key=lambda x:c[x], reverse=True):
+	if k in validTaxons:
+		print k, taxons.get(k), c[k], '%.10f' % (c[k]/N)
+		ct += c[k]
+	# print k, taxons.get(k, 'N/A'), c[k], 'VALID' if k in validTaxons else ''
+print ct, ct/N
diff -r 000000000000 -r 0916697409ea kraken_filter.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kraken_filter.py	Mon May 18 15:29:05 2015 -0400
@@ -0,0 +1,254 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import argparse
+import subprocess
+import struct
+import shutil
+import tempfile
+
+from checkFileFormat import verifyFileFormat
+
+DEFAULT_KRAKEN_DB='/tsl/data/krakendb/ktest/db'
+logfile = None
+
+
+def getDescendents(taxID, tree):
+    descendents = set([taxID])
+    queue = [taxID]
+    while queue:
+        node = queue.pop()
+
+        children = tree.get(node, set())
+        if children:
+            descendents = descendents.union(children)
+            queue.extend(children)
+        pass
+    return descendents
+
+def anabl_getSeqsFromFastX(fn, X=2):
+    it = open(fn)
+    for head in it:
+        try:
+            yield (head.strip(), tuple(map(lambda x:x.strip(),
+                                       ([it.next() for i in xrange(X - 1)]))))
+        except:
+            break
+        pass
+    it.close()
+    pass
+
+def readTaxonomy(db):
+    nodeNames = []
+    for line in open(os.path.join(db, 'taxonomy', 'names.dmp')):
+        line = line.strip().strip('|').strip()
+        if not line: break
+        line = line.split('\t|\t')
+
+        if line[3].strip() == 'scientific name':
+            nodeNames.append((int(line[0].strip()), line[1].strip()))
+        # break
+        pass
+
+    nodeRanks = []
+    nodeChildren = {}
+    for line in open(os.path.join(db, 'taxonomy', 'nodes.dmp')):
+        line = line.strip().strip('|').strip()
+        if not line: break
+        line = map(lambda x:x.strip(), line.split('\t|\t'))
+        line[:2] = map(int, line[:2])
+        if line[0] == 1:
+            line[1] = 0
+        try:
+            nodeChildren[line[1]].add(line[0])
+        except:
+            nodeChildren[line[1]] = set([line[0]])
+        nodeRanks.append((line[0], line[2]))
+        # break
+
+    return dict(nodeNames), dict(nodeRanks), nodeChildren
+
+def getFastqIdentifier(string):
+    if string.endswith('/1') or string.endswith('/2'):
+        return string[1:-2]
+    else:
+        return string.split()[0][1:]
+
+def runFilter(db, taxID, inputClassification, inputR1, outputR1, inputR2=None, outputR2=None, fastx=4, debug=False):
+    global logfile
+    logfile.write('Reading taxonomy...\n')
+    logfile.flush()
+    taxonomyInfo = readTaxonomy(db)
+    keepSequences = set()
+    logfile.write('Traversing requested taxonomy branch...\n')
+    logfile.flush()
+    validIDs = getDescendents(taxID, taxonomyInfo[2])
+    logfile.write('Family tree of %s has %i descendents.\n' % (str(taxID), len(validIDs)))
+    logfile.flush()
+    if debug:
+        for vid in validIDs:
+            logfile.write('Added %s to validIDs.\n' % vid)
+            logfile.flush()
+	
+
+    logfile.write('Filtering sequences...\n')
+    logfile.flush()
+    nseqs = 0
+    for line in open(inputClassification):
+        nseqs += 1
+        line = line.strip().split()
+        takeUnclassified = taxID == 0 and line[0] == 'U'
+        takeClassified = line[0] == 'C' and int(line[2]) in validIDs
+        if takeUnclassified or takeClassified:            
+            keepSequences.add(line[1].strip())
+            if debug:
+                logfile.write('Added %s to keepSequences.\n' % line[1].strip())
+                logfile.flush()
+    logfile.write('Keeping %i of %i sequences (%.1f).\n' % (len(keepSequences), nseqs, float(len(keepSequences))/nseqs))
+    logfile.flush()
+    logfile.write('Writing filtered sequence sets...\n')
+    logfile.flush()
+    fwdOut = open(outputR1, 'wb')
+    fwdGen = anabl_getSeqsFromFastX(inputR1, X=fastx)
+  
+    revOut, revGen = None, None
+    revSid, revSeq = None, None
+    if outputR2 is not None and inputR2 is not None:
+        revOut = open(outputR2, 'wb')
+        revGen = anabl_getSeqsFromFastX(inputR2, X=fastx)
+    
+    fqid1, fqid2 = None, None
+    while True:
+        try:
+            fwdSid, fwdSeq = fwdGen.next()
+            fqid1 = getFastqIdentifier(fwdSid)
+            if revGen is not None:
+                revSid, revSeq = revGen.next()
+                fqid2 = getFastqIdentifier(revSid)
+        except:
+            break
+        if fqid1 != fqid2 and fqid2 is not None:
+            sys.stderr.write('Error: fqid-mismatch %s %s.\n' % (fqid1, fqid2))
+            sys.exit(1)
+        if fqid1 in keepSequences:
+            fwdOut.write(('%s\n' * fastx) % ((fwdSid,) + fwdSeq))
+            if revOut is not None:
+                revOut.write(('%s\n' * fastx) % ((revSid,) + revSeq))
+        else:
+            # sys.stdout.write('%s is not in keepSequences\n' % fqid1)
+            pass
+    fwdOut.close()
+    if revOut is not None:
+        revOut.close()
+    pass
+
+
+def main(argv):
+    
+    descr = ''
+    parser = argparse.ArgumentParser(description=descr)
+    parser.add_argument('--dbtype', default='builtinDB', help='Is database built-in or supplied externally?')
+    parser.add_argument('--db', default=DEFAULT_KRAKEN_DB, help='Path to kraken db')
+    parser.add_argument('--in1', help='The r1-file (single-end reads or left paired-end reads).')
+    parser.add_argument('--in2', help='The r2-file (right paired-end reads)')
+    parser.add_argument('--taxid', default=1, help='The root taxon id of the requested branch', type=int)
+    parser.add_argument('--kraken-results', type=str, help='A file containing kraken classification results for the input sequences.')
+    parser.add_argument('--input-format', help='Input sequences stored in fa or fq file(s).', default='fq')
+
+    parser.add_argument('--out1', type=str, help='The r1-output file.')
+    parser.add_argument('--out2', type=str, help='The r2-output file.')
+    parser.add_argument('--debug', action='store_true')
+    parser.add_argument('--logfile', type=str, help='A logfile.', default='kraken_filter.log')
+    args = parser.parse_args()
+
+    kraken_params = []
+    kraken_input = []
+
+    global logfile
+    logfile = open(args.logfile, 'wb')
+    if 'db' not in args or not os.path.exists(args.db):
+        sys.stderr.write('Error: database is missing.\n')
+        sys.exit(1)
+    kraken_params.extend(['--db', args.db])
+    # check whether input file(s) exist(s)
+
+    if 'kraken_results' in args and os.path.exists(args.kraken_results):
+        kraken_input.append(args.kraken_results)
+    else:
+        sys.stderr.write('Error: kraken-classification is missing.\n')
+        sys.exit(1)
+        pass
+
+    if not ('in1' in args and os.path.exists(args.in1)):
+        sys.stderr.write('Error: left/single input file (%s) is missing.\n' % args.in1)
+        sys.exit(1)
+    if not verifyFileFormat(args.in1, args.input_format):
+        sys.stderr.write('Error: fwd/single input file has the wrong format assigned.\n')
+        sys.exit(1)
+
+    kraken_input.append(args.in1)
+    if 'in2' in args:
+        if args.in2 is not None and not os.path.exists(args.in2):
+            sys.stderr.write('Error: right input file (%s) is missing.\n' % args.in2)
+            sys.exit(1)
+        elif args.in2 is not None and not verifyFileFormat(args.in2, args.input_format):
+            sys.stderr.write('Error: rev input file has the wrong format assigned.\n')
+            sys.exit(1)            
+        elif args.in2 is not None:
+            kraken_params.append('--paired')
+            kraken_input.append(args.in2)
+        else:
+            pass
+    else:
+        pass
+    
+    if 'out2' in args and 'in2' in args:
+        input2, output2 = args.in2, args.out2
+    else:
+        input2, output2 = None, None
+
+    if args.taxid < 0:
+        sys.stderr.write('Error: invalid taxon id %i\n' % args.taxid)
+        sys.exit(1)
+    kraken_params.extend(['--taxid', args.taxid])
+
+    if args.input_format == 'fq': 
+        kraken_params.append('--fastq-input')
+        fastx = 4
+    else:
+        kraken_params.append('--fasta-input')
+        fastx = 2
+
+
+
+    # firstChar = open(args.in1).read(1)
+    # if firstChar == '>':
+    #    kraken_params.append('--fasta-input')
+    #    fastx = 2
+    # elif firstChar == '@':
+    #    kraken_params.append('--fastq-input')
+    #    fastx = 4
+    #else:
+    #    """ This will currently disable working with gzipped/bzipped files. """
+    #    sys.stderr.write('Error: Input file starts with unknown symbol %s.\n' % firstChar)
+    #    sys.exit(1)
+    #    pass
+
+    #open(args.kraken_filtered_r1, 'wb').write('\n'.join(map(str, kraken_params + kraken_input)))
+   
+    runFilter(args.db, int(args.taxid), args.kraken_results, 
+              args.in1, args.out1,
+              inputR2=input2, outputR2=output2, fastx=fastx,
+              debug=args.debug)
+
+    logfile.close()
+
+    pass
+
+
+ 
+    
+# main(sys.argv[1:])
+
+if __name__ == '__main__': main(sys.argv[1:])
diff -r 000000000000 -r 0916697409ea kraken_filter.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kraken_filter.xml	Mon May 18 15:29:05 2015 -0400
@@ -0,0 +1,123 @@
+
+	kraken result filter
+	
+	  python
+	
+	kraken_filter.py 
+		--dbtype="${databaseChoice.database}"
+		#if $databaseChoice.database == "builtinDB":
+		  --db="${databaseChoice.builtinDatabases.fields.path}"
+		#else:
+		  --db="NOT_SUPPORTED_YET"
+		#end if
+	
+		--in1="${dataFormat.input1}"
+		--out1="${kraken_filtered_r1}"        
+
+		#if $dataFormat.inputFormat == "pairedFastQ" or $dataFormat.inputFormat == "pairedFastA":
+		  --in2="${dataFormat.input2}"
+		  --out2="${kraken_filtered_r2}"
+		#end if
+
+		#if $dataFormat.inputFormat == "singleFastQ" or $dataFormat.inputFormat == "pairedFastQ":
+		  --input-format="fq"
+		#else:
+		  --input-format="fa"
+		#end if
+
+		#if $whichFilter.whichFilterp == "filterClassified":
+		  --taxid="${whichFilter.taxonID}"
+		#else:
+		  --taxid="0"
+		#end if
+
+                --kraken-results="${kraken_classification}"
+        
+                --logfile="${kraken_filter_log}"
+	
+	
+		
+			
+				
+				
+			
+			
+				
+					
+						
+						
+					
+				
+			
+			
+				
+			
+		
+
+                
+			
+				
+				
+			
+			
+								
+			
+                
+
+		
+
+
+
+		
+			 
+				
+				
+				
+				
+			
+			
+				
+			
+			
+				
+				
+			
+			
+				
+			
+			
+				
+				
+			
+		
+
+
+
+	
+	
+                
+                
+                	inputFormat == "pairedFastQ" or inputFormat == "pairedFastA
+
+                
+                
+	
+
+	
+	This tool is a downstream filter for kraken-classified sequences. 
+	
+
diff -r 000000000000 -r 0916697409ea kraken_summarize.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kraken_summarize.py	Mon May 18 15:29:05 2015 -0400
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+import sys
+import subprocess
+import argparse
+
+from collections import Counter
+
+from Bio import Entrez
+
+
+
+# U	HWI-ST933:109:C2A2NACXX:5:1101:1311:2013	0	50	Q:0
+def fetchTaxonomyData(ids, email='christian.schudoma@tsl.ac.uk'):
+    Entrez.email = email
+    handle = Entrez.efetch(db='Taxonomy', id=','.join(ids), retmode='xml')
+    records = Entrez.read(handle)
+    return records
+
+def writeKronaInput(fi, taxInfo, unclassified=0):
+    if unclassified:
+        fi.write('%i\tUnclassified\n' % unclassified)
+    for tid in sorted(taxInfo, key=lambda x:taxInfo[x][0]['Lineage']):
+        fi.write('%i\t%s\n' % (taxInfo[tid][1], '; '.join([taxInfo[tid][0]['Lineage'].strip(), taxInfoDict[tid][0]['ScientificName']]).replace('; ', '\t').strip('\t')))
+    pass
+
+def writeOutput(out, taxInfoDict, c):
+    for tid in sorted(taxInfoDict, reverse=True, key=lambda x:taxInfoDict[x][1]):
+        data = [tid, taxInfoDict[tid][1], taxInfoDict[tid][0]['TaxId'], taxInfoDict[tid][0]['Lineage'], taxInfoDict[tid][0]['ScientificName']]
+        out.write('\t'.join(data) + '\n')
+    data = (sum(c.values()) - c['0'], sum(c.values()), (sum(c.values()) - c['0']) / float(sum(c.values())) * 100, c['0'])
+    out.write('%i/%i (%.5f%%) classified, %i unclassified\n' % data)
+
+
+def main(argv):
+
+    parser = argparse.ArgumentParser(description='')
+    parser.add_argument('--krona-output', type=str, default='')
+    parser.add_argument('--output', type=str)
+    parser.add_argument('--call-krona', type=str)
+    parser.add_argument('--include-unclassified')
+    parser.add_argument('kraken_summary_tsv', type=str)
+    args = parser.parse_args()
+
+    c = Counter(line.strip().split()[2] for line in open(sys.argv[1]))
+    taxids = sorted(c.keys(), key=lambda x:c[x], reverse=True)
+    taxData = fetchTaxonomyData(taxids)
+    taxInfoDict = {tinfo['TaxId']: [tinfo, c[tinfo['TaxId']]] for tinfo in taxData}
+
+    kr_out = None
+    if 'krona_output' in args:
+        with open(args.krona_output, 'wb') as kr_out:
+            if 'include_unclassified' in args:
+                writeKronaInput(kr_out, taxInfoDict, unclassified=c['0'])
+            else:
+                writeKronaInput(kr_out, taxInfoDict)
+
+        if 'call_krona' in args:
+            p = subprocess.Popen('source krona_tools-2.4; ktImportText -o %s %s' % (args.call_krona, args.krona_output), shell=True, stdout=subprocess.PIPE)
+            p.communicate()
+
+    if 'output' in args:
+        if args.output == '-':
+            writeOutput(sys.stdout, taxInfoDict, c)
+        else:
+            with open(args.output, 'wb') as out:
+                writeOutput(out, taxInfoDict, c)
+
+
+
+
+
+
+
+
+"""
+2	Fats	Saturated fat
+3	Fats	Unsaturated fat	Monounsaturated fat
+3	Fats	Unsaturated fat	Polyunsaturated fat
+13	Carbohydrates	Sugars
+4	Carbohydrates	Dietary fiber
+21	Carbohydrates
+5	Protein
+4
+"""
+
+
+
+"""
+handle = Entrez.efetch(db="Taxonomy", id="1323524", retmode="xml")
+records  = Entrez.read(handle)
+records[0]["Lineage"]
+'Viruses; dsRNA viruses; Partitiviridae; Betapartitivirus'
+records[0]["LineageEx"]
+[{u'ScientificName': 'Viruses', u'TaxId': '10239', u'Rank': 'superkingdom'}, {u'ScientificName': 'dsRNA viruses', u'TaxId': '35325', u'Rank': 'no rank'}, {u'ScientificName': 'Partitiviridae', u'TaxId': '11012', u'Rank': 'family'}, {u'ScientificName': 'Betapartitivirus', u'TaxId': '1511809', u'Rank': 'genus'}]
+"""
+
+"""
+{u'Lineage': 'Viruses; dsDNA viruses, no RNA stage; Iridoviridae; unclassified Iridoviridae', u'Division': 'Viruses', u'ParentTaxId': '180169', u'PubDate': '2014/03/16 07:01:27', u'LineageEx': [{u'ScientificName': 'Viruses', u'TaxId': '10239', u'Rank': 'superkingdom'}, {u'ScientificName': 'dsDNA viruses, no RNA stage', u'TaxId': '35237', u'Rank': 'no rank'}, {u'ScientificName': 'Iridoviridae', u'TaxId': '10486', u'Rank': 'family'}, {u'ScientificName': 'unclassified Iridoviridae', u'TaxId': '180169', u'Rank': 'no rank'}], u'CreateDate': '2014/02/24 15:56:28', u'TaxId': '1465751', u'Rank': 'species', u'GeneticCode': {u'GCId': '1', u'GCName': 'Standard'},
+u'ScientificName': 'Anopheles minimus irodovirus', u'MitoGeneticCode': {u'MGCId': '0', u'MGCName': 'Unspecified'}, u'UpdateDate': '2014/02/24 15:56:29'}
+
+"""
diff -r 000000000000 -r 0916697409ea kraken_summarize.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kraken_summarize.xml	Mon May 18 15:29:05 2015 -0400
@@ -0,0 +1,26 @@
+
+	
+	
+	  python
+	
+	kraken_summarize.py
+		  --krona-output=$krona_output
+			--output=$tax_summary
+			--call-krona=$krona_html
+			$kraken_summary_tsv
+	
+
+
+	
+		
+		
+	
+		
+		
+		
+	
+
+	
+	Krona visualisation and classification summary of Kraken runs.
+	
+
diff -r 000000000000 -r 0916697409ea kraken_visualize.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kraken_visualize.py	Mon May 18 15:29:05 2015 -0400
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+
+import sys
+
+def readTaxonomyNames(names_dmp):
+    nodeNames = []
+    for line in open(names_dmp):
+        line = line.strip().strip('|').strip()
+        if not line: break
+        line = line.split('\t|\t')
+        if line[3].strip() == 'scientific name':
+            nodeNames.append((int(line[0].strip()), line[1].strip()))        
+        pass
+    return dict(nodeNames)
+
+def readTaxonomyNodes(nodes_dmp):    
+    nodeRanks = []
+    nodeChildren = {}
+    nodeParents = {}
+    for line in open(nodes_dmp):
+        line = line.strip().strip('|').strip()
+        if not line: break
+        line = map(lambda x:x.strip(), line.split('\t|\t'))
+        line[:2] = map(int, line[:2])
+        if line[0] == 1:
+            line[1] = 1    
+
+        nodeParents[line[0]] = line[1]
+        try:
+            nodeChildren[line[1]].add(line[0])
+        except:
+            nodeChildren[line[1]] = set([line[0]])
+        nodeRanks.append((line[0], line[2]))
+
+    return dict(nodeRanks), nodeChildren, nodeParents
+
+def getDescendents(taxID, tree):
+    descendents = set([taxID])
+    queue = [taxID]
+    while queue:
+        node = queue.pop()
+
+        children = tree.get(node, set())
+        if children:
+            descendents = descendents.union(children)
+            queue.extend(children)
+        pass
+    return descendents
\ No newline at end of file
diff -r 000000000000 -r 0916697409ea tool_dependencies.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Mon May 18 15:29:05 2015 -0400
@@ -0,0 +1,9 @@
+
+
+    
+        
+    
+    
+        
+    
+