diff kraken_filter.py @ 14:fd27c97c8366 draft default tip

Uploaded
author cschu
date Mon, 18 May 2015 15:55:47 -0400
parents 0916697409ea
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kraken_filter.py	Mon May 18 15:55:47 2015 -0400
@@ -0,0 +1,254 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import argparse
+import subprocess
+import struct
+import shutil
+import tempfile
+
+from checkFileFormat import verifyFileFormat
+
+DEFAULT_KRAKEN_DB='/tsl/data/krakendb/ktest/db'
+logfile = None
+
+
+def getDescendents(taxID, tree):
+    descendents = set([taxID])
+    queue = [taxID]
+    while queue:
+        node = queue.pop()
+
+        children = tree.get(node, set())
+        if children:
+            descendents = descendents.union(children)
+            queue.extend(children)
+        pass
+    return descendents
+
+def anabl_getSeqsFromFastX(fn, X=2):
+    it = open(fn)
+    for head in it:
+        try:
+            yield (head.strip(), tuple(map(lambda x:x.strip(),
+                                       ([it.next() for i in xrange(X - 1)]))))
+        except:
+            break
+        pass
+    it.close()
+    pass
+
+def readTaxonomy(db):
+    nodeNames = []
+    for line in open(os.path.join(db, 'taxonomy', 'names.dmp')):
+        line = line.strip().strip('|').strip()
+        if not line: break
+        line = line.split('\t|\t')
+
+        if line[3].strip() == 'scientific name':
+            nodeNames.append((int(line[0].strip()), line[1].strip()))
+        # break
+        pass
+
+    nodeRanks = []
+    nodeChildren = {}
+    for line in open(os.path.join(db, 'taxonomy', 'nodes.dmp')):
+        line = line.strip().strip('|').strip()
+        if not line: break
+        line = map(lambda x:x.strip(), line.split('\t|\t'))
+        line[:2] = map(int, line[:2])
+        if line[0] == 1:
+            line[1] = 0
+        try:
+            nodeChildren[line[1]].add(line[0])
+        except:
+            nodeChildren[line[1]] = set([line[0]])
+        nodeRanks.append((line[0], line[2]))
+        # break
+
+    return dict(nodeNames), dict(nodeRanks), nodeChildren
+
+def getFastqIdentifier(string):
+    if string.endswith('/1') or string.endswith('/2'):
+        return string[1:-2]
+    else:
+        return string.split()[0][1:]
+
+def runFilter(db, taxID, inputClassification, inputR1, outputR1, inputR2=None, outputR2=None, fastx=4, debug=False):
+    global logfile
+    logfile.write('Reading taxonomy...\n')
+    logfile.flush()
+    taxonomyInfo = readTaxonomy(db)
+    keepSequences = set()
+    logfile.write('Traversing requested taxonomy branch...\n')
+    logfile.flush()
+    validIDs = getDescendents(taxID, taxonomyInfo[2])
+    logfile.write('Family tree of %s has %i descendents.\n' % (str(taxID), len(validIDs)))
+    logfile.flush()
+    if debug:
+        for vid in validIDs:
+            logfile.write('Added %s to validIDs.\n' % vid)
+            logfile.flush()
+	
+
+    logfile.write('Filtering sequences...\n')
+    logfile.flush()
+    nseqs = 0
+    for line in open(inputClassification):
+        nseqs += 1
+        line = line.strip().split()
+        takeUnclassified = taxID == 0 and line[0] == 'U'
+        takeClassified = line[0] == 'C' and int(line[2]) in validIDs
+        if takeUnclassified or takeClassified:            
+            keepSequences.add(line[1].strip())
+            if debug:
+                logfile.write('Added %s to keepSequences.\n' % line[1].strip())
+                logfile.flush()
+    logfile.write('Keeping %i of %i sequences (%.1f).\n' % (len(keepSequences), nseqs, float(len(keepSequences))/nseqs))
+    logfile.flush()
+    logfile.write('Writing filtered sequence sets...\n')
+    logfile.flush()
+    fwdOut = open(outputR1, 'wb')
+    fwdGen = anabl_getSeqsFromFastX(inputR1, X=fastx)
+  
+    revOut, revGen = None, None
+    revSid, revSeq = None, None
+    if outputR2 is not None and inputR2 is not None:
+        revOut = open(outputR2, 'wb')
+        revGen = anabl_getSeqsFromFastX(inputR2, X=fastx)
+    
+    fqid1, fqid2 = None, None
+    while True:
+        try:
+            fwdSid, fwdSeq = fwdGen.next()
+            fqid1 = getFastqIdentifier(fwdSid)
+            if revGen is not None:
+                revSid, revSeq = revGen.next()
+                fqid2 = getFastqIdentifier(revSid)
+        except:
+            break
+        if fqid1 != fqid2 and fqid2 is not None:
+            sys.stderr.write('Error: fqid-mismatch %s %s.\n' % (fqid1, fqid2))
+            sys.exit(1)
+        if fqid1 in keepSequences:
+            fwdOut.write(('%s\n' * fastx) % ((fwdSid,) + fwdSeq))
+            if revOut is not None:
+                revOut.write(('%s\n' * fastx) % ((revSid,) + revSeq))
+        else:
+            # sys.stdout.write('%s is not in keepSequences\n' % fqid1)
+            pass
+    fwdOut.close()
+    if revOut is not None:
+        revOut.close()
+    pass
+
+
+def main(argv):
+    
+    descr = ''
+    parser = argparse.ArgumentParser(description=descr)
+    parser.add_argument('--dbtype', default='builtinDB', help='Is database built-in or supplied externally?')
+    parser.add_argument('--db', default=DEFAULT_KRAKEN_DB, help='Path to kraken db')
+    parser.add_argument('--in1', help='The r1-file (single-end reads or left paired-end reads).')
+    parser.add_argument('--in2', help='The r2-file (right paired-end reads)')
+    parser.add_argument('--taxid', default=1, help='The root taxon id of the requested branch', type=int)
+    parser.add_argument('--kraken-results', type=str, help='A file containing kraken classification results for the input sequences.')
+    parser.add_argument('--input-format', help='Input sequences stored in fa or fq file(s).', default='fq')
+
+    parser.add_argument('--out1', type=str, help='The r1-output file.')
+    parser.add_argument('--out2', type=str, help='The r2-output file.')
+    parser.add_argument('--debug', action='store_true')
+    parser.add_argument('--logfile', type=str, help='A logfile.', default='kraken_filter.log')
+    args = parser.parse_args()
+
+    kraken_params = []
+    kraken_input = []
+
+    global logfile
+    logfile = open(args.logfile, 'wb')
+    if 'db' not in args or not os.path.exists(args.db):
+        sys.stderr.write('Error: database is missing.\n')
+        sys.exit(1)
+    kraken_params.extend(['--db', args.db])
+    # check whether input file(s) exist(s)
+
+    if 'kraken_results' in args and os.path.exists(args.kraken_results):
+        kraken_input.append(args.kraken_results)
+    else:
+        sys.stderr.write('Error: kraken-classification is missing.\n')
+        sys.exit(1)
+        pass
+
+    if not ('in1' in args and os.path.exists(args.in1)):
+        sys.stderr.write('Error: left/single input file (%s) is missing.\n' % args.in1)
+        sys.exit(1)
+    if not verifyFileFormat(args.in1, args.input_format):
+        sys.stderr.write('Error: fwd/single input file has the wrong format assigned.\n')
+        sys.exit(1)
+
+    kraken_input.append(args.in1)
+    if 'in2' in args:
+        if args.in2 is not None and not os.path.exists(args.in2):
+            sys.stderr.write('Error: right input file (%s) is missing.\n' % args.in2)
+            sys.exit(1)
+        elif args.in2 is not None and not verifyFileFormat(args.in2, args.input_format):
+            sys.stderr.write('Error: rev input file has the wrong format assigned.\n')
+            sys.exit(1)            
+        elif args.in2 is not None:
+            kraken_params.append('--paired')
+            kraken_input.append(args.in2)
+        else:
+            pass
+    else:
+        pass
+    
+    if 'out2' in args and 'in2' in args:
+        input2, output2 = args.in2, args.out2
+    else:
+        input2, output2 = None, None
+
+    if args.taxid < 0:
+        sys.stderr.write('Error: invalid taxon id %i\n' % args.taxid)
+        sys.exit(1)
+    kraken_params.extend(['--taxid', args.taxid])
+
+    if args.input_format == 'fq': 
+        kraken_params.append('--fastq-input')
+        fastx = 4
+    else:
+        kraken_params.append('--fasta-input')
+        fastx = 2
+
+
+
+    # firstChar = open(args.in1).read(1)
+    # if firstChar == '>':
+    #    kraken_params.append('--fasta-input')
+    #    fastx = 2
+    # elif firstChar == '@':
+    #    kraken_params.append('--fastq-input')
+    #    fastx = 4
+    #else:
+    #    """ This will currently disable working with gzipped/bzipped files. """
+    #    sys.stderr.write('Error: Input file starts with unknown symbol %s.\n' % firstChar)
+    #    sys.exit(1)
+    #    pass
+
+    #open(args.kraken_filtered_r1, 'wb').write('\n'.join(map(str, kraken_params + kraken_input)))
+   
+    runFilter(args.db, int(args.taxid), args.kraken_results, 
+              args.in1, args.out1,
+              inputR2=input2, outputR2=output2, fastx=fastx,
+              debug=args.debug)
+
+    logfile.close()
+
+    pass
+
+
+ 
+    
+# main(sys.argv[1:])
+
+if __name__ == '__main__': main(sys.argv[1:])