Mercurial > repos > rnateam > bctools
diff remove_tail.py @ 2:de4ea3aa1090 draft
Uploaded
author | rnateam |
---|---|
date | Thu, 22 Oct 2015 10:26:45 -0400 |
parents | |
children | 0b9aab6aaebf |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/remove_tail.py Thu Oct 22 10:26:45 2015 -0400 @@ -0,0 +1,95 @@ +#!/usr/bin/env python + +tool_description = """ +Remove a certain number of nucleotides from the 3'-tails of sequences in fastq +format. + +Example usage: +- remove the last 7 nucleotides from file input.fastq, write result to file + output.fastq: +remove_tail.py input.fastq 7 --out output.fastq +""" + +epilog = """ +Author: Daniel Maticzka +Copyright: 2015 +License: Apache +Email: maticzkd@informatik.uni-freiburg.de +Status: Testing +""" + +import argparse +import logging +from sys import stdout +from Bio.SeqIO.QualityIO import FastqGeneralIterator + +# avoid ugly python IOError when stdout output is piped into another program +# and then truncated (such as piping to head) +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE, SIG_DFL) + +# parse command line arguments +parser = argparse.ArgumentParser(description=tool_description, + epilog=epilog, + formatter_class=argparse.RawDescriptionHelpFormatter) +# positional arguments +parser.add_argument( + "infile", + help="Path to fastq file.") +parser.add_argument( + "length", + type=int, + help="Remove this many nts.") +# optional arguments +parser.add_argument( + "-o", "--outfile", + help="Write results to this file.") +parser.add_argument( + "-v", "--verbose", + help="Be verbose.", + action="store_true") +parser.add_argument( + "-d", "--debug", + help="Print lots of debugging information", + action="store_true") +parser.add_argument( + '--version', + action='version', + version='0.1.0') + +args = parser.parse_args() +if args.debug: + logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") +elif args.verbose: + logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s") +else: + logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") +logging.info("Parsed arguments:") +logging.info(" infile: '{}'".format(args.infile)) +logging.info(" length: '{}'".format(args.length)) +if args.outfile: + logging.info(" outfile: enabled writing to file") + logging.info(" outfile: '{}'".format(args.outfile)) +logging.info("") + +# check length parameter +if args.length < 0: + raise ValueError("Length must be a positive integer, is '{}'.".format(args.length)) + +# remove tail +with (open(args.outfile, "w") if args.outfile is not None else stdout) as samout: + for header, seq, qual in FastqGeneralIterator(open(args.infile)): + + # if removing tail would lead to an empty sequence, + # set sequence to a single N to keep fastq synchronized + if len(seq) <= args.length: + seq = "N" + qual = "B" + logging.debug("read '{}' was too short to remove full tail".format(header)) + logging.debug("seq: {}".format(seq)) + logging.debug("len(seq): {}".format(len(seq))) + else: + seq = seq[0:-args.length] + qual = qual[0:-args.length] + + samout.write("@%s\n%s\n+\n%s\n" % (header, seq, qual))