diff remove_tail.py @ 2:de4ea3aa1090 draft

Uploaded
author rnateam
date Thu, 22 Oct 2015 10:26:45 -0400
parents
children 0b9aab6aaebf
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/remove_tail.py	Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+tool_description = """
+Remove a certain number of nucleotides from the 3'-tails of sequences in fastq
+format.
+
+Example usage:
+- remove the last 7 nucleotides from file input.fastq, write result to file
+  output.fastq:
+remove_tail.py input.fastq 7 --out output.fastq
+"""
+
+epilog = """
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Testing
+"""
+
+import argparse
+import logging
+from sys import stdout
+from Bio.SeqIO.QualityIO import FastqGeneralIterator
+
+# avoid ugly python IOError when stdout output is piped into another program
+# and then truncated (such as piping to head)
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
+# parse command line arguments
+parser = argparse.ArgumentParser(description=tool_description,
+                                 epilog=epilog,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+# positional arguments
+parser.add_argument(
+    "infile",
+    help="Path to fastq file.")
+parser.add_argument(
+    "length",
+    type=int,
+    help="Remove this many nts.")
+# optional arguments
+parser.add_argument(
+    "-o", "--outfile",
+    help="Write results to this file.")
+parser.add_argument(
+    "-v", "--verbose",
+    help="Be verbose.",
+    action="store_true")
+parser.add_argument(
+    "-d", "--debug",
+    help="Print lots of debugging information",
+    action="store_true")
+parser.add_argument(
+    '--version',
+    action='version',
+    version='0.1.0')
+
+args = parser.parse_args()
+if args.debug:
+    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
+elif args.verbose:
+    logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
+else:
+    logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
+logging.info("Parsed arguments:")
+logging.info("  infile: '{}'".format(args.infile))
+logging.info("  length: '{}'".format(args.length))
+if args.outfile:
+    logging.info("  outfile: enabled writing to file")
+    logging.info("  outfile: '{}'".format(args.outfile))
+logging.info("")
+
+# check length parameter
+if args.length < 0:
+    raise ValueError("Length must be a positive integer, is '{}'.".format(args.length))
+
+# remove tail
+with (open(args.outfile, "w") if args.outfile is not None else stdout) as samout:
+    for header, seq, qual in FastqGeneralIterator(open(args.infile)):
+
+        # if removing tail would lead to an empty sequence,
+        # set sequence to a single N to keep fastq synchronized
+        if len(seq) <= args.length:
+            seq = "N"
+            qual = "B"
+            logging.debug("read '{}' was too short to remove full tail".format(header))
+            logging.debug("seq: {}".format(seq))
+            logging.debug("len(seq): {}".format(len(seq)))
+        else:
+            seq = seq[0:-args.length]
+            qual = qual[0:-args.length]
+
+        samout.write("@%s\n%s\n+\n%s\n" % (header, seq, qual))