Mercurial > repos > rnateam > bctools
view remove_tail.py @ 19:885602df4554 draft
another try with bedtools dependency
author | rnateam |
---|---|
date | Mon, 30 Nov 2015 11:43:33 -0500 |
parents | de4ea3aa1090 |
children | 0b9aab6aaebf |
line wrap: on
line source
#!/usr/bin/env python tool_description = """ Remove a certain number of nucleotides from the 3'-tails of sequences in fastq format. Example usage: - remove the last 7 nucleotides from file input.fastq, write result to file output.fastq: remove_tail.py input.fastq 7 --out output.fastq """ epilog = """ Author: Daniel Maticzka Copyright: 2015 License: Apache Email: maticzkd@informatik.uni-freiburg.de Status: Testing """ import argparse import logging from sys import stdout from Bio.SeqIO.QualityIO import FastqGeneralIterator # avoid ugly python IOError when stdout output is piped into another program # and then truncated (such as piping to head) from signal import signal, SIGPIPE, SIG_DFL signal(SIGPIPE, SIG_DFL) # parse command line arguments parser = argparse.ArgumentParser(description=tool_description, epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter) # positional arguments parser.add_argument( "infile", help="Path to fastq file.") parser.add_argument( "length", type=int, help="Remove this many nts.") # optional arguments parser.add_argument( "-o", "--outfile", help="Write results to this file.") parser.add_argument( "-v", "--verbose", help="Be verbose.", action="store_true") parser.add_argument( "-d", "--debug", help="Print lots of debugging information", action="store_true") parser.add_argument( '--version', action='version', version='0.1.0') args = parser.parse_args() if args.debug: logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") elif args.verbose: logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s") else: logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") logging.info("Parsed arguments:") logging.info(" infile: '{}'".format(args.infile)) logging.info(" length: '{}'".format(args.length)) if args.outfile: logging.info(" outfile: enabled writing to file") logging.info(" outfile: '{}'".format(args.outfile)) logging.info("") # check length parameter if args.length < 0: raise ValueError("Length must be a positive integer, is '{}'.".format(args.length)) # remove tail with (open(args.outfile, "w") if args.outfile is not None else stdout) as samout: for header, seq, qual in FastqGeneralIterator(open(args.infile)): # if removing tail would lead to an empty sequence, # set sequence to a single N to keep fastq synchronized if len(seq) <= args.length: seq = "N" qual = "B" logging.debug("read '{}' was too short to remove full tail".format(header)) logging.debug("seq: {}".format(seq)) logging.debug("len(seq): {}".format(len(seq))) else: seq = seq[0:-args.length] qual = qual[0:-args.length] samout.write("@%s\n%s\n+\n%s\n" % (header, seq, qual))