2
|
1 #!/usr/bin/env python
|
|
2
|
|
3 tool_description = """
|
|
4 Remove a certain number of nucleotides from the 3'-tails of sequences in fastq
|
|
5 format.
|
|
6
|
|
7 Example usage:
|
|
8 - remove the last 7 nucleotides from file input.fastq, write result to file
|
|
9 output.fastq:
|
|
10 remove_tail.py input.fastq 7 --out output.fastq
|
|
11 """
|
|
12
|
|
13 epilog = """
|
|
14 Author: Daniel Maticzka
|
|
15 Copyright: 2015
|
|
16 License: Apache
|
|
17 Email: maticzkd@informatik.uni-freiburg.de
|
|
18 Status: Testing
|
|
19 """
|
|
20
|
|
21 import argparse
|
|
22 import logging
|
|
23 from sys import stdout
|
|
24 from Bio.SeqIO.QualityIO import FastqGeneralIterator
|
|
25
|
|
26 # avoid ugly python IOError when stdout output is piped into another program
|
|
27 # and then truncated (such as piping to head)
|
|
28 from signal import signal, SIGPIPE, SIG_DFL
|
|
29 signal(SIGPIPE, SIG_DFL)
|
|
30
|
|
31 # parse command line arguments
|
|
32 parser = argparse.ArgumentParser(description=tool_description,
|
|
33 epilog=epilog,
|
|
34 formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
35 # positional arguments
|
|
36 parser.add_argument(
|
|
37 "infile",
|
|
38 help="Path to fastq file.")
|
|
39 parser.add_argument(
|
|
40 "length",
|
|
41 type=int,
|
|
42 help="Remove this many nts.")
|
|
43 # optional arguments
|
|
44 parser.add_argument(
|
|
45 "-o", "--outfile",
|
|
46 help="Write results to this file.")
|
|
47 parser.add_argument(
|
|
48 "-v", "--verbose",
|
|
49 help="Be verbose.",
|
|
50 action="store_true")
|
|
51 parser.add_argument(
|
|
52 "-d", "--debug",
|
|
53 help="Print lots of debugging information",
|
|
54 action="store_true")
|
|
55 parser.add_argument(
|
|
56 '--version',
|
|
57 action='version',
|
|
58 version='0.1.0')
|
|
59
|
|
60 args = parser.parse_args()
|
|
61 if args.debug:
|
|
62 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
|
|
63 elif args.verbose:
|
|
64 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
|
|
65 else:
|
|
66 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
|
|
67 logging.info("Parsed arguments:")
|
|
68 logging.info(" infile: '{}'".format(args.infile))
|
|
69 logging.info(" length: '{}'".format(args.length))
|
|
70 if args.outfile:
|
|
71 logging.info(" outfile: enabled writing to file")
|
|
72 logging.info(" outfile: '{}'".format(args.outfile))
|
|
73 logging.info("")
|
|
74
|
|
75 # check length parameter
|
|
76 if args.length < 0:
|
|
77 raise ValueError("Length must be a positive integer, is '{}'.".format(args.length))
|
|
78
|
|
79 # remove tail
|
|
80 with (open(args.outfile, "w") if args.outfile is not None else stdout) as samout:
|
|
81 for header, seq, qual in FastqGeneralIterator(open(args.infile)):
|
|
82
|
|
83 # if removing tail would lead to an empty sequence,
|
|
84 # set sequence to a single N to keep fastq synchronized
|
|
85 if len(seq) <= args.length:
|
|
86 seq = "N"
|
|
87 qual = "B"
|
|
88 logging.debug("read '{}' was too short to remove full tail".format(header))
|
|
89 logging.debug("seq: {}".format(seq))
|
|
90 logging.debug("len(seq): {}".format(len(seq)))
|
|
91 else:
|
|
92 seq = seq[0:-args.length]
|
|
93 qual = qual[0:-args.length]
|
|
94
|
|
95 samout.write("@%s\n%s\n+\n%s\n" % (header, seq, qual))
|