annotate remove_tail.py @ 2:de4ea3aa1090 draft

Uploaded
author rnateam
date Thu, 22 Oct 2015 10:26:45 -0400
parents
children 0b9aab6aaebf
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
1 #!/usr/bin/env python
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
3 tool_description = """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
4 Remove a certain number of nucleotides from the 3'-tails of sequences in fastq
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
5 format.
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
6
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
7 Example usage:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
8 - remove the last 7 nucleotides from file input.fastq, write result to file
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
9 output.fastq:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
10 remove_tail.py input.fastq 7 --out output.fastq
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
11 """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
12
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
13 epilog = """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
14 Author: Daniel Maticzka
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
15 Copyright: 2015
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
16 License: Apache
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
17 Email: maticzkd@informatik.uni-freiburg.de
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
18 Status: Testing
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
19 """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
20
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
21 import argparse
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
22 import logging
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
23 from sys import stdout
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
24 from Bio.SeqIO.QualityIO import FastqGeneralIterator
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
25
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
26 # avoid ugly python IOError when stdout output is piped into another program
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
27 # and then truncated (such as piping to head)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
28 from signal import signal, SIGPIPE, SIG_DFL
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
29 signal(SIGPIPE, SIG_DFL)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
30
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
31 # parse command line arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
32 parser = argparse.ArgumentParser(description=tool_description,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
33 epilog=epilog,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
34 formatter_class=argparse.RawDescriptionHelpFormatter)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
35 # positional arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
36 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
37 "infile",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
38 help="Path to fastq file.")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
39 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
40 "length",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
41 type=int,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
42 help="Remove this many nts.")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
43 # optional arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
44 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
45 "-o", "--outfile",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
46 help="Write results to this file.")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
47 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
48 "-v", "--verbose",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
49 help="Be verbose.",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
50 action="store_true")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
51 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
52 "-d", "--debug",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
53 help="Print lots of debugging information",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
54 action="store_true")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
55 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
56 '--version',
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
57 action='version',
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
58 version='0.1.0')
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
59
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
60 args = parser.parse_args()
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
61 if args.debug:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
62 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
63 elif args.verbose:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
64 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
65 else:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
66 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
67 logging.info("Parsed arguments:")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
68 logging.info(" infile: '{}'".format(args.infile))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
69 logging.info(" length: '{}'".format(args.length))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
70 if args.outfile:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
71 logging.info(" outfile: enabled writing to file")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
72 logging.info(" outfile: '{}'".format(args.outfile))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
73 logging.info("")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
74
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
75 # check length parameter
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
76 if args.length < 0:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
77 raise ValueError("Length must be a positive integer, is '{}'.".format(args.length))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
78
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
79 # remove tail
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
80 with (open(args.outfile, "w") if args.outfile is not None else stdout) as samout:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
81 for header, seq, qual in FastqGeneralIterator(open(args.infile)):
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
82
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
83 # if removing tail would lead to an empty sequence,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
84 # set sequence to a single N to keep fastq synchronized
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
85 if len(seq) <= args.length:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
86 seq = "N"
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
87 qual = "B"
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
88 logging.debug("read '{}' was too short to remove full tail".format(header))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
89 logging.debug("seq: {}".format(seq))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
90 logging.debug("len(seq): {}".format(len(seq)))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
91 else:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
92 seq = seq[0:-args.length]
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
93 qual = qual[0:-args.length]
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
94
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
95 samout.write("@%s\n%s\n+\n%s\n" % (header, seq, qual))