2
|
1 #!/usr/bin/env python
|
|
2
|
|
3 tool_description = """
|
|
4 Exract barcodes from a FASTQ file according to a user-specified pattern.
|
|
5
|
|
6 By default output is written to stdout.
|
|
7
|
|
8 Example usage:
|
5
|
9 - remove barcode nucleotides at positions 1-3 and 6-7 from FASTQ; write modified
|
|
10 FASTQ entries to output.fastq and barcode nucleotides to barcodes.fa:
|
7
|
11 fastq_extract_barcodes.py barcoded_input.fastq XXXNNXX --out output.fastq --bcs barcodes.fastq
|
2
|
12 """
|
|
13
|
|
14 epilog = """
|
|
15 Author: Daniel Maticzka
|
|
16 Copyright: 2015
|
|
17 License: Apache
|
|
18 Email: maticzkd@informatik.uni-freiburg.de
|
|
19 Status: Testing
|
|
20 """
|
|
21
|
|
22 import argparse
|
|
23 import logging
|
|
24 import re
|
|
25 from sys import stdout
|
|
26 from Bio.SeqIO.QualityIO import FastqGeneralIterator
|
|
27
|
|
28 # avoid ugly python IOError when stdout output is piped into another program
|
|
29 # and then truncated (such as piping to head)
|
|
30 from signal import signal, SIGPIPE, SIG_DFL
|
|
31 signal(SIGPIPE, SIG_DFL)
|
|
32
|
|
33 # parse command line arguments
|
|
34 parser = argparse.ArgumentParser(description=tool_description,
|
|
35 epilog=epilog,
|
|
36 formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
37 # positional arguments
|
|
38 parser.add_argument(
|
|
39 "infile",
|
|
40 help="Path to fastq file.")
|
|
41 parser.add_argument(
|
|
42 "pattern",
|
|
43 help="Pattern of barcode nucleotides starting at 5'-end. X positions will be moved to the header, N positions will be kept.")
|
|
44 # optional arguments
|
|
45 parser.add_argument(
|
|
46 "-o", "--outfile",
|
|
47 help="Write results to this file.")
|
|
48 parser.add_argument(
|
|
49 "-b", "--bcs",
|
|
50 dest="out_bc_fasta",
|
7
|
51 help="Write barcodes to this file in FASTQ format.")
|
|
52 parser.add_argument(
|
|
53 "--fasta-barcodes",
|
|
54 dest="save_bcs_as_fa",
|
|
55 action="store_true",
|
|
56 help="Save extracted barcodes in FASTA format.")
|
2
|
57 parser.add_argument(
|
5
|
58 "-a", "--add-bc-to-fastq",
|
|
59 dest="add_to_head",
|
7
|
60 help="Append extracted barcodes to the FASTQ headers.",
|
|
61 action="store_true")
|
5
|
62 parser.add_argument(
|
2
|
63 "-v", "--verbose",
|
|
64 help="Be verbose.",
|
|
65 action="store_true")
|
|
66 parser.add_argument(
|
|
67 "-d", "--debug",
|
|
68 help="Print lots of debugging information",
|
|
69 action="store_true")
|
|
70 parser.add_argument(
|
|
71 '--version',
|
|
72 action='version',
|
7
|
73 version='0.1.0')
|
2
|
74
|
|
75 args = parser.parse_args()
|
|
76 if args.debug:
|
|
77 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
|
|
78 elif args.verbose:
|
|
79 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
|
|
80 else:
|
|
81 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
|
|
82 logging.info("Parsed arguments:")
|
|
83 logging.info(" infile: '{}'".format(args.infile))
|
|
84 logging.info(" pattern: '{}'".format(args.pattern))
|
|
85 if args.outfile:
|
|
86 logging.info(" outfile: enabled writing to file")
|
|
87 logging.info(" outfile: '{}'".format(args.outfile))
|
|
88 if args.out_bc_fasta:
|
7
|
89 logging.info(" bcs: enabled writing barcodes to fastq file")
|
2
|
90 logging.info(" bcs: {}".format(args.out_bc_fasta))
|
7
|
91 if args.save_bcs_as_fa:
|
|
92 logging.info(" fasta-barcodes: write barcodes in fasta format instead of fastq")
|
2
|
93 logging.info("")
|
|
94
|
|
95 # check if supplied pattern is valid
|
|
96 valid_pattern = re.compile("^[XN]+$")
|
|
97 pattern_match = valid_pattern.match(args.pattern)
|
|
98 if pattern_match is None:
|
|
99 raise ValueError("Error: supplied pattern '{}' is not valid.".format(args.pattern))
|
|
100
|
|
101 # check if at least one barcode position is included in the pattern
|
|
102 has_bcpos_pattern = re.compile("X")
|
|
103 pattern_match = has_bcpos_pattern.search(args.pattern)
|
|
104 if pattern_match is None:
|
|
105 raise ValueError("Error: supplied pattern '{}' does not contain a barcode position 'X'.".format(args.pattern))
|
|
106
|
|
107 logging.info("Barcode pattern analysis:")
|
|
108 # get X positions of pattern string
|
|
109 barcode_nt_pattern = re.compile("X+")
|
|
110 barcode_positions = []
|
|
111 for m in re.finditer(barcode_nt_pattern, args.pattern):
|
|
112 logging.info(' found barcode positions in pattern: %02d-%02d: %s' % (m.start(), m.end(), m.group(0)))
|
|
113 barcode_positions.append((m.start(), m.end()))
|
|
114 logging.info(" barcode positions: {}".format(barcode_positions))
|
|
115 # get last position of a barcode nt in the pattern
|
|
116 # reads must be long enough for all
|
|
117 min_readlen = barcode_positions[-1][-1]
|
|
118 logging.info(" last position of a barcode nt in pattern: {}".format(min_readlen))
|
|
119 logging.info("")
|
|
120
|
|
121 # get coordinates of nucleotides to keep
|
|
122 # the tail after the last barcode nt is handled separately
|
|
123 seq_positions = []
|
|
124 last_seq_start = 0
|
|
125 for bcstart, bcstop in barcode_positions:
|
|
126 seq_positions.append((last_seq_start, bcstart))
|
|
127 last_seq_start = bcstop
|
|
128 logging.info(" sequence positions: {}".format(seq_positions))
|
|
129 logging.info(" start of sequence tail: {}".format(last_seq_start))
|
|
130
|
|
131 samout = (open(args.outfile, "w") if args.outfile is not None else stdout)
|
|
132 if args.out_bc_fasta is not None:
|
|
133 faout = open(args.out_bc_fasta, "w")
|
|
134 for header, seq, qual in FastqGeneralIterator(open(args.infile)):
|
|
135
|
|
136 # skip reads that are too short to extract the full requested barcode
|
|
137 if len(seq) < min_readlen:
|
|
138 logging.warning("skipping read '{}', is too short to extract the full requested barcode".format(header))
|
|
139 logging.debug("seq: {}".format(seq))
|
|
140 logging.debug("len(seq): {}".format(len(seq)))
|
|
141 continue
|
|
142
|
|
143 # extract barcode nucleotides
|
|
144 barcode_list = []
|
7
|
145 barcode_qual_list = []
|
2
|
146 for bcstart, bcstop in barcode_positions:
|
|
147 barcode_list.append(seq[bcstart:bcstop])
|
7
|
148 barcode_qual_list.append(qual[bcstart:bcstop])
|
2
|
149 barcode = "".join(barcode_list)
|
7
|
150 barcode_quals = "".join(barcode_qual_list)
|
2
|
151 logging.debug("extracted barcode: {}".format(barcode))
|
|
152
|
|
153 # create new sequence and quality string without barcode nucleotides
|
|
154 new_seq_list = []
|
|
155 new_qual_list = []
|
|
156 for seqstart, seqstop in seq_positions:
|
|
157 new_seq_list.append(seq[seqstart:seqstop])
|
|
158 new_qual_list.append(qual[seqstart:seqstop])
|
|
159 new_seq_list.append(seq[last_seq_start:])
|
|
160 new_qual_list.append(qual[last_seq_start:])
|
|
161 new_seq = "".join(new_seq_list)
|
|
162 new_qual = "".join(new_qual_list)
|
|
163 # check if at least one nucleotide is left. having none would break fastq
|
|
164 if len(new_seq) == 0:
|
|
165 logging.warning("skipping read '{}', no sequence remains after barcode extraction".format(header))
|
|
166 logging.debug("seq: {}".format(seq))
|
|
167 logging.debug("len(seq): {}".format(len(seq)))
|
|
168 continue
|
|
169
|
|
170 # write barcode nucleotides into header
|
5
|
171 if args.add_to_head:
|
|
172 annotated_header = " ".join([header, barcode])
|
|
173 else:
|
|
174 annotated_header = header
|
2
|
175 samout.write("@%s\n%s\n+\n%s\n" % (annotated_header, new_seq, new_qual))
|
|
176
|
|
177 # write barcode to fasta if requested
|
|
178 if args.out_bc_fasta is not None:
|
7
|
179 if args.save_bcs_as_fa:
|
|
180 faout.write(">{}\n{}\n".format(header, barcode))
|
|
181 else:
|
|
182 faout.write("@{}\n{}\n+\n{}\n".format(header, barcode, barcode_quals))
|
2
|
183
|
|
184 # close files
|
|
185 samout.close()
|
|
186 if args.out_bc_fasta is not None:
|
|
187 faout.close()
|