annotate extract_bcs.py @ 58:bbbae1ee87e0 draft default tip

fix for flexbar with small data issue
author rnateam
date Tue, 16 Feb 2016 10:08:58 -0500
parents 0b9aab6aaebf
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
1 #!/usr/bin/env python
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
2
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 7
diff changeset
3 import argparse
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 7
diff changeset
4 import logging
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 7
diff changeset
5 import re
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 7
diff changeset
6 from sys import stdout
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 7
diff changeset
7 from Bio.SeqIO.QualityIO import FastqGeneralIterator
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 7
diff changeset
8 # avoid ugly python IOError when stdout output is piped into another program
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 7
diff changeset
9 # and then truncated (such as piping to head)
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 7
diff changeset
10 from signal import signal, SIGPIPE, SIG_DFL
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 7
diff changeset
11 signal(SIGPIPE, SIG_DFL)
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 7
diff changeset
12
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
13 tool_description = """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
14 Exract barcodes from a FASTQ file according to a user-specified pattern.
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
15
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
16 By default output is written to stdout.
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
17
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
18 Example usage:
5
e841de88235c Uploaded
rnateam
parents: 2
diff changeset
19 - remove barcode nucleotides at positions 1-3 and 6-7 from FASTQ; write modified
e841de88235c Uploaded
rnateam
parents: 2
diff changeset
20 FASTQ entries to output.fastq and barcode nucleotides to barcodes.fa:
7
bb59215dfd8f Uploaded
rnateam
parents: 5
diff changeset
21 fastq_extract_barcodes.py barcoded_input.fastq XXXNNXX --out output.fastq --bcs barcodes.fastq
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
22 """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
23
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
24 epilog = """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
25 Author: Daniel Maticzka
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
26 Copyright: 2015
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
27 License: Apache
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
28 Email: maticzkd@informatik.uni-freiburg.de
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
29 Status: Testing
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
30 """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
31
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
32 # parse command line arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
33 parser = argparse.ArgumentParser(description=tool_description,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
34 epilog=epilog,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
35 formatter_class=argparse.RawDescriptionHelpFormatter)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
36 # positional arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
37 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
38 "infile",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
39 help="Path to fastq file.")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
40 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
41 "pattern",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
42 help="Pattern of barcode nucleotides starting at 5'-end. X positions will be moved to the header, N positions will be kept.")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
43 # optional arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
44 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
45 "-o", "--outfile",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
46 help="Write results to this file.")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
47 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
48 "-b", "--bcs",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
49 dest="out_bc_fasta",
7
bb59215dfd8f Uploaded
rnateam
parents: 5
diff changeset
50 help="Write barcodes to this file in FASTQ format.")
bb59215dfd8f Uploaded
rnateam
parents: 5
diff changeset
51 parser.add_argument(
bb59215dfd8f Uploaded
rnateam
parents: 5
diff changeset
52 "--fasta-barcodes",
bb59215dfd8f Uploaded
rnateam
parents: 5
diff changeset
53 dest="save_bcs_as_fa",
bb59215dfd8f Uploaded
rnateam
parents: 5
diff changeset
54 action="store_true",
bb59215dfd8f Uploaded
rnateam
parents: 5
diff changeset
55 help="Save extracted barcodes in FASTA format.")
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
56 parser.add_argument(
5
e841de88235c Uploaded
rnateam
parents: 2
diff changeset
57 "-a", "--add-bc-to-fastq",
e841de88235c Uploaded
rnateam
parents: 2
diff changeset
58 dest="add_to_head",
7
bb59215dfd8f Uploaded
rnateam
parents: 5
diff changeset
59 help="Append extracted barcodes to the FASTQ headers.",
bb59215dfd8f Uploaded
rnateam
parents: 5
diff changeset
60 action="store_true")
5
e841de88235c Uploaded
rnateam
parents: 2
diff changeset
61 parser.add_argument(
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
62 "-v", "--verbose",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
63 help="Be verbose.",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
64 action="store_true")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
65 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
66 "-d", "--debug",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
67 help="Print lots of debugging information",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
68 action="store_true")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
69 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
70 '--version',
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
71 action='version',
7
bb59215dfd8f Uploaded
rnateam
parents: 5
diff changeset
72 version='0.1.0')
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
73
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
74 args = parser.parse_args()
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
75 if args.debug:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
76 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
77 elif args.verbose:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
78 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
79 else:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
80 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
81 logging.info("Parsed arguments:")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
82 logging.info(" infile: '{}'".format(args.infile))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
83 logging.info(" pattern: '{}'".format(args.pattern))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
84 if args.outfile:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
85 logging.info(" outfile: enabled writing to file")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
86 logging.info(" outfile: '{}'".format(args.outfile))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
87 if args.out_bc_fasta:
7
bb59215dfd8f Uploaded
rnateam
parents: 5
diff changeset
88 logging.info(" bcs: enabled writing barcodes to fastq file")
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
89 logging.info(" bcs: {}".format(args.out_bc_fasta))
7
bb59215dfd8f Uploaded
rnateam
parents: 5
diff changeset
90 if args.save_bcs_as_fa:
bb59215dfd8f Uploaded
rnateam
parents: 5
diff changeset
91 logging.info(" fasta-barcodes: write barcodes in fasta format instead of fastq")
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
92 logging.info("")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
93
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
94 # check if supplied pattern is valid
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
95 valid_pattern = re.compile("^[XN]+$")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
96 pattern_match = valid_pattern.match(args.pattern)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
97 if pattern_match is None:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
98 raise ValueError("Error: supplied pattern '{}' is not valid.".format(args.pattern))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
99
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
100 # check if at least one barcode position is included in the pattern
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
101 has_bcpos_pattern = re.compile("X")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
102 pattern_match = has_bcpos_pattern.search(args.pattern)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
103 if pattern_match is None:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
104 raise ValueError("Error: supplied pattern '{}' does not contain a barcode position 'X'.".format(args.pattern))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
105
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
106 logging.info("Barcode pattern analysis:")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
107 # get X positions of pattern string
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
108 barcode_nt_pattern = re.compile("X+")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
109 barcode_positions = []
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
110 for m in re.finditer(barcode_nt_pattern, args.pattern):
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
111 logging.info(' found barcode positions in pattern: %02d-%02d: %s' % (m.start(), m.end(), m.group(0)))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
112 barcode_positions.append((m.start(), m.end()))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
113 logging.info(" barcode positions: {}".format(barcode_positions))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
114 # get last position of a barcode nt in the pattern
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
115 # reads must be long enough for all
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
116 min_readlen = barcode_positions[-1][-1]
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
117 logging.info(" last position of a barcode nt in pattern: {}".format(min_readlen))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
118 logging.info("")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
119
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
120 # get coordinates of nucleotides to keep
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
121 # the tail after the last barcode nt is handled separately
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
122 seq_positions = []
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
123 last_seq_start = 0
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
124 for bcstart, bcstop in barcode_positions:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
125 seq_positions.append((last_seq_start, bcstart))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
126 last_seq_start = bcstop
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
127 logging.info(" sequence positions: {}".format(seq_positions))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
128 logging.info(" start of sequence tail: {}".format(last_seq_start))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
129
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
130 samout = (open(args.outfile, "w") if args.outfile is not None else stdout)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
131 if args.out_bc_fasta is not None:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
132 faout = open(args.out_bc_fasta, "w")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
133 for header, seq, qual in FastqGeneralIterator(open(args.infile)):
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
134
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
135 # skip reads that are too short to extract the full requested barcode
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
136 if len(seq) < min_readlen:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
137 logging.warning("skipping read '{}', is too short to extract the full requested barcode".format(header))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
138 logging.debug("seq: {}".format(seq))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
139 logging.debug("len(seq): {}".format(len(seq)))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
140 continue
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
141
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
142 # extract barcode nucleotides
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
143 barcode_list = []
7
bb59215dfd8f Uploaded
rnateam
parents: 5
diff changeset
144 barcode_qual_list = []
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
145 for bcstart, bcstop in barcode_positions:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
146 barcode_list.append(seq[bcstart:bcstop])
7
bb59215dfd8f Uploaded
rnateam
parents: 5
diff changeset
147 barcode_qual_list.append(qual[bcstart:bcstop])
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
148 barcode = "".join(barcode_list)
7
bb59215dfd8f Uploaded
rnateam
parents: 5
diff changeset
149 barcode_quals = "".join(barcode_qual_list)
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
150 logging.debug("extracted barcode: {}".format(barcode))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
151
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
152 # create new sequence and quality string without barcode nucleotides
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
153 new_seq_list = []
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
154 new_qual_list = []
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
155 for seqstart, seqstop in seq_positions:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
156 new_seq_list.append(seq[seqstart:seqstop])
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
157 new_qual_list.append(qual[seqstart:seqstop])
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
158 new_seq_list.append(seq[last_seq_start:])
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
159 new_qual_list.append(qual[last_seq_start:])
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
160 new_seq = "".join(new_seq_list)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
161 new_qual = "".join(new_qual_list)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
162 # check if at least one nucleotide is left. having none would break fastq
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
163 if len(new_seq) == 0:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
164 logging.warning("skipping read '{}', no sequence remains after barcode extraction".format(header))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
165 logging.debug("seq: {}".format(seq))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
166 logging.debug("len(seq): {}".format(len(seq)))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
167 continue
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
168
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
169 # write barcode nucleotides into header
5
e841de88235c Uploaded
rnateam
parents: 2
diff changeset
170 if args.add_to_head:
e841de88235c Uploaded
rnateam
parents: 2
diff changeset
171 annotated_header = " ".join([header, barcode])
e841de88235c Uploaded
rnateam
parents: 2
diff changeset
172 else:
e841de88235c Uploaded
rnateam
parents: 2
diff changeset
173 annotated_header = header
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
174 samout.write("@%s\n%s\n+\n%s\n" % (annotated_header, new_seq, new_qual))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
175
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
176 # write barcode to fasta if requested
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
177 if args.out_bc_fasta is not None:
7
bb59215dfd8f Uploaded
rnateam
parents: 5
diff changeset
178 if args.save_bcs_as_fa:
bb59215dfd8f Uploaded
rnateam
parents: 5
diff changeset
179 faout.write(">{}\n{}\n".format(header, barcode))
bb59215dfd8f Uploaded
rnateam
parents: 5
diff changeset
180 else:
bb59215dfd8f Uploaded
rnateam
parents: 5
diff changeset
181 faout.write("@{}\n{}\n+\n{}\n".format(header, barcode, barcode_quals))
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
182
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
183 # close files
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
184 samout.close()
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
185 if args.out_bc_fasta is not None:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
186 faout.close()