comparison extract_bcs.py @ 2:de4ea3aa1090 draft

Uploaded
author rnateam
date Thu, 22 Oct 2015 10:26:45 -0400
parents
children e841de88235c
comparison
equal deleted inserted replaced
1:ae0f58d3318f 2:de4ea3aa1090
1 #!/usr/bin/env python
2
3 tool_description = """
4 Exract barcodes from a FASTQ file according to a user-specified pattern.
5
6 By default output is written to stdout.
7
8 Example usage:
9 - move nucleotides at positions 1-3 and 6-7 to FASTQ header and write to file
10 output.fastq:
11 fastq_extract_barcodes.py barcoded_input.fastq XXXNNXX --out output.fastq
12 """
13
14 epilog = """
15 Author: Daniel Maticzka
16 Copyright: 2015
17 License: Apache
18 Email: maticzkd@informatik.uni-freiburg.de
19 Status: Testing
20 """
21
22 import argparse
23 import logging
24 import re
25 from sys import stdout
26 from Bio.SeqIO.QualityIO import FastqGeneralIterator
27
28 # avoid ugly python IOError when stdout output is piped into another program
29 # and then truncated (such as piping to head)
30 from signal import signal, SIGPIPE, SIG_DFL
31 signal(SIGPIPE, SIG_DFL)
32
33 # parse command line arguments
34 parser = argparse.ArgumentParser(description=tool_description,
35 epilog=epilog,
36 formatter_class=argparse.RawDescriptionHelpFormatter)
37 # positional arguments
38 parser.add_argument(
39 "infile",
40 help="Path to fastq file.")
41 parser.add_argument(
42 "pattern",
43 help="Pattern of barcode nucleotides starting at 5'-end. X positions will be moved to the header, N positions will be kept.")
44 # optional arguments
45 parser.add_argument(
46 "-o", "--outfile",
47 help="Write results to this file.")
48 parser.add_argument(
49 "-b", "--bcs",
50 dest="out_bc_fasta",
51 help="If set, barcodes are written to this file in FASTA format.")
52 parser.add_argument(
53 "-v", "--verbose",
54 help="Be verbose.",
55 action="store_true")
56 parser.add_argument(
57 "-d", "--debug",
58 help="Print lots of debugging information",
59 action="store_true")
60 parser.add_argument(
61 '--version',
62 action='version',
63 version='0.1.0')
64
65 args = parser.parse_args()
66 if args.debug:
67 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
68 elif args.verbose:
69 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
70 else:
71 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
72 logging.info("Parsed arguments:")
73 logging.info(" infile: '{}'".format(args.infile))
74 logging.info(" pattern: '{}'".format(args.pattern))
75 if args.outfile:
76 logging.info(" outfile: enabled writing to file")
77 logging.info(" outfile: '{}'".format(args.outfile))
78 if args.out_bc_fasta:
79 logging.info(" bcs: enabled writing barcodes to fasta file")
80 logging.info(" bcs: {}".format(args.out_bc_fasta))
81 logging.info("")
82
83 # check if supplied pattern is valid
84 valid_pattern = re.compile("^[XN]+$")
85 pattern_match = valid_pattern.match(args.pattern)
86 if pattern_match is None:
87 raise ValueError("Error: supplied pattern '{}' is not valid.".format(args.pattern))
88
89 # check if at least one barcode position is included in the pattern
90 has_bcpos_pattern = re.compile("X")
91 pattern_match = has_bcpos_pattern.search(args.pattern)
92 if pattern_match is None:
93 raise ValueError("Error: supplied pattern '{}' does not contain a barcode position 'X'.".format(args.pattern))
94
95 logging.info("Barcode pattern analysis:")
96 # get X positions of pattern string
97 barcode_nt_pattern = re.compile("X+")
98 barcode_positions = []
99 for m in re.finditer(barcode_nt_pattern, args.pattern):
100 logging.info(' found barcode positions in pattern: %02d-%02d: %s' % (m.start(), m.end(), m.group(0)))
101 barcode_positions.append((m.start(), m.end()))
102 logging.info(" barcode positions: {}".format(barcode_positions))
103 # get last position of a barcode nt in the pattern
104 # reads must be long enough for all
105 min_readlen = barcode_positions[-1][-1]
106 logging.info(" last position of a barcode nt in pattern: {}".format(min_readlen))
107 logging.info("")
108
109 # get coordinates of nucleotides to keep
110 # the tail after the last barcode nt is handled separately
111 seq_positions = []
112 last_seq_start = 0
113 for bcstart, bcstop in barcode_positions:
114 seq_positions.append((last_seq_start, bcstart))
115 last_seq_start = bcstop
116 logging.info(" sequence positions: {}".format(seq_positions))
117 logging.info(" start of sequence tail: {}".format(last_seq_start))
118
119 samout = (open(args.outfile, "w") if args.outfile is not None else stdout)
120 if args.out_bc_fasta is not None:
121 faout = open(args.out_bc_fasta, "w")
122 for header, seq, qual in FastqGeneralIterator(open(args.infile)):
123
124 # skip reads that are too short to extract the full requested barcode
125 if len(seq) < min_readlen:
126 logging.warning("skipping read '{}', is too short to extract the full requested barcode".format(header))
127 logging.debug("seq: {}".format(seq))
128 logging.debug("len(seq): {}".format(len(seq)))
129 continue
130
131 # extract barcode nucleotides
132 barcode_list = []
133 for bcstart, bcstop in barcode_positions:
134 barcode_list.append(seq[bcstart:bcstop])
135 barcode = "".join(barcode_list)
136 logging.debug("extracted barcode: {}".format(barcode))
137
138 # create new sequence and quality string without barcode nucleotides
139 new_seq_list = []
140 new_qual_list = []
141 for seqstart, seqstop in seq_positions:
142 new_seq_list.append(seq[seqstart:seqstop])
143 new_qual_list.append(qual[seqstart:seqstop])
144 new_seq_list.append(seq[last_seq_start:])
145 new_qual_list.append(qual[last_seq_start:])
146 new_seq = "".join(new_seq_list)
147 new_qual = "".join(new_qual_list)
148 # check if at least one nucleotide is left. having none would break fastq
149 if len(new_seq) == 0:
150 logging.warning("skipping read '{}', no sequence remains after barcode extraction".format(header))
151 logging.debug("seq: {}".format(seq))
152 logging.debug("len(seq): {}".format(len(seq)))
153 continue
154
155 # write barcode nucleotides into header
156 annotated_header = header + " " + barcode
157 samout.write("@%s\n%s\n+\n%s\n" % (annotated_header, new_seq, new_qual))
158
159 # write barcode to fasta if requested
160 if args.out_bc_fasta is not None:
161 faout.write(">{}\n{}\n".format(header, barcode))
162
163 # close files
164 samout.close()
165 if args.out_bc_fasta is not None:
166 faout.close()