Mercurial > repos > rnateam > bctools
comparison extract_bcs.py @ 2:de4ea3aa1090 draft
Uploaded
| author | rnateam |
|---|---|
| date | Thu, 22 Oct 2015 10:26:45 -0400 |
| parents | |
| children | e841de88235c |
comparison
equal
deleted
inserted
replaced
| 1:ae0f58d3318f | 2:de4ea3aa1090 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 tool_description = """ | |
| 4 Exract barcodes from a FASTQ file according to a user-specified pattern. | |
| 5 | |
| 6 By default output is written to stdout. | |
| 7 | |
| 8 Example usage: | |
| 9 - move nucleotides at positions 1-3 and 6-7 to FASTQ header and write to file | |
| 10 output.fastq: | |
| 11 fastq_extract_barcodes.py barcoded_input.fastq XXXNNXX --out output.fastq | |
| 12 """ | |
| 13 | |
| 14 epilog = """ | |
| 15 Author: Daniel Maticzka | |
| 16 Copyright: 2015 | |
| 17 License: Apache | |
| 18 Email: maticzkd@informatik.uni-freiburg.de | |
| 19 Status: Testing | |
| 20 """ | |
| 21 | |
| 22 import argparse | |
| 23 import logging | |
| 24 import re | |
| 25 from sys import stdout | |
| 26 from Bio.SeqIO.QualityIO import FastqGeneralIterator | |
| 27 | |
| 28 # avoid ugly python IOError when stdout output is piped into another program | |
| 29 # and then truncated (such as piping to head) | |
| 30 from signal import signal, SIGPIPE, SIG_DFL | |
| 31 signal(SIGPIPE, SIG_DFL) | |
| 32 | |
| 33 # parse command line arguments | |
| 34 parser = argparse.ArgumentParser(description=tool_description, | |
| 35 epilog=epilog, | |
| 36 formatter_class=argparse.RawDescriptionHelpFormatter) | |
| 37 # positional arguments | |
| 38 parser.add_argument( | |
| 39 "infile", | |
| 40 help="Path to fastq file.") | |
| 41 parser.add_argument( | |
| 42 "pattern", | |
| 43 help="Pattern of barcode nucleotides starting at 5'-end. X positions will be moved to the header, N positions will be kept.") | |
| 44 # optional arguments | |
| 45 parser.add_argument( | |
| 46 "-o", "--outfile", | |
| 47 help="Write results to this file.") | |
| 48 parser.add_argument( | |
| 49 "-b", "--bcs", | |
| 50 dest="out_bc_fasta", | |
| 51 help="If set, barcodes are written to this file in FASTA format.") | |
| 52 parser.add_argument( | |
| 53 "-v", "--verbose", | |
| 54 help="Be verbose.", | |
| 55 action="store_true") | |
| 56 parser.add_argument( | |
| 57 "-d", "--debug", | |
| 58 help="Print lots of debugging information", | |
| 59 action="store_true") | |
| 60 parser.add_argument( | |
| 61 '--version', | |
| 62 action='version', | |
| 63 version='0.1.0') | |
| 64 | |
| 65 args = parser.parse_args() | |
| 66 if args.debug: | |
| 67 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") | |
| 68 elif args.verbose: | |
| 69 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s") | |
| 70 else: | |
| 71 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") | |
| 72 logging.info("Parsed arguments:") | |
| 73 logging.info(" infile: '{}'".format(args.infile)) | |
| 74 logging.info(" pattern: '{}'".format(args.pattern)) | |
| 75 if args.outfile: | |
| 76 logging.info(" outfile: enabled writing to file") | |
| 77 logging.info(" outfile: '{}'".format(args.outfile)) | |
| 78 if args.out_bc_fasta: | |
| 79 logging.info(" bcs: enabled writing barcodes to fasta file") | |
| 80 logging.info(" bcs: {}".format(args.out_bc_fasta)) | |
| 81 logging.info("") | |
| 82 | |
| 83 # check if supplied pattern is valid | |
| 84 valid_pattern = re.compile("^[XN]+$") | |
| 85 pattern_match = valid_pattern.match(args.pattern) | |
| 86 if pattern_match is None: | |
| 87 raise ValueError("Error: supplied pattern '{}' is not valid.".format(args.pattern)) | |
| 88 | |
| 89 # check if at least one barcode position is included in the pattern | |
| 90 has_bcpos_pattern = re.compile("X") | |
| 91 pattern_match = has_bcpos_pattern.search(args.pattern) | |
| 92 if pattern_match is None: | |
| 93 raise ValueError("Error: supplied pattern '{}' does not contain a barcode position 'X'.".format(args.pattern)) | |
| 94 | |
| 95 logging.info("Barcode pattern analysis:") | |
| 96 # get X positions of pattern string | |
| 97 barcode_nt_pattern = re.compile("X+") | |
| 98 barcode_positions = [] | |
| 99 for m in re.finditer(barcode_nt_pattern, args.pattern): | |
| 100 logging.info(' found barcode positions in pattern: %02d-%02d: %s' % (m.start(), m.end(), m.group(0))) | |
| 101 barcode_positions.append((m.start(), m.end())) | |
| 102 logging.info(" barcode positions: {}".format(barcode_positions)) | |
| 103 # get last position of a barcode nt in the pattern | |
| 104 # reads must be long enough for all | |
| 105 min_readlen = barcode_positions[-1][-1] | |
| 106 logging.info(" last position of a barcode nt in pattern: {}".format(min_readlen)) | |
| 107 logging.info("") | |
| 108 | |
| 109 # get coordinates of nucleotides to keep | |
| 110 # the tail after the last barcode nt is handled separately | |
| 111 seq_positions = [] | |
| 112 last_seq_start = 0 | |
| 113 for bcstart, bcstop in barcode_positions: | |
| 114 seq_positions.append((last_seq_start, bcstart)) | |
| 115 last_seq_start = bcstop | |
| 116 logging.info(" sequence positions: {}".format(seq_positions)) | |
| 117 logging.info(" start of sequence tail: {}".format(last_seq_start)) | |
| 118 | |
| 119 samout = (open(args.outfile, "w") if args.outfile is not None else stdout) | |
| 120 if args.out_bc_fasta is not None: | |
| 121 faout = open(args.out_bc_fasta, "w") | |
| 122 for header, seq, qual in FastqGeneralIterator(open(args.infile)): | |
| 123 | |
| 124 # skip reads that are too short to extract the full requested barcode | |
| 125 if len(seq) < min_readlen: | |
| 126 logging.warning("skipping read '{}', is too short to extract the full requested barcode".format(header)) | |
| 127 logging.debug("seq: {}".format(seq)) | |
| 128 logging.debug("len(seq): {}".format(len(seq))) | |
| 129 continue | |
| 130 | |
| 131 # extract barcode nucleotides | |
| 132 barcode_list = [] | |
| 133 for bcstart, bcstop in barcode_positions: | |
| 134 barcode_list.append(seq[bcstart:bcstop]) | |
| 135 barcode = "".join(barcode_list) | |
| 136 logging.debug("extracted barcode: {}".format(barcode)) | |
| 137 | |
| 138 # create new sequence and quality string without barcode nucleotides | |
| 139 new_seq_list = [] | |
| 140 new_qual_list = [] | |
| 141 for seqstart, seqstop in seq_positions: | |
| 142 new_seq_list.append(seq[seqstart:seqstop]) | |
| 143 new_qual_list.append(qual[seqstart:seqstop]) | |
| 144 new_seq_list.append(seq[last_seq_start:]) | |
| 145 new_qual_list.append(qual[last_seq_start:]) | |
| 146 new_seq = "".join(new_seq_list) | |
| 147 new_qual = "".join(new_qual_list) | |
| 148 # check if at least one nucleotide is left. having none would break fastq | |
| 149 if len(new_seq) == 0: | |
| 150 logging.warning("skipping read '{}', no sequence remains after barcode extraction".format(header)) | |
| 151 logging.debug("seq: {}".format(seq)) | |
| 152 logging.debug("len(seq): {}".format(len(seq))) | |
| 153 continue | |
| 154 | |
| 155 # write barcode nucleotides into header | |
| 156 annotated_header = header + " " + barcode | |
| 157 samout.write("@%s\n%s\n+\n%s\n" % (annotated_header, new_seq, new_qual)) | |
| 158 | |
| 159 # write barcode to fasta if requested | |
| 160 if args.out_bc_fasta is not None: | |
| 161 faout.write(">{}\n{}\n".format(header, barcode)) | |
| 162 | |
| 163 # close files | |
| 164 samout.close() | |
| 165 if args.out_bc_fasta is not None: | |
| 166 faout.close() |
