| 2 | 1 #!/usr/bin/env python | 
|  | 2 | 
|  | 3 tool_description = """ | 
|  | 4 Exract barcodes from a FASTQ file according to a user-specified pattern. | 
|  | 5 | 
|  | 6 By default output is written to stdout. | 
|  | 7 | 
|  | 8 Example usage: | 
| 5 | 9 - remove barcode nucleotides at positions 1-3 and 6-7 from FASTQ; write modified | 
|  | 10   FASTQ entries to output.fastq and barcode nucleotides to barcodes.fa: | 
|  | 11 fastq_extract_barcodes.py barcoded_input.fastq XXXNNXX --out output.fastq --bcs barcodes.fa | 
| 2 | 12 """ | 
|  | 13 | 
|  | 14 epilog = """ | 
|  | 15 Author: Daniel Maticzka | 
|  | 16 Copyright: 2015 | 
|  | 17 License: Apache | 
|  | 18 Email: maticzkd@informatik.uni-freiburg.de | 
|  | 19 Status: Testing | 
|  | 20 """ | 
|  | 21 | 
|  | 22 import argparse | 
|  | 23 import logging | 
|  | 24 import re | 
|  | 25 from sys import stdout | 
|  | 26 from Bio.SeqIO.QualityIO import FastqGeneralIterator | 
|  | 27 | 
|  | 28 # avoid ugly python IOError when stdout output is piped into another program | 
|  | 29 # and then truncated (such as piping to head) | 
|  | 30 from signal import signal, SIGPIPE, SIG_DFL | 
|  | 31 signal(SIGPIPE, SIG_DFL) | 
|  | 32 | 
|  | 33 # parse command line arguments | 
|  | 34 parser = argparse.ArgumentParser(description=tool_description, | 
|  | 35                                  epilog=epilog, | 
|  | 36                                  formatter_class=argparse.RawDescriptionHelpFormatter) | 
|  | 37 # positional arguments | 
|  | 38 parser.add_argument( | 
|  | 39     "infile", | 
|  | 40     help="Path to fastq file.") | 
|  | 41 parser.add_argument( | 
|  | 42     "pattern", | 
|  | 43     help="Pattern of barcode nucleotides starting at 5'-end. X positions will be moved to the header, N positions will be kept.") | 
|  | 44 # optional arguments | 
|  | 45 parser.add_argument( | 
|  | 46     "-o", "--outfile", | 
|  | 47     help="Write results to this file.") | 
|  | 48 parser.add_argument( | 
|  | 49     "-b", "--bcs", | 
|  | 50     dest="out_bc_fasta", | 
|  | 51     help="If set, barcodes are written to this file in FASTA format.") | 
|  | 52 parser.add_argument( | 
| 5 | 53     "-a", "--add-bc-to-fastq", | 
|  | 54     dest="add_to_head", | 
|  | 55     help="If set, append extracted barcodes to the FASTQ headers.", | 
|  | 56     action="store_true" | 
|  | 57 ) | 
|  | 58 parser.add_argument( | 
| 2 | 59     "-v", "--verbose", | 
|  | 60     help="Be verbose.", | 
|  | 61     action="store_true") | 
|  | 62 parser.add_argument( | 
|  | 63     "-d", "--debug", | 
|  | 64     help="Print lots of debugging information", | 
|  | 65     action="store_true") | 
|  | 66 parser.add_argument( | 
|  | 67     '--version', | 
|  | 68     action='version', | 
| 5 | 69     version='1.0.0') | 
| 2 | 70 | 
|  | 71 args = parser.parse_args() | 
|  | 72 if args.debug: | 
|  | 73     logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") | 
|  | 74 elif args.verbose: | 
|  | 75     logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s") | 
|  | 76 else: | 
|  | 77     logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") | 
|  | 78 logging.info("Parsed arguments:") | 
|  | 79 logging.info("  infile: '{}'".format(args.infile)) | 
|  | 80 logging.info("  pattern: '{}'".format(args.pattern)) | 
|  | 81 if args.outfile: | 
|  | 82     logging.info("  outfile: enabled writing to file") | 
|  | 83     logging.info("  outfile: '{}'".format(args.outfile)) | 
|  | 84 if args.out_bc_fasta: | 
|  | 85     logging.info("  bcs: enabled writing barcodes to fasta file") | 
|  | 86     logging.info("  bcs: {}".format(args.out_bc_fasta)) | 
|  | 87 logging.info("") | 
|  | 88 | 
|  | 89 # check if supplied pattern is valid | 
|  | 90 valid_pattern = re.compile("^[XN]+$") | 
|  | 91 pattern_match = valid_pattern.match(args.pattern) | 
|  | 92 if pattern_match is None: | 
|  | 93     raise ValueError("Error: supplied pattern '{}' is not valid.".format(args.pattern)) | 
|  | 94 | 
|  | 95 # check if at least one barcode position is included in the pattern | 
|  | 96 has_bcpos_pattern = re.compile("X") | 
|  | 97 pattern_match = has_bcpos_pattern.search(args.pattern) | 
|  | 98 if pattern_match is None: | 
|  | 99     raise ValueError("Error: supplied pattern '{}' does not contain a barcode position 'X'.".format(args.pattern)) | 
|  | 100 | 
|  | 101 logging.info("Barcode pattern analysis:") | 
|  | 102 # get X positions of pattern string | 
|  | 103 barcode_nt_pattern = re.compile("X+") | 
|  | 104 barcode_positions = [] | 
|  | 105 for m in re.finditer(barcode_nt_pattern, args.pattern): | 
|  | 106     logging.info('  found barcode positions in pattern: %02d-%02d: %s' % (m.start(), m.end(), m.group(0))) | 
|  | 107     barcode_positions.append((m.start(), m.end())) | 
|  | 108 logging.info("  barcode positions: {}".format(barcode_positions)) | 
|  | 109 # get last position of a barcode nt in the pattern | 
|  | 110 # reads must be long enough for all | 
|  | 111 min_readlen = barcode_positions[-1][-1] | 
|  | 112 logging.info("  last position of a barcode nt in pattern: {}".format(min_readlen)) | 
|  | 113 logging.info("") | 
|  | 114 | 
|  | 115 # get coordinates of nucleotides to keep | 
|  | 116 # the tail after the last barcode nt is handled separately | 
|  | 117 seq_positions = [] | 
|  | 118 last_seq_start = 0 | 
|  | 119 for bcstart, bcstop in barcode_positions: | 
|  | 120     seq_positions.append((last_seq_start, bcstart)) | 
|  | 121     last_seq_start = bcstop | 
|  | 122 logging.info("  sequence positions: {}".format(seq_positions)) | 
|  | 123 logging.info("  start of sequence tail: {}".format(last_seq_start)) | 
|  | 124 | 
|  | 125 samout = (open(args.outfile, "w") if args.outfile is not None else stdout) | 
|  | 126 if args.out_bc_fasta is not None: | 
|  | 127     faout = open(args.out_bc_fasta, "w") | 
|  | 128 for header, seq, qual in FastqGeneralIterator(open(args.infile)): | 
|  | 129 | 
|  | 130     # skip reads that are too short to extract the full requested barcode | 
|  | 131     if len(seq) < min_readlen: | 
|  | 132         logging.warning("skipping read '{}', is too short to extract the full requested barcode".format(header)) | 
|  | 133         logging.debug("seq: {}".format(seq)) | 
|  | 134         logging.debug("len(seq): {}".format(len(seq))) | 
|  | 135         continue | 
|  | 136 | 
|  | 137     # extract barcode nucleotides | 
|  | 138     barcode_list = [] | 
|  | 139     for bcstart, bcstop in barcode_positions: | 
|  | 140         barcode_list.append(seq[bcstart:bcstop]) | 
|  | 141     barcode = "".join(barcode_list) | 
|  | 142     logging.debug("extracted barcode: {}".format(barcode)) | 
|  | 143 | 
|  | 144     # create new sequence and quality string without barcode nucleotides | 
|  | 145     new_seq_list = [] | 
|  | 146     new_qual_list = [] | 
|  | 147     for seqstart, seqstop in seq_positions: | 
|  | 148         new_seq_list.append(seq[seqstart:seqstop]) | 
|  | 149         new_qual_list.append(qual[seqstart:seqstop]) | 
|  | 150     new_seq_list.append(seq[last_seq_start:]) | 
|  | 151     new_qual_list.append(qual[last_seq_start:]) | 
|  | 152     new_seq = "".join(new_seq_list) | 
|  | 153     new_qual = "".join(new_qual_list) | 
|  | 154     # check if at least one nucleotide is left. having none would break fastq | 
|  | 155     if len(new_seq) == 0: | 
|  | 156         logging.warning("skipping read '{}', no sequence remains after barcode extraction".format(header)) | 
|  | 157         logging.debug("seq: {}".format(seq)) | 
|  | 158         logging.debug("len(seq): {}".format(len(seq))) | 
|  | 159         continue | 
|  | 160 | 
|  | 161     # write barcode nucleotides into header | 
| 5 | 162     if args.add_to_head: | 
|  | 163         annotated_header = " ".join([header, barcode]) | 
|  | 164     else: | 
|  | 165         annotated_header = header | 
| 2 | 166     samout.write("@%s\n%s\n+\n%s\n" % (annotated_header, new_seq, new_qual)) | 
|  | 167 | 
|  | 168     # write barcode to fasta if requested | 
|  | 169     if args.out_bc_fasta is not None: | 
|  | 170         faout.write(">{}\n{}\n".format(header, barcode)) | 
|  | 171 | 
|  | 172 # close files | 
|  | 173 samout.close() | 
|  | 174 if args.out_bc_fasta is not None: | 
|  | 175     faout.close() |