Mercurial > repos > rnateam > bctools
diff extract_bcs.py @ 2:de4ea3aa1090 draft
Uploaded
author | rnateam |
---|---|
date | Thu, 22 Oct 2015 10:26:45 -0400 |
parents | |
children | e841de88235c |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_bcs.py Thu Oct 22 10:26:45 2015 -0400 @@ -0,0 +1,166 @@ +#!/usr/bin/env python + +tool_description = """ +Exract barcodes from a FASTQ file according to a user-specified pattern. + +By default output is written to stdout. + +Example usage: +- move nucleotides at positions 1-3 and 6-7 to FASTQ header and write to file + output.fastq: +fastq_extract_barcodes.py barcoded_input.fastq XXXNNXX --out output.fastq +""" + +epilog = """ +Author: Daniel Maticzka +Copyright: 2015 +License: Apache +Email: maticzkd@informatik.uni-freiburg.de +Status: Testing +""" + +import argparse +import logging +import re +from sys import stdout +from Bio.SeqIO.QualityIO import FastqGeneralIterator + +# avoid ugly python IOError when stdout output is piped into another program +# and then truncated (such as piping to head) +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE, SIG_DFL) + +# parse command line arguments +parser = argparse.ArgumentParser(description=tool_description, + epilog=epilog, + formatter_class=argparse.RawDescriptionHelpFormatter) +# positional arguments +parser.add_argument( + "infile", + help="Path to fastq file.") +parser.add_argument( + "pattern", + help="Pattern of barcode nucleotides starting at 5'-end. X positions will be moved to the header, N positions will be kept.") +# optional arguments +parser.add_argument( + "-o", "--outfile", + help="Write results to this file.") +parser.add_argument( + "-b", "--bcs", + dest="out_bc_fasta", + help="If set, barcodes are written to this file in FASTA format.") +parser.add_argument( + "-v", "--verbose", + help="Be verbose.", + action="store_true") +parser.add_argument( + "-d", "--debug", + help="Print lots of debugging information", + action="store_true") +parser.add_argument( + '--version', + action='version', + version='0.1.0') + +args = parser.parse_args() +if args.debug: + logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") +elif args.verbose: + logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s") +else: + logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") +logging.info("Parsed arguments:") +logging.info(" infile: '{}'".format(args.infile)) +logging.info(" pattern: '{}'".format(args.pattern)) +if args.outfile: + logging.info(" outfile: enabled writing to file") + logging.info(" outfile: '{}'".format(args.outfile)) +if args.out_bc_fasta: + logging.info(" bcs: enabled writing barcodes to fasta file") + logging.info(" bcs: {}".format(args.out_bc_fasta)) +logging.info("") + +# check if supplied pattern is valid +valid_pattern = re.compile("^[XN]+$") +pattern_match = valid_pattern.match(args.pattern) +if pattern_match is None: + raise ValueError("Error: supplied pattern '{}' is not valid.".format(args.pattern)) + +# check if at least one barcode position is included in the pattern +has_bcpos_pattern = re.compile("X") +pattern_match = has_bcpos_pattern.search(args.pattern) +if pattern_match is None: + raise ValueError("Error: supplied pattern '{}' does not contain a barcode position 'X'.".format(args.pattern)) + +logging.info("Barcode pattern analysis:") +# get X positions of pattern string +barcode_nt_pattern = re.compile("X+") +barcode_positions = [] +for m in re.finditer(barcode_nt_pattern, args.pattern): + logging.info(' found barcode positions in pattern: %02d-%02d: %s' % (m.start(), m.end(), m.group(0))) + barcode_positions.append((m.start(), m.end())) +logging.info(" barcode positions: {}".format(barcode_positions)) +# get last position of a barcode nt in the pattern +# reads must be long enough for all +min_readlen = barcode_positions[-1][-1] +logging.info(" last position of a barcode nt in pattern: {}".format(min_readlen)) +logging.info("") + +# get coordinates of nucleotides to keep +# the tail after the last barcode nt is handled separately +seq_positions = [] +last_seq_start = 0 +for bcstart, bcstop in barcode_positions: + seq_positions.append((last_seq_start, bcstart)) + last_seq_start = bcstop +logging.info(" sequence positions: {}".format(seq_positions)) +logging.info(" start of sequence tail: {}".format(last_seq_start)) + +samout = (open(args.outfile, "w") if args.outfile is not None else stdout) +if args.out_bc_fasta is not None: + faout = open(args.out_bc_fasta, "w") +for header, seq, qual in FastqGeneralIterator(open(args.infile)): + + # skip reads that are too short to extract the full requested barcode + if len(seq) < min_readlen: + logging.warning("skipping read '{}', is too short to extract the full requested barcode".format(header)) + logging.debug("seq: {}".format(seq)) + logging.debug("len(seq): {}".format(len(seq))) + continue + + # extract barcode nucleotides + barcode_list = [] + for bcstart, bcstop in barcode_positions: + barcode_list.append(seq[bcstart:bcstop]) + barcode = "".join(barcode_list) + logging.debug("extracted barcode: {}".format(barcode)) + + # create new sequence and quality string without barcode nucleotides + new_seq_list = [] + new_qual_list = [] + for seqstart, seqstop in seq_positions: + new_seq_list.append(seq[seqstart:seqstop]) + new_qual_list.append(qual[seqstart:seqstop]) + new_seq_list.append(seq[last_seq_start:]) + new_qual_list.append(qual[last_seq_start:]) + new_seq = "".join(new_seq_list) + new_qual = "".join(new_qual_list) + # check if at least one nucleotide is left. having none would break fastq + if len(new_seq) == 0: + logging.warning("skipping read '{}', no sequence remains after barcode extraction".format(header)) + logging.debug("seq: {}".format(seq)) + logging.debug("len(seq): {}".format(len(seq))) + continue + + # write barcode nucleotides into header + annotated_header = header + " " + barcode + samout.write("@%s\n%s\n+\n%s\n" % (annotated_header, new_seq, new_qual)) + + # write barcode to fasta if requested + if args.out_bc_fasta is not None: + faout.write(">{}\n{}\n".format(header, barcode)) + +# close files +samout.close() +if args.out_bc_fasta is not None: + faout.close()