Mercurial > repos > nick > duplex
diff make-barcodes.awk @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author | nick |
---|---|
date | Thu, 02 Feb 2017 18:44:31 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/make-barcodes.awk Thu Feb 02 18:44:31 2017 -0500 @@ -0,0 +1,59 @@ +# The awk code that transforms the one-line fastq record pair into the output that can be sorted +# by barcode. +# Input columns (the 4 FASTQ lines for both reads in a read pair): +# 1: read1 name +# 2: read2 name +# 3: read1 sequence +# 4: read2 sequence +# 5: read1 + line +# 6: read2 + line +# 7: read1 quality +# 8: read2 quality +# Output columns: +# 1: the barcode, put into a canonical form +# 2: the order of the barcode halves ("ab" or "ba") +# 3: read1 name +# 4: sequence of read 1, minus the 12bp barcode and 5bp invariant sequence +# 5: read1 quality scores, minus the same first 17bp +# 6: read2 name +# 7: sequence of read 2, minus the first 17bp +# 8: read2 quality scores, minus the first 17bp +# The canonical form of the barcode is composed of two concatenated tags, one from each read. +# By default, each tag is the first 12bp of the read. The tag from the first read is the "alpha" and +# the tag from the second is the "beta". The barcode is formed by concatenating them in an order +# determined by a string comparison of the two. The lesser tag is first (if they are equal, the +# beta is first, but then you have bigger problems). + +BEGIN { + FS = "\t" + OFS = "\t" + # The number of bases from the start of each read that form the two halves of the barcode. + # (this should be half the size of the full, canonical barcode). + if (TAG_LEN == "") { + TAG_LEN = 12 + } + # The number of bases in the read that are between the barcode and the start of the actual sample + # sequence (the restriction site in the Loeb 2014 protocol). + if (INVARIANT == "") { + INVARIANT = 5 + } +} + +$3 && $4 { + alpha = substr($3, 1, TAG_LEN) + beta = substr($4, 1, TAG_LEN) + if (alpha < beta) { + barcode = alpha beta + order = "ab" + } else { + barcode = beta alpha + order = "ba" + } + name1 = substr($1, 2) + name2 = substr($2, 2) + seq1 = substr($3, TAG_LEN + INVARIANT + 1) + seq2 = substr($4, TAG_LEN + INVARIANT + 1) + qual1 = substr($7, TAG_LEN + INVARIANT + 1) + qual2 = substr($8, TAG_LEN + INVARIANT + 1) + print barcode, order, name1, seq1, qual1, name2, seq2, qual2 +}