Mercurial > repos > nick > duplex
comparison make-barcodes.awk @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
| author | nick |
|---|---|
| date | Thu, 02 Feb 2017 18:44:31 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 17:836fa4fe9494 | 18:e4d75f9efb90 |
|---|---|
| 1 # The awk code that transforms the one-line fastq record pair into the output that can be sorted | |
| 2 # by barcode. | |
| 3 # Input columns (the 4 FASTQ lines for both reads in a read pair): | |
| 4 # 1: read1 name | |
| 5 # 2: read2 name | |
| 6 # 3: read1 sequence | |
| 7 # 4: read2 sequence | |
| 8 # 5: read1 + line | |
| 9 # 6: read2 + line | |
| 10 # 7: read1 quality | |
| 11 # 8: read2 quality | |
| 12 # Output columns: | |
| 13 # 1: the barcode, put into a canonical form | |
| 14 # 2: the order of the barcode halves ("ab" or "ba") | |
| 15 # 3: read1 name | |
| 16 # 4: sequence of read 1, minus the 12bp barcode and 5bp invariant sequence | |
| 17 # 5: read1 quality scores, minus the same first 17bp | |
| 18 # 6: read2 name | |
| 19 # 7: sequence of read 2, minus the first 17bp | |
| 20 # 8: read2 quality scores, minus the first 17bp | |
| 21 # The canonical form of the barcode is composed of two concatenated tags, one from each read. | |
| 22 # By default, each tag is the first 12bp of the read. The tag from the first read is the "alpha" and | |
| 23 # the tag from the second is the "beta". The barcode is formed by concatenating them in an order | |
| 24 # determined by a string comparison of the two. The lesser tag is first (if they are equal, the | |
| 25 # beta is first, but then you have bigger problems). | |
| 26 | |
| 27 BEGIN { | |
| 28 FS = "\t" | |
| 29 OFS = "\t" | |
| 30 # The number of bases from the start of each read that form the two halves of the barcode. | |
| 31 # (this should be half the size of the full, canonical barcode). | |
| 32 if (TAG_LEN == "") { | |
| 33 TAG_LEN = 12 | |
| 34 } | |
| 35 # The number of bases in the read that are between the barcode and the start of the actual sample | |
| 36 # sequence (the restriction site in the Loeb 2014 protocol). | |
| 37 if (INVARIANT == "") { | |
| 38 INVARIANT = 5 | |
| 39 } | |
| 40 } | |
| 41 | |
| 42 $3 && $4 { | |
| 43 alpha = substr($3, 1, TAG_LEN) | |
| 44 beta = substr($4, 1, TAG_LEN) | |
| 45 if (alpha < beta) { | |
| 46 barcode = alpha beta | |
| 47 order = "ab" | |
| 48 } else { | |
| 49 barcode = beta alpha | |
| 50 order = "ba" | |
| 51 } | |
| 52 name1 = substr($1, 2) | |
| 53 name2 = substr($2, 2) | |
| 54 seq1 = substr($3, TAG_LEN + INVARIANT + 1) | |
| 55 seq2 = substr($4, TAG_LEN + INVARIANT + 1) | |
| 56 qual1 = substr($7, TAG_LEN + INVARIANT + 1) | |
| 57 qual2 = substr($8, TAG_LEN + INVARIANT + 1) | |
| 58 print barcode, order, name1, seq1, qual1, name2, seq2, qual2 | |
| 59 } |
