comparison make-barcodes.awk @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents
children
comparison
equal deleted inserted replaced
17:836fa4fe9494 18:e4d75f9efb90
1 # The awk code that transforms the one-line fastq record pair into the output that can be sorted
2 # by barcode.
3 # Input columns (the 4 FASTQ lines for both reads in a read pair):
4 # 1: read1 name
5 # 2: read2 name
6 # 3: read1 sequence
7 # 4: read2 sequence
8 # 5: read1 + line
9 # 6: read2 + line
10 # 7: read1 quality
11 # 8: read2 quality
12 # Output columns:
13 # 1: the barcode, put into a canonical form
14 # 2: the order of the barcode halves ("ab" or "ba")
15 # 3: read1 name
16 # 4: sequence of read 1, minus the 12bp barcode and 5bp invariant sequence
17 # 5: read1 quality scores, minus the same first 17bp
18 # 6: read2 name
19 # 7: sequence of read 2, minus the first 17bp
20 # 8: read2 quality scores, minus the first 17bp
21 # The canonical form of the barcode is composed of two concatenated tags, one from each read.
22 # By default, each tag is the first 12bp of the read. The tag from the first read is the "alpha" and
23 # the tag from the second is the "beta". The barcode is formed by concatenating them in an order
24 # determined by a string comparison of the two. The lesser tag is first (if they are equal, the
25 # beta is first, but then you have bigger problems).
26
27 BEGIN {
28 FS = "\t"
29 OFS = "\t"
30 # The number of bases from the start of each read that form the two halves of the barcode.
31 # (this should be half the size of the full, canonical barcode).
32 if (TAG_LEN == "") {
33 TAG_LEN = 12
34 }
35 # The number of bases in the read that are between the barcode and the start of the actual sample
36 # sequence (the restriction site in the Loeb 2014 protocol).
37 if (INVARIANT == "") {
38 INVARIANT = 5
39 }
40 }
41
42 $3 && $4 {
43 alpha = substr($3, 1, TAG_LEN)
44 beta = substr($4, 1, TAG_LEN)
45 if (alpha < beta) {
46 barcode = alpha beta
47 order = "ab"
48 } else {
49 barcode = beta alpha
50 order = "ba"
51 }
52 name1 = substr($1, 2)
53 name2 = substr($2, 2)
54 seq1 = substr($3, TAG_LEN + INVARIANT + 1)
55 seq2 = substr($4, TAG_LEN + INVARIANT + 1)
56 qual1 = substr($7, TAG_LEN + INVARIANT + 1)
57 qual2 = substr($8, TAG_LEN + INVARIANT + 1)
58 print barcode, order, name1, seq1, qual1, name2, seq2, qual2
59 }