|
10
|
1 #!/bin/bash
|
|
|
2
|
|
|
3 ####################################################################################################
|
|
|
4 #Copyright (C) 2014 Lukasz Kielpinski, Nikos Sidiropoulos
|
|
|
5
|
|
|
6 #This program is free software: you can redistribute it and/or modify it under the terms of the
|
|
|
7 #GNU General Public License as published by the Free Software Foundation, either version 3 of the
|
|
|
8 #License, or (at your option) any later version.
|
|
|
9
|
|
|
10 #This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
|
|
|
11 #even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
12 #GNU General Public License for more details (http://www.gnu.org/licenses/).
|
|
|
13 ####################################################################################################
|
|
|
14
|
|
|
15 function print_help {
|
|
|
16 cat <<End-of-message
|
|
|
17 Summarize Unique Barcodes.
|
|
|
18 Counts the number of unique random barcodes and reads associated with each sequenced fragment.
|
|
|
19 -------------------------------------
|
|
|
20 Input arguments:
|
|
|
21 -h: Help
|
|
|
22 -f: Aligned reads in BAM format.
|
|
|
23 -b: Barcode file (optional).
|
|
|
24 -p: Set priming position to a fixed value.
|
|
|
25 -t: Trim untemplated nucleotides.
|
|
|
26 -k: Produce k2n file. Warning: Can be sloooow!
|
|
|
27 -r: Rscript path
|
|
|
28 -o: Output folder (default: "output_dir")
|
|
|
29 -------------------------------------
|
|
|
30 Usage : summarize_unique_barcodes.sh -f <BAM_file> -b <BARCODES> -p <PRIMING_POSITION> -t -k -r <R_SCRIPT_PATH>
|
|
|
31 End-of-message
|
|
|
32 exit
|
|
|
33 }
|
|
|
34
|
|
|
35 #defaults
|
|
|
36 output_dir="output_dir"
|
|
|
37 trim_flag="False"
|
|
|
38
|
|
|
39 #parse input
|
|
|
40 while getopts hf:b::p:o:ktr: myarg
|
|
|
41 do case "$myarg" in
|
|
|
42 h) print_help
|
|
|
43 exit ;;
|
|
|
44 f) bamfile="$OPTARG" ;; #required
|
|
|
45 b) barcodes="$OPTARG" ;; #optional
|
|
|
46 t) trim_flag="True" ;;
|
|
|
47 k) k2n="True" ;;
|
|
|
48 p) priming_pos="$OPTARG" ;;
|
|
|
49 o) output_dir="$OPTARG" ;;
|
|
|
50 r) R_SCRIPT_PATH="$OPTARG" ;; #required
|
|
|
51 [?]) echo "ERROR: Unknown parameter"
|
|
|
52 print_help
|
|
|
53 exit 1 ;;
|
|
|
54 esac
|
|
|
55 done
|
|
|
56
|
|
|
57 ###### Sanity checks ######
|
|
|
58 if [ -z $bamfile ]; then
|
|
|
59 echo "Error: Aligned reads file is missing!"
|
|
|
60 print_help
|
|
|
61 exit 1
|
|
|
62 fi
|
|
|
63
|
|
|
64 if [ "$barcodes" == "None" ] && [ "$k2n" == "True" ]; then
|
|
|
65 echo "Error: k2n file cannot be produced without a barcode file!"
|
|
|
66 exit 1
|
|
|
67 fi
|
|
|
68
|
|
|
69
|
|
|
70 mkdir -p $output_dir
|
|
|
71
|
|
|
72 #Check if bamfile contains single or paired-end reads
|
|
|
73 samtools view -f 0x1 $bamfile | head -n 1 > paired
|
|
|
74
|
|
|
75 if [ -s paired ]; then
|
|
|
76 #paired-end
|
|
|
77 samtools view $bamfile | awk 'BEGIN{OFS="\t"}{if(substr($0,1,1)!="@"){print}}' - | awk -v out="${output_dir}/trimming_stats.txt" -v flag="${trim_flag}" 'BEGIN{OFS="\t";counter[0]=0;counter[1]=0;counter[2]=0;counter[3]=0;counter[4]=0}
|
|
|
78 function abs(value){return(value<0?-value:value)}
|
|
|
79 function return_offset(local_offset){print($1, $3, $4+local_offset, $4+abs($9)-1);counter[local_offset]++}
|
|
|
80 ($2 != 99) {next};
|
|
|
81 (flag == "False") {return_offset(0);next};
|
|
|
82 (/[\s\t]MD:Z:/ && !/MD:Z:([012][ACGT])/) {return_offset(0);next};
|
|
|
83 (/[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/ && substr($10,1,1)=="N") {return_offset(0);next};
|
|
|
84 (/[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/) {return_offset(1);next};
|
|
|
85
|
|
|
86 (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(0);next};
|
|
|
87 (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/) {return_offset(2);next};
|
|
|
88
|
|
|
89 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,1,2)=="NN") {return_offset(0);next};
|
|
|
90 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,1,1)=="N") {return_offset(2);next};
|
|
|
91 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(1);next};
|
|
|
92 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(2);next};
|
|
|
93
|
|
|
94 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,2)=="NN") {return_offset(0);next};
|
|
|
95 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(3);next};
|
|
|
96 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,3,1)=="N") {return_offset(2);next};
|
|
|
97 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/) {return_offset(3);next};
|
|
|
98
|
|
|
99 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,3,1)=="N" && substr($10,1,1)=="N") {return_offset(0);next};
|
|
|
100 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,3,1)=="N") {return_offset(1);next};
|
|
|
101 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,1,1)=="N") {return_offset(3);next};
|
|
|
102 (/MD:Z:0[ACGT]1[ACGT]/) {return_offset(3);next};
|
|
|
103
|
|
|
104 (/MD:Z:2[ACGT]/ && substr($10,3,1)=="N") {return_offset(0);next};
|
|
|
105 (/MD:Z:2[ACGT]/) {return_offset(3);next};
|
|
|
106
|
|
|
107 (substr($10,3,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(3);next};
|
|
|
108 (substr($10,2,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(2);next};
|
|
|
109 (substr($10,1,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(1);next};
|
|
|
110
|
|
|
111 {return_offset(0);counter[4]++}
|
|
|
112 END{print("No trimming:",counter[0],", out of which not recognized MD field for:",counter[4],"; 1 nt trimmed:", counter[1],"; 2 nt trimmed:", counter[2],"; 3 nt trimmed:",counter[3]) > out}' | sort -S1G -k1,1 | gzip > positions_temp_sorted.gz
|
|
|
113
|
|
|
114 else
|
|
|
115 #single-end
|
|
|
116 samtools view $bamfile | awk 'BEGIN{OFS="\t"}{if(substr($0,1,1)!="@"){print}}' - | awk -v out="${output_dir}/trimming_stats.txt" -v flag="${trim_flag}" 'BEGIN{OFS="\t";counter[0]=0;counter[1]=0;counter[2]=0;counter[3]=0;counter[4]=0}
|
|
|
117 function abs(value){return(value<0?-value:value)}
|
|
|
118 function return_offset(local_offset){print($1, $3, $4+local_offset, $4+abs($9)-1);counter[local_offset]++}
|
|
|
119 ($2 != 0) {next};
|
|
|
120 (flag == "False") {return_offset(0);next};
|
|
|
121 (/[\s\t]MD:Z:/ && !/MD:Z:([012][ACGT])/) {return_offset(0);next};
|
|
|
122 (/[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/ && substr($10,1,1)=="N") {return_offset(0);next};
|
|
|
123 (/[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/) {return_offset(1);next};
|
|
|
124
|
|
|
125 (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(0);next};
|
|
|
126 (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/) {return_offset(2);next};
|
|
|
127
|
|
|
128 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,1,2)=="NN") {return_offset(0);next};
|
|
|
129 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,1,1)=="N") {return_offset(2);next};
|
|
|
130 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(1);next};
|
|
|
131 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(2);next};
|
|
|
132
|
|
|
133 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,2)=="NN") {return_offset(0);next};
|
|
|
134 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(3);next};
|
|
|
135 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,3,1)=="N") {return_offset(2);next};
|
|
|
136 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/) {return_offset(3);next};
|
|
|
137
|
|
|
138 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,3,1)=="N" && substr($10,1,1)=="N") {return_offset(0);next};
|
|
|
139 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,3,1)=="N") {return_offset(1);next};
|
|
|
140 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,1,1)=="N") {return_offset(3);next};
|
|
|
141 (/MD:Z:0[ACGT]1[ACGT]/) {return_offset(3);next};
|
|
|
142
|
|
|
143 (/MD:Z:2[ACGT]/ && substr($10,3,1)=="N") {return_offset(0);next};
|
|
|
144 (/MD:Z:2[ACGT]/) {return_offset(3);next};
|
|
|
145
|
|
|
146 (substr($10,3,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(3);next};
|
|
|
147 (substr($10,2,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(2);next};
|
|
|
148 (substr($10,1,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(1);next};
|
|
|
149
|
|
|
150 {return_offset(0);counter[4]++}
|
|
|
151 END{print("No trimming:",counter[0],", out of which not recognized MD field for:",counter[4],"; 1 nt trimmed:", counter[1],"; 2 nt trimmed:", counter[2],"; 3 nt trimmed:",counter[3]) > out}' | sort -S1G -k1,1 | gzip > positions_temp_sorted.gz
|
|
|
152
|
|
|
153 fi
|
|
|
154
|
|
|
155 #Computing barcode length (Use the first line and compute the string length of the second column)
|
|
|
156 if [ "$barcodes" != "None" ]; then
|
|
|
157
|
|
|
158 TMP=`head -1 $barcodes | awk '{print $2}'`
|
|
|
159 BAR_LEN=`echo ${#TMP}`
|
|
|
160
|
|
|
161 #Remove "@" from barcodes and sort them
|
|
|
162 sed 's/^.//' $barcodes | sort -k1,1 -S1G | gzip > barcodes_temp_sorted.gz
|
|
|
163
|
|
|
164 #Merge poistions and barcodes
|
|
|
165 join -1 1 <(zcat positions_temp_sorted.gz) <(zcat barcodes_temp_sorted.gz) | cut -f 2,3,4,5 -d " " | awk '{if($4 !~ /N/){print}}' | awk -v bar_len="${BAR_LEN}" '{if(length($4)==bar_len){print}}' | gzip > merged_temp.gz
|
|
|
166
|
|
|
167 rm barcodes_temp_sorted.gz
|
|
|
168
|
|
|
169 else
|
|
|
170 zcat positions_temp_sorted.gz | cut -f 2,3,4 | gzip > merged_temp.gz
|
|
|
171 fi
|
|
|
172
|
|
|
173 #If the experiment is single-end set NA values to the priming column.
|
|
|
174 if [ ! -s paired ]; then
|
|
|
175 zcat merged_temp.gz | awk '{print $1, $2, "NA", $4, $5}' - > merged_temp2
|
|
|
176 cat merged_temp2 | gzip > merged_temp.gz
|
|
|
177 fi
|
|
|
178
|
|
|
179 #Fix priming position
|
|
|
180 if [ ! -z $priming_pos ]; then
|
|
|
181 zcat merged_temp.gz | awk -v pos="${priming_pos}" '{print $1, $2, pos, $4, $5}' - > merged_temp2
|
|
|
182 cat merged_temp2 | gzip > merged_temp.gz
|
|
|
183 fi
|
|
|
184
|
|
|
185 #File summary.txt columns: RNA_ID, Start, End, barcode sequence, sequenced_count[=number of sequenced fragments fulfilling previous requiremnts]
|
|
|
186
|
|
|
187 zcat merged_temp.gz | awk '{barcode[$1][$2][$3][$4]++}END{
|
|
|
188 for(RNA in barcode){
|
|
|
189 for(start_position in barcode[RNA]){
|
|
|
190 for(end_position in barcode[RNA][start_position]){
|
|
|
191 for(barseq in barcode[RNA][start_position][end_position]){print RNA,start_position,end_position,barseq,barcode[RNA][start_position][end_position][barseq]}}}}}' > $output_dir/summary.txt
|
|
|
192
|
|
|
193 #File unique_barcodes.txt columns: RNA_ID, Start, End, number of unique barcodes observed for this fragment.
|
|
|
194
|
|
|
195 awk '{barcode[$1][$2][$3]++}END{
|
|
|
196 for(RNA in barcode){
|
|
|
197 for(start_position in barcode[RNA]){
|
|
|
198 for(end_position in barcode[RNA][start_position]){print RNA "\t" start_position "\t" end_position "\t" barcode[RNA][start_position][end_position]}}}}' $output_dir/summary.txt > $output_dir/unique_barcodes.txt &
|
|
|
199
|
|
|
200 #File read_counts.txt colums: RNA_ID, Start, End, sequenced_count
|
|
|
201
|
|
|
202 zcat merged_temp.gz | awk '{barcode[$1][$2][$3]++}END{
|
|
|
203 for(RNA in barcode){
|
|
|
204 for(start_position in barcode[RNA]){
|
|
|
205 for(end_position in barcode[RNA][start_position]){print RNA "\t" start_position "\t" end_position "\t" barcode[RNA][start_position][end_position]}}}}' > $output_dir/read_counts.txt &
|
|
|
206
|
|
|
207 wait
|
|
|
208
|
|
|
209 #Print the maximum observed barcodes value. Usefull to assess the necessity of producing the k2n file.
|
|
|
210 if [ "$barcodes" != "None" ]; then
|
|
|
211
|
|
|
212 cut -f 4 $output_dir/unique_barcodes.txt | sort -S1G -rn > sorted_bars
|
|
|
213 max_observed_barcodes=`head -n 1 sorted_bars`
|
|
|
214
|
|
|
215 echo "Maximum observed Barcodes = ${max_observed_barcodes}"
|
|
|
216 rm sorted_bars
|
|
|
217 fi
|
|
|
218
|
|
|
219 #Produce k2n file
|
|
|
220 if [ "$k2n" == "True" ]; then
|
|
|
221 bar_length=$(head -n 1 $barcodes | cut -f 2 | xargs expr length)
|
|
|
222 Rscript $R_SCRIPT_PATH/k2n.R merged_temp.gz $output_dir/read_counts.txt ${max_observed_barcodes} $bar_length $output_dir/k2n.txt
|
|
|
223 fi
|
|
|
224
|
|
|
225 #Remove temp files
|
|
|
226 rm paired merged_temp.gz positions_temp_sorted.gz
|