Mercurial > repos > nikos > rna_probing
comparison summarize_unique_barcodes.sh @ 24:431aebd93843 draft default tip
Fixed a bug in k2n.R where the function k2n_calc() would result in an error for single-end read files.
| author | nikos |
|---|---|
| date | Wed, 05 Aug 2015 09:21:02 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 23:fb76303acf4f | 24:431aebd93843 |
|---|---|
| 1 #!/bin/bash | |
| 2 | |
| 3 #################################################################################################### | |
| 4 #Copyright (C) 2014 Lukasz Kielpinski, Nikos Sidiropoulos | |
| 5 | |
| 6 #This program is free software: you can redistribute it and/or modify it under the terms of the | |
| 7 #GNU General Public License as published by the Free Software Foundation, either version 3 of the | |
| 8 #License, or (at your option) any later version. | |
| 9 | |
| 10 #This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without | |
| 11 #even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 12 #GNU General Public License for more details (http://www.gnu.org/licenses/). | |
| 13 #################################################################################################### | |
| 14 | |
| 15 function print_help { | |
| 16 cat <<End-of-message | |
| 17 Summarize Unique Barcodes. | |
| 18 Counts the number of unique random barcodes and reads associated with each sequenced fragment. | |
| 19 ------------------------------------- | |
| 20 Input arguments: | |
| 21 -h: Help | |
| 22 -f: Aligned reads in BAM format. | |
| 23 -b: Barcode file (optional). | |
| 24 -p: Set priming position to a fixed value. | |
| 25 -t: Trim untemplated nucleotides. | |
| 26 -k: Produce k2n file. Warning: Can be sloooow! | |
| 27 -r: Rscript path | |
| 28 -o: Output folder (default: "output_dir") | |
| 29 ------------------------------------- | |
| 30 Usage : summarize_unique_barcodes.sh -f <BAM_file> -b <BARCODES> -p <PRIMING_POSITION> -t -k -r <R_SCRIPT_PATH> | |
| 31 End-of-message | |
| 32 exit | |
| 33 } | |
| 34 | |
| 35 #defaults | |
| 36 output_dir="output_dir" | |
| 37 trim_flag="False" | |
| 38 | |
| 39 #parse input | |
| 40 while getopts hf:b::p:o:ktr: myarg | |
| 41 do case "$myarg" in | |
| 42 h) print_help | |
| 43 exit ;; | |
| 44 f) bamfile="$OPTARG" ;; #required | |
| 45 b) barcodes="$OPTARG" ;; #optional | |
| 46 t) trim_flag="True" ;; | |
| 47 k) k2n="True" ;; | |
| 48 p) priming_pos="$OPTARG" ;; | |
| 49 o) output_dir="$OPTARG" ;; | |
| 50 r) R_SCRIPT_PATH="$OPTARG" ;; #required | |
| 51 [?]) echo "ERROR: Unknown parameter" | |
| 52 print_help | |
| 53 exit 1 ;; | |
| 54 esac | |
| 55 done | |
| 56 | |
| 57 ###### Sanity checks ###### | |
| 58 if [ -z $bamfile ]; then | |
| 59 echo "Error: Aligned reads file is missing!" | |
| 60 print_help | |
| 61 exit 1 | |
| 62 fi | |
| 63 | |
| 64 if [ "$barcodes" == "None" ] && [ "$k2n" == "True" ]; then | |
| 65 echo "Error: k2n file cannot be produced without a barcode file!" | |
| 66 exit 1 | |
| 67 fi | |
| 68 | |
| 69 | |
| 70 mkdir -p $output_dir | |
| 71 | |
| 72 #Check if bamfile contains single or paired-end reads | |
| 73 samtools view -f 0x1 $bamfile | head -n 1 > paired | |
| 74 | |
| 75 if [ -s paired ]; then | |
| 76 #paired-end | |
| 77 samtools view $bamfile | awk 'BEGIN{OFS="\t"}{if(substr($0,1,1)!="@"){print}}' - | awk -v out="${output_dir}/trimming_stats.txt" -v flag="${trim_flag}" 'BEGIN{OFS="\t";counter[0]=0;counter[1]=0;counter[2]=0;counter[3]=0;counter[4]=0} | |
| 78 function abs(value){return(value<0?-value:value)} | |
| 79 function return_offset(local_offset){print($1, $3, $4+local_offset, $4+abs($9)-1);counter[local_offset]++} | |
| 80 ($2 != 99) {next}; | |
| 81 (flag == "False") {return_offset(0);next}; | |
| 82 (/[\s\t]MD:Z:/ && !/MD:Z:([012][ACGT])/) {return_offset(0);next}; | |
| 83 (/[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/ && substr($10,1,1)=="N") {return_offset(0);next}; | |
| 84 (/[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/) {return_offset(1);next}; | |
| 85 | |
| 86 (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(0);next}; | |
| 87 (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/) {return_offset(2);next}; | |
| 88 | |
| 89 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,1,2)=="NN") {return_offset(0);next}; | |
| 90 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,1,1)=="N") {return_offset(2);next}; | |
| 91 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(1);next}; | |
| 92 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(2);next}; | |
| 93 | |
| 94 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,2)=="NN") {return_offset(0);next}; | |
| 95 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(3);next}; | |
| 96 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,3,1)=="N") {return_offset(2);next}; | |
| 97 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/) {return_offset(3);next}; | |
| 98 | |
| 99 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,3,1)=="N" && substr($10,1,1)=="N") {return_offset(0);next}; | |
| 100 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,3,1)=="N") {return_offset(1);next}; | |
| 101 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,1,1)=="N") {return_offset(3);next}; | |
| 102 (/MD:Z:0[ACGT]1[ACGT]/) {return_offset(3);next}; | |
| 103 | |
| 104 (/MD:Z:2[ACGT]/ && substr($10,3,1)=="N") {return_offset(0);next}; | |
| 105 (/MD:Z:2[ACGT]/) {return_offset(3);next}; | |
| 106 | |
| 107 (substr($10,3,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(3);next}; | |
| 108 (substr($10,2,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(2);next}; | |
| 109 (substr($10,1,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(1);next}; | |
| 110 | |
| 111 {return_offset(0);counter[4]++} | |
| 112 END{print("No trimming:",counter[0],", out of which not recognized MD field for:",counter[4],"; 1 nt trimmed:", counter[1],"; 2 nt trimmed:", counter[2],"; 3 nt trimmed:",counter[3]) > out}' | sort -S1G -k1,1 | gzip > positions_temp_sorted.gz | |
| 113 | |
| 114 else | |
| 115 #single-end | |
| 116 samtools view $bamfile | awk 'BEGIN{OFS="\t"}{if(substr($0,1,1)!="@"){print}}' - | awk -v out="${output_dir}/trimming_stats.txt" -v flag="${trim_flag}" 'BEGIN{OFS="\t";counter[0]=0;counter[1]=0;counter[2]=0;counter[3]=0;counter[4]=0} | |
| 117 function abs(value){return(value<0?-value:value)} | |
| 118 function return_offset(local_offset){print($1, $3, $4+local_offset, $4+abs($9)-1);counter[local_offset]++} | |
| 119 ($2 != 0) {next}; | |
| 120 (flag == "False") {return_offset(0);next}; | |
| 121 (/[\s\t]MD:Z:/ && !/MD:Z:([012][ACGT])/) {return_offset(0);next}; | |
| 122 (/[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/ && substr($10,1,1)=="N") {return_offset(0);next}; | |
| 123 (/[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/) {return_offset(1);next}; | |
| 124 | |
| 125 (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(0);next}; | |
| 126 (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/) {return_offset(2);next}; | |
| 127 | |
| 128 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,1,2)=="NN") {return_offset(0);next}; | |
| 129 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,1,1)=="N") {return_offset(2);next}; | |
| 130 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(1);next}; | |
| 131 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(2);next}; | |
| 132 | |
| 133 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,2)=="NN") {return_offset(0);next}; | |
| 134 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(3);next}; | |
| 135 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,3,1)=="N") {return_offset(2);next}; | |
| 136 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/) {return_offset(3);next}; | |
| 137 | |
| 138 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,3,1)=="N" && substr($10,1,1)=="N") {return_offset(0);next}; | |
| 139 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,3,1)=="N") {return_offset(1);next}; | |
| 140 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,1,1)=="N") {return_offset(3);next}; | |
| 141 (/MD:Z:0[ACGT]1[ACGT]/) {return_offset(3);next}; | |
| 142 | |
| 143 (/MD:Z:2[ACGT]/ && substr($10,3,1)=="N") {return_offset(0);next}; | |
| 144 (/MD:Z:2[ACGT]/) {return_offset(3);next}; | |
| 145 | |
| 146 (substr($10,3,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(3);next}; | |
| 147 (substr($10,2,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(2);next}; | |
| 148 (substr($10,1,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(1);next}; | |
| 149 | |
| 150 {return_offset(0);counter[4]++} | |
| 151 END{print("No trimming:",counter[0],", out of which not recognized MD field for:",counter[4],"; 1 nt trimmed:", counter[1],"; 2 nt trimmed:", counter[2],"; 3 nt trimmed:",counter[3]) > out}' | sort -S1G -k1,1 | gzip > positions_temp_sorted.gz | |
| 152 | |
| 153 fi | |
| 154 | |
| 155 #Computing barcode length (Use the first line and compute the string length of the second column) | |
| 156 if [ "$barcodes" != "None" ]; then | |
| 157 | |
| 158 TMP=$(for line in `cut -f 2 $barcodes`; do if [ ! -z "$line" ]; then echo $line; break; fi; done) | |
| 159 BAR_LEN=`echo ${#TMP}` | |
| 160 | |
| 161 #Remove "@" from barcodes and sort them | |
| 162 sed 's/^.//' $barcodes | sort -k1,1 -S1G | gzip > barcodes_temp_sorted.gz | |
| 163 | |
| 164 #Merge poistions and barcodes | |
| 165 join -1 1 <(zcat positions_temp_sorted.gz) <(zcat barcodes_temp_sorted.gz) | cut -f 2,3,4,5 -d " " | awk '{if($4 !~ /N/){print}}' | awk -v bar_len="${BAR_LEN}" '{if(length($4)==bar_len){print}}' | gzip > merged_temp.gz | |
| 166 | |
| 167 rm barcodes_temp_sorted.gz | |
| 168 | |
| 169 else | |
| 170 zcat positions_temp_sorted.gz | cut -f 2,3,4 | gzip > merged_temp.gz | |
| 171 fi | |
| 172 | |
| 173 #If the experiment is single-end set NA values to the priming column. | |
| 174 if [ ! -s paired ]; then | |
| 175 zcat merged_temp.gz | awk '{print $1, $2, "NA", $4, $5}' - > merged_temp2 | |
| 176 gzip -c merged_temp2 > merged_temp.gz | |
| 177 rm merged_temp2 | |
| 178 fi | |
| 179 | |
| 180 #Fix priming position | |
| 181 if [ ! -z $priming_pos ]; then | |
| 182 zcat merged_temp.gz | awk -v pos="${priming_pos}" '{print $1, $2, pos, $4, $5}' - > merged_temp2 | |
| 183 gzip -c merged_temp2 > merged_temp.gz | |
| 184 rm merged_temp2 | |
| 185 fi | |
| 186 | |
| 187 #File summary.txt columns: RNA_ID, Start, End, barcode sequence, sequenced_count[=number of sequenced fragments fulfilling previous requiremnts] | |
| 188 | |
| 189 zcat merged_temp.gz | awk '{barcode[$1][$2][$3][$4]++}END{ | |
| 190 for(RNA in barcode){ | |
| 191 for(start_position in barcode[RNA]){ | |
| 192 for(end_position in barcode[RNA][start_position]){ | |
| 193 for(barseq in barcode[RNA][start_position][end_position]){print RNA,start_position,end_position,barseq,barcode[RNA][start_position][end_position][barseq]}}}}}' > $output_dir/summary.txt | |
| 194 | |
| 195 #File unique_barcodes.txt columns: RNA_ID, Start, End, number of unique barcodes observed for this fragment. | |
| 196 | |
| 197 awk '{barcode[$1][$2][$3]++}END{ | |
| 198 for(RNA in barcode){ | |
| 199 for(start_position in barcode[RNA]){ | |
| 200 for(end_position in barcode[RNA][start_position]){print RNA "\t" start_position "\t" end_position "\t" barcode[RNA][start_position][end_position]}}}}' $output_dir/summary.txt > $output_dir/unique_barcodes.txt & | |
| 201 | |
| 202 #File read_counts.txt colums: RNA_ID, Start, End, sequenced_count | |
| 203 | |
| 204 zcat merged_temp.gz | awk '{barcode[$1][$2][$3]++}END{ | |
| 205 for(RNA in barcode){ | |
| 206 for(start_position in barcode[RNA]){ | |
| 207 for(end_position in barcode[RNA][start_position]){print RNA "\t" start_position "\t" end_position "\t" barcode[RNA][start_position][end_position]}}}}' > $output_dir/read_counts.txt & | |
| 208 | |
| 209 wait | |
| 210 | |
| 211 #Print the maximum observed barcodes value. Usefull to assess the necessity of producing the k2n file. | |
| 212 if [ "$barcodes" != "None" ]; then | |
| 213 | |
| 214 cut -f 4 $output_dir/unique_barcodes.txt | sort -S1G -rn > sorted_bars | |
| 215 max_observed_barcodes=`head -n 1 sorted_bars` | |
| 216 | |
| 217 echo "Maximum observed Barcodes = ${max_observed_barcodes}" | |
| 218 rm sorted_bars | |
| 219 fi | |
| 220 | |
| 221 #Produce k2n file | |
| 222 if [ "$k2n" == "True" ]; then | |
| 223 if [ ! -s paired ]; then | |
| 224 Rscript $R_SCRIPT_PATH/k2n.R merged_temp.gz $output_dir/unique_barcodes.txt $output_dir/k2n.txt FALSE | |
| 225 else | |
| 226 Rscript $R_SCRIPT_PATH/k2n.R merged_temp.gz $output_dir/unique_barcodes.txt $output_dir/k2n.txt TRUE | |
| 227 fi | |
| 228 fi | |
| 229 | |
| 230 #Remove temp files | |
| 231 rm merged_temp.gz positions_temp_sorted.gz paired |
