| 0 | 1 #!/bin/bash | 
|  | 2 | 
|  | 3 ############################################################ | 
|  | 4 #Create table of fragments containing coordinates[RNA molecule, start,end], number of mapped reads, number of unique barcodes and 3'most nucleotide of cDNA (on that was ligated to) | 
|  | 5 ############################################################ | 
|  | 6 #Remove untemplated nucleotides and create positions_temp file | 
|  | 7 | 
|  | 8 function print_help { | 
|  | 9 cat <<End-of-message | 
|  | 10 Summarize Unique Barcodes. | 
|  | 11 Counts the number of unique random barcodes and reads associated with each sequenced fragment. | 
|  | 12 ------------------------------------- | 
|  | 13 Input arguments: | 
|  | 14 -h: Help | 
|  | 15 -f: Aligned reads in BAM format. | 
|  | 16 -b: Barcode file (optional). | 
|  | 17 -p: Set priming position to a fixed value. | 
|  | 18 -t: Trim untemplated nucleotides. | 
|  | 19 -k: Produce k2n file. Warning: Can be sloooow! | 
|  | 20 -r: Rscript path | 
|  | 21 -o: Output folder (default: "output_dir") | 
|  | 22 ------------------------------------- | 
|  | 23 Usage : summarize_unique_barcodes.sh -f <BAM_file> -b <BARCODES> -p <PRIMING_POSITION> -t -k -r <R_SCRIPT_PATH> | 
|  | 24 End-of-message | 
|  | 25 exit | 
|  | 26 } | 
|  | 27 | 
|  | 28 | 
|  | 29 #defaults | 
|  | 30 output_dir="output_dir" | 
|  | 31 trim_flag="False" | 
|  | 32 | 
|  | 33 #parse input | 
|  | 34 while getopts hf:b::p:o:ktr: myarg | 
|  | 35 do  case "$myarg" in | 
|  | 36     h)  print_help | 
|  | 37         exit ;; | 
|  | 38     f)  bamfile="$OPTARG" ;; #required | 
|  | 39     b)  barcodes="$OPTARG" ;; #optional | 
|  | 40     t)  trim_flag="True" ;; | 
|  | 41     k)  k2n="True" ;; | 
|  | 42     p)  priming_pos="$OPTARG" ;; | 
|  | 43     o)  output_dir="$OPTARG" ;; | 
|  | 44     r)  R_SCRIPT_PATH="$OPTARG" ;; #required | 
|  | 45     [?])  echo "ERROR: Unknown parameter" | 
|  | 46         print_help | 
|  | 47         exit 1 ;; | 
|  | 48     esac | 
|  | 49 done | 
|  | 50 | 
|  | 51 ###### Sanity checks ###### | 
|  | 52 if [ -z $bamfile ]; then | 
|  | 53     echo "Error: Aligned reads file is missing!" | 
|  | 54     print_help | 
|  | 55     exit 1 | 
|  | 56 fi | 
|  | 57 | 
|  | 58 if [ "$barcodes" == "None" ] && [ "$k2n" == "True" ]; then | 
|  | 59     echo "Error: k2n file cannot be produced without a barcode file!" | 
|  | 60     exit 1 | 
|  | 61 fi | 
|  | 62 | 
|  | 63 | 
|  | 64 mkdir -p $output_dir | 
|  | 65 | 
|  | 66 #Check if bamfile contains single or paired-end data | 
|  | 67 samtools view -f 0x1 $bamfile | head -n 1 > paired | 
|  | 68 | 
|  | 69 if [ -s paired ]; then | 
|  | 70     #paired-end | 
|  | 71     samtools view $bamfile | awk 'BEGIN{OFS="\t"}{if(substr($0,1,1)!="@"){print}}' - | awk -v out="${output_dir}/trimming_stats.txt" -v flag="${trim_flag}" 'BEGIN{OFS="\t";counter[0]=0;counter[1]=0;counter[2]=0;counter[3]=0;counter[4]=0} | 
|  | 72     function abs(value){return(value<0?-value:value)} | 
|  | 73     function return_offset(local_offset){print($1, $3, $4+local_offset, $4+abs($9)-1);counter[local_offset]++} | 
|  | 74     ($2 != 99) {next}; | 
|  | 75     (flag == "False") {return_offset(0);next}; | 
|  | 76     (/[\s\t]MD:Z:/ && !/MD:Z:([012][ACGT])/)  {return_offset(0);next}; | 
|  | 77     (/[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/ && substr($10,1,1)=="N") {return_offset(0);next}; | 
|  | 78     (/[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/) {return_offset(1);next}; | 
|  | 79 | 
|  | 80     (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(0);next}; | 
|  | 81     (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/) {return_offset(2);next}; | 
|  | 82 | 
|  | 83     (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,1,2)=="NN") {return_offset(0);next}; | 
|  | 84     (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,1,1)=="N") {return_offset(2);next}; | 
|  | 85     (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(1);next}; | 
|  | 86     (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(2);next}; | 
|  | 87 | 
|  | 88     (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,2)=="NN") {return_offset(0);next}; | 
|  | 89     (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(3);next}; | 
|  | 90     (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,3,1)=="N") {return_offset(2);next}; | 
|  | 91     (/[\s\t]MD:Z:1[ACGT]0[ACGT]/) {return_offset(3);next}; | 
|  | 92 | 
|  | 93     (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,3,1)=="N" && substr($10,1,1)=="N") {return_offset(0);next}; | 
|  | 94     (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,3,1)=="N") {return_offset(1);next}; | 
|  | 95     (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,1,1)=="N") {return_offset(3);next}; | 
|  | 96     (/MD:Z:0[ACGT]1[ACGT]/) {return_offset(3);next}; | 
|  | 97 | 
|  | 98     (/MD:Z:2[ACGT]/ && substr($10,3,1)=="N")  {return_offset(0);next}; | 
|  | 99     (/MD:Z:2[ACGT]/)  {return_offset(3);next}; | 
|  | 100 | 
|  | 101     (substr($10,3,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(3);next}; | 
|  | 102     (substr($10,2,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(2);next}; | 
|  | 103     (substr($10,1,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(1);next}; | 
|  | 104 | 
|  | 105     {return_offset(0);counter[4]++} | 
|  | 106     END{print("No trimming:",counter[0],", out of which not recognized MD field for:",counter[4],"; 1 nt trimmed:", counter[1],"; 2 nt trimmed:", counter[2],"; 3 nt trimmed:",counter[3]) > out}' | sort -S1G -k1,1 | gzip > positions_temp_sorted.gz | 
|  | 107 else | 
|  | 108     #single-end | 
|  | 109     samtools view $bamfile | awk 'BEGIN{OFS="\t"}{if(substr($0,1,1)!="@"){print}}' - | awk -v out="${output_dir}/trimming_stats.txt" -v flag="${trim_flag}" 'BEGIN{OFS="\t";counter[0]=0;counter[1]=0;counter[2]=0;counter[3]=0;counter[4]=0} | 
|  | 110     function abs(value){return(value<0?-value:value)} | 
|  | 111     function return_offset(local_offset){print($1, $3, $4+local_offset, $4+abs($9)-1);counter[local_offset]++} | 
|  | 112     ($2 != 0) {next}; | 
|  | 113     (flag == "False") {return_offset(0);next}; | 
|  | 114     (/[\s\t]MD:Z:/ && !/MD:Z:([012][ACGT])/)  {return_offset(0);next}; | 
|  | 115     (/[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/ && substr($10,1,1)=="N") {return_offset(0);next}; | 
|  | 116     (/[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/) {return_offset(1);next}; | 
|  | 117 | 
|  | 118     (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(0);next}; | 
|  | 119     (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/) {return_offset(2);next}; | 
|  | 120 | 
|  | 121     (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,1,2)=="NN") {return_offset(0);next}; | 
|  | 122     (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,1,1)=="N") {return_offset(2);next}; | 
|  | 123     (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(1);next}; | 
|  | 124     (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(2);next}; | 
|  | 125 | 
|  | 126     (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,2)=="NN") {return_offset(0);next}; | 
|  | 127     (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(3);next}; | 
|  | 128     (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,3,1)=="N") {return_offset(2);next}; | 
|  | 129     (/[\s\t]MD:Z:1[ACGT]0[ACGT]/) {return_offset(3);next}; | 
|  | 130 | 
|  | 131     (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,3,1)=="N" && substr($10,1,1)=="N") {return_offset(0);next}; | 
|  | 132     (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,3,1)=="N") {return_offset(1);next}; | 
|  | 133     (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,1,1)=="N") {return_offset(3);next}; | 
|  | 134     (/MD:Z:0[ACGT]1[ACGT]/) {return_offset(3);next}; | 
|  | 135 | 
|  | 136     (/MD:Z:2[ACGT]/ && substr($10,3,1)=="N")  {return_offset(0);next}; | 
|  | 137     (/MD:Z:2[ACGT]/)  {return_offset(3);next}; | 
|  | 138 | 
|  | 139     (substr($10,3,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(3);next}; | 
|  | 140     (substr($10,2,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(2);next}; | 
|  | 141     (substr($10,1,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(1);next}; | 
|  | 142 | 
|  | 143     {return_offset(0);counter[4]++} | 
|  | 144     END{print("No trimming:",counter[0],", out of which not recognized MD field for:",counter[4],"; 1 nt trimmed:", counter[1],"; 2 nt trimmed:", counter[2],"; 3 nt trimmed:",counter[3]) > out}' | sort -S1G -k1,1 | gzip > positions_temp_sorted.gz | 
|  | 145 | 
|  | 146 fi | 
|  | 147 | 
|  | 148 # Computing barcode length (Use the first line and compute the string length of the second column | 
|  | 149 if [ "$barcodes" != "None" ]; then | 
|  | 150 | 
|  | 151     TMP=`head -1 $barcodes | awk '{print $2}'` | 
|  | 152     BAR_LEN=`echo ${#TMP}` | 
|  | 153 | 
|  | 154     # Remove "@" from barcodes and sort them | 
|  | 155     sed 's/^.//' $barcodes | sort -k1,1 -S1G | gzip > barcodes_temp_sorted.gz | 
|  | 156 | 
|  | 157     # Merge poistions and barcodes | 
|  | 158     join -1 1 <(zcat positions_temp_sorted.gz) <(zcat barcodes_temp_sorted.gz) | cut -f 2,3,4,5 -d " " | awk '{if($4 !~ /N/){print}}' | awk -v bar_len="${BAR_LEN}" '{if(length($4)==bar_len){print}}' | gzip > merged_temp.gz | 
|  | 159 | 
|  | 160     rm barcodes_temp_sorted.gz | 
|  | 161 | 
|  | 162 else | 
|  | 163     zcat positions_temp_sorted.gz | cut -f 2,3,4 | gzip > merged_temp.gz | 
|  | 164 fi | 
|  | 165 | 
|  | 166 # If the experiment is single-end | 
|  | 167 if [ ! -s paired ]; then | 
|  | 168     zcat merged_temp.gz | awk '{print $1, $2, "NA", $4, $5}' - > merged_temp2 | 
|  | 169     cat merged_temp2 | gzip > merged_temp.gz | 
|  | 170 fi | 
|  | 171 | 
|  | 172 #### If priming flag is set.... | 
|  | 173 if [ ! -z $priming_pos ]; then | 
|  | 174     zcat merged_temp.gz | awk -v pos="${priming_pos}" '{print $1, $2, pos, $4, $5}' - > merged_temp2 | 
|  | 175     cat merged_temp2 | gzip > merged_temp.gz | 
|  | 176 fi | 
|  | 177 | 
|  | 178 #File summary.gz columns: RNA_ID, Start, End, barcode sequence, sequenced_count[=number of sequenced fragments fulfilling previous requiremnts] | 
|  | 179 | 
|  | 180 zcat merged_temp.gz | awk '{barcode[$1][$2][$3][$4]++}END{ | 
|  | 181 for(RNA in barcode){ | 
|  | 182 for(start_position in barcode[RNA]){ | 
|  | 183 for(end_position in barcode[RNA][start_position]){ | 
|  | 184 for(barseq in barcode[RNA][start_position][end_position]){print RNA,start_position,end_position,barseq,barcode[RNA][start_position][end_position][barseq]}}}}}' > $output_dir/summary.txt | 
|  | 185 | 
|  | 186 #File unique_barcodes columns: RNA_ID, Start, End, number of unique barcodes observed for this fragment [PROBLEM: How to treat the different 3cdns for the same fragment? if the template was homogenous then it should be always the same] | 
|  | 187 | 
|  | 188 awk '{barcode[$1][$2][$3]++}END{ | 
|  | 189 for(RNA in barcode){ | 
|  | 190 for(start_position in barcode[RNA]){ | 
|  | 191 for(end_position in barcode[RNA][start_position]){print RNA "\t" start_position "\t" end_position "\t" barcode[RNA][start_position][end_position]}}}}' $output_dir/summary.txt > $output_dir/unique_barcodes.txt & | 
|  | 192 | 
|  | 193 #read_counts.gz colums: RNA_ID, Start, End, sequenced_count | 
|  | 194 | 
|  | 195 zcat merged_temp.gz | awk '{barcode[$1][$2][$3]++}END{ | 
|  | 196 for(RNA in barcode){ | 
|  | 197 for(start_position in barcode[RNA]){ | 
|  | 198 for(end_position in barcode[RNA][start_position]){print RNA "\t" start_position "\t" end_position "\t" barcode[RNA][start_position][end_position]}}}}' > $output_dir/read_counts.txt & | 
|  | 199 | 
|  | 200 wait | 
|  | 201 | 
|  | 202 if [ "$barcodes" != "None" ]; then | 
|  | 203 | 
|  | 204     cut -f 4 $output_dir/unique_barcodes.txt | sort -S1G -rn > sorted_bars | 
|  | 205     max_observed_barcodes=`head -n 1 sorted_bars` | 
|  | 206 | 
|  | 207     echo "Maximum observed Barcodes = ${max_observed_barcodes}" | 
|  | 208     rm sorted_bars | 
|  | 209 fi | 
|  | 210 | 
|  | 211 if [ "$k2n" == "True" ]; then | 
|  | 212     bar_length=$(head -n 1 $barcodes | cut -f 2 | xargs expr length) | 
|  | 213     Rscript $R_SCRIPT_PATH/k2n.R -m merged_temp.gz -c $output_dir/read_counts.txt -b ${max_observed_barcodes} -l $bar_length -o $output_dir/k2n.txt | 
|  | 214 fi | 
|  | 215 | 
|  | 216 #Remove temp files | 
|  | 217 rm paired merged_temp.gz positions_temp_sorted.gz |