rna_probing: summarize_unique_barcodes.sh annotate

annotate summarize_unique_barcodes.sh @ 11:f6265e05c55c draft

Uploaded

author	nikos
date	Wed, 05 Nov 2014 10:00:47 -0500
parents	33e625bef2b9
children	52ff42c85994

rev	line source
10 33e625bef2b9 Uploaded nikos parents: diff changeset	1 #!/bin/bash
33e625bef2b9 Uploaded nikos parents: diff changeset	2
33e625bef2b9 Uploaded nikos parents: diff changeset	3 ####################################################################################################
33e625bef2b9 Uploaded nikos parents: diff changeset	4 #Copyright (C) 2014 Lukasz Kielpinski, Nikos Sidiropoulos
33e625bef2b9 Uploaded nikos parents: diff changeset	5
33e625bef2b9 Uploaded nikos parents: diff changeset	6 #This program is free software: you can redistribute it and/or modify it under the terms of the
33e625bef2b9 Uploaded nikos parents: diff changeset	7 #GNU General Public License as published by the Free Software Foundation, either version 3 of the
33e625bef2b9 Uploaded nikos parents: diff changeset	8 #License, or (at your option) any later version.
33e625bef2b9 Uploaded nikos parents: diff changeset	9
33e625bef2b9 Uploaded nikos parents: diff changeset	10 #This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
33e625bef2b9 Uploaded nikos parents: diff changeset	11 #even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33e625bef2b9 Uploaded nikos parents: diff changeset	12 #GNU General Public License for more details (http://www.gnu.org/licenses/).
33e625bef2b9 Uploaded nikos parents: diff changeset	13 ####################################################################################################
33e625bef2b9 Uploaded nikos parents: diff changeset	14
33e625bef2b9 Uploaded nikos parents: diff changeset	15 function print_help {
33e625bef2b9 Uploaded nikos parents: diff changeset	16 cat <<End-of-message
33e625bef2b9 Uploaded nikos parents: diff changeset	17 Summarize Unique Barcodes.
33e625bef2b9 Uploaded nikos parents: diff changeset	18 Counts the number of unique random barcodes and reads associated with each sequenced fragment.
33e625bef2b9 Uploaded nikos parents: diff changeset	19 -------------------------------------
33e625bef2b9 Uploaded nikos parents: diff changeset	20 Input arguments:
33e625bef2b9 Uploaded nikos parents: diff changeset	21 -h: Help
33e625bef2b9 Uploaded nikos parents: diff changeset	22 -f: Aligned reads in BAM format.
33e625bef2b9 Uploaded nikos parents: diff changeset	23 -b: Barcode file (optional).
33e625bef2b9 Uploaded nikos parents: diff changeset	24 -p: Set priming position to a fixed value.
33e625bef2b9 Uploaded nikos parents: diff changeset	25 -t: Trim untemplated nucleotides.
33e625bef2b9 Uploaded nikos parents: diff changeset	26 -k: Produce k2n file. Warning: Can be sloooow!
33e625bef2b9 Uploaded nikos parents: diff changeset	27 -r: Rscript path
33e625bef2b9 Uploaded nikos parents: diff changeset	28 -o: Output folder (default: "output_dir")
33e625bef2b9 Uploaded nikos parents: diff changeset	29 -------------------------------------
33e625bef2b9 Uploaded nikos parents: diff changeset	30 Usage : summarize_unique_barcodes.sh -f <BAM_file> -b <BARCODES> -p <PRIMING_POSITION> -t -k -r <R_SCRIPT_PATH>
33e625bef2b9 Uploaded nikos parents: diff changeset	31 End-of-message
33e625bef2b9 Uploaded nikos parents: diff changeset	32 exit
33e625bef2b9 Uploaded nikos parents: diff changeset	33 }
33e625bef2b9 Uploaded nikos parents: diff changeset	34
33e625bef2b9 Uploaded nikos parents: diff changeset	35 #defaults
33e625bef2b9 Uploaded nikos parents: diff changeset	36 output_dir="output_dir"
33e625bef2b9 Uploaded nikos parents: diff changeset	37 trim_flag="False"
33e625bef2b9 Uploaded nikos parents: diff changeset	38
33e625bef2b9 Uploaded nikos parents: diff changeset	39 #parse input
33e625bef2b9 Uploaded nikos parents: diff changeset	40 while getopts hf:b::p:o:ktr: myarg
33e625bef2b9 Uploaded nikos parents: diff changeset	41 do case "$myarg" in
33e625bef2b9 Uploaded nikos parents: diff changeset	42 h) print_help
33e625bef2b9 Uploaded nikos parents: diff changeset	43 exit ;;
33e625bef2b9 Uploaded nikos parents: diff changeset	44 f) bamfile="$OPTARG" ;; #required
33e625bef2b9 Uploaded nikos parents: diff changeset	45 b) barcodes="$OPTARG" ;; #optional
33e625bef2b9 Uploaded nikos parents: diff changeset	46 t) trim_flag="True" ;;
33e625bef2b9 Uploaded nikos parents: diff changeset	47 k) k2n="True" ;;
33e625bef2b9 Uploaded nikos parents: diff changeset	48 p) priming_pos="$OPTARG" ;;
33e625bef2b9 Uploaded nikos parents: diff changeset	49 o) output_dir="$OPTARG" ;;
33e625bef2b9 Uploaded nikos parents: diff changeset	50 r) R_SCRIPT_PATH="$OPTARG" ;; #required
33e625bef2b9 Uploaded nikos parents: diff changeset	51 [?]) echo "ERROR: Unknown parameter"
33e625bef2b9 Uploaded nikos parents: diff changeset	52 print_help
33e625bef2b9 Uploaded nikos parents: diff changeset	53 exit 1 ;;
33e625bef2b9 Uploaded nikos parents: diff changeset	54 esac
33e625bef2b9 Uploaded nikos parents: diff changeset	55 done
33e625bef2b9 Uploaded nikos parents: diff changeset	56
33e625bef2b9 Uploaded nikos parents: diff changeset	57 ###### Sanity checks ######
33e625bef2b9 Uploaded nikos parents: diff changeset	58 if [ -z $bamfile ]; then
33e625bef2b9 Uploaded nikos parents: diff changeset	59 echo "Error: Aligned reads file is missing!"
33e625bef2b9 Uploaded nikos parents: diff changeset	60 print_help
33e625bef2b9 Uploaded nikos parents: diff changeset	61 exit 1
33e625bef2b9 Uploaded nikos parents: diff changeset	62 fi
33e625bef2b9 Uploaded nikos parents: diff changeset	63
33e625bef2b9 Uploaded nikos parents: diff changeset	64 if [ "$barcodes" == "None" ] && [ "$k2n" == "True" ]; then
33e625bef2b9 Uploaded nikos parents: diff changeset	65 echo "Error: k2n file cannot be produced without a barcode file!"
33e625bef2b9 Uploaded nikos parents: diff changeset	66 exit 1
33e625bef2b9 Uploaded nikos parents: diff changeset	67 fi
33e625bef2b9 Uploaded nikos parents: diff changeset	68
33e625bef2b9 Uploaded nikos parents: diff changeset	69
33e625bef2b9 Uploaded nikos parents: diff changeset	70 mkdir -p $output_dir
33e625bef2b9 Uploaded nikos parents: diff changeset	71
33e625bef2b9 Uploaded nikos parents: diff changeset	72 #Check if bamfile contains single or paired-end reads
33e625bef2b9 Uploaded nikos parents: diff changeset	73 samtools view -f 0x1 $bamfile \| head -n 1 > paired
33e625bef2b9 Uploaded nikos parents: diff changeset	74
33e625bef2b9 Uploaded nikos parents: diff changeset	75 if [ -s paired ]; then
33e625bef2b9 Uploaded nikos parents: diff changeset	76 #paired-end
33e625bef2b9 Uploaded nikos parents: diff changeset	77 samtools view $bamfile \| awk 'BEGIN{OFS="\t"}{if(substr($0,1,1)!="@"){print}}' - \| awk -v out="${output_dir}/trimming_stats.txt" -v flag="${trim_flag}" 'BEGIN{OFS="\t";counter[0]=0;counter[1]=0;counter[2]=0;counter[3]=0;counter[4]=0}
33e625bef2b9 Uploaded nikos parents: diff changeset	78 function abs(value){return(value<0?-value:value)}
33e625bef2b9 Uploaded nikos parents: diff changeset	79 function return_offset(local_offset){print($1, $3, $4+local_offset, $4+abs($9)-1);counter[local_offset]++}
33e625bef2b9 Uploaded nikos parents: diff changeset	80 ($2 != 99) {next};
33e625bef2b9 Uploaded nikos parents: diff changeset	81 (flag == "False") {return_offset(0);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	82 (/[\s\t]MD:Z:/ && !/MD:Z:([012][ACGT])/) {return_offset(0);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	83 (/[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/ && substr($10,1,1)=="N") {return_offset(0);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	84 (/[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/) {return_offset(1);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	85
33e625bef2b9 Uploaded nikos parents: diff changeset	86 (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(0);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	87 (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/) {return_offset(2);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	88
33e625bef2b9 Uploaded nikos parents: diff changeset	89 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,1,2)=="NN") {return_offset(0);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	90 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,1,1)=="N") {return_offset(2);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	91 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(1);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	92 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(2);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	93
33e625bef2b9 Uploaded nikos parents: diff changeset	94 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,2)=="NN") {return_offset(0);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	95 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(3);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	96 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,3,1)=="N") {return_offset(2);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	97 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/) {return_offset(3);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	98
33e625bef2b9 Uploaded nikos parents: diff changeset	99 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,3,1)=="N" && substr($10,1,1)=="N") {return_offset(0);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	100 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,3,1)=="N") {return_offset(1);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	101 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,1,1)=="N") {return_offset(3);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	102 (/MD:Z:0[ACGT]1[ACGT]/) {return_offset(3);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	103
33e625bef2b9 Uploaded nikos parents: diff changeset	104 (/MD:Z:2[ACGT]/ && substr($10,3,1)=="N") {return_offset(0);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	105 (/MD:Z:2[ACGT]/) {return_offset(3);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	106
33e625bef2b9 Uploaded nikos parents: diff changeset	107 (substr($10,3,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(3);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	108 (substr($10,2,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(2);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	109 (substr($10,1,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(1);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	110
33e625bef2b9 Uploaded nikos parents: diff changeset	111 {return_offset(0);counter[4]++}
33e625bef2b9 Uploaded nikos parents: diff changeset	112 END{print("No trimming:",counter[0],", out of which not recognized MD field for:",counter[4],"; 1 nt trimmed:", counter[1],"; 2 nt trimmed:", counter[2],"; 3 nt trimmed:",counter[3]) > out}' \| sort -S1G -k1,1 \| gzip > positions_temp_sorted.gz
33e625bef2b9 Uploaded nikos parents: diff changeset	113
33e625bef2b9 Uploaded nikos parents: diff changeset	114 else
33e625bef2b9 Uploaded nikos parents: diff changeset	115 #single-end
33e625bef2b9 Uploaded nikos parents: diff changeset	116 samtools view $bamfile \| awk 'BEGIN{OFS="\t"}{if(substr($0,1,1)!="@"){print}}' - \| awk -v out="${output_dir}/trimming_stats.txt" -v flag="${trim_flag}" 'BEGIN{OFS="\t";counter[0]=0;counter[1]=0;counter[2]=0;counter[3]=0;counter[4]=0}
33e625bef2b9 Uploaded nikos parents: diff changeset	117 function abs(value){return(value<0?-value:value)}
33e625bef2b9 Uploaded nikos parents: diff changeset	118 function return_offset(local_offset){print($1, $3, $4+local_offset, $4+abs($9)-1);counter[local_offset]++}
33e625bef2b9 Uploaded nikos parents: diff changeset	119 ($2 != 0) {next};
33e625bef2b9 Uploaded nikos parents: diff changeset	120 (flag == "False") {return_offset(0);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	121 (/[\s\t]MD:Z:/ && !/MD:Z:([012][ACGT])/) {return_offset(0);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	122 (/[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/ && substr($10,1,1)=="N") {return_offset(0);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	123 (/[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/) {return_offset(1);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	124
33e625bef2b9 Uploaded nikos parents: diff changeset	125 (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(0);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	126 (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/) {return_offset(2);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	127
33e625bef2b9 Uploaded nikos parents: diff changeset	128 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,1,2)=="NN") {return_offset(0);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	129 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,1,1)=="N") {return_offset(2);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	130 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(1);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	131 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(2);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	132
33e625bef2b9 Uploaded nikos parents: diff changeset	133 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,2)=="NN") {return_offset(0);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	134 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(3);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	135 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,3,1)=="N") {return_offset(2);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	136 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/) {return_offset(3);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	137
33e625bef2b9 Uploaded nikos parents: diff changeset	138 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,3,1)=="N" && substr($10,1,1)=="N") {return_offset(0);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	139 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,3,1)=="N") {return_offset(1);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	140 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,1,1)=="N") {return_offset(3);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	141 (/MD:Z:0[ACGT]1[ACGT]/) {return_offset(3);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	142
33e625bef2b9 Uploaded nikos parents: diff changeset	143 (/MD:Z:2[ACGT]/ && substr($10,3,1)=="N") {return_offset(0);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	144 (/MD:Z:2[ACGT]/) {return_offset(3);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	145
33e625bef2b9 Uploaded nikos parents: diff changeset	146 (substr($10,3,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(3);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	147 (substr($10,2,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(2);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	148 (substr($10,1,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(1);next};
33e625bef2b9 Uploaded nikos parents: diff changeset	149
33e625bef2b9 Uploaded nikos parents: diff changeset	150 {return_offset(0);counter[4]++}
33e625bef2b9 Uploaded nikos parents: diff changeset	151 END{print("No trimming:",counter[0],", out of which not recognized MD field for:",counter[4],"; 1 nt trimmed:", counter[1],"; 2 nt trimmed:", counter[2],"; 3 nt trimmed:",counter[3]) > out}' \| sort -S1G -k1,1 \| gzip > positions_temp_sorted.gz
33e625bef2b9 Uploaded nikos parents: diff changeset	152
33e625bef2b9 Uploaded nikos parents: diff changeset	153 fi
33e625bef2b9 Uploaded nikos parents: diff changeset	154
33e625bef2b9 Uploaded nikos parents: diff changeset	155 #Computing barcode length (Use the first line and compute the string length of the second column)
33e625bef2b9 Uploaded nikos parents: diff changeset	156 if [ "$barcodes" != "None" ]; then
33e625bef2b9 Uploaded nikos parents: diff changeset	157
33e625bef2b9 Uploaded nikos parents: diff changeset	158 TMP=`head -1 $barcodes \| awk '{print $2}'`
33e625bef2b9 Uploaded nikos parents: diff changeset	159 BAR_LEN=`echo ${#TMP}`
33e625bef2b9 Uploaded nikos parents: diff changeset	160
33e625bef2b9 Uploaded nikos parents: diff changeset	161 #Remove "@" from barcodes and sort them
33e625bef2b9 Uploaded nikos parents: diff changeset	162 sed 's/^.//' $barcodes \| sort -k1,1 -S1G \| gzip > barcodes_temp_sorted.gz
33e625bef2b9 Uploaded nikos parents: diff changeset	163
33e625bef2b9 Uploaded nikos parents: diff changeset	164 #Merge poistions and barcodes
33e625bef2b9 Uploaded nikos parents: diff changeset	165 join -1 1 <(zcat positions_temp_sorted.gz) <(zcat barcodes_temp_sorted.gz) \| cut -f 2,3,4,5 -d " " \| awk '{if($4 !~ /N/){print}}' \| awk -v bar_len="${BAR_LEN}" '{if(length($4)==bar_len){print}}' \| gzip > merged_temp.gz
33e625bef2b9 Uploaded nikos parents: diff changeset	166
33e625bef2b9 Uploaded nikos parents: diff changeset	167 rm barcodes_temp_sorted.gz
33e625bef2b9 Uploaded nikos parents: diff changeset	168
33e625bef2b9 Uploaded nikos parents: diff changeset	169 else
33e625bef2b9 Uploaded nikos parents: diff changeset	170 zcat positions_temp_sorted.gz \| cut -f 2,3,4 \| gzip > merged_temp.gz
33e625bef2b9 Uploaded nikos parents: diff changeset	171 fi
33e625bef2b9 Uploaded nikos parents: diff changeset	172
33e625bef2b9 Uploaded nikos parents: diff changeset	173 #If the experiment is single-end set NA values to the priming column.
33e625bef2b9 Uploaded nikos parents: diff changeset	174 if [ ! -s paired ]; then
33e625bef2b9 Uploaded nikos parents: diff changeset	175 zcat merged_temp.gz \| awk '{print $1, $2, "NA", $4, $5}' - > merged_temp2
33e625bef2b9 Uploaded nikos parents: diff changeset	176 cat merged_temp2 \| gzip > merged_temp.gz
33e625bef2b9 Uploaded nikos parents: diff changeset	177 fi
33e625bef2b9 Uploaded nikos parents: diff changeset	178
33e625bef2b9 Uploaded nikos parents: diff changeset	179 #Fix priming position
33e625bef2b9 Uploaded nikos parents: diff changeset	180 if [ ! -z $priming_pos ]; then
33e625bef2b9 Uploaded nikos parents: diff changeset	181 zcat merged_temp.gz \| awk -v pos="${priming_pos}" '{print $1, $2, pos, $4, $5}' - > merged_temp2
33e625bef2b9 Uploaded nikos parents: diff changeset	182 cat merged_temp2 \| gzip > merged_temp.gz
33e625bef2b9 Uploaded nikos parents: diff changeset	183 fi
33e625bef2b9 Uploaded nikos parents: diff changeset	184
33e625bef2b9 Uploaded nikos parents: diff changeset	185 #File summary.txt columns: RNA_ID, Start, End, barcode sequence, sequenced_count[=number of sequenced fragments fulfilling previous requiremnts]
33e625bef2b9 Uploaded nikos parents: diff changeset	186
33e625bef2b9 Uploaded nikos parents: diff changeset	187 zcat merged_temp.gz \| awk '{barcode[$1][$2][$3][$4]++}END{
33e625bef2b9 Uploaded nikos parents: diff changeset	188 for(RNA in barcode){
33e625bef2b9 Uploaded nikos parents: diff changeset	189 for(start_position in barcode[RNA]){
33e625bef2b9 Uploaded nikos parents: diff changeset	190 for(end_position in barcode[RNA][start_position]){
33e625bef2b9 Uploaded nikos parents: diff changeset	191 for(barseq in barcode[RNA][start_position][end_position]){print RNA,start_position,end_position,barseq,barcode[RNA][start_position][end_position][barseq]}}}}}' > $output_dir/summary.txt
33e625bef2b9 Uploaded nikos parents: diff changeset	192
33e625bef2b9 Uploaded nikos parents: diff changeset	193 #File unique_barcodes.txt columns: RNA_ID, Start, End, number of unique barcodes observed for this fragment.
33e625bef2b9 Uploaded nikos parents: diff changeset	194
33e625bef2b9 Uploaded nikos parents: diff changeset	195 awk '{barcode[$1][$2][$3]++}END{
33e625bef2b9 Uploaded nikos parents: diff changeset	196 for(RNA in barcode){
33e625bef2b9 Uploaded nikos parents: diff changeset	197 for(start_position in barcode[RNA]){
33e625bef2b9 Uploaded nikos parents: diff changeset	198 for(end_position in barcode[RNA][start_position]){print RNA "\t" start_position "\t" end_position "\t" barcode[RNA][start_position][end_position]}}}}' $output_dir/summary.txt > $output_dir/unique_barcodes.txt &
33e625bef2b9 Uploaded nikos parents: diff changeset	199
33e625bef2b9 Uploaded nikos parents: diff changeset	200 #File read_counts.txt colums: RNA_ID, Start, End, sequenced_count
33e625bef2b9 Uploaded nikos parents: diff changeset	201
33e625bef2b9 Uploaded nikos parents: diff changeset	202 zcat merged_temp.gz \| awk '{barcode[$1][$2][$3]++}END{
33e625bef2b9 Uploaded nikos parents: diff changeset	203 for(RNA in barcode){
33e625bef2b9 Uploaded nikos parents: diff changeset	204 for(start_position in barcode[RNA]){
33e625bef2b9 Uploaded nikos parents: diff changeset	205 for(end_position in barcode[RNA][start_position]){print RNA "\t" start_position "\t" end_position "\t" barcode[RNA][start_position][end_position]}}}}' > $output_dir/read_counts.txt &
33e625bef2b9 Uploaded nikos parents: diff changeset	206
33e625bef2b9 Uploaded nikos parents: diff changeset	207 wait
33e625bef2b9 Uploaded nikos parents: diff changeset	208
33e625bef2b9 Uploaded nikos parents: diff changeset	209 #Print the maximum observed barcodes value. Usefull to assess the necessity of producing the k2n file.
33e625bef2b9 Uploaded nikos parents: diff changeset	210 if [ "$barcodes" != "None" ]; then
33e625bef2b9 Uploaded nikos parents: diff changeset	211
33e625bef2b9 Uploaded nikos parents: diff changeset	212 cut -f 4 $output_dir/unique_barcodes.txt \| sort -S1G -rn > sorted_bars
33e625bef2b9 Uploaded nikos parents: diff changeset	213 max_observed_barcodes=`head -n 1 sorted_bars`
33e625bef2b9 Uploaded nikos parents: diff changeset	214
33e625bef2b9 Uploaded nikos parents: diff changeset	215 echo "Maximum observed Barcodes = ${max_observed_barcodes}"
33e625bef2b9 Uploaded nikos parents: diff changeset	216 rm sorted_bars
33e625bef2b9 Uploaded nikos parents: diff changeset	217 fi
33e625bef2b9 Uploaded nikos parents: diff changeset	218
33e625bef2b9 Uploaded nikos parents: diff changeset	219 #Produce k2n file
33e625bef2b9 Uploaded nikos parents: diff changeset	220 if [ "$k2n" == "True" ]; then
33e625bef2b9 Uploaded nikos parents: diff changeset	221 bar_length=$(head -n 1 $barcodes \| cut -f 2 \| xargs expr length)
33e625bef2b9 Uploaded nikos parents: diff changeset	222 Rscript $R_SCRIPT_PATH/k2n.R merged_temp.gz $output_dir/read_counts.txt ${max_observed_barcodes} $bar_length $output_dir/k2n.txt
33e625bef2b9 Uploaded nikos parents: diff changeset	223 fi
33e625bef2b9 Uploaded nikos parents: diff changeset	224
33e625bef2b9 Uploaded nikos parents: diff changeset	225 #Remove temp files
33e625bef2b9 Uploaded nikos parents: diff changeset	226 rm paired merged_temp.gz positions_temp_sorted.gz

Mercurial > repos > nikos > rna_probing

annotate summarize_unique_barcodes.sh @ 11:f6265e05c55c draft