annotate summarize_unique_barcodes.sh @ 5:1bf54dbd95e1 draft

Deleted selected files
author nikos
date Tue, 04 Nov 2014 15:10:27 -0500
parents 83dfe38f6a09
children 2ae336f19de0
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
1 #!/bin/bash
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
2
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
3 ############################################################
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
4 #Create table of fragments containing coordinates[RNA molecule, start,end], number of mapped reads, number of unique barcodes and 3'most nucleotide of cDNA (on that was ligated to)
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
5 ############################################################
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
6 #Remove untemplated nucleotides and create positions_temp file
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
7
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
8 function print_help {
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
9 cat <<End-of-message
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
10 Summarize Unique Barcodes.
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
11 Counts the number of unique random barcodes and reads associated with each sequenced fragment.
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
12 -------------------------------------
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
13 Input arguments:
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
14 -h: Help
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
15 -f: Aligned reads in BAM format.
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
16 -b: Barcode file (optional).
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
17 -p: Set priming position to a fixed value.
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
18 -t: Trim untemplated nucleotides.
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
19 -k: Produce k2n file. Warning: Can be sloooow!
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
20 -r: Rscript path
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
21 -o: Output folder (default: "output_dir")
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
22 -------------------------------------
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
23 Usage : summarize_unique_barcodes.sh -f <BAM_file> -b <BARCODES> -p <PRIMING_POSITION> -t -k -r <R_SCRIPT_PATH>
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
24 End-of-message
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
25 exit
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
26 }
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
27
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
28
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
29 #defaults
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
30 output_dir="output_dir"
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
31 trim_flag="False"
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
32
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
33 #parse input
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
34 while getopts hf:b::p:o:ktr: myarg
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
35 do case "$myarg" in
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
36 h) print_help
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
37 exit ;;
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
38 f) bamfile="$OPTARG" ;; #required
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
39 b) barcodes="$OPTARG" ;; #optional
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
40 t) trim_flag="True" ;;
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
41 k) k2n="True" ;;
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
42 p) priming_pos="$OPTARG" ;;
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
43 o) output_dir="$OPTARG" ;;
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
44 r) R_SCRIPT_PATH="$OPTARG" ;; #required
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
45 [?]) echo "ERROR: Unknown parameter"
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
46 print_help
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
47 exit 1 ;;
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
48 esac
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
49 done
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
50
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
51 ###### Sanity checks ######
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
52 if [ -z $bamfile ]; then
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
53 echo "Error: Aligned reads file is missing!"
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
54 print_help
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
55 exit 1
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
56 fi
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
57
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
58 if [ "$barcodes" == "None" ] && [ "$k2n" == "True" ]; then
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
59 echo "Error: k2n file cannot be produced without a barcode file!"
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
60 exit 1
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
61 fi
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
62
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
63
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
64 mkdir -p $output_dir
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
65
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
66 #Check if bamfile contains single or paired-end data
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
67 samtools view -f 0x1 $bamfile | head -n 1 > paired
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
68
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
69 if [ -s paired ]; then
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
70 #paired-end
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
71 samtools view $bamfile | awk 'BEGIN{OFS="\t"}{if(substr($0,1,1)!="@"){print}}' - | awk -v out="${output_dir}/trimming_stats.txt" -v flag="${trim_flag}" 'BEGIN{OFS="\t";counter[0]=0;counter[1]=0;counter[2]=0;counter[3]=0;counter[4]=0}
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
72 function abs(value){return(value<0?-value:value)}
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
73 function return_offset(local_offset){print($1, $3, $4+local_offset, $4+abs($9)-1);counter[local_offset]++}
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
74 ($2 != 99) {next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
75 (flag == "False") {return_offset(0);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
76 (/[\s\t]MD:Z:/ && !/MD:Z:([012][ACGT])/) {return_offset(0);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
77 (/[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/ && substr($10,1,1)=="N") {return_offset(0);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
78 (/[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/) {return_offset(1);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
79
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
80 (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(0);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
81 (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/) {return_offset(2);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
82
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
83 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,1,2)=="NN") {return_offset(0);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
84 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,1,1)=="N") {return_offset(2);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
85 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(1);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
86 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(2);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
87
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
88 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,2)=="NN") {return_offset(0);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
89 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(3);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
90 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,3,1)=="N") {return_offset(2);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
91 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/) {return_offset(3);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
92
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
93 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,3,1)=="N" && substr($10,1,1)=="N") {return_offset(0);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
94 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,3,1)=="N") {return_offset(1);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
95 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,1,1)=="N") {return_offset(3);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
96 (/MD:Z:0[ACGT]1[ACGT]/) {return_offset(3);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
97
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
98 (/MD:Z:2[ACGT]/ && substr($10,3,1)=="N") {return_offset(0);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
99 (/MD:Z:2[ACGT]/) {return_offset(3);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
100
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
101 (substr($10,3,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(3);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
102 (substr($10,2,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(2);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
103 (substr($10,1,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(1);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
104
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
105 {return_offset(0);counter[4]++}
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
106 END{print("No trimming:",counter[0],", out of which not recognized MD field for:",counter[4],"; 1 nt trimmed:", counter[1],"; 2 nt trimmed:", counter[2],"; 3 nt trimmed:",counter[3]) > out}' | sort -S1G -k1,1 | gzip > positions_temp_sorted.gz
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
107 else
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
108 #single-end
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
109 samtools view $bamfile | awk 'BEGIN{OFS="\t"}{if(substr($0,1,1)!="@"){print}}' - | awk -v out="${output_dir}/trimming_stats.txt" -v flag="${trim_flag}" 'BEGIN{OFS="\t";counter[0]=0;counter[1]=0;counter[2]=0;counter[3]=0;counter[4]=0}
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
110 function abs(value){return(value<0?-value:value)}
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
111 function return_offset(local_offset){print($1, $3, $4+local_offset, $4+abs($9)-1);counter[local_offset]++}
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
112 ($2 != 0) {next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
113 (flag == "False") {return_offset(0);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
114 (/[\s\t]MD:Z:/ && !/MD:Z:([012][ACGT])/) {return_offset(0);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
115 (/[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/ && substr($10,1,1)=="N") {return_offset(0);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
116 (/[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/) {return_offset(1);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
117
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
118 (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(0);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
119 (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/) {return_offset(2);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
120
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
121 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,1,2)=="NN") {return_offset(0);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
122 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,1,1)=="N") {return_offset(2);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
123 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(1);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
124 (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(2);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
125
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
126 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,2)=="NN") {return_offset(0);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
127 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,2,1)=="N") {return_offset(3);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
128 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/ && substr($10,3,1)=="N") {return_offset(2);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
129 (/[\s\t]MD:Z:1[ACGT]0[ACGT]/) {return_offset(3);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
130
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
131 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,3,1)=="N" && substr($10,1,1)=="N") {return_offset(0);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
132 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,3,1)=="N") {return_offset(1);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
133 (/MD:Z:0[ACGT]1[ACGT]/ && substr($10,1,1)=="N") {return_offset(3);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
134 (/MD:Z:0[ACGT]1[ACGT]/) {return_offset(3);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
135
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
136 (/MD:Z:2[ACGT]/ && substr($10,3,1)=="N") {return_offset(0);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
137 (/MD:Z:2[ACGT]/) {return_offset(3);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
138
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
139 (substr($10,3,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(3);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
140 (substr($10,2,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(2);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
141 (substr($10,1,1)!="N" && /MD:Z:0[ACGT]0[ACGT]0[ACGT]/) {return_offset(1);next};
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
142
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
143 {return_offset(0);counter[4]++}
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
144 END{print("No trimming:",counter[0],", out of which not recognized MD field for:",counter[4],"; 1 nt trimmed:", counter[1],"; 2 nt trimmed:", counter[2],"; 3 nt trimmed:",counter[3]) > out}' | sort -S1G -k1,1 | gzip > positions_temp_sorted.gz
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
145
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
146 fi
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
147
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
148 # Computing barcode length (Use the first line and compute the string length of the second column
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
149 if [ "$barcodes" != "None" ]; then
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
150
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
151 TMP=`head -1 $barcodes | awk '{print $2}'`
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
152 BAR_LEN=`echo ${#TMP}`
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
153
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
154 # Remove "@" from barcodes and sort them
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
155 sed 's/^.//' $barcodes | sort -k1,1 -S1G | gzip > barcodes_temp_sorted.gz
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
156
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
157 # Merge poistions and barcodes
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
158 join -1 1 <(zcat positions_temp_sorted.gz) <(zcat barcodes_temp_sorted.gz) | cut -f 2,3,4,5 -d " " | awk '{if($4 !~ /N/){print}}' | awk -v bar_len="${BAR_LEN}" '{if(length($4)==bar_len){print}}' | gzip > merged_temp.gz
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
159
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
160 rm barcodes_temp_sorted.gz
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
161
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
162 else
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
163 zcat positions_temp_sorted.gz | cut -f 2,3,4 | gzip > merged_temp.gz
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
164 fi
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
165
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
166 # If the experiment is single-end
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
167 if [ ! -s paired ]; then
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
168 zcat merged_temp.gz | awk '{print $1, $2, "NA", $4, $5}' - > merged_temp2
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
169 cat merged_temp2 | gzip > merged_temp.gz
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
170 fi
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
171
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
172 #### If priming flag is set....
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
173 if [ ! -z $priming_pos ]; then
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
174 zcat merged_temp.gz | awk -v pos="${priming_pos}" '{print $1, $2, pos, $4, $5}' - > merged_temp2
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
175 cat merged_temp2 | gzip > merged_temp.gz
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
176 fi
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
177
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
178 #File summary.gz columns: RNA_ID, Start, End, barcode sequence, sequenced_count[=number of sequenced fragments fulfilling previous requiremnts]
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
179
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
180 zcat merged_temp.gz | awk '{barcode[$1][$2][$3][$4]++}END{
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
181 for(RNA in barcode){
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
182 for(start_position in barcode[RNA]){
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
183 for(end_position in barcode[RNA][start_position]){
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
184 for(barseq in barcode[RNA][start_position][end_position]){print RNA,start_position,end_position,barseq,barcode[RNA][start_position][end_position][barseq]}}}}}' > $output_dir/summary.txt
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
185
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
186 #File unique_barcodes columns: RNA_ID, Start, End, number of unique barcodes observed for this fragment [PROBLEM: How to treat the different 3cdns for the same fragment? if the template was homogenous then it should be always the same]
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
187
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
188 awk '{barcode[$1][$2][$3]++}END{
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
189 for(RNA in barcode){
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
190 for(start_position in barcode[RNA]){
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
191 for(end_position in barcode[RNA][start_position]){print RNA "\t" start_position "\t" end_position "\t" barcode[RNA][start_position][end_position]}}}}' $output_dir/summary.txt > $output_dir/unique_barcodes.txt &
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
192
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
193 #read_counts.gz colums: RNA_ID, Start, End, sequenced_count
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
194
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
195 zcat merged_temp.gz | awk '{barcode[$1][$2][$3]++}END{
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
196 for(RNA in barcode){
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
197 for(start_position in barcode[RNA]){
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
198 for(end_position in barcode[RNA][start_position]){print RNA "\t" start_position "\t" end_position "\t" barcode[RNA][start_position][end_position]}}}}' > $output_dir/read_counts.txt &
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
199
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
200 wait
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
201
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
202 if [ "$barcodes" != "None" ]; then
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
203
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
204 cut -f 4 $output_dir/unique_barcodes.txt | sort -S1G -rn > sorted_bars
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
205 max_observed_barcodes=`head -n 1 sorted_bars`
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
206
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
207 echo "Maximum observed Barcodes = ${max_observed_barcodes}"
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
208 rm sorted_bars
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
209 fi
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
210
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
211 if [ "$k2n" == "True" ]; then
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
212 bar_length=$(head -n 1 $barcodes | cut -f 2 | xargs expr length)
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
213 Rscript $R_SCRIPT_PATH/k2n.R -m merged_temp.gz -c $output_dir/read_counts.txt -b ${max_observed_barcodes} -l $bar_length -o $output_dir/k2n.txt
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
214 fi
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
215
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
216 #Remove temp files
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
217 rm paired merged_temp.gz positions_temp_sorted.gz