|
2
|
1 #!/bin/bash
|
|
|
2
|
|
|
3 ############################################################
|
|
|
4 #Create table of fragments containing coordinates[RNA molecule, start,end], number of mapped reads, number of unique barcodes and 3'most nucleotide of cDNA (on that was ligated to)
|
|
|
5 ############################################################
|
|
|
6 #Remove untemplated nucleotides and create positions_temp file
|
|
|
7
|
|
|
8 #defaults
|
|
|
9 output_dir="output_dir"
|
|
|
10 priming_pos=-1
|
|
|
11
|
|
|
12 #parse input
|
|
|
13 while getopts hf:b:p:o: myarg
|
|
|
14 do case "$myarg" in
|
|
|
15 h) echo "Usage: estimate_unique_counts.sh -f <bam_file> -b <barcodes_file> -o <output_dir>"
|
|
|
16 exit ;;
|
|
|
17 f) bamfile="$OPTARG" ;; #required
|
|
|
18 b) barcodes="$OPTARG" ;; #required
|
|
|
19 p) priming_pos="$OPTARG" ;; #optional
|
|
|
20 o) output_dir="$OPTARG" ;; #optional
|
|
|
21 [?]) echo "Usage: estimate_unique_counts.sh -f <myfile.bam> -b <barcodes.txt>"
|
|
|
22 exit 1 ;;
|
|
|
23 esac
|
|
|
24 done
|
|
|
25
|
|
|
26 mkdir $output_dir
|
|
|
27
|
|
|
28 samtools view $bamfile | awk 'BEGIN{OFS="\t"}{if(substr($0,1,1)!="@"){print}}' - | awk -v out="${output_dir}/trimming_stats.txt" 'BEGIN{OFS="\t";counter[0]=0;counter[1]=0;counter[2]=0;counter[3]=0}
|
|
|
29 function abs(value){return(value<0?-value:value)}
|
|
|
30 ($2==99 && /[\s\t]MD:Z:/ && !/MD:Z:([012][ACGT])/) {print($1, $3, $4+0, $4+abs($9)-1);counter[0]++;next};
|
|
|
31 ($2==99 && /[\s\t]MD:Z:0[ACGT]/ && !/MD:Z:0[ACGT][01][ACGT]/) {print($1, $3, $4+1, $4+abs($9)-1);counter[1]++;next};
|
|
|
32 ($2==99 && (/[\s\t]MD:Z:1[ACGT]/ && !/MD:Z:1[ACGT]0[ACGT]/ || (/MD:Z:0[ACGT]0[ACGT]/ && !/MD:Z:0[ACGT]0[ACGT]0[ACGT]/))) {print($1, $3, $4+2, $4+abs($9)-1);counter[2]++;next};
|
|
|
33 ($2==99 && ((/[\s\t]MD:Z:1[ACGT]0[ACGT]/)||(/MD:Z:0[ACGT]1[ACGT]/)||(/MD:Z:0[ACGT]0[ACGT]0[ACGT]/)||(/MD:Z:2[ACGT]/))) {print($1, $3, $4+3, $4+abs($9)-1);counter[3]++;next}
|
|
|
34 END{print("No trimming:",counter[0],"1 nt trimmed:", counter[1],"2 nt trimmed:", counter[2],"3 nt trimmed:",counter[3]) > out}' | sort -k1,1 | gzip > positions_temp_sorted.gz &
|
|
|
35
|
|
|
36 # Computing barcode length (Use the first line and compute the string length of the second column
|
|
|
37
|
|
|
38 TMP=`head -1 $barcodes | awk '{print $2}'`
|
|
|
39 BAR_LEN=`echo ${#TMP}`
|
|
|
40
|
|
|
41 # Remove "@" from barcodes and sort them
|
|
|
42 sed 's/^.//' $barcodes | sort -k1,1 | gzip > barcodes_temp_sorted.gz &
|
|
|
43
|
|
|
44 wait
|
|
|
45
|
|
|
46 # Merge poistions and barcodes
|
|
|
47 join -1 1 <(zcat positions_temp_sorted.gz) <(zcat barcodes_temp_sorted.gz) | cut -f 2,3,4,5 -d " " | awk '{if($4 !~ /N/){print}}' | awk -v bar_len="${BAR_LEN}" '{if(length($4)==bar_len){print}}' | gzip > merged_temp.gz
|
|
|
48
|
|
|
49 #### If priming flag is set....
|
|
|
50 if [ $priming_pos != -1 ]; then
|
|
|
51 zcat merged_temp.gz | awk -v pos="${priming_pos}" '{print $1, $2, pos, $4}' - > merged_temp2
|
|
|
52 cat merged_temp2 | gzip > merged_temp.gz
|
|
|
53 rm merged_temp2
|
|
|
54 fi
|
|
|
55
|
|
|
56 #File summary.gz columns: RNA_ID, Start, End, barcode sequence, sequenced_count[=number of sequenced fragments fulfilling previous requiremnts]
|
|
|
57
|
|
|
58 zcat merged_temp.gz | awk '{barcode[$1][$2][$3][$4]++}END{
|
|
|
59 for(RNA in barcode){
|
|
|
60 for(start_position in barcode[RNA]){
|
|
|
61 for(end_position in barcode[RNA][start_position]){
|
|
|
62 for(barseq in barcode[RNA][start_position][end_position]){print RNA,start_position,end_position,barseq,barcode[RNA][start_position][end_position][barseq]}}}}}' > $output_dir/summary.txt
|
|
|
63
|
|
|
64 #File unique_barcodes columns: RNA_ID, Start, End, number of unique barcodes observed for this fragment [PROBLEM: How to treat the different 3cdns for the same fragment? if the template was homogenous then it should be always the same]
|
|
|
65
|
|
|
66 awk '{barcode[$1][$2][$3]++}END{
|
|
|
67 for(RNA in barcode){
|
|
|
68 for(start_position in barcode[RNA]){
|
|
|
69 for(end_position in barcode[RNA][start_position]){print RNA,start_position,end_position,barcode[RNA][start_position][end_position]}}}}' $output_dir/summary.txt > $output_dir/unique_barcodes.txt &
|
|
|
70
|
|
|
71 #read_counts.gz colums: RNA_ID, Start, End, sequenced_count
|
|
|
72
|
|
|
73 zcat merged_temp.gz | awk '{barcode[$1][$2][$3]++}END{
|
|
|
74 for(RNA in barcode){
|
|
|
75 for(start_position in barcode[RNA]){
|
|
|
76 for(end_position in barcode[RNA][start_position]){print RNA,start_position,end_position,barcode[RNA][start_position][end_position]}}}}' > $output_dir/read_counts.txt &
|
|
|
77
|
|
|
78 wait
|
|
|
79
|
|
|
80 ##Remove temporary files - didn't do it, in case debugging needed.
|
|
|
81
|
|
|
82 rm positions_temp_sorted.gz
|
|
|
83 rm barcodes_temp_sorted.gz
|
|
|
84 #rm merged_temp.gz
|
|
|
85
|
|
|
86 ##End of remove temp files
|