annotate preprocessing.sh @ 10:33e625bef2b9 draft

Uploaded
author nikos
date Wed, 05 Nov 2014 09:09:03 -0500
parents
children 68af3d2260aa
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
10
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
1 #!/bin/bash
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
2
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
3 ####################################################################################################
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
4 #Copyright (C) 2014 Lukasz Kielpinski, Nikos Sidiropoulos
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
5
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
6 #This program is free software: you can redistribute it and/or modify it under the terms of the
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
7 #GNU General Public License as published by the Free Software Foundation, either version 3 of the
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
8 #License, or (at your option) any later version.
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
9
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
10 #This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
11 #even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
12 #GNU General Public License for more details (http://www.gnu.org/licenses/).
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
13 ####################################################################################################
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
14
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
15 #Preprocessing workflow - Read debarcoding and trimming.
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
16
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
17 function print_help {
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
18 cat <<End-of-message
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
19 RNA probing data preprocessing.
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
20 Read trimming and debarcoding.
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
21 -------------------------------------
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
22 Input arguments:
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
23 -h: Help
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
24 -1: Read1 (FASTQ) - Required
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
25 -2: Read2 (FASTQ) - Optional
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
26 -b: Barcode signature
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
27 -t: Trimming length
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
28 -o: Output folder (default: "output_dir")
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
29 -------------------------------------
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
30 Usage : preprocessing.sh -f <READ1> -r <READ2> -b <BARCODE_SEQ> -t <TRIM_LENGTH> -o <output_dir>
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
31 End-of-message
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
32 exit
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
33 }
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
34
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
35 #defaults
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
36 output_dir="./output_dir"
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
37 read2=""
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
38 trim_length=0
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
39
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
40 #parse input
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
41 while getopts h1:2:b:t:o: myarg
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
42 do case "$myarg" in
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
43 h) print_help
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
44 exit ;;
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
45 1) read1="$OPTARG" ;; #required
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
46 2) read2="$OPTARG" ;; #optional
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
47 b) barcode="$OPTARG" ;; #optional
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
48 t) trim_length="$OPTARG" ;; #optional
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
49 o) output_dir="$OPTARG" ;; #optional
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
50 [?]) echo "ERROR: Unknown parameter"
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
51 print_help
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
52 exit 1 ;;
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
53 esac
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
54 done
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
55
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
56 if [[ -z $read1 ]]; then
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
57 echo "ERROR: Read1 file required!"
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
58 print_help
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
59 exit 1
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
60 fi
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
61
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
62 # Create the output folder
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
63 mkdir -p $output_dir
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
64
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
65 if [ -z "$barcode" ]; then
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
66 barcode=''
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
67 fi
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
68 BAR_LENGTH=`eval echo ${#barcode}`
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
69
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
70 ###################################################################################################
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
71 # Convert the Barcode signature to a regular exression.
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
72
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
73 # Example:
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
74 #
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
75 # Barcode in the oligo used: NWTRYSNNNN
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
76 # Which means that each proper read must begin with: NNNN(C|G)(A|G)(C|T)A(A|T)N
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
77 # As regex:
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
78 # ^[ACGT][ACGT][ACGT][ACGT][CG][AG][CT][A][AT][ACGT]
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
79 # Reverse complement the barcode sequence and create a regular expression.
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
80
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
81 REV_COMPLEMENT=$(perl -0777ne's/\n //g; tr/ATGCatgcNnYyRrKkMmBbVvDdHh/TACGtacgNnRrYyMmKkVvBbHhDd/; print scalar reverse $_;' <(echo $barcode))
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
82
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
83 REG_EXP=$(sed 's/A/[A]/g;s/C/[C]/g;s/G/[G]/g;s/T/[T]/g;s/R/[AG]/g;s/Y/[CT]/g;s/S/[GC]/g;s/W/[AT]/g;s/K/[GT]/g;s/M/[AC]/g;s/B/[CGT]/g;s/D/[AGT]/g;s/H/[ACT]/g;s/V/[ACG]/g;s/N/[AGCTN]/g;' <<< $REV_COMPLEMENT)
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
84
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
85 ###################################################################################################
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
86 #Remove reads that do not start with the signature (Read1)
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
87
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
88 #First awk removes them, second awk removes corresponding quality strings.
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
89 #Remove last N nt - may be derived from the random primers.
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
90
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
91 awk '{if(NR%4==2){if(/^'"$REG_EXP"'/){print}else{print ""}}else{print}}' $read1 |
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
92
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
93 awk 'BEGIN{trim_flag=0; trimming_stats=0; all_processed=0}
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
94 {
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
95 if(NR%4==1){print; all_processed++}
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
96 if(NR%4==2){if(length($1)==0){trim_flag=1;trimming_stats++}else{trim_flag=0};print}
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
97 if(NR%4==3){print}
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
98 if(NR%4==0){if(trim_flag==1){print ""}else{print $0}}
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
99 }END{print(trimming_stats, all_processed) > "trimming_stats.error"}' |
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
100
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
101 awk -v len="${trim_length}" '{if(NR%2==0){print(substr($1,0,length($1)-len))}else{print}}' |
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
102
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
103 awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{print}}' | gzip > R1.fastq.gz &
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
104
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
105 wait
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
106
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
107 mv trimming_stats.error $output_dir/trimming_stats.txt
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
108
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
109 ###################################################################################################
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
110 #Trim primers and possible random barcode
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
111
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
112 if [ -z "$read2" ]; then
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
113 #single-end
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
114
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
115 #Extract the barcode sequence from the first read:
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
116 zcat R1.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}}' | awk '{print($1)}' > $output_dir/read1.fastq &
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
117
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
118 zcat R1.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' | paste - - > $output_dir/barcodes.txt &
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
119
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
120 wait
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
121
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
122 #Remove temp files
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
123 rm R1.fastq.gz
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
124
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
125 else
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
126 #paired-end
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
127
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
128 #Trim primers (Read2)
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
129 awk -v len1="${trim_length}" -v len2="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len1+1,(length($0)-len1-len2)))}else{print($0)}}' $read2 |
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
130 awk '{if(NR%2==0 && length($1)<20){printf("\n")}else{print}}' | gzip > R2.fastq.gz &
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
131
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
132 wait
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
133
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
134 #Remove empty reads - remove each pair from for which at least one read of the pair got removed (they are problematic when mapping)
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
135
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
136 #First define which lines to keep from both fastq files (k for keep, d for discard in the lines_to_keep file)
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
137 paste <(zcat R1.fastq.gz) <(zcat R2.fastq.gz) | awk 'BEGIN{OFS="\n"}{if(NR%4==2 && NF==2){print("k","k","k","k")}else{if(NR%4==2 && NF<2){print("d","d","d","d")}}}' > lines_to_keep
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
138
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
139 paste lines_to_keep <(zcat R1.fastq.gz) | awk '{if($1=="k")print($2,$3)}' | gzip > R1_readsANDbarcodes.fastq.gz &
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
140
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
141 paste lines_to_keep <(zcat R2.fastq.gz) | awk '{if($1=="k")print($2,$3)}' | awk '{print($1)}' > $output_dir/read2.fastq &
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
142
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
143 wait
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
144
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
145 ########################################################################
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
146 #Extract the barcode sequence from the first read:
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
147 zcat R1_readsANDbarcodes.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}}' | awk '{print($1)}' > $output_dir/read1.fastq &
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
148
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
149 zcat R1_readsANDbarcodes.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' | paste - - > $output_dir/barcodes.txt &
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
150
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
151 wait
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
152
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
153 #Remove temp files
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
154 rm R1_readsANDbarcodes.fastq.gz R1.fastq.gz R2.fastq.gz lines_to_keep
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
155
33e625bef2b9 Uploaded
nikos
parents:
diff changeset
156 fi