annotate preprocessing.sh @ 0:83dfe38f6a09 draft

Uploaded
author nikos
date Tue, 04 Nov 2014 14:28:45 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
1 #!/bin/bash
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
2
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
3 #Preprocessing workflow - trimming adapters, barcodes etc
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
4
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
5 # Barcode in the oligo used: NWTRYSNNNN
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
6 # Which means that each proper read must begin with: NNNN(C|G)(A|G)(C|T)A(A|T)N
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
7 # As regex:
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
8 # ^[ACGT][ACGT][ACGT][ACGT][CG][AG][CT][A][AT][ACGT]
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
9
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
10 function print_help {
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
11 cat <<End-of-message
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
12 RNA probing data preprocessing.
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
13 Read trimming and debarcoding.
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
14 -------------------------------------
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
15 Input arguments:
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
16 -h: Help
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
17 -1: Read1 (FASTQ) - Required
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
18 -2: Read2 (FASTQ) - Optional
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
19 -b: Barcode signature
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
20 -t: Trimming length
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
21 -o: Output folder (default: "output_dir")
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
22 -------------------------------------
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
23 Usage : preprocessing.sh -f <READ1> -r <READ2> -b <BARCODE_SEQ> -t <TRIM_LENGTH> -o <output_dir>
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
24 End-of-message
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
25 exit
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
26 }
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
27
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
28 #defaults
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
29 output_dir="./output_dir"
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
30 read2=""
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
31 trim_length=0
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
32
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
33 #parse input
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
34 while getopts h1:2:b:t:o: myarg
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
35 do case "$myarg" in
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
36 h) print_help
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
37 exit ;;
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
38 1) read1="$OPTARG" ;; #required
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
39 2) read2="$OPTARG" ;; #optional
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
40 b) barcode="$OPTARG" ;; #optional
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
41 t) trim_length="$OPTARG" ;; #optional
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
42 o) output_dir="$OPTARG" ;; #optional
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
43 [?]) echo "ERROR: Unknown parameter"
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
44 print_help
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
45 exit 1 ;;
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
46 esac
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
47 done
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
48
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
49 if [[ -z $read1 ]]; then
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
50 echo "ERROR: Read1 file required!"
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
51 print_help
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
52 exit 1
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
53 fi
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
54
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
55 # Create the output folder
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
56 mkdir -p $output_dir
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
57
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
58 if [ -z "$barcode" ]; then
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
59 barcode=''
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
60 fi
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
61 BAR_LENGTH=`eval echo ${#barcode}`
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
62
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
63 #########################################################################################
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
64
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
65 # Reverse complement the barcode sequence and create a regular expression.
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
66
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
67 REV_COMPLEMENT=$(perl -0777ne's/\n //g; tr/ATGCatgcNnYyRrKkMmBbVvDdHh/TACGtacgNnRrYyMmKkVvBbHhDd/; print scalar reverse $_;' <(echo $barcode))
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
68
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
69 REG_EXP=$(sed 's/A/[A]/g;s/C/[C]/g;s/G/[G]/g;s/T/[T]/g;s/R/[AG]/g;s/Y/[CT]/g;s/S/[GC]/g;s/W/[AT]/g;s/K/[GT]/g;s/M/[AC]/g;s/B/[CGT]/g;s/D/[AGT]/g;s/H/[ACT]/g;s/V/[ACG]/g;s/N/[AGCTN]/g;' <<< $REV_COMPLEMENT)
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
70
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
71 ########################################################################
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
72 #1. Remove all the reads that do not start with the signature
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
73
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
74 # (first awk removes them, second awk removes corresponding quality strings) followed by cutadapt.
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
75
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
76 # After cutadapt, remove last 15 nt - may be derived from the random primer [should be optional, as the user may: 1) use different random primer length, 2) have short reasd and would lose too much info] and remove all the reads that are shorter than 30 nt
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
77
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
78 # (10 nt barcode + 20 nt for mapping)
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
79
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
80 awk '{if(NR%4==2){if(/^'"$REG_EXP"'/){print}else{print ""}}else{print}}' $read1 |
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
81
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
82 awk 'BEGIN{trim_flag=0; trimming_stats=0; all_processed=0}
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
83 {
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
84 if(NR%4==1){print; all_processed++}
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
85 if(NR%4==2){if(length($1)==0){trim_flag=1;trimming_stats++}else{trim_flag=0};print}
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
86 if(NR%4==3){print}
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
87 if(NR%4==0){if(trim_flag==1){print ""}else{print $0}}
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
88 }END{print(trimming_stats, all_processed) > "trimming_stats.error"}' |
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
89
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
90 awk -v len="${trim_length}" '{if(NR%2==0){print(substr($1,0,length($1)-len))}else{print}}' |
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
91
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
92 awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{print}}' | gzip > R1.fastq.gz &
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
93
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
94 wait
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
95
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
96 mv trimming_stats.error $output_dir/trimming_stats.txt
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
97
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
98 ########################################################################
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
99 #2. Trim the adapter, primer and possible random barcode from the second read
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
100
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
101 if [ -z "$read2" ]; then
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
102 #single-end
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
103
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
104 #3. Remove empty reads - Unecessary
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
105 #4. Extract the barcode sequence from the first read:
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
106 zcat R1.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}}' | awk '{print($1)}' > $output_dir/read1.fastq &
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
107
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
108 zcat R1.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' | paste - - > $output_dir/barcodes.txt &
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
109
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
110 wait
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
111
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
112 ########################################################################
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
113 #5. Remove temp files
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
114
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
115 rm R1.fastq.gz
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
116
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
117 else
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
118 #paired-end
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
119
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
120 awk -v len1="${trim_length}" -v len2="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len1+1,(length($0)-len1-len2)))}else{print($0)}}' $read2 |
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
121
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
122 awk '{if(NR%2==0 && length($1)<20){printf("\n")}else{print}}' | gzip > R2.fastq.gz &
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
123
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
124 wait
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
125
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
126 ########################################################################
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
127 #3. Remove empty reads - remove each pair from for which at least one read of the pair got removed (they are problematic for tophat mapping)
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
128 #first define which lines to keep from both fastq files (k for keep, d for discard in the lines_to_keep file)
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
129
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
130 paste <(zcat R1.fastq.gz) <(zcat R2.fastq.gz) | awk 'BEGIN{OFS="\n"}{if(NR%4==2 && NF==2){print("k","k","k","k")}else{if(NR%4==2 && NF<2){print("d","d","d","d")}}}' > lines_to_keep
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
131
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
132 paste lines_to_keep <(zcat R1.fastq.gz) | awk '{if($1=="k")print($2,$3)}' | gzip > R1_readsANDbarcodes.fastq.gz &
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
133
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
134 paste lines_to_keep <(zcat R2.fastq.gz) | awk '{if($1=="k")print($2,$3)}' | awk '{print($1)}' > $output_dir/read2.fastq &
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
135
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
136 wait
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
137
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
138 rm lines_to_keep
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
139
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
140 ########################################################################
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
141 #4. Extract the barcode sequence from the first read:
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
142 zcat R1_readsANDbarcodes.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}}' | awk '{print($1)}' > $output_dir/read1.fastq &
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
143
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
144 zcat R1_readsANDbarcodes.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' | paste - - > $output_dir/barcodes.txt &
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
145
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
146 wait
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
147
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
148 ########################################################################
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
149 #5. Remove temp files
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
150
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
151 rm R1_readsANDbarcodes.fastq.gz R1.fastq.gz R2.fastq.gz
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
152
83dfe38f6a09 Uploaded
nikos
parents:
diff changeset
153 fi