comparison preprocessing.sh @ 24:431aebd93843 draft default tip

Fixed a bug in k2n.R where the function k2n_calc() would result in an error for single-end read files.
author nikos
date Wed, 05 Aug 2015 09:21:02 -0400
parents 31f25b37187b
children
comparison
equal deleted inserted replaced
23:fb76303acf4f 24:431aebd93843
1 #!/bin/bash
2
3 ####################################################################################################
4 #Copyright (C) 2014 Lukasz Kielpinski, Nikos Sidiropoulos
5
6 #This program is free software: you can redistribute it and/or modify it under the terms of the
7 #GNU General Public License as published by the Free Software Foundation, either version 3 of the
8 #License, or (at your option) any later version.
9
10 #This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
11 #even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 #GNU General Public License for more details (http://www.gnu.org/licenses/).
13 ####################################################################################################
14
15 #Preprocessing workflow - Read debarcoding and trimming.
16
17 function print_help {
18 cat <<End-of-message
19 RNA probing data preprocessing.
20 Read trimming and debarcoding.
21 -------------------------------------
22 Input arguments:
23 -h: Help
24 -1: Read1 (FASTQ) - Required
25 -2: Read2 (FASTQ) - Optional
26 -b: Barcode signature
27 -t: Trimming length
28 -o: Output folder (Default: "output_dir")
29 -------------------------------------
30 Usage : preprocessing.sh -f <READ1> -r <READ2> -b <BARCODE_SEQ> -t <TRIM_LENGTH> -o <output_dir>
31 End-of-message
32 exit
33 }
34
35 #defaults
36 output_dir="./output_dir"
37 read2=""
38 trim_length=0
39
40 #parse input
41 while getopts h1:2:b:t:o: myarg
42 do case "$myarg" in
43 h) print_help
44 exit ;;
45 1) read1="$OPTARG" ;; #required
46 2) read2="$OPTARG" ;; #optional
47 b) barcode="$OPTARG" ;; #optional
48 t) trim_length="$OPTARG" ;; #optional
49 o) output_dir="$OPTARG" ;; #optional
50 [?]) echo "ERROR: Unknown parameter"
51 print_help
52 exit 1 ;;
53 esac
54 done
55
56 if [[ -z $read1 ]]; then
57 echo "ERROR: Read1 file required!"
58 print_help
59 exit 1
60 fi
61
62 # Create the output folder
63 mkdir -p $output_dir
64
65 if [ -z "$barcode" ]; then
66 barcode=''
67 fi
68 BAR_LENGTH=`eval echo ${#barcode}`
69
70 ###################################################################################################
71 # Convert the Barcode signature to a regular exression.
72
73 # Example:
74 #
75 # Barcode in the oligo used: NWTRYSNNNN
76 # Which means that each proper read must begin with: NNNN(C|G)(A|G)(C|T)A(A|T)N
77 # As regex:
78 # ^[ACGT][ACGT][ACGT][ACGT][CG][AG][CT][A][AT][ACGT]
79 # Reverse complement the barcode sequence and create a regular expression.
80
81 REV_COMPLEMENT=$(perl -0777ne's/\n //g; tr/ATGCatgcNnYyRrKkMmBbVvDdHh/TACGtacgNnRrYyMmKkVvBbHhDd/; print scalar reverse $_;' <(echo $barcode))
82
83 REG_EXP=$(sed 's/A/[A]/g;s/C/[C]/g;s/G/[G]/g;s/T/[T]/g;s/R/[AG]/g;s/Y/[CT]/g;s/S/[GC]/g;s/W/[AT]/g;s/K/[GT]/g;s/M/[AC]/g;s/B/[CGT]/g;s/D/[AGT]/g;s/H/[ACT]/g;s/V/[ACG]/g;s/N/[AGCTN]/g;' <<< $REV_COMPLEMENT)
84
85 ###################################################################################################
86 #Remove reads that do not start with the signature (Read1)
87
88 #First awk removes them, second awk removes corresponding quality strings.
89 #Remove last N nt - may be derived from the random primers.
90
91 awk '{if(NR%4==2){if(/^'"$REG_EXP"'/){print}else{print ""}}else{print}}' $read1 |
92
93 awk 'BEGIN{trim_flag=0; trimming_stats=0; all_processed=0}
94 {
95 if(NR%4==1){print; all_processed++}
96 if(NR%4==2){if(length($1)==0){trim_flag=1;trimming_stats++}else{trim_flag=0};print}
97 if(NR%4==3){print}
98 if(NR%4==0){if(trim_flag==1){print ""}else{print $0}}
99 }END{print(trimming_stats, all_processed) > "trimming_stats.error"}' |
100
101 awk -v len="${trim_length}" '{if(NR%2==0){print(substr($1,0,length($1)-len))}else{print}}' > R1.fastq &
102
103 wait
104
105 mv trimming_stats.error $output_dir/trimming_stats.txt
106
107 ###################################################################################################
108 #Trim primers and possible random barcode
109
110 if [ -z "$read2" ]; then
111 #single-end
112
113 #Extract the barcode sequence from the first read:
114 awk -v len="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}' R1.fastq | awk '{print($1)}' > $output_dir/read1.fastq &
115
116 awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' R1.fastq | paste - - > $output_dir/barcodes.txt &
117
118 wait
119
120 #Remove temp files
121 rm R1.fastq
122
123 else
124 #paired-end
125
126 #Trim primers (Read2)
127 awk -v len1="${trim_length}" -v len2="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len1+1,(length($0)-len1-len2)))}else{print($0)}}' $read2 > R2.fastq
128
129 ## Fix pairs in Read1 and Read2 fastq files
130 #Sort
131 awk '{print($1)}' R1.fastq | paste - - - - | awk '(NF==4){print $0}' | sort -S1G -k1,1 > R1_collapsed.fastq &
132 awk '{print($1)}' R2.fastq | paste - - - - | awk '(NF==4){print $0}' | sort -S1G -k1,1 > R2_collapsed.fastq &
133
134 wait
135
136 join -1 1 -2 1 R1_collapsed.fastq R2_collapsed.fastq > joined.fastq
137
138 awk 'BEGIN{OFS=""}{print($1"\n"$2"\n"$3"\n"$4)}' joined.fastq > R1_sorted.fastq &
139 awk 'BEGIN{OFS=""}{print($1"\n"$5"\n"$6"\n"$7)}' joined.fastq > $output_dir/read2.fastq &
140
141 wait
142
143 rm joined.fastq R1_collapsed.fastq R2_collapsed.fastq R1.fastq R2.fastq &
144
145 ########################################################################
146 #Extract the barcode sequence from the first read:
147 awk -v len="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}' R1_sorted.fastq | awk '{print($1)}' > $output_dir/read1.fastq &
148
149 awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' R1_sorted.fastq | paste - - > $output_dir/barcodes.txt &
150
151 wait
152
153 #Remove temp files
154 rm R1_sorted.fastq
155
156 fi