Mercurial > repos > nikos > rna_probing
comparison preprocessing.sh @ 0:83dfe38f6a09 draft
Uploaded
author | nikos |
---|---|
date | Tue, 04 Nov 2014 14:28:45 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:83dfe38f6a09 |
---|---|
1 #!/bin/bash | |
2 | |
3 #Preprocessing workflow - trimming adapters, barcodes etc | |
4 | |
5 # Barcode in the oligo used: NWTRYSNNNN | |
6 # Which means that each proper read must begin with: NNNN(C|G)(A|G)(C|T)A(A|T)N | |
7 # As regex: | |
8 # ^[ACGT][ACGT][ACGT][ACGT][CG][AG][CT][A][AT][ACGT] | |
9 | |
10 function print_help { | |
11 cat <<End-of-message | |
12 RNA probing data preprocessing. | |
13 Read trimming and debarcoding. | |
14 ------------------------------------- | |
15 Input arguments: | |
16 -h: Help | |
17 -1: Read1 (FASTQ) - Required | |
18 -2: Read2 (FASTQ) - Optional | |
19 -b: Barcode signature | |
20 -t: Trimming length | |
21 -o: Output folder (default: "output_dir") | |
22 ------------------------------------- | |
23 Usage : preprocessing.sh -f <READ1> -r <READ2> -b <BARCODE_SEQ> -t <TRIM_LENGTH> -o <output_dir> | |
24 End-of-message | |
25 exit | |
26 } | |
27 | |
28 #defaults | |
29 output_dir="./output_dir" | |
30 read2="" | |
31 trim_length=0 | |
32 | |
33 #parse input | |
34 while getopts h1:2:b:t:o: myarg | |
35 do case "$myarg" in | |
36 h) print_help | |
37 exit ;; | |
38 1) read1="$OPTARG" ;; #required | |
39 2) read2="$OPTARG" ;; #optional | |
40 b) barcode="$OPTARG" ;; #optional | |
41 t) trim_length="$OPTARG" ;; #optional | |
42 o) output_dir="$OPTARG" ;; #optional | |
43 [?]) echo "ERROR: Unknown parameter" | |
44 print_help | |
45 exit 1 ;; | |
46 esac | |
47 done | |
48 | |
49 if [[ -z $read1 ]]; then | |
50 echo "ERROR: Read1 file required!" | |
51 print_help | |
52 exit 1 | |
53 fi | |
54 | |
55 # Create the output folder | |
56 mkdir -p $output_dir | |
57 | |
58 if [ -z "$barcode" ]; then | |
59 barcode='' | |
60 fi | |
61 BAR_LENGTH=`eval echo ${#barcode}` | |
62 | |
63 ######################################################################################### | |
64 | |
65 # Reverse complement the barcode sequence and create a regular expression. | |
66 | |
67 REV_COMPLEMENT=$(perl -0777ne's/\n //g; tr/ATGCatgcNnYyRrKkMmBbVvDdHh/TACGtacgNnRrYyMmKkVvBbHhDd/; print scalar reverse $_;' <(echo $barcode)) | |
68 | |
69 REG_EXP=$(sed 's/A/[A]/g;s/C/[C]/g;s/G/[G]/g;s/T/[T]/g;s/R/[AG]/g;s/Y/[CT]/g;s/S/[GC]/g;s/W/[AT]/g;s/K/[GT]/g;s/M/[AC]/g;s/B/[CGT]/g;s/D/[AGT]/g;s/H/[ACT]/g;s/V/[ACG]/g;s/N/[AGCTN]/g;' <<< $REV_COMPLEMENT) | |
70 | |
71 ######################################################################## | |
72 #1. Remove all the reads that do not start with the signature | |
73 | |
74 # (first awk removes them, second awk removes corresponding quality strings) followed by cutadapt. | |
75 | |
76 # After cutadapt, remove last 15 nt - may be derived from the random primer [should be optional, as the user may: 1) use different random primer length, 2) have short reasd and would lose too much info] and remove all the reads that are shorter than 30 nt | |
77 | |
78 # (10 nt barcode + 20 nt for mapping) | |
79 | |
80 awk '{if(NR%4==2){if(/^'"$REG_EXP"'/){print}else{print ""}}else{print}}' $read1 | | |
81 | |
82 awk 'BEGIN{trim_flag=0; trimming_stats=0; all_processed=0} | |
83 { | |
84 if(NR%4==1){print; all_processed++} | |
85 if(NR%4==2){if(length($1)==0){trim_flag=1;trimming_stats++}else{trim_flag=0};print} | |
86 if(NR%4==3){print} | |
87 if(NR%4==0){if(trim_flag==1){print ""}else{print $0}} | |
88 }END{print(trimming_stats, all_processed) > "trimming_stats.error"}' | | |
89 | |
90 awk -v len="${trim_length}" '{if(NR%2==0){print(substr($1,0,length($1)-len))}else{print}}' | | |
91 | |
92 awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{print}}' | gzip > R1.fastq.gz & | |
93 | |
94 wait | |
95 | |
96 mv trimming_stats.error $output_dir/trimming_stats.txt | |
97 | |
98 ######################################################################## | |
99 #2. Trim the adapter, primer and possible random barcode from the second read | |
100 | |
101 if [ -z "$read2" ]; then | |
102 #single-end | |
103 | |
104 #3. Remove empty reads - Unecessary | |
105 #4. Extract the barcode sequence from the first read: | |
106 zcat R1.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}}' | awk '{print($1)}' > $output_dir/read1.fastq & | |
107 | |
108 zcat R1.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' | paste - - > $output_dir/barcodes.txt & | |
109 | |
110 wait | |
111 | |
112 ######################################################################## | |
113 #5. Remove temp files | |
114 | |
115 rm R1.fastq.gz | |
116 | |
117 else | |
118 #paired-end | |
119 | |
120 awk -v len1="${trim_length}" -v len2="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len1+1,(length($0)-len1-len2)))}else{print($0)}}' $read2 | | |
121 | |
122 awk '{if(NR%2==0 && length($1)<20){printf("\n")}else{print}}' | gzip > R2.fastq.gz & | |
123 | |
124 wait | |
125 | |
126 ######################################################################## | |
127 #3. Remove empty reads - remove each pair from for which at least one read of the pair got removed (they are problematic for tophat mapping) | |
128 #first define which lines to keep from both fastq files (k for keep, d for discard in the lines_to_keep file) | |
129 | |
130 paste <(zcat R1.fastq.gz) <(zcat R2.fastq.gz) | awk 'BEGIN{OFS="\n"}{if(NR%4==2 && NF==2){print("k","k","k","k")}else{if(NR%4==2 && NF<2){print("d","d","d","d")}}}' > lines_to_keep | |
131 | |
132 paste lines_to_keep <(zcat R1.fastq.gz) | awk '{if($1=="k")print($2,$3)}' | gzip > R1_readsANDbarcodes.fastq.gz & | |
133 | |
134 paste lines_to_keep <(zcat R2.fastq.gz) | awk '{if($1=="k")print($2,$3)}' | awk '{print($1)}' > $output_dir/read2.fastq & | |
135 | |
136 wait | |
137 | |
138 rm lines_to_keep | |
139 | |
140 ######################################################################## | |
141 #4. Extract the barcode sequence from the first read: | |
142 zcat R1_readsANDbarcodes.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}}' | awk '{print($1)}' > $output_dir/read1.fastq & | |
143 | |
144 zcat R1_readsANDbarcodes.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' | paste - - > $output_dir/barcodes.txt & | |
145 | |
146 wait | |
147 | |
148 ######################################################################## | |
149 #5. Remove temp files | |
150 | |
151 rm R1_readsANDbarcodes.fastq.gz R1.fastq.gz R2.fastq.gz | |
152 | |
153 fi |