Mercurial > repos > nikos > rna_probing
comparison preprocessing.sh @ 24:431aebd93843 draft default tip
Fixed a bug in k2n.R where the function k2n_calc() would result in an error for single-end read files.
| author | nikos |
|---|---|
| date | Wed, 05 Aug 2015 09:21:02 -0400 |
| parents | 31f25b37187b |
| children |
comparison
equal
deleted
inserted
replaced
| 23:fb76303acf4f | 24:431aebd93843 |
|---|---|
| 1 #!/bin/bash | |
| 2 | |
| 3 #################################################################################################### | |
| 4 #Copyright (C) 2014 Lukasz Kielpinski, Nikos Sidiropoulos | |
| 5 | |
| 6 #This program is free software: you can redistribute it and/or modify it under the terms of the | |
| 7 #GNU General Public License as published by the Free Software Foundation, either version 3 of the | |
| 8 #License, or (at your option) any later version. | |
| 9 | |
| 10 #This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without | |
| 11 #even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 12 #GNU General Public License for more details (http://www.gnu.org/licenses/). | |
| 13 #################################################################################################### | |
| 14 | |
| 15 #Preprocessing workflow - Read debarcoding and trimming. | |
| 16 | |
| 17 function print_help { | |
| 18 cat <<End-of-message | |
| 19 RNA probing data preprocessing. | |
| 20 Read trimming and debarcoding. | |
| 21 ------------------------------------- | |
| 22 Input arguments: | |
| 23 -h: Help | |
| 24 -1: Read1 (FASTQ) - Required | |
| 25 -2: Read2 (FASTQ) - Optional | |
| 26 -b: Barcode signature | |
| 27 -t: Trimming length | |
| 28 -o: Output folder (Default: "output_dir") | |
| 29 ------------------------------------- | |
| 30 Usage : preprocessing.sh -f <READ1> -r <READ2> -b <BARCODE_SEQ> -t <TRIM_LENGTH> -o <output_dir> | |
| 31 End-of-message | |
| 32 exit | |
| 33 } | |
| 34 | |
| 35 #defaults | |
| 36 output_dir="./output_dir" | |
| 37 read2="" | |
| 38 trim_length=0 | |
| 39 | |
| 40 #parse input | |
| 41 while getopts h1:2:b:t:o: myarg | |
| 42 do case "$myarg" in | |
| 43 h) print_help | |
| 44 exit ;; | |
| 45 1) read1="$OPTARG" ;; #required | |
| 46 2) read2="$OPTARG" ;; #optional | |
| 47 b) barcode="$OPTARG" ;; #optional | |
| 48 t) trim_length="$OPTARG" ;; #optional | |
| 49 o) output_dir="$OPTARG" ;; #optional | |
| 50 [?]) echo "ERROR: Unknown parameter" | |
| 51 print_help | |
| 52 exit 1 ;; | |
| 53 esac | |
| 54 done | |
| 55 | |
| 56 if [[ -z $read1 ]]; then | |
| 57 echo "ERROR: Read1 file required!" | |
| 58 print_help | |
| 59 exit 1 | |
| 60 fi | |
| 61 | |
| 62 # Create the output folder | |
| 63 mkdir -p $output_dir | |
| 64 | |
| 65 if [ -z "$barcode" ]; then | |
| 66 barcode='' | |
| 67 fi | |
| 68 BAR_LENGTH=`eval echo ${#barcode}` | |
| 69 | |
| 70 ################################################################################################### | |
| 71 # Convert the Barcode signature to a regular exression. | |
| 72 | |
| 73 # Example: | |
| 74 # | |
| 75 # Barcode in the oligo used: NWTRYSNNNN | |
| 76 # Which means that each proper read must begin with: NNNN(C|G)(A|G)(C|T)A(A|T)N | |
| 77 # As regex: | |
| 78 # ^[ACGT][ACGT][ACGT][ACGT][CG][AG][CT][A][AT][ACGT] | |
| 79 # Reverse complement the barcode sequence and create a regular expression. | |
| 80 | |
| 81 REV_COMPLEMENT=$(perl -0777ne's/\n //g; tr/ATGCatgcNnYyRrKkMmBbVvDdHh/TACGtacgNnRrYyMmKkVvBbHhDd/; print scalar reverse $_;' <(echo $barcode)) | |
| 82 | |
| 83 REG_EXP=$(sed 's/A/[A]/g;s/C/[C]/g;s/G/[G]/g;s/T/[T]/g;s/R/[AG]/g;s/Y/[CT]/g;s/S/[GC]/g;s/W/[AT]/g;s/K/[GT]/g;s/M/[AC]/g;s/B/[CGT]/g;s/D/[AGT]/g;s/H/[ACT]/g;s/V/[ACG]/g;s/N/[AGCTN]/g;' <<< $REV_COMPLEMENT) | |
| 84 | |
| 85 ################################################################################################### | |
| 86 #Remove reads that do not start with the signature (Read1) | |
| 87 | |
| 88 #First awk removes them, second awk removes corresponding quality strings. | |
| 89 #Remove last N nt - may be derived from the random primers. | |
| 90 | |
| 91 awk '{if(NR%4==2){if(/^'"$REG_EXP"'/){print}else{print ""}}else{print}}' $read1 | | |
| 92 | |
| 93 awk 'BEGIN{trim_flag=0; trimming_stats=0; all_processed=0} | |
| 94 { | |
| 95 if(NR%4==1){print; all_processed++} | |
| 96 if(NR%4==2){if(length($1)==0){trim_flag=1;trimming_stats++}else{trim_flag=0};print} | |
| 97 if(NR%4==3){print} | |
| 98 if(NR%4==0){if(trim_flag==1){print ""}else{print $0}} | |
| 99 }END{print(trimming_stats, all_processed) > "trimming_stats.error"}' | | |
| 100 | |
| 101 awk -v len="${trim_length}" '{if(NR%2==0){print(substr($1,0,length($1)-len))}else{print}}' > R1.fastq & | |
| 102 | |
| 103 wait | |
| 104 | |
| 105 mv trimming_stats.error $output_dir/trimming_stats.txt | |
| 106 | |
| 107 ################################################################################################### | |
| 108 #Trim primers and possible random barcode | |
| 109 | |
| 110 if [ -z "$read2" ]; then | |
| 111 #single-end | |
| 112 | |
| 113 #Extract the barcode sequence from the first read: | |
| 114 awk -v len="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}' R1.fastq | awk '{print($1)}' > $output_dir/read1.fastq & | |
| 115 | |
| 116 awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' R1.fastq | paste - - > $output_dir/barcodes.txt & | |
| 117 | |
| 118 wait | |
| 119 | |
| 120 #Remove temp files | |
| 121 rm R1.fastq | |
| 122 | |
| 123 else | |
| 124 #paired-end | |
| 125 | |
| 126 #Trim primers (Read2) | |
| 127 awk -v len1="${trim_length}" -v len2="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len1+1,(length($0)-len1-len2)))}else{print($0)}}' $read2 > R2.fastq | |
| 128 | |
| 129 ## Fix pairs in Read1 and Read2 fastq files | |
| 130 #Sort | |
| 131 awk '{print($1)}' R1.fastq | paste - - - - | awk '(NF==4){print $0}' | sort -S1G -k1,1 > R1_collapsed.fastq & | |
| 132 awk '{print($1)}' R2.fastq | paste - - - - | awk '(NF==4){print $0}' | sort -S1G -k1,1 > R2_collapsed.fastq & | |
| 133 | |
| 134 wait | |
| 135 | |
| 136 join -1 1 -2 1 R1_collapsed.fastq R2_collapsed.fastq > joined.fastq | |
| 137 | |
| 138 awk 'BEGIN{OFS=""}{print($1"\n"$2"\n"$3"\n"$4)}' joined.fastq > R1_sorted.fastq & | |
| 139 awk 'BEGIN{OFS=""}{print($1"\n"$5"\n"$6"\n"$7)}' joined.fastq > $output_dir/read2.fastq & | |
| 140 | |
| 141 wait | |
| 142 | |
| 143 rm joined.fastq R1_collapsed.fastq R2_collapsed.fastq R1.fastq R2.fastq & | |
| 144 | |
| 145 ######################################################################## | |
| 146 #Extract the barcode sequence from the first read: | |
| 147 awk -v len="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}' R1_sorted.fastq | awk '{print($1)}' > $output_dir/read1.fastq & | |
| 148 | |
| 149 awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' R1_sorted.fastq | paste - - > $output_dir/barcodes.txt & | |
| 150 | |
| 151 wait | |
| 152 | |
| 153 #Remove temp files | |
| 154 rm R1_sorted.fastq | |
| 155 | |
| 156 fi |
