| 10 | 1 #!/bin/bash | 
|  | 2 | 
|  | 3 #################################################################################################### | 
|  | 4 #Copyright (C) 2014 Lukasz Kielpinski, Nikos Sidiropoulos | 
|  | 5 | 
|  | 6 #This program is free software: you can redistribute it and/or modify it under the terms of the | 
|  | 7 #GNU General Public License as published by the Free Software Foundation, either version 3 of the | 
|  | 8 #License, or (at your option) any later version. | 
|  | 9 | 
|  | 10 #This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without | 
|  | 11 #even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 
|  | 12 #GNU General Public License for more details (http://www.gnu.org/licenses/). | 
|  | 13 #################################################################################################### | 
|  | 14 | 
|  | 15 #Preprocessing workflow - Read debarcoding and trimming. | 
|  | 16 | 
|  | 17 function print_help { | 
|  | 18 cat <<End-of-message | 
|  | 19 RNA probing data preprocessing. | 
|  | 20 Read trimming and debarcoding. | 
|  | 21 ------------------------------------- | 
|  | 22 Input arguments: | 
|  | 23 -h: Help | 
|  | 24 -1: Read1 (FASTQ) - Required | 
|  | 25 -2: Read2 (FASTQ) - Optional | 
|  | 26 -b: Barcode signature | 
|  | 27 -t: Trimming length | 
|  | 28 -o: Output folder (default: "output_dir") | 
|  | 29 ------------------------------------- | 
|  | 30 Usage : preprocessing.sh -f <READ1> -r <READ2> -b <BARCODE_SEQ> -t <TRIM_LENGTH> -o <output_dir> | 
|  | 31 End-of-message | 
|  | 32 exit | 
|  | 33 } | 
|  | 34 | 
|  | 35 #defaults | 
|  | 36 output_dir="./output_dir" | 
|  | 37 read2="" | 
|  | 38 trim_length=0 | 
|  | 39 | 
|  | 40 #parse input | 
|  | 41 while getopts h1:2:b:t:o: myarg | 
|  | 42 do      case "$myarg" in | 
|  | 43         h)      print_help | 
|  | 44                 exit ;; | 
|  | 45         1)      read1="$OPTARG" ;; #required | 
|  | 46         2)      read2="$OPTARG" ;; #optional | 
|  | 47         b)      barcode="$OPTARG" ;; #optional | 
|  | 48         t)      trim_length="$OPTARG" ;; #optional | 
|  | 49         o)      output_dir="$OPTARG" ;; #optional | 
|  | 50         [?])    echo "ERROR: Unknown parameter" | 
|  | 51                 print_help | 
|  | 52                 exit 1 ;; | 
|  | 53         esac | 
|  | 54 done | 
|  | 55 | 
|  | 56 if [[ -z $read1 ]]; then | 
|  | 57   echo "ERROR: Read1 file required!" | 
|  | 58   print_help | 
|  | 59   exit 1 | 
|  | 60 fi | 
|  | 61 | 
|  | 62 # Create the output folder | 
|  | 63 mkdir -p $output_dir | 
|  | 64 | 
|  | 65 if [ -z "$barcode" ]; then | 
|  | 66     barcode='' | 
|  | 67 fi | 
|  | 68 BAR_LENGTH=`eval echo ${#barcode}` | 
|  | 69 | 
|  | 70 ################################################################################################### | 
|  | 71 # Convert the Barcode signature to a regular exression. | 
|  | 72 | 
|  | 73 # Example: | 
|  | 74 # | 
|  | 75 # Barcode in the oligo used: NWTRYSNNNN | 
|  | 76 # Which means that each proper read must begin with: NNNN(C|G)(A|G)(C|T)A(A|T)N | 
|  | 77 # As regex: | 
|  | 78 # ^[ACGT][ACGT][ACGT][ACGT][CG][AG][CT][A][AT][ACGT] | 
|  | 79 # Reverse complement the barcode sequence and create a regular expression. | 
|  | 80 | 
|  | 81 REV_COMPLEMENT=$(perl -0777ne's/\n //g; tr/ATGCatgcNnYyRrKkMmBbVvDdHh/TACGtacgNnRrYyMmKkVvBbHhDd/; print scalar reverse $_;' <(echo $barcode)) | 
|  | 82 | 
|  | 83 REG_EXP=$(sed 's/A/[A]/g;s/C/[C]/g;s/G/[G]/g;s/T/[T]/g;s/R/[AG]/g;s/Y/[CT]/g;s/S/[GC]/g;s/W/[AT]/g;s/K/[GT]/g;s/M/[AC]/g;s/B/[CGT]/g;s/D/[AGT]/g;s/H/[ACT]/g;s/V/[ACG]/g;s/N/[AGCTN]/g;' <<< $REV_COMPLEMENT) | 
|  | 84 | 
|  | 85 ################################################################################################### | 
|  | 86 #Remove reads that do not start with the signature (Read1) | 
|  | 87 | 
|  | 88 #First awk removes them, second awk removes corresponding quality strings. | 
|  | 89 #Remove last N nt - may be derived from the random primers. | 
|  | 90 | 
|  | 91 awk '{if(NR%4==2){if(/^'"$REG_EXP"'/){print}else{print ""}}else{print}}' $read1 | | 
|  | 92 | 
|  | 93 awk 'BEGIN{trim_flag=0; trimming_stats=0; all_processed=0} | 
|  | 94 { | 
|  | 95     if(NR%4==1){print; all_processed++} | 
|  | 96     if(NR%4==2){if(length($1)==0){trim_flag=1;trimming_stats++}else{trim_flag=0};print} | 
|  | 97     if(NR%4==3){print} | 
|  | 98     if(NR%4==0){if(trim_flag==1){print ""}else{print $0}} | 
|  | 99 }END{print(trimming_stats, all_processed) > "trimming_stats.error"}' | | 
|  | 100 | 
|  | 101 awk -v len="${trim_length}" '{if(NR%2==0){print(substr($1,0,length($1)-len))}else{print}}' | | 
|  | 102 | 
|  | 103 awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{print}}' | gzip  > R1.fastq.gz & | 
|  | 104 | 
|  | 105 wait | 
|  | 106 | 
|  | 107 mv trimming_stats.error $output_dir/trimming_stats.txt | 
|  | 108 | 
|  | 109 ################################################################################################### | 
|  | 110 #Trim primers and possible random barcode | 
|  | 111 | 
|  | 112 if [ -z "$read2" ]; then | 
|  | 113     #single-end | 
|  | 114 | 
|  | 115     #Extract the barcode sequence from the first read: | 
|  | 116     zcat R1.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}}' | awk '{print($1)}' > $output_dir/read1.fastq & | 
|  | 117 | 
|  | 118     zcat R1.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' | paste - - > $output_dir/barcodes.txt & | 
|  | 119 | 
|  | 120     wait | 
|  | 121 | 
|  | 122     #Remove temp files | 
|  | 123     rm R1.fastq.gz | 
|  | 124 | 
|  | 125 else | 
|  | 126     #paired-end | 
|  | 127 | 
|  | 128     #Trim primers (Read2) | 
|  | 129     awk -v len1="${trim_length}" -v len2="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len1+1,(length($0)-len1-len2)))}else{print($0)}}' $read2 | | 
|  | 130     awk '{if(NR%2==0 && length($1)<20){printf("\n")}else{print}}' | gzip > R2.fastq.gz & | 
|  | 131 | 
|  | 132     wait | 
|  | 133 | 
|  | 134     #Remove empty reads - remove each pair from for which at least one read of the pair got removed (they are problematic when mapping) | 
|  | 135 | 
|  | 136     #First define which lines to keep from both fastq files (k for keep, d for discard in the lines_to_keep file) | 
|  | 137     paste <(zcat R1.fastq.gz) <(zcat R2.fastq.gz) | awk 'BEGIN{OFS="\n"}{if(NR%4==2 && NF==2){print("k","k","k","k")}else{if(NR%4==2 && NF<2){print("d","d","d","d")}}}' > lines_to_keep | 
|  | 138 | 
|  | 139     paste lines_to_keep <(zcat R1.fastq.gz) | awk '{if($1=="k")print($2,$3)}' | gzip > R1_readsANDbarcodes.fastq.gz & | 
|  | 140 | 
|  | 141     paste lines_to_keep <(zcat R2.fastq.gz) | awk '{if($1=="k")print($2,$3)}' | awk '{print($1)}' > $output_dir/read2.fastq & | 
|  | 142 | 
|  | 143     wait | 
|  | 144 | 
|  | 145     ######################################################################## | 
|  | 146     #Extract the barcode sequence from the first read: | 
|  | 147     zcat R1_readsANDbarcodes.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}}' | awk '{print($1)}' > $output_dir/read1.fastq & | 
|  | 148 | 
|  | 149     zcat R1_readsANDbarcodes.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' | paste - - > $output_dir/barcodes.txt & | 
|  | 150 | 
|  | 151     wait | 
|  | 152 | 
|  | 153     #Remove temp files | 
|  | 154     rm R1_readsANDbarcodes.fastq.gz R1.fastq.gz R2.fastq.gz lines_to_keep | 
|  | 155 | 
|  | 156 fi |