Mercurial > repos > nikos > hrf_seq
comparison preprocessing.sh @ 2:14d6929f8aa1 draft default tip
Tool tests or/and test-data are missing.
| author | nikos |
|---|---|
| date | Thu, 11 Sep 2014 08:30:05 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:fd7e52483bbc | 2:14d6929f8aa1 |
|---|---|
| 1 #!/bin/bash | |
| 2 | |
| 3 #Preprocessing workflow - trimming adapters, barcodes etc | |
| 4 | |
| 5 # Barcode in the oligo used: NWTRYSNNNN | |
| 6 | |
| 7 # Which means that each proper read must begin with: NNNN(C|G)(A|G)(C|T)A(A|T)N | |
| 8 | |
| 9 # As regex: | |
| 10 | |
| 11 # ^[ACGT][ACGT][ACGT][ACGT][CG][AG][CT][A][AT][ACGT] | |
| 12 | |
| 13 | |
| 14 READ1=${1} | |
| 15 READ2=${2} | |
| 16 BARCODE=$3 | |
| 17 ADAPTER1=$4 | |
| 18 ADAPTER2=$5 | |
| 19 CUTOFF=$6 | |
| 20 CUTADAPT_LOG=${7} | |
| 21 TRIM_LENGTH=$8 | |
| 22 OVERLAP_LENGTH=$9 | |
| 23 OUT_R1=${10} | |
| 24 OUT_R2=${11} | |
| 25 OUT_BARCODES=${12} | |
| 26 | |
| 27 BAR_LENGTH=`eval echo ${#BARCODE}` | |
| 28 | |
| 29 ######################################################################################### | |
| 30 | |
| 31 # Reverse complement the barcode sequence and create a regular expression. | |
| 32 | |
| 33 REV_COMPLEMENT=$(perl -0777ne's/\n //g; tr/ATGCatgcNnYyRrKkMmBbVvDdHh/TACGtacgNnRrYyMmKkVvBbHhDd/; print scalar reverse $_;' <(echo $BARCODE)) | |
| 34 | |
| 35 REG_EXP=$(sed 's/A/[A]/g;s/C/[C]/g;s/G/[G]/g;s/T/[T]/g;s/R/[AG]/g;s/Y/[CT]/g;s/S/[GC]/g;s/W/[AT]/g;s/K/[GT]/g;s/M/[AC]/g;s/B/[CGT]/g;s/D/[AGT]/g;s/H/[ACT]/g;s/V/[ACG]/g;s/N/[AGCT]/g;' <<< $REV_COMPLEMENT) | |
| 36 | |
| 37 ######################################################################## | |
| 38 #1. Remove all the reads that do not start with the signature | |
| 39 | |
| 40 # (first awk removes them, second awk removes corresponding quality strings) followed by cutadapt. | |
| 41 | |
| 42 # After cutadapt, remove last 15 nt - may be derived from the random primer [should be optional, as the user may: 1) use different random primer length, 2) have short reasd and would lose too much info] and remove all the reads that are shorter than 30 nt | |
| 43 | |
| 44 # (10 nt barcode + 20 nt for mapping) | |
| 45 | |
| 46 echo -e '------------------------Read1------------------------\n' > $CUTADAPT_LOG | |
| 47 | |
| 48 awk '{if(NR%4==2){if(/^'"$REG_EXP"'/){print}else{print ""}}else{print}}' $READ1 | | |
| 49 | |
| 50 awk 'BEGIN{trim_flag=0; trimming_stats=0; all_processed=0} | |
| 51 { | |
| 52 if(NR%4==1){print; all_processed++} | |
| 53 if(NR%4==2){if(length($1)==0){trim_flag=1;trimming_stats++}else{trim_flag=0};print} | |
| 54 if(NR%4==3){print} | |
| 55 if(NR%4==0){if(trim_flag==1){print ""}else{print $0}} | |
| 56 }END{print(trimming_stats, all_processed) > "trimming_stats.error"}' | | |
| 57 | |
| 58 cutadapt -a $ADAPTER1 -q $CUTOFF --format=fastq -O $OVERLAP_LENGTH - 2>>$CUTADAPT_LOG | | |
| 59 | |
| 60 awk -v len="${TRIM_LENGTH}" '{if(NR%2==0){print(substr($1,0,length($1)-len))}else{print}}' | | |
| 61 | |
| 62 awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{print}}' | gzip > R1.fastq.gz & | |
| 63 | |
| 64 wait | |
| 65 | |
| 66 ######################################################################## | |
| 67 #2. Trim the adapter, primer and possible random barcode from the second read | |
| 68 | |
| 69 echo -e '------------------------Read2------------------------\n' >> $CUTADAPT_LOG | |
| 70 | |
| 71 cutadapt -a $ADAPTER2 -q $CUTOFF --format=fastq -O $OVERLAP_LENGTH $READ2 2>>$CUTADAPT_LOG | | |
| 72 | |
| 73 awk -v len1="${TRIM_LENGTH}" -v len2="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len1+1,(length($0)-len1-len2)))}else{print($0)}}' - | | |
| 74 | |
| 75 awk '{if(NR%2==0 && length($1)<20){printf("\n")}else{print}}' | gzip > R2.fastq.gz & | |
| 76 | |
| 77 wait | |
| 78 | |
| 79 ######################################################################## | |
| 80 #3. Remove empty reads - remove each pair from for which at least one read of the pair got removed (they are problematic for tophat mapping) | |
| 81 | |
| 82 #first define which lines to keep from both fastq files (k for keep, d for discard in the lines_to_keep file) | |
| 83 | |
| 84 paste <(zcat R1.fastq.gz) <(zcat R2.fastq.gz) | awk 'BEGIN{OFS="\n"}{if(NR%4==2 && NF==2){print("k","k","k","k")}else{if(NR%4==2 && NF<2){print("d","d","d","d")}}}' > lines_to_keep | |
| 85 | |
| 86 paste lines_to_keep <(zcat R1.fastq.gz) | awk '{if($1=="k")print($2,$3)}' | gzip > R1_readsANDbarcodes.fastq.gz & | |
| 87 | |
| 88 paste lines_to_keep <(zcat R2.fastq.gz) | awk '{if($1=="k")print($2,$3)}' > $OUT_R2 & | |
| 89 | |
| 90 wait | |
| 91 | |
| 92 ######################################################################## | |
| 93 #4. Extract the barcode sequence from the first read: | |
| 94 zcat R1_readsANDbarcodes.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}}' > $OUT_R1 & | |
| 95 | |
| 96 zcat R1_readsANDbarcodes.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' | paste - - > ${OUT_BARCODES} & | |
| 97 | |
| 98 wait | |
| 99 | |
| 100 ######################################################################## | |
| 101 #6. Remove temporary fastq files" | |
| 102 | |
| 103 rm R1.fastq.gz R2.fastq.gz R1_readsANDbarcodes.fastq.gz | |
| 104 rm lines_to_keep | |
| 105 | |
| 106 ######################################################################## | |
| 107 #7. problem! Spaces added at the end of the strings in fastq files (AWK induced). I will remove them: | |
| 108 | |
| 109 mv $OUT_R1 R1.temp.fastq | |
| 110 mv $OUT_R2 R2.temp.fastq | |
| 111 | |
| 112 awk '{print($1)}' R1.temp.fastq > $OUT_R1 & | |
| 113 awk '{print($1)}' R2.temp.fastq > $OUT_R2 & | |
| 114 | |
| 115 wait | |
| 116 rm R?.temp.fastq |
