# HG changeset patch # User nikos # Date 1422274383 18000 # Node ID 68af3d2260aa26c11175644fda7a13e56d295a9c # Parent 37d00e22959b864115a688a7cfe4fb8e84dad5da Uploaded diff -r 37d00e22959b -r 68af3d2260aa preprocessing.sh --- a/preprocessing.sh Wed Nov 05 12:34:49 2014 -0500 +++ b/preprocessing.sh Mon Jan 26 07:13:03 2015 -0500 @@ -25,7 +25,7 @@ -2: Read2 (FASTQ) - Optional -b: Barcode signature -t: Trimming length --o: Output folder (default: "output_dir") +-o: Output folder (Default: "output_dir") ------------------------------------- Usage : preprocessing.sh -f -r -b -t -o End-of-message @@ -98,9 +98,7 @@ if(NR%4==0){if(trim_flag==1){print ""}else{print $0}} }END{print(trimming_stats, all_processed) > "trimming_stats.error"}' | -awk -v len="${trim_length}" '{if(NR%2==0){print(substr($1,0,length($1)-len))}else{print}}' | - -awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{print}}' | gzip > R1.fastq.gz & +awk -v len="${trim_length}" '{if(NR%2==0){print(substr($1,0,length($1)-len))}else{print}}' > R1.fastq & wait @@ -113,44 +111,46 @@ #single-end #Extract the barcode sequence from the first read: - zcat R1.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}}' | awk '{print($1)}' > $output_dir/read1.fastq & + awk -v len="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}' R1.fastq | awk '{print($1)}' > $output_dir/read1.fastq & - zcat R1.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' | paste - - > $output_dir/barcodes.txt & + awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' R1.fastq | paste - - > $output_dir/barcodes.txt & wait #Remove temp files - rm R1.fastq.gz + rm R1.fastq else #paired-end #Trim primers (Read2) - awk -v len1="${trim_length}" -v len2="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len1+1,(length($0)-len1-len2)))}else{print($0)}}' $read2 | - awk '{if(NR%2==0 && length($1)<20){printf("\n")}else{print}}' | gzip > R2.fastq.gz & + awk -v len1="${trim_length}" -v len2="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len1+1,(length($0)-len1-len2)))}else{print($0)}}' $read2 > R2.fastq + + ## Fix pairs in Read1 and Read2 fastq files + #Sort + awk '{print($1)}' R1.fastq | paste - - - - | awk '(NF==4){print $0}' | sort -S1G -k1,1 > R1_collapsed.fastq & + awk '{print($1)}' R2.fastq | paste - - - - | awk '(NF==4){print $0}' | sort -S1G -k1,1 > R2_collapsed.fastq & wait - #Remove empty reads - remove each pair from for which at least one read of the pair got removed (they are problematic when mapping) + join -1 1 -2 1 R1_collapsed.fastq R2_collapsed.fastq > joined.fastq - #First define which lines to keep from both fastq files (k for keep, d for discard in the lines_to_keep file) - paste <(zcat R1.fastq.gz) <(zcat R2.fastq.gz) | awk 'BEGIN{OFS="\n"}{if(NR%4==2 && NF==2){print("k","k","k","k")}else{if(NR%4==2 && NF<2){print("d","d","d","d")}}}' > lines_to_keep - - paste lines_to_keep <(zcat R1.fastq.gz) | awk '{if($1=="k")print($2,$3)}' | gzip > R1_readsANDbarcodes.fastq.gz & - - paste lines_to_keep <(zcat R2.fastq.gz) | awk '{if($1=="k")print($2,$3)}' | awk '{print($1)}' > $output_dir/read2.fastq & + awk 'BEGIN{OFS=""}{print($1"\n"$2"\n"$3"\n"$4)}' joined.fastq > R1_sorted.fastq & + awk 'BEGIN{OFS=""}{print($1"\n"$5"\n"$6"\n"$7)}' joined.fastq > $output_dir/read2.fastq & wait + rm joined.fastq R1_collapsed.fastq R2_collapsed.fastq R1.fastq R2.fastq & + ######################################################################## #Extract the barcode sequence from the first read: - zcat R1_readsANDbarcodes.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}}' | awk '{print($1)}' > $output_dir/read1.fastq & + awk -v len="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}' R1_sorted.fastq | awk '{print($1)}' > $output_dir/read1.fastq & - zcat R1_readsANDbarcodes.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' | paste - - > $output_dir/barcodes.txt & + awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' R1_sorted.fastq | paste - - > $output_dir/barcodes.txt & wait #Remove temp files - rm R1_readsANDbarcodes.fastq.gz R1.fastq.gz R2.fastq.gz lines_to_keep + rm R1_sorted.fastq fi