changeset 14:68af3d2260aa draft

Uploaded
author nikos
date Mon, 26 Jan 2015 07:13:03 -0500
parents 37d00e22959b
children 7a5780dc7b71
files preprocessing.sh
diffstat 1 files changed, 19 insertions(+), 19 deletions(-) [+]
line wrap: on
line diff
--- a/preprocessing.sh	Wed Nov 05 12:34:49 2014 -0500
+++ b/preprocessing.sh	Mon Jan 26 07:13:03 2015 -0500
@@ -25,7 +25,7 @@
 -2: Read2 (FASTQ) - Optional
 -b: Barcode signature
 -t: Trimming length
--o: Output folder (default: "output_dir")
+-o: Output folder (Default: "output_dir")
 -------------------------------------
 Usage : preprocessing.sh -f <READ1> -r <READ2> -b <BARCODE_SEQ> -t <TRIM_LENGTH> -o <output_dir>
 End-of-message
@@ -98,9 +98,7 @@
     if(NR%4==0){if(trim_flag==1){print ""}else{print $0}}
 }END{print(trimming_stats, all_processed) > "trimming_stats.error"}' |
 
-awk -v len="${trim_length}" '{if(NR%2==0){print(substr($1,0,length($1)-len))}else{print}}' |
-
-awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{print}}' | gzip  > R1.fastq.gz &
+awk -v len="${trim_length}" '{if(NR%2==0){print(substr($1,0,length($1)-len))}else{print}}' > R1.fastq &
 
 wait
 
@@ -113,44 +111,46 @@
     #single-end
 
     #Extract the barcode sequence from the first read:
-    zcat R1.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}}' | awk '{print($1)}' > $output_dir/read1.fastq &
+    awk -v len="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}' R1.fastq | awk '{print($1)}' > $output_dir/read1.fastq &
 
-    zcat R1.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' | paste - - > $output_dir/barcodes.txt &
+    awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' R1.fastq | paste - - > $output_dir/barcodes.txt &
 
     wait
 
     #Remove temp files
-    rm R1.fastq.gz
+    rm R1.fastq
 
 else
     #paired-end
 
     #Trim primers (Read2)
-    awk -v len1="${trim_length}" -v len2="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len1+1,(length($0)-len1-len2)))}else{print($0)}}' $read2 |
-    awk '{if(NR%2==0 && length($1)<20){printf("\n")}else{print}}' | gzip > R2.fastq.gz &
+    awk -v len1="${trim_length}" -v len2="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len1+1,(length($0)-len1-len2)))}else{print($0)}}' $read2 > R2.fastq
+
+    ## Fix pairs in Read1 and Read2 fastq files
+    #Sort
+    awk '{print($1)}' R1.fastq | paste - - - - | awk '(NF==4){print $0}' | sort -S1G -k1,1 > R1_collapsed.fastq &
+    awk '{print($1)}' R2.fastq | paste - - - - | awk '(NF==4){print $0}' | sort -S1G -k1,1 > R2_collapsed.fastq &
 
     wait
 
-    #Remove empty reads - remove each pair from for which at least one read of the pair got removed (they are problematic when mapping)
+    join -1 1 -2 1 R1_collapsed.fastq R2_collapsed.fastq > joined.fastq
 
-    #First define which lines to keep from both fastq files (k for keep, d for discard in the lines_to_keep file)
-    paste <(zcat R1.fastq.gz) <(zcat R2.fastq.gz) | awk 'BEGIN{OFS="\n"}{if(NR%4==2 && NF==2){print("k","k","k","k")}else{if(NR%4==2 && NF<2){print("d","d","d","d")}}}' > lines_to_keep
-
-    paste lines_to_keep <(zcat R1.fastq.gz) | awk '{if($1=="k")print($2,$3)}' | gzip > R1_readsANDbarcodes.fastq.gz &
-
-    paste lines_to_keep <(zcat R2.fastq.gz) | awk '{if($1=="k")print($2,$3)}' | awk '{print($1)}' > $output_dir/read2.fastq &
+    awk 'BEGIN{OFS=""}{print($1"\n"$2"\n"$3"\n"$4)}' joined.fastq > R1_sorted.fastq &
+    awk 'BEGIN{OFS=""}{print($1"\n"$5"\n"$6"\n"$7)}' joined.fastq > $output_dir/read2.fastq &
 
     wait
 
+    rm joined.fastq R1_collapsed.fastq R2_collapsed.fastq R1.fastq R2.fastq &
+
     ########################################################################
     #Extract the barcode sequence from the first read:
-    zcat R1_readsANDbarcodes.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}}' | awk '{print($1)}' > $output_dir/read1.fastq &
+    awk -v len="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}' R1_sorted.fastq | awk '{print($1)}' > $output_dir/read1.fastq &
 
-    zcat R1_readsANDbarcodes.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' | paste - - > $output_dir/barcodes.txt &
+    awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' R1_sorted.fastq | paste - - > $output_dir/barcodes.txt &
 
     wait
 
     #Remove temp files
-    rm R1_readsANDbarcodes.fastq.gz R1.fastq.gz R2.fastq.gz lines_to_keep
+    rm R1_sorted.fastq
 
 fi