Mercurial > repos > nikos > rna_probing
changeset 14:68af3d2260aa draft
Uploaded
author | nikos |
---|---|
date | Mon, 26 Jan 2015 07:13:03 -0500 |
parents | 37d00e22959b |
children | 7a5780dc7b71 |
files | preprocessing.sh |
diffstat | 1 files changed, 19 insertions(+), 19 deletions(-) [+] |
line wrap: on
line diff
--- a/preprocessing.sh Wed Nov 05 12:34:49 2014 -0500 +++ b/preprocessing.sh Mon Jan 26 07:13:03 2015 -0500 @@ -25,7 +25,7 @@ -2: Read2 (FASTQ) - Optional -b: Barcode signature -t: Trimming length --o: Output folder (default: "output_dir") +-o: Output folder (Default: "output_dir") ------------------------------------- Usage : preprocessing.sh -f <READ1> -r <READ2> -b <BARCODE_SEQ> -t <TRIM_LENGTH> -o <output_dir> End-of-message @@ -98,9 +98,7 @@ if(NR%4==0){if(trim_flag==1){print ""}else{print $0}} }END{print(trimming_stats, all_processed) > "trimming_stats.error"}' | -awk -v len="${trim_length}" '{if(NR%2==0){print(substr($1,0,length($1)-len))}else{print}}' | - -awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{print}}' | gzip > R1.fastq.gz & +awk -v len="${trim_length}" '{if(NR%2==0){print(substr($1,0,length($1)-len))}else{print}}' > R1.fastq & wait @@ -113,44 +111,46 @@ #single-end #Extract the barcode sequence from the first read: - zcat R1.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}}' | awk '{print($1)}' > $output_dir/read1.fastq & + awk -v len="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}' R1.fastq | awk '{print($1)}' > $output_dir/read1.fastq & - zcat R1.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' | paste - - > $output_dir/barcodes.txt & + awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' R1.fastq | paste - - > $output_dir/barcodes.txt & wait #Remove temp files - rm R1.fastq.gz + rm R1.fastq else #paired-end #Trim primers (Read2) - awk -v len1="${trim_length}" -v len2="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len1+1,(length($0)-len1-len2)))}else{print($0)}}' $read2 | - awk '{if(NR%2==0 && length($1)<20){printf("\n")}else{print}}' | gzip > R2.fastq.gz & + awk -v len1="${trim_length}" -v len2="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len1+1,(length($0)-len1-len2)))}else{print($0)}}' $read2 > R2.fastq + + ## Fix pairs in Read1 and Read2 fastq files + #Sort + awk '{print($1)}' R1.fastq | paste - - - - | awk '(NF==4){print $0}' | sort -S1G -k1,1 > R1_collapsed.fastq & + awk '{print($1)}' R2.fastq | paste - - - - | awk '(NF==4){print $0}' | sort -S1G -k1,1 > R2_collapsed.fastq & wait - #Remove empty reads - remove each pair from for which at least one read of the pair got removed (they are problematic when mapping) + join -1 1 -2 1 R1_collapsed.fastq R2_collapsed.fastq > joined.fastq - #First define which lines to keep from both fastq files (k for keep, d for discard in the lines_to_keep file) - paste <(zcat R1.fastq.gz) <(zcat R2.fastq.gz) | awk 'BEGIN{OFS="\n"}{if(NR%4==2 && NF==2){print("k","k","k","k")}else{if(NR%4==2 && NF<2){print("d","d","d","d")}}}' > lines_to_keep - - paste lines_to_keep <(zcat R1.fastq.gz) | awk '{if($1=="k")print($2,$3)}' | gzip > R1_readsANDbarcodes.fastq.gz & - - paste lines_to_keep <(zcat R2.fastq.gz) | awk '{if($1=="k")print($2,$3)}' | awk '{print($1)}' > $output_dir/read2.fastq & + awk 'BEGIN{OFS=""}{print($1"\n"$2"\n"$3"\n"$4)}' joined.fastq > R1_sorted.fastq & + awk 'BEGIN{OFS=""}{print($1"\n"$5"\n"$6"\n"$7)}' joined.fastq > $output_dir/read2.fastq & wait + rm joined.fastq R1_collapsed.fastq R2_collapsed.fastq R1.fastq R2.fastq & + ######################################################################## #Extract the barcode sequence from the first read: - zcat R1_readsANDbarcodes.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%2==0 && length($1)<20+len){printf("\n")}else{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}}' | awk '{print($1)}' > $output_dir/read1.fastq & + awk -v len="${BAR_LENGTH}" '{if(NR%2==0){print(substr($0,len+1,length($0)))}else{print($0)}}' R1_sorted.fastq | awk '{print($1)}' > $output_dir/read1.fastq & - zcat R1_readsANDbarcodes.fastq.gz | awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' | paste - - > $output_dir/barcodes.txt & + awk -v len="${BAR_LENGTH}" '{if(NR%4==1){print($1)}else{if(NR%4==2){print(substr($0,0,len))}}}' R1_sorted.fastq | paste - - > $output_dir/barcodes.txt & wait #Remove temp files - rm R1_readsANDbarcodes.fastq.gz R1.fastq.gz R2.fastq.gz lines_to_keep + rm R1_sorted.fastq fi