annotate barcode_sort.pl @ 7:2c843c8df821 draft

planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
author tiagoantao
date Thu, 31 Mar 2016 10:32:59 -0400
parents 5cd3cc89bf59
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d838c6a2ec0f planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff changeset
1 #!/usr/bin/perl -w
d838c6a2ec0f planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff changeset
2 use strict; use warnings;
d838c6a2ec0f planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff changeset
3
7
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
4 my $cut_site = $ARGV[7]; #let the user specify the cut site keyword (e.g. "TGCA" for the SbfI 6 bp cutter)
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
5
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
6 my $b = $ARGV[6]; #let the user specify barcode length
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
7 my $n = 2; # set number of nucleotides to expect before the barcode (bestRAD libraries run at U. Oregon have 2bp here)
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
8 my $c = $b + $n; # set position for start of enzyme cutsite, which occurs after the initial nucleotides plus the barcode
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
9 my $e = 6; # set length of remaining enzyme cutsite sequence (e.g. 6 for SbfI) -- for other than SbfI, need to change the actual sequence below!!!
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
10 my $e = length($cut_site); #calculate the length of the cut site
0
d838c6a2ec0f planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff changeset
11 # note this is the length of what's left after enzyme digestion, NOT the full length of the enzyme recognition site
d838c6a2ec0f planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff changeset
12 # the program expects all correct forward reads to follow the pattern: $n initial nucleotides, then $b nucleotides of barcode, then $e nucleotides of the cutsite
d838c6a2ec0f planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff changeset
13
6
5cd3cc89bf59 planemo upload commit 80e74122bff9cbc1d172f920b164d6db9c028e7b
tiagoantao
parents: 0
diff changeset
14 my $is_resilient;
5cd3cc89bf59 planemo upload commit 80e74122bff9cbc1d172f920b164d6db9c028e7b
tiagoantao
parents: 0
diff changeset
15
7
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
16 open (IN, $ARGV[0]); # read file with barcodes
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
17 my %counts = (); # make a hash of barcodes that will be searched
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
18 while(<IN>) { # counts of each barcode can be tracked with this hash, with a few more lines of code below
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
19 chomp($_);
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
20 $counts{$_} = 0;
0
d838c6a2ec0f planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff changeset
21 }
d838c6a2ec0f planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff changeset
22 close IN;
d838c6a2ec0f planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff changeset
23
7
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
24 open (IN1, $ARGV[1]); # read fastq file of raw forward reads
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
25 open (IN2, $ARGV[2]); # read fastq file of raw reverse reads -- these must have pairs in identical order
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
26 open (OUT1, ">$ARGV[3]"); # create fastq outfile for flipped forward reads (cutsite end)
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
27 open (OUT2, ">$ARGV[4]"); # create fastq outfile for flipped reverse reads (randomly sheared end)
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
28 my $forward; my $reverse; my $barcode; # establish string variables for all parts of fastq files
0
d838c6a2ec0f planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff changeset
29 my $name1; my $name2; my $third1; my $third2; my $qual1; my $qual2;
7
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
30
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
31 $is_resilient = $ARGV[5]; # true if is resilient to errors - Brian's additions
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
32
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
33
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
34 sub strDiffMaxDelta {
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
35
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
36 my ( $s1, $s2, $maxDelta ) = @_;
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
37 my $diffCount = () = ( $s1 ^ $s2 ) =~ /[^\x00]/g;
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
38 return $diffCount <= $maxDelta;
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
39
0
d838c6a2ec0f planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff changeset
40 }
7
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
41
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
42 if ($is_resilient eq "true") {
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
43
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
44 my $max_errors = $ARGV[8];
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
45
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
46 while($name1 = <IN1>) { # start reading through the paired fastq input files
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
47 $name2 = <IN2>; # read all parts of a single read pair (4 lines in each of the 2 fastq files)
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
48 $forward = <IN1>;
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
49 $reverse = <IN2>;
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
50 $third1 = <IN1>; $third2 = <IN2>; $qual1 = <IN1>; $qual2 = <IN2>;
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
51 my $which = 0; my $for; my $rev; # establish variables used below
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
52
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
53 if(strDiffMaxDelta(substr($forward, $c, $e), $cut_site, $max_errors)) { # check for SbfI cutsite in the correct place in forward read
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
54 if(strDiffMaxDelta(substr($reverse, $c, $e), $cut_site, $max_errors)) { # check for SbfI cutsite in the correct place in reverse read
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
55 $for = substr($forward, $n, $b); # this is where a barcode should be if it's in the forward read
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
56 $rev = substr($reverse, $n, $b); # this is where a barcode should be if it's in the reverse read
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
57 if(exists $counts{$for} && (exists $counts{$rev}) == 0) { # if a correct barcode and cutsite are in forward but not reverse read...
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
58 $which = 1; $barcode = $for; $counts{$for}++; # which = 1 means the pair is correctly oriented
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
59 }
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
60 elsif((exists $counts{$for}) == 0 && exists $counts{$rev}) { # if a correct barcode and cutsite are only in the reverse read...
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
61 $which = 2; $barcode = $rev; $counts{$rev}++; # which = 2 means the pair needs to be flipped
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
62 }
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
63 }
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
64 else { # the cutsite is only found in the forward read
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
65 #$barcode = substr($forward, $n, $b);
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
66 #if(exists $counts{$barcode}) {$which = 1; $counts{$barcode}++; } # if a correct barcode is in the forward read, the pair is correctly oriented
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
67 $which = 1;
0
d838c6a2ec0f planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff changeset
68
7
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
69 }
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
70 }
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
71 elsif(strDiffMaxDelta(substr($reverse, $c, $e), $cut_site, $max_errors)){ # cutsite not in forward read but is in reverse read
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
72 #$barcode = substr($reverse, $n, $b);
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
73 #if(exists $counts{$barcode}) {$which = 2; $counts{$barcode}++; } # pair needs to be flipped
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
74 $which = 2;
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
75 } # if a cutsite and correct barcode has not been found in either read, which = 0 at this point
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
76 if($which == 1) { # if the pair is correctly oriented, print out fastq format for the pair
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
77 my $temp1 = substr($forward, $n); # trim initial nucleotides off read and quality scores...
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
78 my $temp2 = substr($qual1, $n); # so that output keeps barcode and cutsite but not other nucleotides...
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
79 print OUT1 "$name1$temp1$third1$temp2"; # and is ready to go into process_radtags
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
80 print OUT2 "$name2$reverse$third2$qual2";
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
81 }
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
82 elsif($which == 2) { # if the pair needs to be flipped, print out fastq format for the flipped pair
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
83 my $temp1 = substr($reverse, $n);
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
84 my $temp2 = substr($qual2, $n);
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
85 print OUT1 "$name2$temp1$third2$temp2";
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
86 print OUT2 "$name1$forward$third1$qual1";
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
87 } # if which == 0, nothing is printed out for the pair
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
88 }
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
89 close IN1;
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
90 close IN2;
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
91 close OUT1;
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
92 close OUT2;
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
93
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
94 foreach my $key (sort keys %counts) {
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
95 print "$key" . "\t" . "$counts{$key}\n";
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
96 }
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
97
0
d838c6a2ec0f planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff changeset
98 }
d838c6a2ec0f planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff changeset
99
d838c6a2ec0f planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff changeset
100
7
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
101 else{ #Paul's code
0
d838c6a2ec0f planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff changeset
102
7
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
103 while($name1 = <IN1>) { # start reading through the paired fastq input files
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
104 $name2 = <IN2>; # read all parts of a single read pair (4 lines in each of the 2 fastq files)
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
105 $forward = <IN1>;
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
106 $reverse = <IN2>;
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
107 $third1 = <IN1>; $third2 = <IN2>; $qual1 = <IN1>; $qual2 = <IN2>;
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
108
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
109 my $which = 0; my $for; my $rev; # establish variables used below
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
110 if(substr($forward, $c, $e) eq "TGCAGG") { # check for SbfI cutsite in the correct place in forward read
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
111 if(substr($reverse, $c, $e) eq "TGCAGG") { # check for SbfI cutsite in the correct place in reverse read
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
112 $for = substr($forward, $n, $b); # this is where a barcode should be if it's in the forward read
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
113 $rev = substr($reverse, $n, $b); # this is where a barcode should be if it's in the reverse read
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
114 if(exists $counts{$for} && (exists $counts{$rev}) == 0) { # if a correct barcode and cutsite are in forward but not reverse read...
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
115 $which = 1; $barcode = $for; $counts{$for}++; # which = 1 means the pair is correctly oriented
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
116 }
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
117 elsif((exists $counts{$for}) == 0 && exists $counts{$rev}) { # if a correct barcode and cutsite are only in the reverse read...
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
118 $which = 2; $barcode = $rev; $counts{$rev}++; # which = 2 means the pair needs to be flipped
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
119 }
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
120 }
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
121 else { # the cutsite is only found in the forward read
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
122 $barcode = substr($forward, $n, $b);
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
123 if(exists $counts{$barcode}) {$which = 1; $counts{$barcode}++; } # if a correct barcode is in the forward read, the pair is correctly oriented
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
124 }
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
125 }
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
126 elsif(substr($reverse, $c, $e) eq "TGCAGG") { # cutsite not in forward read but is in reverse read
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
127 $barcode = substr($reverse, $n, $b);
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
128 if(exists $counts{$barcode}) {$which = 2; $counts{$barcode}++; } # pair needs to be flipped
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
129 } # if a cutsite and correct barcode has not been found in either read, which = 0 at this point
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
130 if($which == 1) { # if the pair is correctly oriented, print out fastq format for the pair
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
131 my $temp1 = substr($forward, $n); # trim initial nucleotides off read and quality scores...
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
132 my $temp2 = substr($qual1, $n); # so that output keeps barcode and cutsite but not other nucleotides...
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
133 print OUT1 "$name1$temp1$third1$temp2"; # and is ready to go into process_radtags
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
134 print OUT2 "$name2$reverse$third2$qual2";
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
135 }
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
136 elsif($which == 2) { # if the pair needs to be flipped, print out fastq format for the flipped pair
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
137 my $temp1 = substr($reverse, $n);
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
138 my $temp2 = substr($qual2, $n);
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
139 print OUT1 "$name2$temp1$third2$temp2";
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
140 print OUT2 "$name1$forward$third1$qual1";
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
141 } # if which == 0, nothing is printed out for the pair
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
142
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
143
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
144 }
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
145 close IN1;
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
146 close IN2;
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
147 close OUT1;
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
148 close OUT2;
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
149
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
150 foreach my $key (sort keys %counts) {
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
151 print "$key" . "\t" . "$counts{$key}\n";
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
152 }
2c843c8df821 planemo upload commit 1af8d398aa035883718d02bb1b9051edb5d10c48
tiagoantao
parents: 6
diff changeset
153 }