Mercurial > repos > tiagoantao > barcode_stacks
annotate barcode_sort.pl @ 3:2d1df0e14208 draft
planemo upload commit 48a9cb9e296a9afe54276d75b464da0513e95ddc
| author | tiagoantao |
|---|---|
| date | Mon, 28 Mar 2016 19:39:35 -0400 |
| parents | d838c6a2ec0f |
| children | 5cd3cc89bf59 |
| rev | line source |
|---|---|
|
0
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
1 #!/usr/bin/perl -w |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
2 use strict; use warnings; |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
3 |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
4 my $b = 8; # set barcode length (Hohenlohe lab 2015 bestRAD barcodes are 8bp) |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
5 my $n = 2; # set number of nucleotides to expect before the barcode (bestRAD libraries run at U. Oregon have 2bp here) |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
6 my $c = $b + $n; # set position for start of enzyme cutsite, which occurs after the initial nucleotides plus the barcode |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
7 my $e = 6; # set length of remaining enzyme cutsite sequence (e.g. 6 for SbfI) -- for other than SbfI, need to change the actual sequence below!!! |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
8 # note this is the length of what's left after enzyme digestion, NOT the full length of the enzyme recognition site |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
9 # the program expects all correct forward reads to follow the pattern: $n initial nucleotides, then $b nucleotides of barcode, then $e nucleotides of the cutsite |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
10 |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
11 open (IN, $ARGV[0]); # read file with barcodes |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
12 my %counts = (); # make a hash of barcodes that will be searched |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
13 while(<IN>) { # counts of each barcode can be tracked with this hash, with a few more lines of code below |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
14 chomp($_); |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
15 $counts{$_} = 0; |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
16 } |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
17 close IN; |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
18 |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
19 open (IN1, $ARGV[1]); # read fastq file of raw forward reads |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
20 open (IN2, $ARGV[2]); # read fastq file of raw reverse reads -- these must have pairs in identical order |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
21 open (OUT1, ">$ARGV[3]"); # create fastq outfile for flipped forward reads (cutsite end) |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
22 open (OUT2, ">$ARGV[4]"); # create fastq outfile for flipped reverse reads (randomly sheared end) |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
23 my $forward; my $reverse; my $barcode; # establish string variables for all parts of fastq files |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
24 my $name1; my $name2; my $third1; my $third2; my $qual1; my $qual2; |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
25 while($name1 = <IN1>) { # start reading through the paired fastq input files |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
26 $name2 = <IN2>; # read all parts of a single read pair (4 lines in each of the 2 fastq files) |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
27 $forward = <IN1>; |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
28 $reverse = <IN2>; |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
29 $third1 = <IN1>; $third2 = <IN2>; $qual1 = <IN1>; $qual2 = <IN2>; |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
30 my $which = 0; my $for; my $rev; # establish variables used below |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
31 if(substr($forward, $c, $e) eq "TGCAGG") { # check for SbfI cutsite in the correct place in forward read |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
32 if(substr($reverse, $c, $e) eq "TGCAGG") { # check for SbfI cutsite in the correct place in reverse read |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
33 $for = substr($forward, $n, $b); # this is where a barcode should be if it's in the forward read |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
34 $rev = substr($reverse, $n, $b); # this is where a barcode should be if it's in the reverse read |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
35 if(exists $counts{$for} && (exists $counts{$rev}) == 0) { # if a correct barcode and cutsite are in forward but not reverse read... |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
36 $which = 1; $barcode = $for; $counts{$for}++; # which = 1 means the pair is correctly oriented |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
37 } |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
38 elsif((exists $counts{$for}) == 0 && exists $counts{$rev}) { # if a correct barcode and cutsite are only in the reverse read... |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
39 $which = 2; $barcode = $rev; $counts{$rev}++; # which = 2 means the pair needs to be flipped |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
40 } |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
41 } |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
42 else { # the cutsite is only found in the forward read |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
43 $barcode = substr($forward, $n, $b); |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
44 if(exists $counts{$barcode}) {$which = 1; $counts{$barcode}++; } # if a correct barcode is in the forward read, the pair is correctly oriented |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
45 } |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
46 } |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
47 elsif(substr($reverse, $c, $e) eq "TGCAGG") { # cutsite not in forward read but is in reverse read |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
48 $barcode = substr($reverse, $n, $b); |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
49 if(exists $counts{$barcode}) {$which = 2; $counts{$barcode}++; } # pair needs to be flipped |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
50 } # if a cutsite and correct barcode has not been found in either read, which = 0 at this point |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
51 if($which == 1) { # if the pair is correctly oriented, print out fastq format for the pair |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
52 my $temp1 = substr($forward, $n); # trim initial nucleotides off read and quality scores... |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
53 my $temp2 = substr($qual1, $n); # so that output keeps barcode and cutsite but not other nucleotides... |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
54 print OUT1 "$name1$temp1$third1$temp2"; # and is ready to go into process_radtags |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
55 print OUT2 "$name2$reverse$third2$qual2"; |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
56 } |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
57 elsif($which == 2) { # if the pair needs to be flipped, print out fastq format for the flipped pair |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
58 my $temp1 = substr($reverse, $n); |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
59 my $temp2 = substr($qual2, $n); |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
60 print OUT1 "$name2$temp1$third2$temp2"; |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
61 print OUT2 "$name1$forward$third1$qual1"; |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
62 } # if which == 0, nothing is printed out for the pair |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
63 } |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
64 close IN1; |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
65 close IN2; |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
66 close OUT1; |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
67 close OUT2; |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
68 |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
69 foreach my $key (sort keys %counts) { |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
70 print "$key" . "\t" . "$counts{$key}\n"; |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
71 } |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
72 |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
73 |
|
d838c6a2ec0f
planemo upload commit 75d88a8a93e8cd3acb00103bf6a7a07649d17c70
tiagoantao
parents:
diff
changeset
|
74 |
