2
|
1 #!/usr/bin/perl
|
|
2
|
|
3 #filter out dulpicates from SAMPLE (optional) and create a control dataset w/o duplicates with the same number of reads as in the SAMPLE
|
|
4
|
|
5 use strict;
|
|
6 use warnings;
|
|
7 use diagnostics;
|
|
8
|
|
9 my $usage = qq{
|
|
10 $0
|
|
11
|
|
12 -----------------------------
|
|
13 mandatory parameters:
|
|
14
|
|
15 -f CHiP_file
|
|
16 -c control_file
|
|
17 -t type [bam, sam, eland]
|
|
18 -o output file
|
|
19 -----------------------------
|
|
20 optional parameters:
|
|
21
|
|
22 none
|
|
23 };
|
|
24
|
|
25 if(scalar(@ARGV) == 0){
|
|
26 print $usage;
|
|
27 exit(0);
|
|
28 }
|
|
29
|
|
30 ## mandatory arguments
|
|
31
|
|
32 my $filename = "";
|
|
33 my $output_fname = "";
|
|
34
|
|
35 my $controlFilename = "";
|
|
36 my $type = "";
|
|
37 my $sampleOutput = "";
|
|
38
|
|
39
|
|
40 ## optional arguments
|
|
41
|
|
42 ## parse command line arguments
|
|
43
|
|
44 while(scalar(@ARGV) > 0){
|
|
45 my $this_arg = shift @ARGV;
|
|
46 if ( $this_arg eq '-h') {print "$usage\n"; exit; }
|
|
47
|
|
48 elsif ( $this_arg eq '-f') {$filename = shift @ARGV;}
|
|
49 elsif ( $this_arg eq '-c') {$controlFilename = shift @ARGV;}
|
|
50 elsif ( $this_arg eq '-t') {$type = shift @ARGV;}
|
|
51 elsif ( $this_arg eq '-o') {$output_fname = shift @ARGV;}
|
|
52 elsif ( $this_arg eq '-s') {$sampleOutput = shift @ARGV;}
|
|
53
|
|
54
|
|
55 elsif ( $this_arg =~ m/^-/ ) { print "unknown flag: $this_arg\n";}
|
|
56 }
|
|
57
|
|
58 if ( $filename eq ""){
|
|
59 die "you should specify chip file\n";
|
|
60 }
|
|
61 if( $controlFilename eq ""){
|
|
62 die "you should specify control file\n";
|
|
63 }
|
|
64 if( $type eq ""){
|
|
65 die "you should specify file type (bam, sam or eland)\n";
|
|
66 }
|
|
67 if( $output_fname eq ""){
|
|
68 die "you should specify output filename\n";
|
|
69 }
|
|
70
|
|
71
|
|
72 print "\n-----------------\n\n";
|
|
73
|
|
74 my %hash;
|
|
75 my $chipCount = 0;
|
|
76 my @header;
|
|
77
|
|
78
|
|
79 if ($type eq "eland") {
|
|
80 open FILE, "< $filename " || die "$filename : $!\n";
|
|
81 while(<FILE>){
|
|
82 my @fields = split(/\t/,$_);
|
|
83 my $entry = $fields[6].":".$fields[7]."-".$fields[8];
|
|
84 unless (exists($hash{$entry})) {
|
|
85 $hash{$entry} = $_;
|
|
86 $chipCount++;
|
|
87 }
|
|
88 }
|
|
89 } elsif ($type eq "sam") {
|
|
90 open FILE, "< $filename " || die "$filename : $!\n";
|
|
91 while(<FILE>){
|
|
92 if (m/^@/) {
|
|
93 push(@header,$_);
|
|
94 next;
|
|
95 }
|
|
96 my @fields = split(/\t/,$_);
|
|
97 next if (scalar(@fields)<10);
|
|
98 my $entry = $fields[2].":".$fields[3]."-".$fields[1];
|
|
99 unless (exists($hash{$entry})) {
|
|
100 $hash{$entry} = $_;
|
|
101 $chipCount++;
|
|
102 }
|
|
103 }
|
|
104 } elsif ($type eq "bam") {
|
|
105 open(FILE, "samtools view -h $filename |") or die "$0: can't open ".$filename.":$!\n";
|
|
106 while(<FILE>){
|
|
107 if (m/^@/) {
|
|
108 push(@header,$_);
|
|
109 next;
|
|
110 }
|
|
111 my @fields = split(/\t/,$_);
|
|
112 next if (scalar(@fields)<10);
|
|
113 my $entry = $fields[2].":".$fields[3]."-".$fields[1];
|
|
114 unless (exists($hash{$entry})) {
|
|
115 $hash{$entry} = $_;
|
|
116 $chipCount++;
|
|
117 }
|
|
118 }
|
|
119 }
|
|
120 close FILE;
|
|
121 print "ChIP: $chipCount\n";
|
|
122
|
|
123 if ($sampleOutput ne "") {
|
|
124
|
|
125 open OUT, "> $sampleOutput" || die "$sampleOutput: $!\n";
|
|
126
|
|
127 if ($type eq "bam" || $type eq "sam") { #print header
|
|
128 for my $headerLine (@header) {
|
|
129 print OUT $headerLine;
|
|
130 }
|
|
131 }
|
|
132 for my $line (values %hash) {
|
|
133 print OUT $line;
|
|
134 }
|
|
135 close OUT;
|
|
136 }
|
|
137
|
|
138 delete @hash{keys %hash};
|
|
139 @header = ();
|
|
140
|
|
141 my $controlCount = 0;
|
|
142 if ($type eq "eland") {
|
|
143 open FILE, "< $controlFilename " || die "$controlFilename : $!\n";
|
|
144 while(<FILE>){
|
|
145 my @fields = split(/\t/,$_);
|
|
146 my $entry = $fields[6].":".$fields[7]."-".$fields[8];
|
|
147 unless (exists($hash{$entry})) {
|
|
148 $hash{$entry} = $_;
|
|
149 $controlCount++;
|
|
150 }
|
|
151 }
|
|
152 } elsif ($type eq "sam") {
|
|
153 open FILE, "< $controlFilename " || die "$controlFilename : $!\n";
|
|
154 while(<FILE>){
|
|
155 if (m/^@/) {
|
|
156 push(@header,$_);
|
|
157 next;
|
|
158 }
|
|
159 my @fields = split(/\t/,$_);
|
|
160 my $entry = $fields[2].":".$fields[3]."-".$fields[1];
|
|
161 unless (exists($hash{$entry})) {
|
|
162 $hash{$entry} = $_;
|
|
163 $controlCount++;
|
|
164 }
|
|
165 }
|
|
166 } elsif ($type eq "bam") {
|
|
167 open(FILE, "samtools view -h $controlFilename |") or die "$0: can't open ".$controlFilename.":$!\n";
|
|
168 while(<FILE>){
|
|
169 if (m/^@/) {
|
|
170 push(@header,$_);
|
|
171 next;
|
|
172 }
|
|
173 my @fields = split(/\t/,$_);
|
|
174 my $entry = $fields[2].":".$fields[3]."-".$fields[1];
|
|
175 unless (exists($hash{$entry})) {
|
|
176 $hash{$entry} = $_;
|
|
177 $controlCount++;
|
|
178 }
|
|
179 }
|
|
180 }
|
|
181 close FILE;
|
|
182 print "Control: $controlCount\n";
|
|
183 my $prob = $chipCount/$controlCount;
|
|
184
|
|
185 open OUT, "> $output_fname" || die "$output_fname: $!\n";
|
|
186
|
|
187 if ($type eq "bam" || $type eq "sam") { #print header
|
|
188 for my $headerLine (@header) {
|
|
189 print OUT $headerLine;
|
|
190 }
|
|
191 }
|
|
192 my $count = 0;
|
|
193
|
|
194 for my $line (values %hash) {
|
|
195 my $rand = rand();
|
|
196
|
|
197 if ($rand < $prob) {
|
|
198 print OUT $line;
|
|
199 $count ++;
|
|
200 }
|
|
201 last if ($count == $chipCount);
|
|
202 }
|
|
203
|
|
204
|
|
205 if ($count < $chipCount) {
|
|
206
|
|
207 $prob = ($chipCount-$count)/$controlCount*1.1;
|
|
208
|
|
209 for my $line (values %hash) {
|
|
210 my $rand = rand();
|
|
211
|
|
212 if ($rand < $prob) {
|
|
213 print OUT $line;
|
|
214 $count ++;
|
|
215 }
|
|
216 last if ($count == $chipCount);
|
|
217 }
|
|
218 }
|
|
219
|
|
220 print "count = $count\n";
|
|
221 close OUT;
|
|
222
|