1
|
1 #!/usr/bin/perl
|
|
2
|
|
3 #create a control dataset with the same number of reads as in the SAMPLE (highest peaks)
|
|
4
|
|
5 use strict;
|
|
6 use warnings;
|
|
7 use diagnostics;
|
|
8
|
|
9 my $usage = qq{
|
|
10 $0
|
|
11
|
|
12 -----------------------------
|
|
13 mandatory parameters:
|
|
14
|
|
15 -f CHiP_file
|
|
16 -c control_file
|
|
17 -o output file
|
|
18 -----------------------------
|
|
19 optional parameters:
|
|
20
|
|
21 -n number of files to create
|
|
22
|
|
23 none
|
|
24 };
|
|
25
|
|
26 if(scalar(@ARGV) == 0){
|
|
27 print $usage;
|
|
28 exit(0);
|
|
29 }
|
|
30
|
|
31 ## mandatory arguments
|
|
32
|
|
33 my $filename = "";
|
|
34 my $output_fname = "";
|
|
35
|
|
36 my $controlFilename = "";
|
|
37
|
|
38 my $nBootstrap = 1;
|
|
39
|
|
40
|
|
41 ## optional arguments
|
|
42
|
|
43 ## parse command line arguments
|
|
44
|
|
45 while(scalar(@ARGV) > 0){
|
|
46 my $this_arg = shift @ARGV;
|
|
47 if ( $this_arg eq '-h') {print "$usage\n"; exit; }
|
|
48
|
|
49 elsif ( $this_arg eq '-f') {$filename = shift @ARGV;}
|
|
50 elsif ( $this_arg eq '-c') {$controlFilename = shift @ARGV;}
|
|
51 elsif ( $this_arg eq '-o') {$output_fname = shift @ARGV;}
|
|
52 elsif ( $this_arg eq '-n') {$nBootstrap = shift @ARGV;}
|
|
53
|
|
54 elsif ( $this_arg =~ m/^-/ ) { print "unknown flag: $this_arg\n";}
|
|
55 }
|
|
56
|
|
57 if ( $filename eq ""){
|
|
58 die "you should specify chip file\n";
|
|
59 }
|
|
60 if( $controlFilename eq ""){
|
|
61 die "you should specify control file\n";
|
|
62 }
|
|
63
|
|
64 if( $output_fname eq ""){
|
|
65 die "you should specify output filename\n";
|
|
66 }
|
|
67
|
|
68
|
|
69 print "\n-----------------\n\n";
|
|
70
|
|
71 my %hash;
|
|
72 my $chipCount = 0;
|
|
73 my @header;
|
|
74 open FILE, "< $filename " || die "$filename : $!\n";
|
|
75 while(<FILE>){
|
|
76 $chipCount++;
|
|
77 }
|
|
78 close FILE;
|
|
79 #print "ChIP: $chipCount\n";
|
|
80
|
|
81
|
|
82 my $controlCount = 0;
|
|
83
|
|
84 open FILE, "< $controlFilename " || die "$controlFilename : $!\n";
|
|
85 while(<FILE>){
|
|
86 next if (/track/);
|
|
87 my $entry = $_;
|
|
88 my @fields = split(/\t/,$_);
|
|
89 $hash{$entry} = $fields[4];
|
|
90 $controlCount++;
|
|
91 }
|
|
92 #print "controlCount : $controlCount\n";
|
|
93
|
|
94 close FILE;
|
|
95 open OUT, "> $output_fname" || die "$output_fname: $!\n";
|
|
96 my $count = 0;
|
|
97 if ($controlCount>$chipCount) {
|
|
98 my $prob = $chipCount/$controlCount*1.1;
|
|
99 for my $entry (sort {$hash{$b}<=>$hash{$a}} keys %hash) {
|
|
100 my $yes = rand();
|
|
101 if ($yes<=$prob) {$yes=1;}else {$yes=0;}
|
|
102 if ($yes) {
|
|
103 print OUT $entry ;
|
|
104 $count++;
|
|
105 }
|
|
106 if ($count >=$chipCount) {
|
|
107 last;
|
|
108 }
|
|
109 }
|
|
110
|
|
111 } else {
|
|
112 for my $entry (keys %hash) {
|
|
113 print OUT $entry;
|
|
114 }
|
|
115 }
|
|
116
|
|
117 close OUT;
|
|
118
|
|
119
|
|
120 for my $try (2..$nBootstrap) {
|
|
121
|
|
122 open OUT, "> $output_fname$try" || die "$output_fname$try: $!\n";
|
|
123 my $count = 0;
|
|
124 if ($controlCount>$chipCount) {
|
|
125 my $prob = $chipCount/$controlCount*1.1;
|
|
126 for my $entry (sort {$hash{$b}<=>$hash{$a}} keys %hash) {
|
|
127 my $yes = rand();
|
|
128 if ($yes<=$prob) {$yes=1;}else {$yes=0;}
|
|
129 if ($yes) {
|
|
130 print OUT $entry ;
|
|
131 $count++;
|
|
132 }
|
|
133
|
|
134 if ($count >=$chipCount ) {
|
|
135 last;
|
|
136 }
|
|
137 }
|
|
138 if ($count <$chipCount) { #do it again!
|
|
139
|
|
140 for my $entry (sort {$hash{$b}<=>$hash{$a}} keys %hash) {
|
|
141 my $yes = rand();
|
|
142 if ($yes<=$prob) {$yes=1;}else {$yes=0;}
|
|
143 if ($yes) {
|
|
144 print OUT $entry ;
|
|
145 $count++;
|
|
146 }
|
|
147
|
|
148 if ($count >=$chipCount ) {
|
|
149 last;
|
|
150 }
|
|
151 }
|
|
152
|
|
153 }
|
|
154
|
|
155 } else {
|
|
156 for my $entry (keys %hash) {
|
|
157 print OUT $entry;
|
|
158 }
|
|
159 }
|
|
160
|
|
161 close OUT;
|
|
162
|
|
163 }
|