comparison createControlSubSet.pl @ 2:106d0a5c2b7f draft

Uploaded
author jbrayet
date Tue, 02 Feb 2016 09:02:40 -0500
parents
children
comparison
equal deleted inserted replaced
1:d9fb544723cb 2:106d0a5c2b7f
1 #!/usr/bin/perl
2
3 #filter out dulpicates from SAMPLE (optional) and create a control dataset w/o duplicates with the same number of reads as in the SAMPLE
4
5 use strict;
6 use warnings;
7 use diagnostics;
8
9 my $usage = qq{
10 $0
11
12 -----------------------------
13 mandatory parameters:
14
15 -f CHiP_file
16 -c control_file
17 -t type [bam, sam, eland]
18 -o output file
19 -----------------------------
20 optional parameters:
21
22 none
23 };
24
25 if(scalar(@ARGV) == 0){
26 print $usage;
27 exit(0);
28 }
29
30 ## mandatory arguments
31
32 my $filename = "";
33 my $output_fname = "";
34
35 my $controlFilename = "";
36 my $type = "";
37 my $sampleOutput = "";
38
39
40 ## optional arguments
41
42 ## parse command line arguments
43
44 while(scalar(@ARGV) > 0){
45 my $this_arg = shift @ARGV;
46 if ( $this_arg eq '-h') {print "$usage\n"; exit; }
47
48 elsif ( $this_arg eq '-f') {$filename = shift @ARGV;}
49 elsif ( $this_arg eq '-c') {$controlFilename = shift @ARGV;}
50 elsif ( $this_arg eq '-t') {$type = shift @ARGV;}
51 elsif ( $this_arg eq '-o') {$output_fname = shift @ARGV;}
52 elsif ( $this_arg eq '-s') {$sampleOutput = shift @ARGV;}
53
54
55 elsif ( $this_arg =~ m/^-/ ) { print "unknown flag: $this_arg\n";}
56 }
57
58 if ( $filename eq ""){
59 die "you should specify chip file\n";
60 }
61 if( $controlFilename eq ""){
62 die "you should specify control file\n";
63 }
64 if( $type eq ""){
65 die "you should specify file type (bam, sam or eland)\n";
66 }
67 if( $output_fname eq ""){
68 die "you should specify output filename\n";
69 }
70
71
72 print "\n-----------------\n\n";
73
74 my %hash;
75 my $chipCount = 0;
76 my @header;
77
78
79 if ($type eq "eland") {
80 open FILE, "< $filename " || die "$filename : $!\n";
81 while(<FILE>){
82 my @fields = split(/\t/,$_);
83 my $entry = $fields[6].":".$fields[7]."-".$fields[8];
84 unless (exists($hash{$entry})) {
85 $hash{$entry} = $_;
86 $chipCount++;
87 }
88 }
89 } elsif ($type eq "sam") {
90 open FILE, "< $filename " || die "$filename : $!\n";
91 while(<FILE>){
92 if (m/^@/) {
93 push(@header,$_);
94 next;
95 }
96 my @fields = split(/\t/,$_);
97 next if (scalar(@fields)<10);
98 my $entry = $fields[2].":".$fields[3]."-".$fields[1];
99 unless (exists($hash{$entry})) {
100 $hash{$entry} = $_;
101 $chipCount++;
102 }
103 }
104 } elsif ($type eq "bam") {
105 open(FILE, "samtools view -h $filename |") or die "$0: can't open ".$filename.":$!\n";
106 while(<FILE>){
107 if (m/^@/) {
108 push(@header,$_);
109 next;
110 }
111 my @fields = split(/\t/,$_);
112 next if (scalar(@fields)<10);
113 my $entry = $fields[2].":".$fields[3]."-".$fields[1];
114 unless (exists($hash{$entry})) {
115 $hash{$entry} = $_;
116 $chipCount++;
117 }
118 }
119 }
120 close FILE;
121 print "ChIP: $chipCount\n";
122
123 if ($sampleOutput ne "") {
124
125 open OUT, "> $sampleOutput" || die "$sampleOutput: $!\n";
126
127 if ($type eq "bam" || $type eq "sam") { #print header
128 for my $headerLine (@header) {
129 print OUT $headerLine;
130 }
131 }
132 for my $line (values %hash) {
133 print OUT $line;
134 }
135 close OUT;
136 }
137
138 delete @hash{keys %hash};
139 @header = ();
140
141 my $controlCount = 0;
142 if ($type eq "eland") {
143 open FILE, "< $controlFilename " || die "$controlFilename : $!\n";
144 while(<FILE>){
145 my @fields = split(/\t/,$_);
146 my $entry = $fields[6].":".$fields[7]."-".$fields[8];
147 unless (exists($hash{$entry})) {
148 $hash{$entry} = $_;
149 $controlCount++;
150 }
151 }
152 } elsif ($type eq "sam") {
153 open FILE, "< $controlFilename " || die "$controlFilename : $!\n";
154 while(<FILE>){
155 if (m/^@/) {
156 push(@header,$_);
157 next;
158 }
159 my @fields = split(/\t/,$_);
160 my $entry = $fields[2].":".$fields[3]."-".$fields[1];
161 unless (exists($hash{$entry})) {
162 $hash{$entry} = $_;
163 $controlCount++;
164 }
165 }
166 } elsif ($type eq "bam") {
167 open(FILE, "samtools view -h $controlFilename |") or die "$0: can't open ".$controlFilename.":$!\n";
168 while(<FILE>){
169 if (m/^@/) {
170 push(@header,$_);
171 next;
172 }
173 my @fields = split(/\t/,$_);
174 my $entry = $fields[2].":".$fields[3]."-".$fields[1];
175 unless (exists($hash{$entry})) {
176 $hash{$entry} = $_;
177 $controlCount++;
178 }
179 }
180 }
181 close FILE;
182 print "Control: $controlCount\n";
183 my $prob = $chipCount/$controlCount;
184
185 open OUT, "> $output_fname" || die "$output_fname: $!\n";
186
187 if ($type eq "bam" || $type eq "sam") { #print header
188 for my $headerLine (@header) {
189 print OUT $headerLine;
190 }
191 }
192 my $count = 0;
193
194 for my $line (values %hash) {
195 my $rand = rand();
196
197 if ($rand < $prob) {
198 print OUT $line;
199 $count ++;
200 }
201 last if ($count == $chipCount);
202 }
203
204
205 if ($count < $chipCount) {
206
207 $prob = ($chipCount-$count)/$controlCount*1.1;
208
209 for my $line (values %hash) {
210 my $rand = rand();
211
212 if ($rand < $prob) {
213 print OUT $line;
214 $count ++;
215 }
216 last if ($count == $chipCount);
217 }
218 }
219
220 print "count = $count\n";
221 close OUT;
222