Mercurial > repos > jbrayet > controlsubset_1_0_docker
comparison createControlSubSet.pl @ 2:106d0a5c2b7f draft
Uploaded
author | jbrayet |
---|---|
date | Tue, 02 Feb 2016 09:02:40 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1:d9fb544723cb | 2:106d0a5c2b7f |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 #filter out dulpicates from SAMPLE (optional) and create a control dataset w/o duplicates with the same number of reads as in the SAMPLE | |
4 | |
5 use strict; | |
6 use warnings; | |
7 use diagnostics; | |
8 | |
9 my $usage = qq{ | |
10 $0 | |
11 | |
12 ----------------------------- | |
13 mandatory parameters: | |
14 | |
15 -f CHiP_file | |
16 -c control_file | |
17 -t type [bam, sam, eland] | |
18 -o output file | |
19 ----------------------------- | |
20 optional parameters: | |
21 | |
22 none | |
23 }; | |
24 | |
25 if(scalar(@ARGV) == 0){ | |
26 print $usage; | |
27 exit(0); | |
28 } | |
29 | |
30 ## mandatory arguments | |
31 | |
32 my $filename = ""; | |
33 my $output_fname = ""; | |
34 | |
35 my $controlFilename = ""; | |
36 my $type = ""; | |
37 my $sampleOutput = ""; | |
38 | |
39 | |
40 ## optional arguments | |
41 | |
42 ## parse command line arguments | |
43 | |
44 while(scalar(@ARGV) > 0){ | |
45 my $this_arg = shift @ARGV; | |
46 if ( $this_arg eq '-h') {print "$usage\n"; exit; } | |
47 | |
48 elsif ( $this_arg eq '-f') {$filename = shift @ARGV;} | |
49 elsif ( $this_arg eq '-c') {$controlFilename = shift @ARGV;} | |
50 elsif ( $this_arg eq '-t') {$type = shift @ARGV;} | |
51 elsif ( $this_arg eq '-o') {$output_fname = shift @ARGV;} | |
52 elsif ( $this_arg eq '-s') {$sampleOutput = shift @ARGV;} | |
53 | |
54 | |
55 elsif ( $this_arg =~ m/^-/ ) { print "unknown flag: $this_arg\n";} | |
56 } | |
57 | |
58 if ( $filename eq ""){ | |
59 die "you should specify chip file\n"; | |
60 } | |
61 if( $controlFilename eq ""){ | |
62 die "you should specify control file\n"; | |
63 } | |
64 if( $type eq ""){ | |
65 die "you should specify file type (bam, sam or eland)\n"; | |
66 } | |
67 if( $output_fname eq ""){ | |
68 die "you should specify output filename\n"; | |
69 } | |
70 | |
71 | |
72 print "\n-----------------\n\n"; | |
73 | |
74 my %hash; | |
75 my $chipCount = 0; | |
76 my @header; | |
77 | |
78 | |
79 if ($type eq "eland") { | |
80 open FILE, "< $filename " || die "$filename : $!\n"; | |
81 while(<FILE>){ | |
82 my @fields = split(/\t/,$_); | |
83 my $entry = $fields[6].":".$fields[7]."-".$fields[8]; | |
84 unless (exists($hash{$entry})) { | |
85 $hash{$entry} = $_; | |
86 $chipCount++; | |
87 } | |
88 } | |
89 } elsif ($type eq "sam") { | |
90 open FILE, "< $filename " || die "$filename : $!\n"; | |
91 while(<FILE>){ | |
92 if (m/^@/) { | |
93 push(@header,$_); | |
94 next; | |
95 } | |
96 my @fields = split(/\t/,$_); | |
97 next if (scalar(@fields)<10); | |
98 my $entry = $fields[2].":".$fields[3]."-".$fields[1]; | |
99 unless (exists($hash{$entry})) { | |
100 $hash{$entry} = $_; | |
101 $chipCount++; | |
102 } | |
103 } | |
104 } elsif ($type eq "bam") { | |
105 open(FILE, "samtools view -h $filename |") or die "$0: can't open ".$filename.":$!\n"; | |
106 while(<FILE>){ | |
107 if (m/^@/) { | |
108 push(@header,$_); | |
109 next; | |
110 } | |
111 my @fields = split(/\t/,$_); | |
112 next if (scalar(@fields)<10); | |
113 my $entry = $fields[2].":".$fields[3]."-".$fields[1]; | |
114 unless (exists($hash{$entry})) { | |
115 $hash{$entry} = $_; | |
116 $chipCount++; | |
117 } | |
118 } | |
119 } | |
120 close FILE; | |
121 print "ChIP: $chipCount\n"; | |
122 | |
123 if ($sampleOutput ne "") { | |
124 | |
125 open OUT, "> $sampleOutput" || die "$sampleOutput: $!\n"; | |
126 | |
127 if ($type eq "bam" || $type eq "sam") { #print header | |
128 for my $headerLine (@header) { | |
129 print OUT $headerLine; | |
130 } | |
131 } | |
132 for my $line (values %hash) { | |
133 print OUT $line; | |
134 } | |
135 close OUT; | |
136 } | |
137 | |
138 delete @hash{keys %hash}; | |
139 @header = (); | |
140 | |
141 my $controlCount = 0; | |
142 if ($type eq "eland") { | |
143 open FILE, "< $controlFilename " || die "$controlFilename : $!\n"; | |
144 while(<FILE>){ | |
145 my @fields = split(/\t/,$_); | |
146 my $entry = $fields[6].":".$fields[7]."-".$fields[8]; | |
147 unless (exists($hash{$entry})) { | |
148 $hash{$entry} = $_; | |
149 $controlCount++; | |
150 } | |
151 } | |
152 } elsif ($type eq "sam") { | |
153 open FILE, "< $controlFilename " || die "$controlFilename : $!\n"; | |
154 while(<FILE>){ | |
155 if (m/^@/) { | |
156 push(@header,$_); | |
157 next; | |
158 } | |
159 my @fields = split(/\t/,$_); | |
160 my $entry = $fields[2].":".$fields[3]."-".$fields[1]; | |
161 unless (exists($hash{$entry})) { | |
162 $hash{$entry} = $_; | |
163 $controlCount++; | |
164 } | |
165 } | |
166 } elsif ($type eq "bam") { | |
167 open(FILE, "samtools view -h $controlFilename |") or die "$0: can't open ".$controlFilename.":$!\n"; | |
168 while(<FILE>){ | |
169 if (m/^@/) { | |
170 push(@header,$_); | |
171 next; | |
172 } | |
173 my @fields = split(/\t/,$_); | |
174 my $entry = $fields[2].":".$fields[3]."-".$fields[1]; | |
175 unless (exists($hash{$entry})) { | |
176 $hash{$entry} = $_; | |
177 $controlCount++; | |
178 } | |
179 } | |
180 } | |
181 close FILE; | |
182 print "Control: $controlCount\n"; | |
183 my $prob = $chipCount/$controlCount; | |
184 | |
185 open OUT, "> $output_fname" || die "$output_fname: $!\n"; | |
186 | |
187 if ($type eq "bam" || $type eq "sam") { #print header | |
188 for my $headerLine (@header) { | |
189 print OUT $headerLine; | |
190 } | |
191 } | |
192 my $count = 0; | |
193 | |
194 for my $line (values %hash) { | |
195 my $rand = rand(); | |
196 | |
197 if ($rand < $prob) { | |
198 print OUT $line; | |
199 $count ++; | |
200 } | |
201 last if ($count == $chipCount); | |
202 } | |
203 | |
204 | |
205 if ($count < $chipCount) { | |
206 | |
207 $prob = ($chipCount-$count)/$controlCount*1.1; | |
208 | |
209 for my $line (values %hash) { | |
210 my $rand = rand(); | |
211 | |
212 if ($rand < $prob) { | |
213 print OUT $line; | |
214 $count ++; | |
215 } | |
216 last if ($count == $chipCount); | |
217 } | |
218 } | |
219 | |
220 print "count = $count\n"; | |
221 close OUT; | |
222 |