Mercurial > repos > jbrayet > controlsubset_1_0_docker
comparison createControlSubSet.pl @ 2:106d0a5c2b7f draft
Uploaded
| author | jbrayet |
|---|---|
| date | Tue, 02 Feb 2016 09:02:40 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:d9fb544723cb | 2:106d0a5c2b7f |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 | |
| 3 #filter out dulpicates from SAMPLE (optional) and create a control dataset w/o duplicates with the same number of reads as in the SAMPLE | |
| 4 | |
| 5 use strict; | |
| 6 use warnings; | |
| 7 use diagnostics; | |
| 8 | |
| 9 my $usage = qq{ | |
| 10 $0 | |
| 11 | |
| 12 ----------------------------- | |
| 13 mandatory parameters: | |
| 14 | |
| 15 -f CHiP_file | |
| 16 -c control_file | |
| 17 -t type [bam, sam, eland] | |
| 18 -o output file | |
| 19 ----------------------------- | |
| 20 optional parameters: | |
| 21 | |
| 22 none | |
| 23 }; | |
| 24 | |
| 25 if(scalar(@ARGV) == 0){ | |
| 26 print $usage; | |
| 27 exit(0); | |
| 28 } | |
| 29 | |
| 30 ## mandatory arguments | |
| 31 | |
| 32 my $filename = ""; | |
| 33 my $output_fname = ""; | |
| 34 | |
| 35 my $controlFilename = ""; | |
| 36 my $type = ""; | |
| 37 my $sampleOutput = ""; | |
| 38 | |
| 39 | |
| 40 ## optional arguments | |
| 41 | |
| 42 ## parse command line arguments | |
| 43 | |
| 44 while(scalar(@ARGV) > 0){ | |
| 45 my $this_arg = shift @ARGV; | |
| 46 if ( $this_arg eq '-h') {print "$usage\n"; exit; } | |
| 47 | |
| 48 elsif ( $this_arg eq '-f') {$filename = shift @ARGV;} | |
| 49 elsif ( $this_arg eq '-c') {$controlFilename = shift @ARGV;} | |
| 50 elsif ( $this_arg eq '-t') {$type = shift @ARGV;} | |
| 51 elsif ( $this_arg eq '-o') {$output_fname = shift @ARGV;} | |
| 52 elsif ( $this_arg eq '-s') {$sampleOutput = shift @ARGV;} | |
| 53 | |
| 54 | |
| 55 elsif ( $this_arg =~ m/^-/ ) { print "unknown flag: $this_arg\n";} | |
| 56 } | |
| 57 | |
| 58 if ( $filename eq ""){ | |
| 59 die "you should specify chip file\n"; | |
| 60 } | |
| 61 if( $controlFilename eq ""){ | |
| 62 die "you should specify control file\n"; | |
| 63 } | |
| 64 if( $type eq ""){ | |
| 65 die "you should specify file type (bam, sam or eland)\n"; | |
| 66 } | |
| 67 if( $output_fname eq ""){ | |
| 68 die "you should specify output filename\n"; | |
| 69 } | |
| 70 | |
| 71 | |
| 72 print "\n-----------------\n\n"; | |
| 73 | |
| 74 my %hash; | |
| 75 my $chipCount = 0; | |
| 76 my @header; | |
| 77 | |
| 78 | |
| 79 if ($type eq "eland") { | |
| 80 open FILE, "< $filename " || die "$filename : $!\n"; | |
| 81 while(<FILE>){ | |
| 82 my @fields = split(/\t/,$_); | |
| 83 my $entry = $fields[6].":".$fields[7]."-".$fields[8]; | |
| 84 unless (exists($hash{$entry})) { | |
| 85 $hash{$entry} = $_; | |
| 86 $chipCount++; | |
| 87 } | |
| 88 } | |
| 89 } elsif ($type eq "sam") { | |
| 90 open FILE, "< $filename " || die "$filename : $!\n"; | |
| 91 while(<FILE>){ | |
| 92 if (m/^@/) { | |
| 93 push(@header,$_); | |
| 94 next; | |
| 95 } | |
| 96 my @fields = split(/\t/,$_); | |
| 97 next if (scalar(@fields)<10); | |
| 98 my $entry = $fields[2].":".$fields[3]."-".$fields[1]; | |
| 99 unless (exists($hash{$entry})) { | |
| 100 $hash{$entry} = $_; | |
| 101 $chipCount++; | |
| 102 } | |
| 103 } | |
| 104 } elsif ($type eq "bam") { | |
| 105 open(FILE, "samtools view -h $filename |") or die "$0: can't open ".$filename.":$!\n"; | |
| 106 while(<FILE>){ | |
| 107 if (m/^@/) { | |
| 108 push(@header,$_); | |
| 109 next; | |
| 110 } | |
| 111 my @fields = split(/\t/,$_); | |
| 112 next if (scalar(@fields)<10); | |
| 113 my $entry = $fields[2].":".$fields[3]."-".$fields[1]; | |
| 114 unless (exists($hash{$entry})) { | |
| 115 $hash{$entry} = $_; | |
| 116 $chipCount++; | |
| 117 } | |
| 118 } | |
| 119 } | |
| 120 close FILE; | |
| 121 print "ChIP: $chipCount\n"; | |
| 122 | |
| 123 if ($sampleOutput ne "") { | |
| 124 | |
| 125 open OUT, "> $sampleOutput" || die "$sampleOutput: $!\n"; | |
| 126 | |
| 127 if ($type eq "bam" || $type eq "sam") { #print header | |
| 128 for my $headerLine (@header) { | |
| 129 print OUT $headerLine; | |
| 130 } | |
| 131 } | |
| 132 for my $line (values %hash) { | |
| 133 print OUT $line; | |
| 134 } | |
| 135 close OUT; | |
| 136 } | |
| 137 | |
| 138 delete @hash{keys %hash}; | |
| 139 @header = (); | |
| 140 | |
| 141 my $controlCount = 0; | |
| 142 if ($type eq "eland") { | |
| 143 open FILE, "< $controlFilename " || die "$controlFilename : $!\n"; | |
| 144 while(<FILE>){ | |
| 145 my @fields = split(/\t/,$_); | |
| 146 my $entry = $fields[6].":".$fields[7]."-".$fields[8]; | |
| 147 unless (exists($hash{$entry})) { | |
| 148 $hash{$entry} = $_; | |
| 149 $controlCount++; | |
| 150 } | |
| 151 } | |
| 152 } elsif ($type eq "sam") { | |
| 153 open FILE, "< $controlFilename " || die "$controlFilename : $!\n"; | |
| 154 while(<FILE>){ | |
| 155 if (m/^@/) { | |
| 156 push(@header,$_); | |
| 157 next; | |
| 158 } | |
| 159 my @fields = split(/\t/,$_); | |
| 160 my $entry = $fields[2].":".$fields[3]."-".$fields[1]; | |
| 161 unless (exists($hash{$entry})) { | |
| 162 $hash{$entry} = $_; | |
| 163 $controlCount++; | |
| 164 } | |
| 165 } | |
| 166 } elsif ($type eq "bam") { | |
| 167 open(FILE, "samtools view -h $controlFilename |") or die "$0: can't open ".$controlFilename.":$!\n"; | |
| 168 while(<FILE>){ | |
| 169 if (m/^@/) { | |
| 170 push(@header,$_); | |
| 171 next; | |
| 172 } | |
| 173 my @fields = split(/\t/,$_); | |
| 174 my $entry = $fields[2].":".$fields[3]."-".$fields[1]; | |
| 175 unless (exists($hash{$entry})) { | |
| 176 $hash{$entry} = $_; | |
| 177 $controlCount++; | |
| 178 } | |
| 179 } | |
| 180 } | |
| 181 close FILE; | |
| 182 print "Control: $controlCount\n"; | |
| 183 my $prob = $chipCount/$controlCount; | |
| 184 | |
| 185 open OUT, "> $output_fname" || die "$output_fname: $!\n"; | |
| 186 | |
| 187 if ($type eq "bam" || $type eq "sam") { #print header | |
| 188 for my $headerLine (@header) { | |
| 189 print OUT $headerLine; | |
| 190 } | |
| 191 } | |
| 192 my $count = 0; | |
| 193 | |
| 194 for my $line (values %hash) { | |
| 195 my $rand = rand(); | |
| 196 | |
| 197 if ($rand < $prob) { | |
| 198 print OUT $line; | |
| 199 $count ++; | |
| 200 } | |
| 201 last if ($count == $chipCount); | |
| 202 } | |
| 203 | |
| 204 | |
| 205 if ($count < $chipCount) { | |
| 206 | |
| 207 $prob = ($chipCount-$count)/$controlCount*1.1; | |
| 208 | |
| 209 for my $line (values %hash) { | |
| 210 my $rand = rand(); | |
| 211 | |
| 212 if ($rand < $prob) { | |
| 213 print OUT $line; | |
| 214 $count ++; | |
| 215 } | |
| 216 last if ($count == $chipCount); | |
| 217 } | |
| 218 } | |
| 219 | |
| 220 print "count = $count\n"; | |
| 221 close OUT; | |
| 222 |
