| 1 | 1 #!/usr/bin/perl | 
|  | 2 | 
|  | 3 #create a control dataset with the same number of reads as in the SAMPLE (highest peaks) | 
|  | 4 | 
|  | 5 use strict; | 
|  | 6 use warnings; | 
|  | 7 use diagnostics; | 
|  | 8 | 
|  | 9 my $usage = qq{ | 
|  | 10     $0 | 
|  | 11 | 
|  | 12     ----------------------------- | 
|  | 13     mandatory parameters: | 
|  | 14 | 
|  | 15     -f CHiP_file | 
|  | 16     -c control_file | 
|  | 17     -o output file | 
|  | 18     ----------------------------- | 
|  | 19     optional parameters: | 
|  | 20 | 
|  | 21     -n number of files to create | 
|  | 22 | 
|  | 23     none | 
|  | 24 }; | 
|  | 25 | 
|  | 26 if(scalar(@ARGV) == 0){ | 
|  | 27     print $usage; | 
|  | 28     exit(0); | 
|  | 29 } | 
|  | 30 | 
|  | 31 ## mandatory arguments | 
|  | 32 | 
|  | 33 my $filename = ""; | 
|  | 34 my $output_fname = ""; | 
|  | 35 | 
|  | 36 my $controlFilename = ""; | 
|  | 37 | 
|  | 38 my $nBootstrap = 1; | 
|  | 39 | 
|  | 40 | 
|  | 41 ## optional arguments | 
|  | 42 | 
|  | 43 ## parse command line arguments | 
|  | 44 | 
|  | 45 while(scalar(@ARGV) > 0){ | 
|  | 46     my $this_arg = shift @ARGV; | 
|  | 47     if ( $this_arg eq '-h') {print "$usage\n"; exit; } | 
|  | 48 | 
|  | 49     elsif ( $this_arg eq '-f') {$filename = shift @ARGV;} | 
|  | 50     elsif ( $this_arg eq '-c') {$controlFilename = shift @ARGV;} | 
|  | 51     elsif ( $this_arg eq '-o') {$output_fname = shift @ARGV;} | 
|  | 52     elsif ( $this_arg eq '-n') {$nBootstrap = shift @ARGV;} | 
|  | 53 | 
|  | 54     elsif ( $this_arg =~ m/^-/ ) { print "unknown flag: $this_arg\n";} | 
|  | 55 } | 
|  | 56 | 
|  | 57 if ( $filename eq ""){ | 
|  | 58     die "you should specify chip file\n"; | 
|  | 59 } | 
|  | 60 if( $controlFilename eq ""){ | 
|  | 61     die "you should specify control file\n"; | 
|  | 62 } | 
|  | 63 | 
|  | 64 if( $output_fname eq ""){ | 
|  | 65     die "you should specify output filename\n"; | 
|  | 66 } | 
|  | 67 | 
|  | 68 | 
|  | 69 print "\n-----------------\n\n"; | 
|  | 70 | 
|  | 71 my %hash; | 
|  | 72 my $chipCount = 0; | 
|  | 73 my @header; | 
|  | 74 open FILE, "< $filename " || die "$filename : $!\n"; | 
|  | 75 while(<FILE>){ | 
|  | 76     $chipCount++; | 
|  | 77 } | 
|  | 78 close FILE; | 
|  | 79 #print "ChIP: $chipCount\n"; | 
|  | 80 | 
|  | 81 | 
|  | 82 my $controlCount = 0; | 
|  | 83 | 
|  | 84 open FILE, "< $controlFilename " || die "$controlFilename : $!\n"; | 
|  | 85 while(<FILE>){ | 
|  | 86     next if (/track/); | 
|  | 87     my $entry = $_; | 
|  | 88     my @fields = split(/\t/,$_); | 
|  | 89     $hash{$entry} = $fields[4]; | 
|  | 90     $controlCount++; | 
|  | 91 } | 
|  | 92 #print "controlCount : $controlCount\n"; | 
|  | 93 | 
|  | 94 close FILE; | 
|  | 95 open OUT, "> $output_fname" || die "$output_fname: $!\n"; | 
|  | 96 my $count = 0; | 
|  | 97 if ($controlCount>$chipCount) { | 
|  | 98     my $prob = $chipCount/$controlCount*1.1; | 
|  | 99     for my $entry (sort {$hash{$b}<=>$hash{$a}} keys %hash) { | 
|  | 100 	my $yes = rand(); | 
|  | 101 	if ($yes<=$prob) {$yes=1;}else {$yes=0;} | 
|  | 102 	if ($yes) { | 
|  | 103 		print OUT $entry ; | 
|  | 104 		$count++; | 
|  | 105 	} | 
|  | 106 	if ($count >=$chipCount) { | 
|  | 107 	    last; | 
|  | 108 	} | 
|  | 109     } | 
|  | 110 | 
|  | 111 } else { | 
|  | 112     for my $entry (keys %hash) { | 
|  | 113 	print OUT $entry; | 
|  | 114     } | 
|  | 115 } | 
|  | 116 | 
|  | 117 close OUT; | 
|  | 118 | 
|  | 119 | 
|  | 120 for my $try (2..$nBootstrap) { | 
|  | 121 | 
|  | 122 	open OUT, "> $output_fname$try" || die "$output_fname$try: $!\n"; | 
|  | 123 	my $count = 0; | 
|  | 124 	if ($controlCount>$chipCount) { | 
|  | 125 	    my $prob = $chipCount/$controlCount*1.1; | 
|  | 126 	    for my $entry (sort {$hash{$b}<=>$hash{$a}} keys %hash) { | 
|  | 127 		my $yes = rand(); | 
|  | 128 		if ($yes<=$prob) {$yes=1;}else {$yes=0;} | 
|  | 129 		if ($yes) { | 
|  | 130 			print OUT $entry ; | 
|  | 131 			$count++; | 
|  | 132 		} | 
|  | 133 | 
|  | 134 		if ($count >=$chipCount ) { | 
|  | 135 		    last; | 
|  | 136 		} | 
|  | 137 	    } | 
|  | 138 	    if ($count <$chipCount) { #do it again! | 
|  | 139 | 
|  | 140 		for my $entry (sort {$hash{$b}<=>$hash{$a}} keys %hash) { | 
|  | 141 			my $yes = rand(); | 
|  | 142 			if ($yes<=$prob) {$yes=1;}else {$yes=0;} | 
|  | 143 			if ($yes) { | 
|  | 144 				print OUT $entry ; | 
|  | 145 				$count++; | 
|  | 146 			} | 
|  | 147 | 
|  | 148 			if ($count >=$chipCount ) { | 
|  | 149 			    last; | 
|  | 150 			} | 
|  | 151 	        } | 
|  | 152 | 
|  | 153 	    } | 
|  | 154 | 
|  | 155 	} else { | 
|  | 156 	    for my $entry (keys %hash) { | 
|  | 157 		print OUT $entry; | 
|  | 158 	    } | 
|  | 159 	} | 
|  | 160 | 
|  | 161 	close OUT; | 
|  | 162 | 
|  | 163 } |