Mercurial > repos > mmaiensc > ember
comparison GALAXY_FILES/tools/EMBER/PreProcess_Expression_Data.pl @ 1:e62b2ba92070 default tip
Uploaded
| author | mmaiensc |
|---|---|
| date | Thu, 22 Mar 2012 13:19:59 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:1ef24fd0c914 | 1:e62b2ba92070 |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 # prepare microarray data for use in EMBER w/ discretized behaviors | |
| 3 | |
| 4 use Getopt::Long; | |
| 5 $|++; | |
| 6 | |
| 7 # | |
| 8 # command line arguments | |
| 9 # | |
| 10 $options = "Usage: ./PreProcess_Expresssion_Data.pl <OPTIONS> | |
| 11 -i microarray data (required) | |
| 12 format: <id> <value 1> <value 2> ... <value N> | |
| 13 this requires a title line (starting with #): | |
| 14 #ID <condition 1><replicate 1> <condition 2><replicate 2> ... | |
| 15 where <condition> should match the conditions in input -c, | |
| 16 and the <condition><replicate> pair tells what the data in the column is from | |
| 17 -c list of comparisons (required) | |
| 18 format: <condition1> <condition2> | |
| 19 -a annotation file (required) | |
| 20 format: <id> <gene name> <chr> <start> <end> <strand> | |
| 21 -o output (required) | |
| 22 -p percentile threshold (number between 0 and 1, default 0.63) | |
| 23 this value is used as follows: all data are concatenated into one list, | |
| 24 and the pth percentile of that list is taken as the threshold. | |
| 25 Then, a probe is removed if its value is less than the threshold in ALL conditions. | |
| 26 p=1.0 means all probes are retained, p=0.0 means none are | |
| 27 -l log-transform the data (y or n, default n) | |
| 28 -v verbose (print progress to stdout: y or n, default y) | |
| 29 \n"; | |
| 30 | |
| 31 $in = ""; | |
| 32 $c = ""; | |
| 33 $a = ""; | |
| 34 $o = ""; | |
| 35 $p = 0.63; | |
| 36 $l = "n"; | |
| 37 $v = "y"; | |
| 38 | |
| 39 GetOptions('i=s' => \$in, | |
| 40 'c=s' => \$c, | |
| 41 'a=s' => \$a, | |
| 42 'o=s' => \$o, | |
| 43 'p=f' => \$p, | |
| 44 'l=s' => \$l, | |
| 45 'v=s' => \$v, | |
| 46 ) || die "\n$options\n"; | |
| 47 | |
| 48 if( $in eq "" ){ | |
| 49 print "\nError: set a value for -i\n\n$options"; | |
| 50 exit; | |
| 51 } | |
| 52 if( $c eq "" ){ | |
| 53 print "\nError: set a value for -c\n\n$options"; | |
| 54 exit; | |
| 55 } | |
| 56 if( $a eq "" ){ | |
| 57 print "\nError: set a value for -a\n\n$options"; | |
| 58 exit; | |
| 59 } | |
| 60 if( $o eq "" ){ | |
| 61 print "\nError: set a value for -o\n\n$options"; | |
| 62 exit; | |
| 63 } | |
| 64 | |
| 65 if( $p < 0 || $p > 1 ){ | |
| 66 print "\nError: choose -p to be between 0 and 1\n\n$options"; | |
| 67 exit; | |
| 68 } | |
| 69 if( $l ne "y" && $l ne "n" ){ | |
| 70 print "\nError: choose -l to be y or n\n\n$options"; | |
| 71 exit; | |
| 72 } | |
| 73 if( $v ne "y" && $v ne "n" ){ | |
| 74 print "\nError: choose -v to be y or n\n\n$options"; | |
| 75 exit; | |
| 76 } | |
| 77 | |
| 78 ################################################# | |
| 79 # SECTION 1 - read in file names and conditions # | |
| 80 ################################################# | |
| 81 | |
| 82 # | |
| 83 # read in data list (title line of $in) and list of comparisons | |
| 84 # arrays: @names, @comps | |
| 85 # | |
| 86 open(IN,"$in") || die "Error: can't open file $in\n"; | |
| 87 $line = <IN>; | |
| 88 close(IN); | |
| 89 @names = (); | |
| 90 @parts = split(' ',$line); | |
| 91 if( substr($parts[0], 0, 1) ne "#" ){ | |
| 92 print "Error: no title line detected in file $in\n"; | |
| 93 exit; | |
| 94 } | |
| 95 for($i=1; $i<= $#parts; $i++){ | |
| 96 if( $parts[$i] !~ /#/ ){ | |
| 97 printf("Error: column %i (%s) does not have a replicate number\n", $i, $parts[$i]); | |
| 98 exit; | |
| 99 } | |
| 100 ($condition, $replicate) = split('#', $parts[$i]); | |
| 101 push(@names, [($condition, $replicate)] ); | |
| 102 } | |
| 103 #while($line = <IN>){ | |
| 104 # chomp($line); | |
| 105 # push(@files, $line); | |
| 106 # @parts = split('/',$line); | |
| 107 # ($base, $suff) = split('\.txt', $parts[$#parts]); | |
| 108 # push(@names, $base); | |
| 109 #} | |
| 110 #close(IN); | |
| 111 open(IN,"$c") || die "Error: can't open file $c\n"; | |
| 112 @comps = (); | |
| 113 while($line = <IN>){ | |
| 114 chomp($line); | |
| 115 @parts = split(' ',$line); | |
| 116 if( $#parts != 1 ){ | |
| 117 print "Error: file $c does not have 2 columns\n"; | |
| 118 exit; | |
| 119 } | |
| 120 $info1 = {}; | |
| 121 $info1->{name} = $parts[0]; | |
| 122 $info1->{namei} = 0; | |
| 123 $info2 = {}; | |
| 124 $info2->{name} = $parts[1]; | |
| 125 $info2->{namei} = 0; | |
| 126 push(@comps, [($info1, $info2)] ); | |
| 127 } | |
| 128 close(IN); | |
| 129 | |
| 130 # | |
| 131 # count how many conditions we have, and how many replicates of each | |
| 132 # arrays: @conds | |
| 133 # | |
| 134 @conds = (); | |
| 135 for($i=0; $i<= $#comps; $i++){ | |
| 136 for($j=0; $j< 2; $j++){ | |
| 137 $ok = 1; | |
| 138 $k=-1; | |
| 139 $end = $#conds; | |
| 140 while($k< $end && $ok == 1){ | |
| 141 $k++; | |
| 142 if( $conds[$k]->{name} eq $comps[$i][$j]->{name} ){ | |
| 143 $ok = 0; | |
| 144 } | |
| 145 } | |
| 146 if( $ok == 1 ){ | |
| 147 @empty1 = (); | |
| 148 $info = {}; | |
| 149 $info->{name} = $comps[$i][$j]->{name}; | |
| 150 $info->{reps} = [@empty1]; | |
| 151 $info->{nreps} = 0; | |
| 152 push(@conds, $info); | |
| 153 $comps[$i][$j]->{namei} = $#conds; | |
| 154 } | |
| 155 else{ | |
| 156 $comps[$i][$j]->{namei} = $k; | |
| 157 } | |
| 158 } | |
| 159 } | |
| 160 for($i=0; $i<= $#conds; $i++){ | |
| 161 for($j=0; $j<= $#names; $j++){ | |
| 162 if( $names[$j][0] eq $conds[$i]->{name} ){ | |
| 163 push(@{$conds[$i]->{reps}}, $j); | |
| 164 $conds[$i]->{nreps}++; | |
| 165 } | |
| 166 } | |
| 167 } | |
| 168 | |
| 169 # | |
| 170 # print summary to stdout | |
| 171 # | |
| 172 if( $v eq "y" ){ | |
| 173 printf("\nReading in %i conditions, %i total data columns:\n", $#conds+1, $#names+1); | |
| 174 for($i=0; $i<= $#conds; $i++){ | |
| 175 printf(" %s, %i replicates\n", $conds[$i]->{name}, $conds[$i]->{nreps}); | |
| 176 } | |
| 177 printf("\nMaking %i comparisons:\n", $#comps+1); | |
| 178 for($i=0; $i<= $#comps; $i++){ | |
| 179 printf(" %s vs %s\n", $comps[$i][0]->{name}, $comps[$i][1]->{name}); | |
| 180 } | |
| 181 } | |
| 182 | |
| 183 ########################################################### | |
| 184 # SECTION 2 - read in annotation file and microarray data # | |
| 185 ########################################################### | |
| 186 | |
| 187 if( $v eq "y" ){ | |
| 188 print "\nReading in annotation file and data\n"; | |
| 189 } | |
| 190 | |
| 191 # | |
| 192 # read in probe annotations | |
| 193 # arrays: @probes | |
| 194 # | |
| 195 open(IN,"$a") || die "Error: can't open file $a\n"; | |
| 196 @vals = (); | |
| 197 while($line = <IN>){ | |
| 198 chomp($line); | |
| 199 @parts = split(' ',$line); | |
| 200 if( $#parts != 5 ){ | |
| 201 print "Error: file $a does not have 6 columns\n"; | |
| 202 exit; | |
| 203 } | |
| 204 @empty1 = (); | |
| 205 @empty2 = (); | |
| 206 $info = {}; | |
| 207 $info->{id} = $parts[0]; | |
| 208 $info->{name} = $parts[1]; | |
| 209 $info->{chr} = $parts[2]; | |
| 210 $info->{start} = $parts[3]; | |
| 211 $info->{end} = $parts[4]; | |
| 212 $info->{strand} = $parts[5]; | |
| 213 $info->{vals} = [@empty1]; | |
| 214 $info->{classes} = [@empty2]; | |
| 215 $info->{use} = 0; | |
| 216 push(@vals, $info); | |
| 217 } | |
| 218 close(IN); | |
| 219 @probes = sort{ $a->{id} cmp $b->{id} } @vals; | |
| 220 | |
| 221 # | |
| 222 # read in data, sort by ids, and add to @probes | |
| 223 # record average for each file | |
| 224 # arrays: @averages | |
| 225 # | |
| 226 @averages = (); | |
| 227 open(IN,"$in"); | |
| 228 $line = <IN>; | |
| 229 @data = (); | |
| 230 while( $line = <IN> ){ | |
| 231 chomp($line); | |
| 232 @parts = split(' ',$line); | |
| 233 push(@data, [@parts] ); | |
| 234 } | |
| 235 @sdata = sort{ $a->[0] cmp $b->[0] } @data; | |
| 236 close(IN); | |
| 237 | |
| 238 $j=0; | |
| 239 $k=0; | |
| 240 $endd = $#sdata; | |
| 241 $endp = $#probes; | |
| 242 while( $j <= $endd && $k<= $endp ){ | |
| 243 if( $probes[$k]->{id} gt $sdata[$j][0] ){$j++;} | |
| 244 elsif( $probes[$k]->{id} lt $sdata[$j][0] ){ | |
| 245 $probes[$k]->{use} = 0; | |
| 246 $k++; | |
| 247 } | |
| 248 elsif( $probes[$k]->{id} eq $sdata[$j][0] ){ | |
| 249 for( $i=0; $i<= $#names; $i++){ | |
| 250 if( $l eq "n" ){push(@{$probes[$k]->{vals}}, $sdata[$j][$i+1] );} | |
| 251 if( $l eq "y" ){push(@{$probes[$k]->{vals}}, log($sdata[$j][$i+1])/log(2) );} | |
| 252 } | |
| 253 $probes[$k]->{use} = 1; | |
| 254 $j++; | |
| 255 $k++; | |
| 256 } | |
| 257 else{ | |
| 258 printf("Unknown line comparison\n"); | |
| 259 exit; | |
| 260 } | |
| 261 } | |
| 262 for($i=0; $i<= $#names; $i++){ | |
| 263 $avg=0; | |
| 264 # | |
| 265 # compute average | |
| 266 # | |
| 267 $avg = 0; | |
| 268 $tot = 0; | |
| 269 for($k=0; $k<= $#probes; $k++){ | |
| 270 if( $probes[$k]->{use} == 1 ){ | |
| 271 $avg+= $probes[$k]->{vals}[$i]; | |
| 272 $tot++; | |
| 273 } | |
| 274 } | |
| 275 $avg/= $tot; | |
| 276 push(@averages, $avg); | |
| 277 } | |
| 278 | |
| 279 if( $v eq "y" ){ | |
| 280 printf(" using %i probes\n", $tot); | |
| 281 } | |
| 282 | |
| 283 ################################################################## | |
| 284 # SECTION 3 - determine threshold and label probes to be removed # | |
| 285 ################################################################## | |
| 286 | |
| 287 if( $v eq "y" ){ | |
| 288 printf("\nDetermining threshold\n"); | |
| 289 } | |
| 290 # | |
| 291 # find threshold | |
| 292 # variables: $thresh | |
| 293 # | |
| 294 @allvals = (); | |
| 295 for($i=0; $i<= $#probes; $i++){ | |
| 296 for($j=0; $j<= $#names; $j++){ | |
| 297 if( $probes[$i]->{use} == 1 ){ | |
| 298 push(@allvals, ($probes[$i]->{vals}[$j])/$averages[$j]); | |
| 299 } | |
| 300 } | |
| 301 } | |
| 302 | |
| 303 @sallvals = sort{ $a <=> $b } @allvals; | |
| 304 | |
| 305 $end = (1-$p)*$#sallvals; | |
| 306 $i = sprintf("%i", $end); | |
| 307 $thresh = $sallvals[$i]; | |
| 308 | |
| 309 # | |
| 310 # label probes as on or off | |
| 311 # | |
| 312 $totuse = 0; | |
| 313 for($i=0; $i<= $#probes; $i++){ | |
| 314 $j=0; | |
| 315 $probes[$i]->{use} = 0; | |
| 316 while($j<= $#names && $probes[$i]->{use} == 0 ){ | |
| 317 if( ($probes[$i]->{vals}[$j])/$averages[$j] > $thresh ){ | |
| 318 $probes[$i]->{use} = 1; | |
| 319 $totuse++; | |
| 320 } | |
| 321 $j++; | |
| 322 } | |
| 323 } | |
| 324 if( $v eq "y" ){ | |
| 325 printf(" %i probes retained for analysis\n", $totuse); | |
| 326 } | |
| 327 | |
| 328 ############################################# | |
| 329 # SECTION 4 - make discrete classifications # | |
| 330 ############################################# | |
| 331 | |
| 332 if( $v eq "y" ){ | |
| 333 printf("\nClassifying probes\n"); | |
| 334 } | |
| 335 # | |
| 336 # for each probe, | |
| 337 # for each comparison, | |
| 338 # compute avg and stdev of first and second conditions | |
| 339 # apply rules for classification | |
| 340 # | |
| 341 for($i=0; $i<= $#probes; $i++){ | |
| 342 for($j=0; $j<= $#comps; $j++){ | |
| 343 $avg = (0,0); | |
| 344 $sd = (0,0); | |
| 345 for($k=0; $k< 2; $k++){ | |
| 346 $condi = $comps[$j][$k]->{namei}; | |
| 347 $avg[$k] = 0; | |
| 348 $sd[$k] = 0; | |
| 349 for($l=0; $l< $conds[$condi]->{nreps}; $l++){ | |
| 350 $filei = $conds[$condi]->{reps}[$l]; | |
| 351 $avg[$k]+= $probes[$i]->{vals}[$filei]; | |
| 352 } | |
| 353 $avg[$k] /= $conds[$condi]->{nreps}; | |
| 354 for($l=0; $l< $conds[$condi]->{nreps}; $l++){ | |
| 355 $filei = $conds[$condi]->{reps}[$l]; | |
| 356 $sd[$k]+= ($avg[$k]-$probes[$i]->{vals}[$filei])**2; | |
| 357 } | |
| 358 $sd[$k] = sqrt($sd[$k]/($conds[$condi]->{nreps})); | |
| 359 } | |
| 360 | |
| 361 $s = $sd[0] + $sd[1]; | |
| 362 # ++ category | |
| 363 if( $avg[1]-$avg[0] > 3*$s ){ | |
| 364 push( @{$probes[$i]->{classes}}, 2); | |
| 365 } | |
| 366 # + category | |
| 367 elsif( $avg[1]-$avg[0] > $s ){ | |
| 368 push(@{$probes[$i]->{classes}}, 1); | |
| 369 } | |
| 370 # -- category | |
| 371 elsif( $avg[0]-$avg[1] > 3*$s ){ | |
| 372 push(@{$probes[$i]->{classes}}, -2); | |
| 373 } | |
| 374 # 0 category | |
| 375 elsif( $avg[0]-$avg[1] > $s ){ | |
| 376 push(@{$probes[$i]->{classes}}, -1); | |
| 377 } | |
| 378 else{ | |
| 379 push(@{$probes[$i]->{classes}}, 0); | |
| 380 } | |
| 381 } | |
| 382 } | |
| 383 | |
| 384 ############################ | |
| 385 # SECTION 5 - print result # | |
| 386 ############################ | |
| 387 | |
| 388 if( $v eq "y" ){ | |
| 389 printf("\nPrinting result\n\n"); | |
| 390 } | |
| 391 open(OUT,">$o"); | |
| 392 $delim = " "; | |
| 393 for($i=0; $i<= $#probes; $i++){ | |
| 394 printf OUT ("%s%s%s%s%s%s%s%s%s%s%s", $probes[$i]->{id}, $delim, $probes[$i]->{name}, $delim, $probes[$i]->{chr}, $delim, $probes[$i]->{start}, $delim, $probes[$i]->{end}, $delim, $probes[$i]->{strand} ); | |
| 395 if( $probes[$i]->{use} == 1 ){ | |
| 396 for($j=0; $j<= $#comps; $j++){ | |
| 397 printf OUT ("%s%i", $delim, $probes[$i]->{classes}[$j]); | |
| 398 } | |
| 399 print OUT "\n"; | |
| 400 } | |
| 401 else{ | |
| 402 print OUT "$delim-1\n"; | |
| 403 } | |
| 404 } | |
| 405 close(OUT); | |
| 406 | |
| 407 | |
| 408 | |
| 409 | |
| 410 | |
| 411 | |
| 412 | |
| 413 |
