Mercurial > repos > kellrott > matrix_manipulate
changeset 5:83f2acca2387 draft
Deleted selected files
author | kellrott |
---|---|
date | Thu, 13 Jun 2013 16:54:40 -0400 |
parents | d0e3b5778e17 |
children | efb356d09d6d |
files | matrix_manipulate/aggregate.pl~ matrix_manipulate/floatMatrix.pyc matrix_manipulate/quartile_norm.pl |
diffstat | 3 files changed, 0 insertions(+), 321 deletions(-) [+] |
line wrap: on
line diff
--- a/matrix_manipulate/aggregate.pl~ Thu Jun 13 16:54:15 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,219 +0,0 @@ -#!/usr/bin/perl - -############################################################################## -############################################################################## -## -## aggregate.pl -## -############################################################################## -############################################################################## -## -## Written by Josh Stuart in the lab of Stuart Kim, Stanford University. -## -## Email address: jstuart@stanford.edu -## Phone: (650) 725-7612 -## -## Postal address: Department of Developmental Biology -## Beckman Center Room B314 -## 279 Campus Dr. -## Stanford, CA 94305 -## -## Web site: http://www.smi.stanford.edu/people/stuart -## -############################################################################## -############################################################################## -## -## Written: 00/00/02 -## Updated: 00/00/02 -## -############################################################################## -############################################################################## - -## Support for MEDIAN calculation added by Alex Williams, 2007 - -require "$ENV{MYPERLDIR}/lib/libfile.pl"; -require "$ENV{MYPERLDIR}/lib/libstats.pl"; - -use strict; -use warnings; - -my @flags = ( - [ '-q', 'scalar', 0, 1] - , [ '-k', 'scalar', 1, undef] - , [ '-d', 'scalar', "\t", undef] - , [ '-h', 'scalar', 0, undef] - , [ '-sig', 'scalar', 3, undef] - , [ '-f', 'scalar','mean', undef] - , ['--file', 'scalar', '-', undef] - , ['--emptyval', 'scalar', 'NaN', undef] - , ['--test', 'scalar', 0, 1] - ); - -my %args = %{&parseArgs(\@ARGV, \@flags)}; - -if(exists($args{'--help'})) -{ - print STDOUT <DATA>; - exit(0); -} - -my $emptyVal = $args{'--emptyval'}; -my $runTest = $args{'--test'}; # <-- this doesn't do anything right now. Theoretically it should run a test to make sure the values actually work for a few known cases (sanity check for this program) -my $verbose = not($args{'-q'}); -my $col = int($args{'-k'}) - 1; -my $delim = $args{'-d'}; -my $function = lc($args{'-f'}); # lower-case whatever the function name was -my $headers = $args{'-h'}; -my $sigs = $args{'-sig'}; -my $file = $args{'--file'}; - -my $sprintf_ctrl = '%.' . $sigs . 'f'; - - -if ($function ne 'mean' && $function ne 'median') { - die "ERROR in aggregate.pl: You must specify a function ( -f FUNCTION_NAME ). The supported functions are mean and median.\n"; -} - -# my ($ids, $rows) = &readIds($file, $col, $delim); -# my $data = &readDataMatrix($file, $col, $delim, \$max_cols); -my ($data, $ids, $rows, $max_cols) = &readDataAndIds($file, $col, $delim); - -for(my $i = 0; $i < scalar(@{$rows}) and $i < $headers; $i++) { - print $$ids[$i], $delim, join($delim, @{$$data[$i]}), "\n"; -} - -for(my $i = $headers; $i < scalar(@{$rows}); $i++) -{ - my $id = $$ids[$i]; - - my $useMedian = ($function eq 'median'); - my $useMean = ($function eq 'mean'); - - my @sum; - my @count; - my @medianCalcArray; # this is actually just a list of all the items in the column for each key - # Note: medianCalcArray is an array of ARRAYS. - - for(my $j = 0; $j < $max_cols; $j++) { - $sum[$j] = 0; - $count[$j] = 0; - } - - #my @r = @{$$rows[$i]}; - - for(my $k = 0; $k < scalar(@{$$rows[$i]}); $k++) { - my $row = $$rows[$i][$k]; - - #print "Row is $row\n"; - for(my $j = 0; $j < $max_cols; $j++) { - my $thisEntry = $$data[$row][$j]; - if(defined($thisEntry)) { - if($thisEntry =~ /^\s*[\d+\.eE-]+\s*$/) { - $count[$j]++; - $sum[$j] += $thisEntry; - if ($useMedian) { - if (!defined($medianCalcArray[$j])) { - @{$medianCalcArray[$j]} = (); - } - push(@{$medianCalcArray[$j]}, $thisEntry); - #print "$j: $k: $row: "; - #print @{$medianCalcArray[$j]}; - #print "\n"; - } - } - } - } - } - - my @agg; - for(my $j = 0; $j < $max_cols; $j++) { - $agg[$j] = ${emptyVal}; - if($useMean) { - $agg[$j] = ($count[$j] > 0) ? sprintf($sprintf_ctrl, ($sum[$j] / $count[$j])) : ${emptyVal}; - } - if($useMedian) { - # Only calculate the median if we actually specifically want it... otherwise it slows us down - if (defined($medianCalcArray[$j]) && (scalar(@{$medianCalcArray[$j]}) > 0) ) { - $agg[$j] = vec_median(\@{$medianCalcArray[$j]}); # <-- vec_median is in libstats.pl - } - } - if ($useMean && $useMedian) { die "Error in arguments to aggregate.pl: You cannot specify both *mean* AND *median* at the same time! (We would be overwriting the storage variable!) You will have to run the program twice, once with each option.\n"; } - } - - print STDOUT $id, (($max_cols > 0) ? ($delim . join($delim, @agg)) : ""), "\n"; -} - -exit(0); - - -__DATA__ -syntax: aggregate.pl [OPTIONS] - -Combines the numeric data across rows with the same key. Useful if you have experiments -with replicates. See below for a complete example. - -Example usage: aggregate.pl -f median MYFILE - -OPTIONS are: - --q: Quiet mode (default is verbose) - --k COL: Use the column COL as the key column. The script uses the entries found - on each line of this column as keys. Duplicates are merged by applying - an aggregation function for each value in their records. - --d DELIM: Set the field delimiter to DELIM (default is tab). - --h NUM: The number of headers in the input file (default is 0). - --f FUNCTION: Set the aggregation function to FUNCTION (default is mean). The - possible values are: - - mean: The mean of the values (default) (-f mean) - - median: The median of the values. (-f median) - ---emptyval VALUE: Sets the "empty"/"no data" values to VALUE. (default is NaN) - If an output value has no input data, then this will be the output. - -EXAMPLE: - -Works like this: - -If the input file INPUTFILE has five columns like below (tab-delimited, although -spaces are shown here): (Note that Column 3 (C3) is blank for all rows except -for the last one) - -Column_1 Column_2 C3 C4 C5 -v v v v v --------------------------------- (Sample file is below) -Experiment_Alpha 1 0 77 -Experiment_Alpha 2 0 -Expr_Beta 10 -Expr_Beta 30 -Experiment_Alpha 3 6 -Expr_Beta 5 - - -And you type: aggregate.pl -f mean THE_FILE - -Then the output will be the *mean* values for each experiment, across all rows: - -Experiment_Alpha 2.0 NaN 3.0 77 -Expr_Beta 20 5 NaN NaN - -Note that the "77" case (the last item in the first row) is the corect mean, -because the other Experiment_Alpha items for that column do not have any data ta -all. Even though there are 3 rows labeled "Experiment_Alpha", only one of them -has data for the last column (column 4), so 77 is the mean. The output is always -a matrix (although it could be a single-column matrix). Empty values are padded -with NaN (although you can change this with --emptyval). - - -TO DO / FUTURE WORK: - -Future possibility (NOT IMPLEMENTED YET): smean: Standardized mean (mean/stddev). - -KNOWN BUGS / ISSUES: - -None so far.
--- a/matrix_manipulate/quartile_norm.pl Thu Jun 13 16:54:15 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,102 +0,0 @@ -#!/usr/bin/perl - -use strict; -use Getopt::Long; - -my $out = '-'; -my $q = 75; -my @col; -my @also; -my $names = 1; -my $target = 1000; -my $skip = 0; -my $min=1; -GetOptions("quant=i"=>\$q, "target=i"=>\$target, "col=i@"=>\@col, "out=s"=>\$out, "also=i@"=>\@also, "skip=i"=>\$skip, "min=i"=>\$min); - -my $in = shift @ARGV; - -die usage() unless $in && @col; - -open(OUT, ($out eq '-') ? '<&STDOUT' : ">$out") || die "Can't open $out\n"; -open(IN, ($in eq '-') ? '<&STDIN' : $in) || die "Can't open $in\n"; - -@also = (1) if !@also && !grep {$_ eq '1'} @col; - -map {$_--} @col; -map {$_--} @also; - -my @d; -my $cnt = 0; -my $head =''; -while(<IN>) { - if ($skip) { - --$skip; - $head .= $_; - next; - } - chomp; - my @f = split /\t/; - if ($col[0] eq '-2') { - @col = (1..$#f); - } - for (@col) { - push @{$d[$_]}, $f[$_]; - } - for (@also) { - push @{$d[$_]}, $f[$_]; - } - ++$cnt; -} -for (@col) { - my @t = grep {$_>=$min} @{$d[$_]}; - @t = sort {$a <=> $b} @t; - my $t=quantile(\@t, $q/100); - for (@{$d[$_]}) { - $_= sprintf "%.4f", $target*$_/$t; - } -} - -my @out = (sort {$a <=> $b} (@col, @also)); - -print OUT $head; - -for (my $i=0;$i<$cnt;++$i) { - for my $j (@out) { - print OUT "\t" unless $j == $out[0]; - print OUT $d[$j][$i]; - } - print OUT "\n"; -} - - -sub usage { -<<EOF; -Usage: $0 -c COL [opts] FILE - -Returns an upper quartile normalization of data in column(s) COL -of file FILE. - -Col is 1-based, zeroes are ignores when calculating upper quartile - -Options: - -c|col COL normalize this column of data (can specify more than once, or -1 for all but first col) - -q|quant INT quantile to use (75) - -t|target INT target to use (1000) - -a|also COL output these columns also - -o|out FILE output to this file instead of stdout - -m|min INT minimum value (1) - -s|skip INT skip header rows -EOF -} - -sub quantile { - my ($a,$p) = @_; - my $l = scalar(@{$a}); - my $t = ($l-1)*$p; - my $v=$a->[int($t)]; - if ($t > int($t)) { - return $v + $p * ($a->[int($t)+1] - $v); - } else { - return $v; - } -}