view docs/scripts/man1/AnalyzeTextFilesData.1 @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
line wrap: on
line source

.\" Automatically generated by Pod::Man 2.25 (Pod::Simple 3.22)
.\"
.\" Standard preamble:
.\" ========================================================================
.de Sp \" Vertical space (when we can't use .PP)
.if t .sp .5v
.if n .sp
..
.de Vb \" Begin verbatim text
.ft CW
.nf
.ne \\$1
..
.de Ve \" End verbatim text
.ft R
.fi
..
.\" Set up some character translations and predefined strings.  \*(-- will
.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
.\" nothing in troff, for use with C<>.
.tr \(*W-
.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
.ie n \{\
.    ds -- \(*W-
.    ds PI pi
.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
.    ds L" ""
.    ds R" ""
.    ds C` ""
.    ds C' ""
'br\}
.el\{\
.    ds -- \|\(em\|
.    ds PI \(*p
.    ds L" ``
.    ds R" ''
'br\}
.\"
.\" Escape single quotes in literal strings from groff's Unicode transform.
.ie \n(.g .ds Aq \(aq
.el       .ds Aq '
.\"
.\" If the F register is turned on, we'll generate index entries on stderr for
.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
.\" entries marked with X<> in POD.  Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
.ie \nF \{\
.    de IX
.    tm Index:\\$1\t\\n%\t"\\$2"
..
.    nr % 0
.    rr F
.\}
.el \{\
.    de IX
..
.\}
.\"
.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
.    \" fudge factors for nroff and troff
.if n \{\
.    ds #H 0
.    ds #V .8m
.    ds #F .3m
.    ds #[ \f1
.    ds #] \fP
.\}
.if t \{\
.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
.    ds #V .6m
.    ds #F 0
.    ds #[ \&
.    ds #] \&
.\}
.    \" simple accents for nroff and troff
.if n \{\
.    ds ' \&
.    ds ` \&
.    ds ^ \&
.    ds , \&
.    ds ~ ~
.    ds /
.\}
.if t \{\
.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
.\}
.    \" troff and (daisy-wheel) nroff accents
.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
.ds ae a\h'-(\w'a'u*4/10)'e
.ds Ae A\h'-(\w'A'u*4/10)'E
.    \" corrections for vroff
.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
.    \" for low resolution devices (crt and lpr)
.if \n(.H>23 .if \n(.V>19 \
\{\
.    ds : e
.    ds 8 ss
.    ds o a
.    ds d- d\h'-1'\(ga
.    ds D- D\h'-1'\(hy
.    ds th \o'bp'
.    ds Th \o'LP'
.    ds ae ae
.    ds Ae AE
.\}
.rm #[ #] #H #V #F C
.\" ========================================================================
.\"
.IX Title "ANALYZETEXTFILESDATA 1"
.TH ANALYZETEXTFILESDATA 1 "2015-03-29" "perl v5.14.2" "MayaChemTools"
.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
.nh
.SH "NAME"
AnalyzeTextFilesData.pl \- Analyze numerical coulmn data in TextFile(s)
.SH "SYNOPSIS"
.IX Header "SYNOPSIS"
AnalyzeTextFilesData.pl TextFile(s)...
.PP
AnalyzeTextFilesData.pl [\fB\-c, \-\-colmode\fR colnum | collabel] [\fB\-\-columns\fR \*(L"colnum,[colnum,...]\*(R" | \*(L"collabel,[collabel,...]\*(R" | All]
[\fB\-\-columnpairs\fR \*(L"colnum,colnum,[colnum,colnum]...\*(R" | \*(L"collabel,collabel,[collabel,collabel]...\*(R" | AllPairs]
[\fB\-d, \-\-detail\fR infolevel] [\fB\-f, \-\-fast\fR] [\fB\-\-frequencybins\fR number | \*(L"number,number,[number,...]\*(R"] [\fB\-h, \-\-help\fR]
[\fB\-\-indelim\fR comma | semicolon] [\fB\-\-klargest\fR number] [\fB\-\-ksmallest\fR number]
[\fB\-m, \-\-mode\fR DescriptiveStatisticsBasic | DescriptiveStatisticsAll | All | \*(L"function1, [function2,...]\*(R"]
[\fB\-o, \-\-overwrite\fR] [\fB\-\-outdelim\fR comma | tab | semicolon] [\fB\-p, \-\-precision\fR number]
[\fB\-q, \-\-quote\fR yes | no] [\fB\-r, \-\-root\fR rootname] [\fB\-\-trimfraction\fR number] [\fB\-w, \-\-workingdir\fR dirname] TextFiles(s)...
.SH "DESCRIPTION"
.IX Header "DESCRIPTION"
Anaylze numerical column data in \fITextFile(s)\fR using a combination of various statistical
functions; Non-numerical values are simply ignored. For \fICorrelation, RSquare, and Covariance\fR
analysis, the count of valid values in specifed column pair must be same; otherwise, column
pair is ignored. The file names are separated by space. The valid file extensions are \fI.csv\fR
and \fI.tsv\fR for comma/semicolon and tab delimited text files respectively. All other
file names are ignored. All the text files in a current directory can be specified by
\&\fI*.csv\fR, \fI*.tsv\fR, or the current directory name. The \fB\-\-indelim\fR option determines
the format of \fITextFile(s)\fR. Any file which doesn't correspond to the format indicated
by \fB\-\-indelim\fR option is ignored.
.SH "OPTIONS"
.IX Header "OPTIONS"
.IP "\fB\-c, \-\-colmode\fR \fIcolnum | collabel\fR" 4
.IX Item "-c, --colmode colnum | collabel"
Specify how columns are identified in TextFile(s): using column number or column
label. Possible values: \fIcolnum or collabel\fR. Default value: \fIcolnum\fR.
.ie n .IP "\fB\-\-columns\fR \fI""colnum,[colnum,...]"" | ""collabel,[collabel]..."" | All\fR" 4
.el .IP "\fB\-\-columns\fR \fI``colnum,[colnum,...]'' | ``collabel,[collabel]...'' | All\fR" 4
.IX Item "--columns colnum,[colnum,...] | collabel,[collabel]... | All"
This value is mode specific. It's a list of comma delimited columns to use
for data analysis. Default value: \fIFirst column\fR.
.Sp
This value is ignored during \fICorrelation/Pearson Correlation\fR and \fICovariance\fR
data analysis; \fB\-coulmnparis\fR option is used instead.
.Sp
For \fIcolnum\fR value of \fB\-c, \-\-colmode\fR option, input values format is:
\&\fIcolnum,colnum,...\fR. Example:
.Sp
.Vb 1
\&   1,3,5
.Ve
.Sp
For \fIcollabel\fR value of \fB\-c, \-\-colmode\fR option, input values format is:
\&\fIcollabel,collabel,..\fR. Example:
.Sp
.Vb 1
\&    ALogP,MolWeight,EC50
.Ve
.ie n .IP "\fB\-\-columnpairs\fR \fI""colnum,colnum,[colnum,colnum,...]"" | ""collabel,collabel,[collabel,collabel,...]"" | AllPairs\fR" 4
.el .IP "\fB\-\-columnpairs\fR \fI``colnum,colnum,[colnum,colnum,...]'' | ``collabel,collabel,[collabel,collabel,...]'' | AllPairs\fR" 4
.IX Item "--columnpairs colnum,colnum,[colnum,colnum,...] | collabel,collabel,[collabel,collabel,...] | AllPairs"
This value is mode specific and is only used for \fICorrelation, PearsonCorrelation, or
Covariance\fR value of \fB\-m, \-\-mode\fR option. It is a comma delimited list of column pairs
to use for data analysis during \fICorrelation\fR and \fICovariance\fR calculations. Default value:
\&\fIFirst column, Second column\fR.
.Sp
For \fIcolnum\fR value of \fB\-c, \-\-colmode\fR option, input values format is:
\&\fIcolnum,colnum,[colnum,colnum]...\fR. Example:
.Sp
.Vb 1
\&    1,3,5,6,1,6
.Ve
.Sp
For \fIcollabel\fR value of \fB\-c, \-\-colmode\fR option, input values format is:
\&\fIcollabel,collabel,[collabel,collabel]..\fR. Example:
.Sp
.Vb 1
\&    MolWeight,EC50,NumN+O,PSA
.Ve
.Sp
For \fIAllPairs\fR value of \fB\-\-columnparis\fR option, all column pairs are used for \fICorrelation\fR
and \fICovariance\fR calculations.
.IP "\fB\-d, \-\-detail\fR \fIinfolevel\fR" 4
.IX Item "-d, --detail infolevel"
Level of information to print about column values being ignored. Default: \fI1\fR. Possible values:
1, 2, 3, or 4.
.IP "\fB\-f, \-\-fast\fR" 4
.IX Item "-f, --fast"
In this mode, all the columns specified for analysis are assumed to contain numerical
data and no checking is performed before analysis. By default, only numerical data is
used for analysis; other types of column data is ignored.
.ie n .IP "\fB\-\-frequencybins\fR \fInumber | ""number,number,[number,...]""\fR" 4
.el .IP "\fB\-\-frequencybins\fR \fInumber | ``number,number,[number,...]''\fR" 4
.IX Item "--frequencybins number | number,number,[number,...]"
Specify number of bins or bin range to use for frequency analysis. Default value: \fI10\fR
.Sp
Number of bins value along with the smallest and largest value for a column is used to
group the column values into different groups.
.Sp
The bin range list is used to group values for a column into different groups; It must contain
values in ascending order. Examples:
.Sp
.Vb 2
\&    10,20,30
\&    0.1,0.2,0.3,0.4,0.5
.Ve
.Sp
The frequency value calculated for a specific bin corresponds to all the column values
which are greater than the previous bin value and less than or equal to the current bin value.
.IP "\fB\-h, \-\-help\fR" 4
.IX Item "-h, --help"
Print this help message.
.IP "\fB\-\-indelim\fR \fIcomma | semicolon\fR" 4
.IX Item "--indelim comma | semicolon"
Input delimiter for \s-1CSV\s0 \fITextFile(s)\fR. Possible values: \fIcomma or semicolon\fR.
Default value: \fIcomma\fR. For \s-1TSV\s0 files, this option is ignored and \fItab\fR is used as a
delimiter.
.IP "\fB\-\-klargest\fR \fInumber\fR" 4
.IX Item "--klargest number"
Kth largest value to find by \fIKLargest\fR function. Default value: \fI2\fR Valid values: positive
integers.
.IP "\fB\-\-ksmallest\fR \fInumber\fR" 4
.IX Item "--ksmallest number"
Kth smallest value to find by \fIKSmallest\fR function. Default value: \fI2\fR. Valid values: positive
integers.
.ie n .IP "\fB\-m, \-\-mode\fR \fIDescriptiveStatisticsBasic | DescriptiveStatisticsAll | All | ""function1, [function2,...]""\fR" 4
.el .IP "\fB\-m, \-\-mode\fR \fIDescriptiveStatisticsBasic | DescriptiveStatisticsAll | All | ``function1, [function2,...]''\fR" 4
.IX Item "-m, --mode DescriptiveStatisticsBasic | DescriptiveStatisticsAll | All | function1, [function2,...]"
Specify how to analyze data in TextFile(s): calculate basic or all descriptive statistics; or
use a comma delimited list of supported statistical functions. Possible values:
\&\fIDescriptiveStatisticsBasic | DescriptiveStatisticsAll | \*(L"function1,[function2]...\*(R"\fR. Default
value: \fIDescriptiveStatisticsBasic\fR
.Sp
\&\fIDescriptiveStatisticsBasic\fR includes these functions: \fICount, Maximum, Minimum, Mean,
Median, Sum, StandardDeviation, StandardError, Variance\fR.
.Sp
\&\fIDescriptiveStatisticsAll\fR, in addition to  \fIDescriptiveStatisticsBasic\fR functions, includes:
\&\fIGeometricMean, Frequency, HarmonicMean, KLargest, KSmallest, Kurtosis, Mode, RSquare,
Skewness, TrimMean\fR.
.Sp
\&\fIAll\fR uses complete list of supported functions: \fIAverage, AverageDeviation, Correlation,
Count, Covariance, GeometricMean, Frequency, HarmonicMean, KLargest, KSmallest, Kurtosis,
Maximum, Minimum, Mean, Median, Mode, RSquare, Skewness, Sum,
SumOfSquares, StandardDeviation, StandardDeviationN, StandardError, StandardScores,
StandardScoresN, TrimMean, Variance, VarianceN\fR. The function names ending with N
calculate corresponding values assuming an entire population instead of a population sample.
.Sp
Here are the formulas for these functions:
.Sp
Average: See Mean
.Sp
AverageDeviation: \s-1SUM\s0( \s-1ABS\s0(x[i] \- Xmean) ) / n
.Sp
Correlation: See Pearson Correlation
.Sp
Covariance: \s-1SUM\s0( (x[i] \- Xmean)(y[i] \- Ymean) ) / n
.Sp
GeometricMean: NthROOT( \s-1PRODUCT\s0(x[i]) )
.Sp
HarmonicMean: 1 / ( \s-1SUM\s0(1/x[i]) / n )
.Sp
Mean: \s-1SUM\s0( x[i] ) / n
.Sp
Median: Xsorted[(n \- 1)/2 + 1] for even values of n; (Xsorted[n/2] + Xsorted[n/2 + 1])/2
for odd values of n.
.Sp
Kurtosis: [ {n(n + 1)/(n \- 1)(n \- 2)(n \- 3)}  SUM{ ((x[i] \- Xmean)/STDDEV)^4 } ] \-
{3((n \- 1)^2)}/{(n \- 2)(n\-3)}
.Sp
PearsonCorrelation: \s-1SUM\s0( (x[i] \- Xmean)(y[i] \- Ymean) ) / \s-1SQRT\s0( \s-1SUM\s0( (x[i] \- Xmean)^2 )
(\s-1SUM\s0( (y[i] \- Ymean)^2 ))   )
.Sp
RSquare: PearsonCorrelation^2
.Sp
Skewness: {n/(n \- 1)(n \- 2)} SUM{ ((x[i] \- Xmean)/STDDEV)^3 }
.Sp
StandardDeviation: \s-1SQRT\s0 ( \s-1SUM\s0( (x[i] \- Mean)^2 ) / (n \- 1) )
.Sp
StandardDeviationN: \s-1SQRT\s0 ( \s-1SUM\s0( (x[i] \- Mean)^2 ) / n )
.Sp
StandardError: StandardDeviation / \s-1SQRT\s0( n )
.Sp
StandardScore: (x[i] \- Mean) / (n \- 1)
.Sp
StandardScoreN: (x[i] \- Mean) / n
.Sp
Variance: \s-1SUM\s0( (x[i] \- Xmean)^2  / (n \- 1) )
.Sp
VarianceN: \s-1SUM\s0( (x[i] \- Xmean)^2  / n )
.IP "\fB\-o, \-\-overwrite\fR" 4
.IX Item "-o, --overwrite"
Overwrite existing files.
.IP "\fB\-\-outdelim\fR \fIcomma | tab | semicolon\fR" 4
.IX Item "--outdelim comma | tab | semicolon"
Output text file delimiter. Possible values: \fIcomma, tab, or semicolon\fR
Default value: \fIcomma\fR.
.IP "\fB\-p, \-\-precision\fR \fInumber\fR" 4
.IX Item "-p, --precision number"
Precision of calculated values in the output file. Default: up to \fI2\fR decimal places.
Valid values: positive integers.
.IP "\fB\-q, \-\-quote\fR \fIyes | no\fR" 4
.IX Item "-q, --quote yes | no"
Put quotes around column values in output text file. Possible values: \fIyes or
no\fR. Default value: \fIyes\fR.
.IP "\fB\-r, \-\-root\fR \fIrootname\fR" 4
.IX Item "-r, --root rootname"
New text file name is generated using the root: <Root>.<Ext>. Default new file
name: <InitialTextFileName><Mode>.<Ext>. Based on the specified analysis,
<Mode> corresponds to one of these values: DescriptiveStatisticsBasic,
DescriptiveStatisticsAll, AllStatistics, SpecifiedStatistics, Covariance, Correlation,
Frequency, or StandardScores. The csv, and tsv <Ext> values are used for
comma/semicolon, and tab delimited text files respectively. This option is ignored for
multiple input files.
.IP "\fB\-\-trimfraction\fR \fInumber\fR" 4
.IX Item "--trimfraction number"
Fraction of data to exclude from the top and bottom of the data set during
\&\fITrimMean\fR calculation. Default value: \fI0.1\fR. Valid values: > 0 and < 1.
.IP "\fB\-w \-\-workingdir\fR \fItext\fR" 4
.IX Item "-w --workingdir text"
Location of working directory. Default: current directory.
.SH "EXAMPLES"
.IX Header "EXAMPLES"
To calculate basic statistics for data in first column and generate a
NewSample1DescriptiveStatisticsBasic.csv file, type:
.PP
.Vb 1
\&    % AnalyzeTextFilesData.pl \-o \-r NewSample1 Sample1.csv
.Ve
.PP
To calculate basic statistics for data in third column and generate a
NewSample1DescriptiveStatisticsBasic.csv file, type:
.PP
.Vb 1
\&    % AnalyzeTextFilesData.pl \-\-columns 3 \-o \-r NewSample1 Sample1.csv
.Ve
.PP
To calculate basic statistics for data in MolWeight column and generate a
NewSample1DescriptiveStatisticsBasic.csv file, type:
.PP
.Vb 2
\&    % AnalyzeTextFilesData.pl \-colmode collabel \-\-columns MolWeight \-o
\&    \-r NewSample1 Sample1.csv
.Ve
.PP
To calculate all available statistics for data in third column and all column pairs,
and generate NewSample1DescriptiveStatisticsAll.csv, NewSample1CorrelationMatrix.csv,
NewSample1CorrelationMatrix.csv, and NewSample1MolWeightFrequencyAnalysis.csv files,
type:
.PP
.Vb 2
\&    % AnalyzeTextFilesData.pl \-m DescriptiveStatisticsAll \-\-columns 3 \-o
\&    \-\-columnpairs AllPairs \-r NewSample1 Sample1.csv
.Ve
.PP
To compute frequency distribution of data in third column into five bins and
generate NewSample1MolWeightFrequencyAnalysis.csv, type:
.PP
.Vb 2
\&    % AnalyzeTextFilesData.pl \-m Frequency \-\-frequencybins 5 \-\-columns 3
\&    \-o \-r NewSample1 Sample1.csv
.Ve
.PP
To compute frequency distribution of data in third column into specified bin range
values, and generate NewSample1MolWeightFrequencyAnalysis.csv, type:
.PP
.Vb 2
\&    % AnalyzeTextFilesData.pl \-m Frequency \-\-frequencybins "100,200,400"
\&    \-\-columns 3 \-o \-r NewSample1 Sample1.csv
.Ve
.PP
To calculate all available statistics for data in all columns and column pairs, type:
.PP
.Vb 2
\&    % AnalyzeTextFilesData.pl \-m All \-\-columns  All \-\-columnpairs
\&    AllPairs \-o \-r NewSample1 Sample1.csv
.Ve
.SH "AUTHOR"
.IX Header "AUTHOR"
Manish Sud <msud@san.rr.com>
.SH "SEE ALSO"
.IX Header "SEE ALSO"
JoinTextFiles.pl, MergeTextFilesWithSD.pl, ModifyTextFilesFormat.pl, SplitTextFiles.pl, TextFilesToHTML.pl
.SH "COPYRIGHT"
.IX Header "COPYRIGHT"
Copyright (C) 2015 Manish Sud. All rights reserved.
.PP
This file is part of MayaChemTools.
.PP
MayaChemTools is free software; you can redistribute it and/or modify it under
the terms of the \s-1GNU\s0 Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.