comparison docs/scripts/man1/AnalyzeSDFilesData.1 @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 .\" Automatically generated by Pod::Man 2.25 (Pod::Simple 3.22)
2 .\"
3 .\" Standard preamble:
4 .\" ========================================================================
5 .de Sp \" Vertical space (when we can't use .PP)
6 .if t .sp .5v
7 .if n .sp
8 ..
9 .de Vb \" Begin verbatim text
10 .ft CW
11 .nf
12 .ne \\$1
13 ..
14 .de Ve \" End verbatim text
15 .ft R
16 .fi
17 ..
18 .\" Set up some character translations and predefined strings. \*(-- will
19 .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
20 .\" double quote, and \*(R" will give a right double quote. \*(C+ will
21 .\" give a nicer C++. Capital omega is used to do unbreakable dashes and
22 .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff,
23 .\" nothing in troff, for use with C<>.
24 .tr \(*W-
25 .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
26 .ie n \{\
27 . ds -- \(*W-
28 . ds PI pi
29 . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
30 . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
31 . ds L" ""
32 . ds R" ""
33 . ds C` ""
34 . ds C' ""
35 'br\}
36 .el\{\
37 . ds -- \|\(em\|
38 . ds PI \(*p
39 . ds L" ``
40 . ds R" ''
41 'br\}
42 .\"
43 .\" Escape single quotes in literal strings from groff's Unicode transform.
44 .ie \n(.g .ds Aq \(aq
45 .el .ds Aq '
46 .\"
47 .\" If the F register is turned on, we'll generate index entries on stderr for
48 .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
49 .\" entries marked with X<> in POD. Of course, you'll have to process the
50 .\" output yourself in some meaningful fashion.
51 .ie \nF \{\
52 . de IX
53 . tm Index:\\$1\t\\n%\t"\\$2"
54 ..
55 . nr % 0
56 . rr F
57 .\}
58 .el \{\
59 . de IX
60 ..
61 .\}
62 .\"
63 .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
64 .\" Fear. Run. Save yourself. No user-serviceable parts.
65 . \" fudge factors for nroff and troff
66 .if n \{\
67 . ds #H 0
68 . ds #V .8m
69 . ds #F .3m
70 . ds #[ \f1
71 . ds #] \fP
72 .\}
73 .if t \{\
74 . ds #H ((1u-(\\\\n(.fu%2u))*.13m)
75 . ds #V .6m
76 . ds #F 0
77 . ds #[ \&
78 . ds #] \&
79 .\}
80 . \" simple accents for nroff and troff
81 .if n \{\
82 . ds ' \&
83 . ds ` \&
84 . ds ^ \&
85 . ds , \&
86 . ds ~ ~
87 . ds /
88 .\}
89 .if t \{\
90 . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
91 . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
92 . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
93 . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
94 . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
95 . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
96 .\}
97 . \" troff and (daisy-wheel) nroff accents
98 .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
99 .ds 8 \h'\*(#H'\(*b\h'-\*(#H'
100 .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
101 .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
102 .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
103 .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
104 .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
105 .ds ae a\h'-(\w'a'u*4/10)'e
106 .ds Ae A\h'-(\w'A'u*4/10)'E
107 . \" corrections for vroff
108 .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
109 .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
110 . \" for low resolution devices (crt and lpr)
111 .if \n(.H>23 .if \n(.V>19 \
112 \{\
113 . ds : e
114 . ds 8 ss
115 . ds o a
116 . ds d- d\h'-1'\(ga
117 . ds D- D\h'-1'\(hy
118 . ds th \o'bp'
119 . ds Th \o'LP'
120 . ds ae ae
121 . ds Ae AE
122 .\}
123 .rm #[ #] #H #V #F C
124 .\" ========================================================================
125 .\"
126 .IX Title "ANALYZESDFILESDATA 1"
127 .TH ANALYZESDFILESDATA 1 "2015-03-29" "perl v5.14.2" "MayaChemTools"
128 .\" For nroff, turn off justification. Always turn off hyphenation; it makes
129 .\" way too many mistakes in technical documents.
130 .if n .ad l
131 .nh
132 .SH "NAME"
133 AnalyzeSDFilesData.pl \- Analyze numerical data field values in SDFile(s)
134 .SH "SYNOPSIS"
135 .IX Header "SYNOPSIS"
136 AnalyzeSDFilesData.pl SDFile(s)...
137 .PP
138 AnalyzeSDFilesData.pl [\fB\-\-datafields\fR \*(L"fieldlabel,[fieldlabel,...]\*(R" | All]
139 [\fB\-\-datafieldpairs\fR \*(L"fieldlabel,fieldlabel,[fieldlabel,fieldlabel,...]\*(R" | AllPairs] [\fB\-d, \-\-detail\fR infolevel]
140 [\fB\-f, \-\-fast\fR] [\fB\-\-frequencybins\fR number | \*(L"number,number,[number,...]\*(R"]
141 [\fB\-h, \-\-help\fR] [\fB\-\-klargest\fR number] [\fB\-\-ksmallest\fR number]
142 [\fB\-m, \-\-mode\fR DescriptiveStatisticsBasic | DescriptiveStatisticsAll | All | \*(L"function1, [function2,...]\*(R"]
143 [\fB\-\-trimfraction\fR number] [\fB\-w, \-\-workingdir\fR dirname] SDFiles(s)...
144 .SH "DESCRIPTION"
145 .IX Header "DESCRIPTION"
146 Analyze numerical data field values in \fISDFile(s)\fR using a combination of various statistical
147 functions; Non-numerical values are simply ignored. For \fICorrelation, RSquare, and
148 Covariance\fR analysis, the count of valid values in specified data field pairs must be same;
149 otherwise, column data field pair is ignored. The file names are separated by space.The valid file
150 extensions are \fI.sdf\fR and \fI.sd\fR. All other file names are ignored. All the \s-1SD\s0 files in a
151 current directory can be specified either by \fI*.sdf\fR or the current directory name.
152 .SH "OPTIONS"
153 .IX Header "OPTIONS"
154 .ie n .IP "\fB\-\-datafields\fR \fI""fieldlabel,[fieldlabel,...]"" | Common | All\fR" 4
155 .el .IP "\fB\-\-datafields\fR \fI``fieldlabel,[fieldlabel,...]'' | Common | All\fR" 4
156 .IX Item "--datafields fieldlabel,[fieldlabel,...] | Common | All"
157 Data fields to use for analysis. Possible values: list of comma separated data field
158 labels, data fields common to all records, or all data fields. Default value: \fICommon\fR.
159 Examples:
160 .Sp
161 .Vb 2
162 \& ALogP,MolWeight,EC50
163 \& "MolWeight,PSA"
164 .Ve
165 .ie n .IP "\fB\-\-datafieldpairs\fR \fI""fieldlabel,fieldlabel,[fieldlabel,fieldlabel,...]"" | CommonPairs | AllPairs\fR" 4
166 .el .IP "\fB\-\-datafieldpairs\fR \fI``fieldlabel,fieldlabel,[fieldlabel,fieldlabel,...]'' | CommonPairs | AllPairs\fR" 4
167 .IX Item "--datafieldpairs fieldlabel,fieldlabel,[fieldlabel,fieldlabel,...] | CommonPairs | AllPairs"
168 This value is mode specific and is only used for \fICorrelation, PearsonCorrelation, or
169 Covariance\fR value of \fB\-m, \-\-mode\fR option. It specifies data field label pairs to use
170 for data analysis during \fICorrelation\fR and \fICovariance\fR calculations. Possible values:
171 comma delimited list of data field label pairs, data field label pairs common to all records,
172 or all data field pairs. Default value:\fICommonPairs\fR. Example:
173 .Sp
174 .Vb 1
175 \& MolWeight,EC50,NumN+O,PSA
176 .Ve
177 .Sp
178 For \fIAllPairs\fR value of \fB\-\-datafieldpairs\fR option, all data field label pairs are used for
179 \&\fICorrelation\fR and \fICovariance\fR calculations.
180 .IP "\fB\-d, \-\-detail\fR \fIinfolevel\fR" 4
181 .IX Item "-d, --detail infolevel"
182 Level of information to print about column values being ignored. Default: \fI0\fR. Possible values:
183 0, 1, 2, 3, or 4.
184 .IP "\fB\-f, \-\-fast\fR" 4
185 .IX Item "-f, --fast"
186 In this mode, all the data field values specified for analysis are assumed to contain numerical
187 data and no checking is performed before analysis. By default, only numerical data is
188 used for analysis; other types of column data is ignored.
189 .ie n .IP "\fB\-\-frequencybins\fR \fInumber | ""number,number,[number,...]""\fR" 4
190 .el .IP "\fB\-\-frequencybins\fR \fInumber | ``number,number,[number,...]''\fR" 4
191 .IX Item "--frequencybins number | number,number,[number,...]"
192 Specify number of bins or bin range to use for frequency analysis. Default value: \fI10\fR
193 .Sp
194 Number of bins value along with the smallest and largest value for a column is used to
195 group the column values into different groups.
196 .Sp
197 The bin range list is used to group values for a column into different groups; It must contain
198 values in ascending order. Examples:
199 .Sp
200 .Vb 2
201 \& 10,20,30
202 \& 0.1,0.2,0.3,0.4,0.5
203 .Ve
204 .Sp
205 The frequency value calculated for a specific bin corresponds to all the column values
206 which are greater than the previous bin value and less than or equal to the current bin value.
207 .IP "\fB\-h, \-\-help\fR" 4
208 .IX Item "-h, --help"
209 Print this help message.
210 .IP "\fB\-\-klargest\fR \fInumber\fR" 4
211 .IX Item "--klargest number"
212 Kth largest value to find by \fIKLargest\fR function. Default value: \fI2\fR. Valid values: positive
213 integers.
214 .IP "\fB\-\-ksmallest\fR \fInumber\fR" 4
215 .IX Item "--ksmallest number"
216 Kth smallest value to find by \fIKSmallest\fR function. Default values: \fI2\fR. Valid values: positive
217 integers.
218 .ie n .IP "\fB\-m, \-\-mode\fR \fIDescriptiveStatisticsBasic | DescriptiveStatisticsAll | All | ""function1, [function2,...]""\fR" 4
219 .el .IP "\fB\-m, \-\-mode\fR \fIDescriptiveStatisticsBasic | DescriptiveStatisticsAll | All | ``function1, [function2,...]''\fR" 4
220 .IX Item "-m, --mode DescriptiveStatisticsBasic | DescriptiveStatisticsAll | All | function1, [function2,...]"
221 Specify how to analyze data in SDFile(s): calculate basic or all descriptive statistics; or
222 use a comma delimited list of supported statistical functions. Possible values:
223 \&\fIDescriptiveStatisticsBasic | DescriptiveStatisticsAll | \*(L"function1,[function2]...\*(R"\fR. Default
224 value: \fIDescriptiveStatisticsBasic\fR
225 .Sp
226 \&\fIDescriptiveStatisticsBasic\fR includes these functions: \fICount, Maximum, Minimum, Mean,
227 Median, Sum, StandardDeviation, StandardError, Variance\fR.
228 .Sp
229 \&\fIDescriptiveStatisticsAll\fR, in addition to \fIDescriptiveStatisticsBasic\fR functions, includes:
230 \&\fIGeometricMean, Frequency, HarmonicMean, KLargest, KSmallest, Kurtosis, Mode, RSquare,
231 Skewness, TrimMean\fR.
232 .Sp
233 \&\fIAll\fR uses complete list of supported functions: \fIAverage, AverageDeviation, Correlation,
234 Count, Covariance, GeometricMean, Frequency, HarmonicMean, KLargest, KSmallest, Kurtosis,
235 Maximum, Minimum, Mean, Median, Mode, RSquare, Skewness, Sum,
236 SumOfSquares, StandardDeviation, StandardDeviationN, StandardError, StandardScores,
237 StandardScoresN, TrimMean, Variance, VarianceN\fR. The function names ending with N
238 calculate corresponding values assuming an entire population instead of a population sample.
239 Here are the formulas for these functions:
240 .Sp
241 Average: See Mean
242 .Sp
243 AverageDeviation: \s-1SUM\s0( \s-1ABS\s0(x[i] \- Xmean) ) / n
244 .Sp
245 Correlation: See Pearson Correlation
246 .Sp
247 Covariance: \s-1SUM\s0( (x[i] \- Xmean)(y[i] \- Ymean) ) / n
248 .Sp
249 GeometricMean: NthROOT( \s-1PRODUCT\s0(x[i]) )
250 .Sp
251 HarmonicMean: 1 / ( \s-1SUM\s0(1/x[i]) / n )
252 .Sp
253 Mean: \s-1SUM\s0( x[i] ) / n
254 .Sp
255 Median: Xsorted[(n \- 1)/2 + 1] for even values of n; (Xsorted[n/2] + Xsorted[n/2 + 1])/2
256 for odd values of n.
257 .Sp
258 Kurtosis: [ {n(n + 1)/(n \- 1)(n \- 2)(n \- 3)} SUM{ ((x[i] \- Xmean)/STDDEV)^4 } ] \-
259 {3((n \- 1)^2)}/{(n \- 2)(n\-3)}
260 .Sp
261 PearsonCorrelation: \s-1SUM\s0( (x[i] \- Xmean)(y[i] \- Ymean) ) / \s-1SQRT\s0( \s-1SUM\s0( (x[i] \- Xmean)^2 )
262 (\s-1SUM\s0( (y[i] \- Ymean)^2 )) )
263 .Sp
264 RSquare: PearsonCorrelation^2
265 .Sp
266 Skewness: {n/(n \- 1)(n \- 2)} SUM{ ((x[i] \- Xmean)/STDDEV)^3 }
267 .Sp
268 StandardDeviation: \s-1SQRT\s0 ( \s-1SUM\s0( (x[i] \- Mean)^2 ) / (n \- 1) )
269 .Sp
270 StandardDeviationN: \s-1SQRT\s0 ( \s-1SUM\s0( (x[i] \- Mean)^2 ) / n )
271 .Sp
272 StandardError: StandardDeviation / \s-1SQRT\s0( n )
273 .Sp
274 StandardScore: (x[i] \- Mean) / (n \- 1)
275 .Sp
276 StandardScoreN: (x[i] \- Mean) / n
277 .Sp
278 Variance: \s-1SUM\s0( (x[i] \- Xmean)^2 / (n \- 1) )
279 .Sp
280 VarianceN: \s-1SUM\s0( (x[i] \- Xmean)^2 / n )
281 .IP "\fB\-o, \-\-overwrite\fR" 4
282 .IX Item "-o, --overwrite"
283 Overwrite existing files.
284 .IP "\fB\-\-outdelim\fR \fIcomma | tab | semicolon\fR" 4
285 .IX Item "--outdelim comma | tab | semicolon"
286 Output text file delimiter. Possible values: \fIcomma, tab, or semicolon\fR
287 Default value: \fIcomma\fR.
288 .IP "\fB\-p, \-\-precision\fR \fInumber\fR" 4
289 .IX Item "-p, --precision number"
290 Precision of calculated values in the output file. Default: up to \fI2\fR decimal places.
291 Valid values: positive integers.
292 .IP "\fB\-q, \-\-quote\fR \fIyes | no\fR" 4
293 .IX Item "-q, --quote yes | no"
294 Put quotes around column values in output text file. Possible values: \fIyes or
295 no\fR. Default value: \fIyes\fR.
296 .IP "\fB\-r, \-\-root\fR \fIrootname\fR" 4
297 .IX Item "-r, --root rootname"
298 New text file name is generated using the root: <Root>.<Ext>. Default new file
299 name: <InitialSDFileName><Mode>.<Ext>. Based on the specified analysis,
300 <Mode> corresponds to one of these values: DescriptiveStatisticsBasic,
301 DescriptiveStatisticsAll, AllStatistics, SpecifiedStatistics, Covariance, Correlation,
302 Frequency, or StandardScores. The csv, and tsv <Ext> values are used for
303 comma/semicolon, and tab delimited text files respectively. This option is ignored for
304 multiple input files.
305 .IP "\fB\-\-trimfraction\fR \fInumber\fR" 4
306 .IX Item "--trimfraction number"
307 Fraction of data to exclude from the top and bottom of the data set during
308 \&\fITrimMean\fR calculation. Default value: \fI0.1\fR Valid values: > 0 and < 1.
309 .IP "\fB\-w \-\-workingdir\fR \fItext\fR" 4
310 .IX Item "-w --workingdir text"
311 Location of working directory. Default: current directory.
312 .SH "EXAMPLES"
313 .IX Header "EXAMPLES"
314 To calculate basic statistics for data in all common data fields and generate a
315 NewSample1DescriptiveStatisticsBasic.csv file, type:
316 .PP
317 .Vb 1
318 \& % AnalyzeSDFilesData.pl \-o \-r NewSample1 Sample1.sdf
319 .Ve
320 .PP
321 To calculate basic statistics for MolWeight data field and generate a
322 NewSample1DescriptiveStatisticsBasic.csv file, type:
323 .PP
324 .Vb 2
325 \& % AnalyzeSDFilesData.pl \-\-datafields MolWeight \-o \-r NewSample1
326 \& Sample1.sdf
327 .Ve
328 .PP
329 To calculate all available statistics for MolWeight data field and all data field pairs,
330 and generate NewSample1DescriptiveStatisticsAll.csv, NewSample1CorrelationMatrix.csv,
331 NewSample1CorrelationMatrix.csv, and NewSample1MolWeightFrequencyAnalysis.csv
332 files, type:
333 .PP
334 .Vb 2
335 \& % AnalyzeSDFilesData.pl \-m DescriptiveStatisticsAll \-\-datafields
336 \& MolWeight \-o \-\-datafieldpairs AllPairs \-r NewSample1 Sample1.sdf
337 .Ve
338 .PP
339 To compute frequency distribution of MolWeight data field into five bins and
340 generate NewSample1MolWeightFrequencyAnalysis.csv, type:
341 .PP
342 .Vb 2
343 \& % AnalyzeSDFilesData.pl \-m Frequency \-\-frequencybins 5 \-\-datafields
344 \& MolWeight \-o \-r NewSample1 Sample1.sdf
345 .Ve
346 .PP
347 To compute frequency distribution of data in MolWeight data field into specified bin range
348 values, and generate NewSample1MolWeightFrequencyAnalysis.csv, type:
349 .PP
350 .Vb 2
351 \& % AnalyzeSDFilesData.pl \-m Frequency \-\-frequencybins "100,200,400"
352 \& \-\-datafields MolWeight \-o \-r NewSample1 Sample1.sdf
353 .Ve
354 .PP
355 To calculate all available statistics for data in all data fields and pairs, type:
356 .PP
357 .Vb 2
358 \& % AnalyzeSDFilesData.pl \-m All \-\-datafields All \-\-datafieldpairs
359 \& AllPairs \-o \-r NewSample1 Sample1.sdf
360 .Ve
361 .SH "AUTHOR"
362 .IX Header "AUTHOR"
363 Manish Sud <msud@san.rr.com>
364 .SH "SEE ALSO"
365 .IX Header "SEE ALSO"
366 FilterSDFiles.pl, InfoSDFiles.pl, SplitSDFiles.pl, MergeTextFilesWithSD.pl
367 .SH "COPYRIGHT"
368 .IX Header "COPYRIGHT"
369 Copyright (C) 2015 Manish Sud. All rights reserved.
370 .PP
371 This file is part of MayaChemTools.
372 .PP
373 MayaChemTools is free software; you can redistribute it and/or modify it under
374 the terms of the \s-1GNU\s0 Lesser General Public License as published by the Free
375 Software Foundation; either version 3 of the License, or (at your option)
376 any later version.