view docs/scripts/man1/ExtractFromTextFiles.1 @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
line wrap: on
line source

.\" Automatically generated by Pod::Man 2.25 (Pod::Simple 3.22)
.\"
.\" Standard preamble:
.\" ========================================================================
.de Sp \" Vertical space (when we can't use .PP)
.if t .sp .5v
.if n .sp
..
.de Vb \" Begin verbatim text
.ft CW
.nf
.ne \\$1
..
.de Ve \" End verbatim text
.ft R
.fi
..
.\" Set up some character translations and predefined strings.  \*(-- will
.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
.\" nothing in troff, for use with C<>.
.tr \(*W-
.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
.ie n \{\
.    ds -- \(*W-
.    ds PI pi
.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
.    ds L" ""
.    ds R" ""
.    ds C` ""
.    ds C' ""
'br\}
.el\{\
.    ds -- \|\(em\|
.    ds PI \(*p
.    ds L" ``
.    ds R" ''
'br\}
.\"
.\" Escape single quotes in literal strings from groff's Unicode transform.
.ie \n(.g .ds Aq \(aq
.el       .ds Aq '
.\"
.\" If the F register is turned on, we'll generate index entries on stderr for
.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
.\" entries marked with X<> in POD.  Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
.ie \nF \{\
.    de IX
.    tm Index:\\$1\t\\n%\t"\\$2"
..
.    nr % 0
.    rr F
.\}
.el \{\
.    de IX
..
.\}
.\"
.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
.    \" fudge factors for nroff and troff
.if n \{\
.    ds #H 0
.    ds #V .8m
.    ds #F .3m
.    ds #[ \f1
.    ds #] \fP
.\}
.if t \{\
.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
.    ds #V .6m
.    ds #F 0
.    ds #[ \&
.    ds #] \&
.\}
.    \" simple accents for nroff and troff
.if n \{\
.    ds ' \&
.    ds ` \&
.    ds ^ \&
.    ds , \&
.    ds ~ ~
.    ds /
.\}
.if t \{\
.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
.\}
.    \" troff and (daisy-wheel) nroff accents
.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
.ds ae a\h'-(\w'a'u*4/10)'e
.ds Ae A\h'-(\w'A'u*4/10)'E
.    \" corrections for vroff
.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
.    \" for low resolution devices (crt and lpr)
.if \n(.H>23 .if \n(.V>19 \
\{\
.    ds : e
.    ds 8 ss
.    ds o a
.    ds d- d\h'-1'\(ga
.    ds D- D\h'-1'\(hy
.    ds th \o'bp'
.    ds Th \o'LP'
.    ds ae ae
.    ds Ae AE
.\}
.rm #[ #] #H #V #F C
.\" ========================================================================
.\"
.IX Title "EXTRACTFROMTEXTFILES 1"
.TH EXTRACTFROMTEXTFILES 1 "2015-03-29" "perl v5.14.2" "MayaChemTools"
.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
.nh
.SH "NAME"
ExtractFromTextFiles.pl \- Extract specific data from TextFile(s)
.SH "SYNOPSIS"
.IX Header "SYNOPSIS"
ExtractFromTextFiles.pl TextFile(s)...
.PP
ExtractFromTextFiles.pl [\fB\-c, \-\-colmode\fR colnum | collabel] [\fB\-\-categorycol \fR number | string]
[\fB\-\-columns\fR \*(L"colnum,[colnum]...\*(R" | \*(L"collabel,[collabel]...\*(R"] [\fB\-h, \-\-help\fR]
[\fB\-\-indelim\fR \fIcomma | semicolon\fR] [\fB\-m, \-\-mode \fR \fIcolumns | rows | categories\fR]
[\fB\-o, \-\-overwrite\fR] [\fB\-\-outdelim\fR \fIcomma | tab | semicolon\fR] [\fB\-q, \-\-quote\fR \fIyes | no\fR]
[\fB\-\-rows\fR \*(L"colid,value,criteria...\*(R" | \*(L"colid,value...\*(R" | \*(L"colid,mincolvalue,maxcolvalue\*(R" | \*(L"rownum,rownum,...\*(R" | colid | \*(L"minrownum,maxrownum\*(R"]
[ \fB\-\-rowsmode\fR rowsbycolvalue | rowsbycolvaluelist | rowsbycolvaluerange | rowbymincolvalue | rowbymaxcolvalue | rownums | rownumrange]
[\fB\-r, \-\-root\fR \fIrootname\fR] [\fB\-w, \-\-workingdir\fR \fIdirname\fR] TextFile(s)...
.SH "DESCRIPTION"
.IX Header "DESCRIPTION"
Extract column(s)/row(s) data from \fITextFile(s)\fR identified by column numbers or labels. Or categorize
data using a specified column category. During categorization, a summary text file is
generated containing category name and count; an additional text file, containing data for
for each category, is also generated. The file names are separated by space. The
valid file extensions are \fI.csv\fR and \fI.tsv\fR for comma/semicolon and tab delimited
text files respectively. All other file names are ignored. All the text files in a
current directory can be specified by \fI*.csv\fR, \fI*.tsv\fR, or the current directory
name. The \fB\-\-indelim\fR option determines the format of \fITextFile(s)\fR. Any file
which doesn't correspond to the format indicated by \fB\-\-indelim\fR option is ignored.
.SH "OPTIONS"
.IX Header "OPTIONS"
.IP "\fB\-c, \-\-colmode\fR \fIcolnum | collabel\fR" 4
.IX Item "-c, --colmode colnum | collabel"
Specify how columns are identified in \fITextFile(s)\fR: using column number or column
label. Possible values: \fIcolnum or collabel\fR. Default value: \fIcolnum\fR.
.IP "\fB\-\-categorycol \fR \fInumber | string\fR" 4
.IX Item "--categorycol  number | string"
Column used to categorize data. Default value: First column.
.Sp
For \fIcolnum\fR value of \fB\-c, \-\-colmode\fR option, input value is a column number.
Example: \fI1\fR.
.Sp
For \fIcollabel\fR value of \fB\-c, \-\-colmode\fR option, input value is a column label.
Example: \fIMol_ID\fR.
.ie n .IP "\fB\-\-columns\fR \fI""colnum,[colnum]..."" | ""collabel,[collabel]...""\fR" 4
.el .IP "\fB\-\-columns\fR \fI``colnum,[colnum]...'' | ``collabel,[collabel]...''\fR" 4
.IX Item "--columns colnum,[colnum]... | collabel,[collabel]..."
List of comma delimited columns to extract. Default value: First column.
.Sp
For \fIcolnum\fR value of \fB\-c, \-\-colmode\fR option, input values format is:
\&\fIcolnum,colnum,...\fR. Example: \fI1,3,5\fR
.Sp
For \fIcollabel\fR value of \fB\-c, \-\-colmode\fR option, input values format is:
\&\fIcollabel,collabel,..\fR. Example: \fIMol_ID,MolWeight\fR
.IP "\fB\-h, \-\-help\fR" 4
.IX Item "-h, --help"
Print this help message.
.IP "\fB\-\-indelim\fR \fIcomma | semicolon\fR" 4
.IX Item "--indelim comma | semicolon"
Input delimiter for \s-1CSV\s0 \fITextFile(s)\fR. Possible values: \fIcomma or semicolon\fR.
Default value: \fIcomma\fR. For \s-1TSV\s0 files, this option is ignored and \fItab\fR is used as a
delimiter.
.IP "\fB\-m, \-\-mode \fR \fIcolumns | rows | categories\fR" 4
.IX Item "-m, --mode  columns | rows | categories"
Specify what to extract from \fITextFile(s)\fR. Possible values: \fIcolumns, rows,
or categories\fR. Default value: \fIcolumns\fR.
.Sp
For \fIcolumns\fR mode, data for appropriate columns specified by \fB\-\-columns\fR option
is extracted from \fITextFile(s)\fR and placed into new text files.
.Sp
For \fIrows\fR mode, appropriate rows specified in conjuction with \fB\-\-rowsmode\fR and
\&\fBrows\fR options are extracted from \fITextFile(s)\fR and placed into new text files.
.Sp
For \fIcategories\fR mode, coulmn specified by \fB\-\-categorycol\fR is
used to categorize data, and a summary text file is generated
containing category name and count;  an additional text file, containing data for
for each category, is also generated.
.IP "\fB\-o, \-\-overwrite\fR" 4
.IX Item "-o, --overwrite"
Overwrite existing files.
.IP "\fB\-\-outdelim\fR \fIcomma | tab | semicolon\fR." 4
.IX Item "--outdelim comma | tab | semicolon."
Output text file delimiter. Possible values: \fIcomma, tab, or semicolon\fR.
Default value: \fIcomma\fR
.IP "\fB\-q, \-\-quote\fR \fIyes | no\fR" 4
.IX Item "-q, --quote yes | no"
Put quotes around column values in output text file. Possible values: \fIyes or
no\fR. Default value: \fIyes\fR.
.IP "\fB\-r, \-\-root\fR \fIrootname\fR" 4
.IX Item "-r, --root rootname"
New file name is generated using the root: <Root>.<Ext>. Default for new file
names: <TextFile>CategoriesSummary.<Ext>, <TextFile>ExtractedColumns.<Ext>, and
<TextFile>ExtractedRows.<Ext> for \fIcategories\fR, \fIcolumns\fR, and \fIrows\fR mode
respectively. And <TextFile>Category<CategoryName>.<Ext>
for each category retrieved from each text file. The output file type determines <Ext>
value: csv and tsv for \s-1CSV\s0, and \s-1TSV\s0 files respectively.
.Sp
This option is ignored for multiple input files.
.ie n .IP "\fB\-\-rows\fR \fI""colid,value,criteria..."" | ""colid,value..."" | ""colid,mincolvalue,maxcolvalue"" | ""rownum,rownum,..."" | colid | ""minrownum,maxrownum""\fR" 4
.el .IP "\fB\-\-rows\fR \fI``colid,value,criteria...'' | ``colid,value...'' | ``colid,mincolvalue,maxcolvalue'' | ``rownum,rownum,...'' | colid | ``minrownum,maxrownum''\fR" 4
.IX Item "--rows colid,value,criteria... | colid,value... | colid,mincolvalue,maxcolvalue | rownum,rownum,... | colid | minrownum,maxrownum"
This value is \fB\-\-rowsmode\fR specific. In general, it's a list of comma separated column ids and
associated mode specific value. Based on Column ids specification, column label or number, is
controlled by \fB\-c, \-\-colmode\fR option.
.Sp
First line containing column labels is always written out. And value comparisons assume
numerical column data.
.Sp
For \fIrowsbycolvalue\fR mode, input value format contains these triplets:
\&\fIcolid,value, criteria...\fR. Possible values for criteria: \fIle, ge or eq\fR.
Examples:
.Sp
.Vb 2
\&    MolWt,450,le
\&    MolWt,450,le,LogP,5,le,SumNumNO,10,le,SumNHOH,5,le
.Ve
.Sp
For \fIrowsbycolvaluelist\fR mode, input value format is: \fIcolid,value...\fR. Examples:
.Sp
.Vb 2
\&    Mol_ID,20
\&    Mol_ID,20,1002,1115
.Ve
.Sp
For \fIrowsbycolvaluerange\fR mode, input value format is: \fIcolid,mincolvalue,maxcolvalue\fR. Examples:
.Sp
.Vb 1
\&    MolWt,100,450
.Ve
.Sp
For \fIrowbymincolvalue, rowbymaxcolvalue\fR modes, input value format is: \fIcolid\fR.
.Sp
For \fIrownum\fR mode, input value format is: \fIrownum\fR. Default value: \fI2\fR.
.Sp
For \fIrownumrange\fR mode, input value format is: \fIminrownum, maxrownum\fR. Examples:
.Sp
.Vb 1
\&    10,40
.Ve
.IP "\fB\-\-rowsmode\fR \fIrowsbycolvalue | rowsbycolvaluelist | rowsbycolvaluerange | rowbymincolvalue | rowbymaxcolvalue | rownums | rownumrange\fR" 4
.IX Item "--rowsmode rowsbycolvalue | rowsbycolvaluelist | rowsbycolvaluerange | rowbymincolvalue | rowbymaxcolvalue | rownums | rownumrange"
Specify how to extract rows from \fITextFile(s)\fR. Possible values: \fIrowsbycolvalue, rowsbycolvaluelist, rowsbycolvaluerange,
rowbymincolvalue, rowbymaxcolvalue, rownum, rownumrange\fR. Default value: \fIrownum\fR.
.Sp
Use \fB\-\-rows\fR option to list rows criterion used for extraction of rows from
\&\fITextFile(s)\fR.
.IP "\fB\-w, \-\-workingdir\fR \fIdirname\fR" 4
.IX Item "-w, --workingdir dirname"
Location of working directory. Default: current directory.
.SH "EXAMPLES"
.IX Header "EXAMPLES"
To extract first column from a text file and generate a new \s-1CSV\s0 text file NewSample1.csv,
type:
.PP
.Vb 1
\&    % ExtractFromTextFiles.pl \-r NewSample1 \-o Sample1.csv
.Ve
.PP
To extract columns Mol_ID, MolWeight, and \s-1NAME\s0 from Sample1.csv and generate a new
textfile NewSample1.tsv with no quotes, type:
.PP
.Vb 3
\&    % ExtractFromTextFiles.pl \-m columns \-c collabel \-\-columns "Mol_ID,
\&      MolWeight,NAME" \-\-outdelim tab \-\-quote no \-r NewSample1
\&      \-o Sample1.csv
.Ve
.PP
To extract rows containing values for MolWeight column of less than 450 from
Sample1.csv and generate a new textfile NewSample1.csv, type:
.PP
.Vb 3
\&    % ExtractFromTextFiles.pl \-m rows \-\-rowsmode rowsbycolvalue
\&      \-c collabel \-\-rows MolWeight,450,le \-r NewSample1
\&      \-o Sample1.csv
.Ve
.PP
To extract rows containing values for MolWeight column between 400 and 500 from
Sample1.csv and generate a new textfile NewSample1.csv, type:
.PP
.Vb 3
\&    % ExtractFromTextFiles.pl \-m rows \-\-rowsmode rowsbycolvaluerange
\&      \-c collabel \-\-rows MolWeight,450,500 \-r NewSample1
\&      \-o Sample1.csv
.Ve
.PP
To extract a row containing minimum value for column MolWeight from Sample1.csv and generate
a new textfile NewSample1.csv, type:
.PP
.Vb 3
\&    % ExtractFromTextFiles.pl \-m rows \-\-rowsmode rowbymincolvalue
\&      \-c collabel \-\-rows MolWeight \-r NewSample1
\&      \-o Sample1.csv
.Ve
.SH "AUTHOR"
.IX Header "AUTHOR"
Manish Sud <msud@san.rr.com>
.SH "SEE ALSO"
.IX Header "SEE ALSO"
JoinTextFiles.pl, MergeTextFilesWithSD.pl, ModifyTextFilesFormat.pl, SplitTextFiles.pl
.SH "COPYRIGHT"
.IX Header "COPYRIGHT"
Copyright (C) 2015 Manish Sud. All rights reserved.
.PP
This file is part of MayaChemTools.
.PP
MayaChemTools is free software; you can redistribute it and/or modify it under
the terms of the \s-1GNU\s0 Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.