Mercurial > repos > deepakjadmin > mayatool3_test2
diff bin/ExtractFromTextFiles.pl @ 0:4816e4a8ae95 draft default tip
Uploaded
author | deepakjadmin |
---|---|
date | Wed, 20 Jan 2016 09:23:18 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/ExtractFromTextFiles.pl Wed Jan 20 09:23:18 2016 -0500 @@ -0,0 +1,1157 @@ +#!/usr/bin/perl -w +# +# $RCSfile: ExtractFromTextFiles.pl,v $ +# $Date: 2015/02/28 20:46:19 $ +# $Revision: 1.42 $ +# +# Author: Manish Sud <msud@san.rr.com> +# +# Copyright (C) 2015 Manish Sud. All rights reserved. +# +# This file is part of MayaChemTools. +# +# MayaChemTools is free software; you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your option) any +# later version. +# +# MayaChemTools is distributed in the hope that it will be useful, but without +# any warranty; without even the implied warranty of merchantability of fitness +# for a particular purpose. See the GNU Lesser General Public License for more +# details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or +# write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, +# Boston, MA, 02111-1307, USA. +# + +use strict; +use FindBin; use lib "$FindBin::Bin/../lib"; +use Getopt::Long; +use File::Basename; +use Text::ParseWords; +use FileHandle; +use Benchmark; +use FileUtil; +use TextUtil; + +my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); + +# Autoflush STDOUT +$| = 1; + +$StartTime = new Benchmark; + +# Starting message... +$ScriptName = basename $0; +print "\n$ScriptName:Starting...\n\n"; + +# Get the options and setup script... +SetupScriptUsage(); +if ($Options{help} || @ARGV < 1) { + die GetUsageFromPod("$FindBin::Bin/$ScriptName"); +} + +my(@TextFilesList); +@TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); + +# Process options... +print "Processing options...\n"; +my(%OptionsInfo); +ProcessOptions(); + +# Collect column information for all the text files... +print "Checking input text file(s)...\n"; +my(%TextFilesInfo); +RetrieveTextFilesInfo(); +RetrieveColumnsAndRowsInfo(); + +# Generate output files... +my($FileIndex); +if (@TextFilesList > 1) { + print "\nProcessing text files...\n"; +} +for $FileIndex (0 .. $#TextFilesList) { + if ($TextFilesInfo{FileOkay}[$FileIndex]) { + print "\nProcessing file $TextFilesList[$FileIndex]...\n"; + ExtractFromTextFile($FileIndex); + } +} +print "\n$ScriptName:Done...\n\n"; + +$EndTime = new Benchmark; +$TotalTime = timediff ($EndTime, $StartTime); +print "Total time: ", timestr($TotalTime), "\n"; + +############################################################################### + +# Extract appropriate data from text file... +sub ExtractFromTextFile { + my($Index) = @_; + + if ($OptionsInfo{Mode} =~ /^categories$/i) { + ExtractCategoryData($Index); + } + elsif ($OptionsInfo{Mode} =~ /^rows$/i){ + ExtractRowsData($Index); + } + else { + ExtractColumnData($Index); + } +} + +# Geneate category files... +sub ExtractCategoryData { + my($Index) = @_; + my($TextFile, $CategoryCol, $NewTextFile, $InDelim, @ColLabels); + + $TextFile = $TextFilesList[$Index]; + + $NewTextFile = $TextFilesInfo{OutFile}[$Index]; + $CategoryCol = $TextFilesInfo{CategoryColNum}[$Index]; + $InDelim = $TextFilesInfo{InDelim}[$Index]; + @ColLabels = @{$TextFilesInfo{ColLabels}[$Index]}; + + my($Line, @LineWords, $CategoryName, $CategoryCount, %CategoriesNameToCountMap, %CategoriesNameToLinesMap); + # Collect category data... + open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n"; + # Skip label line... + $_ = <TEXTFILE>; + + %CategoriesNameToCountMap = (); + %CategoriesNameToLinesMap = (); + + while ($Line = GetTextLine(\*TEXTFILE)) { + @LineWords = quotewords($InDelim, 0, $Line); + $CategoryName = ($CategoryCol <= @LineWords) ? $LineWords[$CategoryCol] : ""; + if (exists($CategoriesNameToCountMap{$CategoryName})) { + $CategoriesNameToCountMap{$CategoryName} += 1; + push @{$CategoriesNameToLinesMap{$CategoryName}}, $Line; + } + else { + $CategoriesNameToCountMap{$CategoryName} = 1; + @{$CategoriesNameToLinesMap{$CategoryName}} = (); + push @{$CategoriesNameToLinesMap{$CategoryName}}, $Line; + } + } + close TEXTFILE; + + # Setup file names for individual category files... + my(%CategoriesNameToFileHandleMap, %CategoriesNameToFileNameMap, $CategoryFile, $CategoryFileHandle); + + %CategoriesNameToFileHandleMap = (); + %CategoriesNameToFileNameMap = (); + + for $CategoryName (keys %CategoriesNameToCountMap) { + $CategoryFile = $TextFilesInfo{CategoryOutFileRoot}[$Index] . "$CategoryName" . ".$TextFilesInfo{OutFileExt}[$Index]";; + $CategoryFile =~ s/ //g; + $CategoryFileHandle = new FileHandle; + open $CategoryFileHandle, ">$CategoryFile" or die "Couldn't open $CategoryFile: $! \n"; + $CategoriesNameToFileNameMap{$CategoryName} = $CategoryFile; + $CategoriesNameToFileHandleMap{$CategoryName} = $CategoryFileHandle; + } + + # Write out summary file... + print "Generating file $NewTextFile...\n"; + open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n"; + + # Write out column labels... + @LineWords = ("Category","Count"); + $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); + print NEWTEXTFILE "$Line\n"; + + # Write out the category names and count... + for $CategoryName (sort { lc($a) cmp lc($b) } keys %CategoriesNameToCountMap) { + $CategoryCount = $CategoriesNameToCountMap{$CategoryName}; + @LineWords = ("$CategoryName","$CategoryCount"); + $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); + print NEWTEXTFILE "$Line\n"; + } + close NEWTEXTFILE; + + # Write out a file for each category... + my($ColLabelLine, $LineIndex); + + $ColLabelLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); + print "\nGenerating text files for each category...\n"; + + for $CategoryName (sort { lc($a) cmp lc($b) } keys %CategoriesNameToCountMap) { + print "Generating file $CategoriesNameToFileNameMap{$CategoryName}...\n"; + $CategoryFileHandle = $CategoriesNameToFileHandleMap{$CategoryName}; + print $CategoryFileHandle "$ColLabelLine\n"; + for $LineIndex (0 .. $#{$CategoriesNameToLinesMap{$CategoryName}}) { + $Line = ${$CategoriesNameToLinesMap{$CategoryName}}[$LineIndex]; + @LineWords = quotewords($InDelim, 0, $Line); + $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); + print $CategoryFileHandle "$Line\n"; + } + close $CategoryFileHandle; + } +} + +# Extract data for specific columns... +sub ExtractColumnData { + my($Index) = @_; + my($TextFile, @ColNumsToExtract, $NewTextFile, $InDelim); + + $TextFile = $TextFilesList[$Index]; + $NewTextFile =$TextFilesInfo{OutFile}[$Index]; + $InDelim = $TextFilesInfo{InDelim}[$Index]; + @ColNumsToExtract = @{$TextFilesInfo{ColNumsToExtract}[$Index]}; + + print "Generating file $NewTextFile...\n"; + open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n"; + open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n"; + + $_ = <TEXTFILE>; + # Write out column labels... + my($Line, @LineWords, @ColLabels, $ColLabelLine, @ColValues, $ColValuesLine, $ColNum, $ColValue); + @ColLabels = (); $ColLabelLine = ""; + for $ColNum (@ColNumsToExtract) { + push @ColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum]; + } + $ColLabelLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); + print NEWTEXTFILE "$ColLabelLine\n"; + + while ($Line = GetTextLine(\*TEXTFILE)) { + @LineWords = quotewords($InDelim, 0, $Line); + @ColValues = (); $ColValuesLine = ""; + for $ColNum (@ColNumsToExtract) { + $ColValue = ""; + if ($ColNum < @LineWords) { + $ColValue = (defined $LineWords[$ColNum]) ? $LineWords[$ColNum] : ""; + } + push @ColValues, $ColValue; + } + $ColValuesLine = JoinWords(\@ColValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); + print NEWTEXTFILE "$ColValuesLine\n"; + } + close NEWTEXTFILE; + close TEXTFILE; +} + +# Extract data for specific rows... +sub ExtractRowsData { + my($Index) = @_; + my($TextFile, $NewTextFile, $InDelim, $SpecifiedRowsMode); + + $TextFile = $TextFilesList[$Index]; + $NewTextFile =$TextFilesInfo{OutFile}[$Index]; + $InDelim = $TextFilesInfo{InDelim}[$Index]; + + $SpecifiedRowsMode = $OptionsInfo{SpecifiedRowsMode}; + + print "Generating file $NewTextFile...\n"; + open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n"; + open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n"; + + my($Line, $RowCount, @LineWords, @ColLabels); + + # Write out column labels... + $Line = <TEXTFILE>; + push @ColLabels, @{$TextFilesInfo{ColLabels}[$Index]}; + $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); + print NEWTEXTFILE "$Line\n"; + + if ($SpecifiedRowsMode =~ /^rowsbycolvalue$/i) { + ExtractRowsByColValue($Index, \*TEXTFILE, \*NEWTEXTFILE); + } + elsif ($SpecifiedRowsMode =~ /^rowsbycolvaluelist$/i) { + ExtractRowsByColValueList($Index, \*TEXTFILE, \*NEWTEXTFILE); + } + elsif ($SpecifiedRowsMode =~ /^rowsbycolvaluerange$/i) { + ExtractRowsByColValueRange($Index, \*TEXTFILE, \*NEWTEXTFILE); + } + elsif ($SpecifiedRowsMode =~ /^(rowbymincolvalue|rowbymaxcolvalue)$/i) { + ExtractRowByMinOrMaxColValue($Index, \*TEXTFILE, \*NEWTEXTFILE); + } + elsif ($SpecifiedRowsMode =~ /^rownums$/i) { + ExtractRowsByRowNums($Index, \*TEXTFILE, \*NEWTEXTFILE); + } + elsif ($SpecifiedRowsMode =~ /^rownumrange$/i) { + ExtractRowsByRowNumRange($Index, \*TEXTFILE, \*NEWTEXTFILE); + } + + close NEWTEXTFILE; + close TEXTFILE; +} + +# Extract rows by column value... +sub ExtractRowsByColValue { + my($Index, $TextFileRef, $NewTextFileRef) = @_; + my($Line, $ColNum, $ColValue, $Criterion, $Value, $ValueIndex, $InDelim, @LineWords); + + $InDelim = $TextFilesInfo{InDelim}[$Index]; + + LINE: while ($Line = GetTextLine($TextFileRef)) { + @LineWords = quotewords($InDelim, 0, $Line); + for ($ValueIndex = 0; $ValueIndex < @{$TextFilesInfo{RowValues}[$Index]}; $ValueIndex = $ValueIndex + 3) { + $ColNum = $TextFilesInfo{RowValues}[$Index][$ValueIndex]; + $ColValue = $TextFilesInfo{RowValues}[$Index][$ValueIndex + 1]; + $Criterion = $TextFilesInfo{RowValues}[$Index][$ValueIndex + 2]; + if ($ColNum > $#LineWords) { + next LINE; + } + $Value = $LineWords[$ColNum]; + if ($Criterion =~ /^le$/i) { + if ($Value > $ColValue) { + next LINE; + } + } + elsif ($Criterion =~ /^ge$/i) { + if ($Value < $ColValue) { + next LINE; + } + } + elsif ($Criterion =~ /^eq$/i) { + if ($Value ne $ColValue) { + next LINE; + } + } + } + # Write it out... + $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); + print $NewTextFileRef "$Line\n"; + } +} +# Extract rows by column value list... +sub ExtractRowsByColValueList { + my($Index, $TextFileRef, $NewTextFileRef) = @_; + my($Line, $ColNum, $ColValue, $ValueIndex, $Value, $InDelim, %ColValueMap, @LineWords); + + $InDelim = $TextFilesInfo{InDelim}[$Index]; + $ColNum = $TextFilesInfo{RowValues}[$Index][0]; + + # Setup a col value map... + %ColValueMap = (); + for $ValueIndex (1 .. $#{$TextFilesInfo{RowValues}[$Index]}) { + $Value = $TextFilesInfo{RowValues}[$Index][$ValueIndex]; + $ColValueMap{$Value} = $Value; + } + + LINE: while ($Line = GetTextLine($TextFileRef)) { + @LineWords = quotewords($InDelim, 0, $Line); + if ($ColNum > $#LineWords) { + next LINE; + } + $ColValue = $LineWords[$ColNum]; + if (exists $ColValueMap{$ColValue}) { + $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); + print $NewTextFileRef "$Line\n"; + } + } +} + +# Extract row by minimum column value... +sub ExtractRowByMinOrMaxColValue { + my($Index, $TextFileRef, $NewTextFileRef) = @_; + my($Line, $ColNum, $ColValue, $FirstValue, $ValueLine, $InDelim, @LineWords); + + $InDelim = $TextFilesInfo{InDelim}[$Index]; + $ColNum = $TextFilesInfo{RowValues}[$Index][0]; + + $ValueLine = ''; $ColValue = ''; $FirstValue = 1; + LINE: while ($Line = GetTextLine($TextFileRef)) { + @LineWords = quotewords($InDelim, 0, $Line); + if ($ColNum > $#LineWords) { + next LINE; + } + if ($FirstValue) { + $FirstValue = 0; + $ColValue = $LineWords[$ColNum]; + $ValueLine = $Line; + next LINE; + } + if ($OptionsInfo{SpecifiedRowsMode} =~ /^rowbymaxcolvalue$/i) { + if ($LineWords[$ColNum] > $ColValue) { + $ColValue = $LineWords[$ColNum]; + $ValueLine = $Line; + } + } + else { + if ($LineWords[$ColNum] < $ColValue) { + $ColValue = $LineWords[$ColNum]; + $ValueLine = $Line; + } + } + } + if ($ValueLine) { + @LineWords = quotewords($InDelim, 0, $ValueLine); + $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); + print $NewTextFileRef "$Line\n"; + } +} + +# Extract rows by column value range... +sub ExtractRowsByColValueRange { + my($Index, $TextFileRef, $NewTextFileRef) = @_; + my($Line, $ColNum, $ColValue, $MinValue, $MaxValue, $InDelim, @LineWords); + + $InDelim = $TextFilesInfo{InDelim}[$Index]; + $ColNum = $TextFilesInfo{RowValues}[$Index][0]; + $MinValue = $TextFilesInfo{RowValues}[$Index][1]; + $MaxValue = $TextFilesInfo{RowValues}[$Index][2]; + + LINE: while ($Line = GetTextLine($TextFileRef)) { + @LineWords = quotewords($InDelim, 0, $Line); + if ($ColNum > $#LineWords) { + next LINE; + } + $ColValue = $LineWords[$ColNum]; + if ($ColValue >= $MinValue && $ColValue <= $MaxValue) { + $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); + print $NewTextFileRef "$Line\n"; + } + } +} + +# Extract rows by row number range... +sub ExtractRowsByRowNumRange { + my($Index, $TextFileRef, $NewTextFileRef) = @_; + + my($Line, $MinRowNum, $MaxRowNum, $RowCount, $InDelim, @LineWords); + $InDelim = $TextFilesInfo{InDelim}[$Index]; + $MinRowNum = $TextFilesInfo{RowValues}[$Index][0]; + $MaxRowNum = $TextFilesInfo{RowValues}[$Index][1]; + + $RowCount = 1; + LINE: while ($Line = GetTextLine($TextFileRef)) { + $RowCount++; + if ($RowCount >= $MinRowNum && $RowCount <= $MaxRowNum) { + @LineWords = quotewords($InDelim, 0, $Line); + $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); + print $NewTextFileRef "$Line\n"; + } + elsif ($RowCount > $MaxRowNum) { + last LINE; + } + } +} + +# Extract rows by row numbers... +sub ExtractRowsByRowNums { + my($Index, $TextFileRef, $NewTextFileRef) = @_; + my($Line, $RowNum, $MaxRowNum, $RowCount, $InDelim, %RowNumMap, @LineWords); + + $InDelim = $TextFilesInfo{InDelim}[$Index]; + + # Setup a row nums map... + %RowNumMap = (); + $MaxRowNum = $TextFilesInfo{RowValues}[$Index][0]; + for $RowNum (@{$TextFilesInfo{RowValues}[$Index]}) { + if ($RowNum > $MaxRowNum) { + $MaxRowNum = $RowNum; + } + $RowNumMap{$RowNum} = $RowNum; + } + + $RowCount = 1; + LINE: while ($Line = GetTextLine($TextFileRef)) { + $RowCount++; + if (exists $RowNumMap{$RowCount}) { + @LineWords = quotewords($InDelim, 0, $Line); + $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); + print $NewTextFileRef "$Line\n"; + } + elsif ($RowCount > $MaxRowNum) { + last LINE; + } + } +} + +# Retrieve text file columns and rows information for specified options... +sub RetrieveColumnsAndRowsInfo { + ProcessColumnsInfo(); + ProcessRowsInfo(); +} + +# Make sure the specified columns exists in text files... +sub ProcessColumnsInfo { + my($Index, $SpecifiedCategoryCol, $TextFile, @ColNumsToExtract); + + @{$TextFilesInfo{CategoryColNum}} = (); + @{$TextFilesInfo{ColNumsToExtract}} = (); + + $SpecifiedCategoryCol = $OptionsInfo{SpecifiedCategoryCol}; + + FILELIST: for $Index (0 .. $#TextFilesList) { + $TextFile = $TextFilesList[$Index]; + + $TextFilesInfo{CategoryColNum}[$Index] = 0; + @{$TextFilesInfo{ColNumsToExtract}[$Index]} = (); + + if ($TextFilesInfo{FileOkay}[$Index]) { + if ($OptionsInfo{Mode} =~ /^categories$/i) { + my($CategoryColNum, $CategoryColValid); + + $CategoryColNum = 0; + $CategoryColValid = 1; + if ($SpecifiedCategoryCol) { + if ($OptionsInfo{ColMode} =~ /^colnum$/i) { + if ($SpecifiedCategoryCol <= $TextFilesInfo{ColCount}[$Index]) { + $CategoryColNum = $SpecifiedCategoryCol - 1; + } + else { + $CategoryColValid = 0; + } + } + else { + if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$SpecifiedCategoryCol})) { + $CategoryColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$SpecifiedCategoryCol}; + } + else { + $CategoryColValid = 0; + } + } + } + if ($CategoryColValid) { + $TextFilesInfo{CategoryColNum}[$Index] = $CategoryColNum; + } + else { + warn "Warning: Ignoring file $TextFile: Category column specified, $SpecifiedCategoryCol, using \"--categorycol\" option doesn't exist\n"; + $TextFilesInfo{FileOkay}[$Index] = 0; + } + } + elsif ($OptionsInfo{Mode} =~ /^columns$/i) { + my($SpecifiedColNum, $ColNum); + + $ColNum = 0; + @ColNumsToExtract = (); + + if (@{$OptionsInfo{SpecifiedColumns}}) { + if ($OptionsInfo{ColMode} =~ /^colnum$/i) { + for $SpecifiedColNum (@{$OptionsInfo{SpecifiedColumns}}) { + if ($SpecifiedColNum >=1 && $SpecifiedColNum <= $TextFilesInfo{ColCount}[$Index]) { + $ColNum = $SpecifiedColNum - 1; + push @ColNumsToExtract, $ColNum; + } + } + } + else { + my($ColLabel); + for $ColLabel (@{$OptionsInfo{SpecifiedColumns}}) { + if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) { + push @ColNumsToExtract, $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel}; + } + } + } + } + else { + push @ColNumsToExtract, $ColNum; + } + if (@ColNumsToExtract) { + push @{$TextFilesInfo{ColNumsToExtract}[$Index]}, @ColNumsToExtract; + } + else { + warn "Warning: Ignoring file $TextFile: None of the columns specified, @{$OptionsInfo{SpecifiedColumns}}, using \"--columns\" option exist\n"; + $TextFilesInfo{FileOkay}[$Index] = 0; + } + } + } + } +} + +# Process specified rows info... +sub ProcessRowsInfo { + my($Index, $TextFile, $ColID, $ColIDOkay, $Value, $Criterion, $ColNum, @RowValues); + + @{$TextFilesInfo{RowValues}} = (); + + FILELIST: for $Index (0 .. $#TextFilesList) { + $TextFile = $TextFilesList[$Index]; + @{$TextFilesInfo{RowValues}[$Index]} = (); + + if ($OptionsInfo{Mode} !~ /^rows$/i) { + next FILELIST; + } + if (!$TextFilesInfo{FileOkay}[$Index]) { + next FILELIST; + } + + @RowValues = (); + + if ($OptionsInfo{RowsMode} =~ /^rowsbycolvalue$/i) { + my($ValueIndex); + for ($ValueIndex = 0; $ValueIndex < @{$OptionsInfo{SpecifiedRowValues}}; $ValueIndex = $ValueIndex + 3) { + $ColID = $OptionsInfo{SpecifiedRowValues}[$ValueIndex]; + $Value = $OptionsInfo{SpecifiedRowValues}[$ValueIndex + 1]; + $Criterion = $OptionsInfo{SpecifiedRowValues}[$ValueIndex + 2]; + + $ColIDOkay = 0; + if ($OptionsInfo{ColMode} =~ /^collabel$/i) { + if (exists $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID}) { + $ColIDOkay = 1; + $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID}; + } + } + else { + if ($ColID >=1 && $ColID <= $TextFilesInfo{ColCount}[$Index]) { + $ColNum = $ColID - 1; + $ColIDOkay = 1; + } + } + if ($ColIDOkay) { + push @RowValues, ($ColNum, $Value, $Criterion); + } + } + } + elsif ($OptionsInfo{RowsMode} =~ /^(rowsbycolvaluelist|rowsbycolvaluerange|rowbymincolvalue|rowbymaxcolvalue)$/i) { + # Process coulumn id... + $ColID = $OptionsInfo{SpecifiedRowValues}[0]; + $ColIDOkay = 0; + + if ($OptionsInfo{ColMode} =~ /^collabel$/i) { + if (exists $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID}) { + $ColIDOkay = 1; + $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID}; + } + } + else { + if ($ColID >=1 && $ColID <= $TextFilesInfo{ColCount}[$Index]) { + $ColIDOkay = 1; + $ColNum = $ColID - 1; + } + } + if ($ColIDOkay) { + push @RowValues, $ColNum; + # Get rest of the specified values... + if (@{$OptionsInfo{SpecifiedRowValues}} > 1) { + for $Index (1 .. $#{$OptionsInfo{SpecifiedRowValues}}) { + push @RowValues, $OptionsInfo{SpecifiedRowValues}[$Index]; + } + } + } + } + elsif ($OptionsInfo{RowsMode} =~ /^(rownums|rownumrange)$/i) { + push @RowValues, @{$OptionsInfo{SpecifiedRowValues}}; + } + + if (@RowValues) { + push @{$TextFilesInfo{RowValues}[$Index]}, @RowValues; + } + else { + warn "Warning: Ignoring file $TextFile: Column specified, $ColID, using \"--rows\" option doesn't exist\n"; + $TextFilesInfo{FileOkay}[$Index] = 0; + } + } +} + +# Retrieve information about input text files... +sub RetrieveTextFilesInfo { + my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $OutFileRoot, $CategoryOutFileRoot, $OutFile, $ColNum, $ColLabel); + + %TextFilesInfo = (); + + @{$TextFilesInfo{FileOkay}} = (); + @{$TextFilesInfo{ColCount}} = (); + @{$TextFilesInfo{ColLabels}} = (); + @{$TextFilesInfo{ColLabelToNumMap}} = (); + @{$TextFilesInfo{InDelim}} = (); + @{$TextFilesInfo{OutFile}} = (); + @{$TextFilesInfo{OutFileExt}} = (); + @{$TextFilesInfo{CategoryOutFileRoot}} = (); + + FILELIST: for $Index (0 .. $#TextFilesList) { + $TextFile = $TextFilesList[$Index]; + + $TextFilesInfo{FileOkay}[$Index] = 0; + $TextFilesInfo{ColCount}[$Index] = 0; + $TextFilesInfo{InDelim}[$Index] = ""; + $TextFilesInfo{OutFile}[$Index] = ""; + $TextFilesInfo{OutFileExt}[$Index] = ""; + $TextFilesInfo{CategoryOutFileRoot}[$Index] = ""; + + @{$TextFilesInfo{ColLabels}[$Index]} = (); + %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = (); + + if (!(-e $TextFile)) { + warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; + next FILELIST; + } + if (!CheckFileType($TextFile, "csv tsv")) { + warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n"; + next FILELIST; + } + + ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); + if ($FileExt =~ /^tsv$/i) { + $InDelim = "\t"; + } + else { + $InDelim = "\,"; + if (!($OptionsInfo{InDelim} =~ /^(comma|semicolon)$/i)) { + warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n"; + next FILELIST; + } + if ($OptionsInfo{InDelim} =~ /^semicolon$/i) { + $InDelim = "\;"; + } + } + + if (!open TEXTFILE, "$TextFile") { + warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; + next FILELIST; + } + + $Line = GetTextLine(\*TEXTFILE); + @ColLabels = quotewords($InDelim, 0, $Line); + close TEXTFILE; + + $FileDir = ""; $FileName = ""; $FileExt = ""; + ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); + $FileExt = "csv"; + if ($OptionsInfo{OutDelim} =~ /^tab$/i) { + $FileExt = "tsv"; + } + + if ($OptionsInfo{OutFileRoot} && (@TextFilesList == 1)) { + my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); + if ($RootFileName && $RootFileExt) { + $FileName = $RootFileName; + } + else { + $FileName = $OptionsInfo{OutFileRoot}; + } + $OutFileRoot .= $FileName; + } + else { + $OutFileRoot = $FileName; + $OutFileRoot .= ($OptionsInfo{Mode} =~ /^categories$/i) ? "CategoriesSummary" : (($OptionsInfo{Mode} =~ /^rows$/i) ? "ExtractedRows" : "ExtractedColumns"); + } + $CategoryOutFileRoot = "$FileName" . "Category"; + + $OutFile = $OutFileRoot . ".$FileExt"; + if (lc($OutFile) eq lc($TextFile)) { + warn "Warning: Ignoring file $TextFile:Output file name, $OutFile, is same as input text file name, $TextFile\n"; + next FILELIST; + } + + if (!$OptionsInfo{Overwrite}) { + if (-e $OutFile) { + warn "Warning: Ignoring file $TextFile: The file $OutFile already exists\n"; + next FILELIST; + } + } + + $TextFilesInfo{FileOkay}[$Index] = 1; + $TextFilesInfo{InDelim}[$Index] = $InDelim; + $TextFilesInfo{CategoryOutFileRoot}[$Index] = $CategoryOutFileRoot; + $TextFilesInfo{OutFile}[$Index] = "$OutFile"; + $TextFilesInfo{OutFileExt}[$Index] = "$FileExt"; + + $TextFilesInfo{ColCount}[$Index] = @ColLabels; + push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels; + + for $ColNum (0 .. $#ColLabels) { + $ColLabel = $ColLabels[$ColNum]; + $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum; + } + } +} + +# Process option values... +sub ProcessOptions { + my(@SpecifiedColumns, @SpecifiedRowValues); + + %OptionsInfo = (); + + $OptionsInfo{Mode} = $Options{mode}; + + $OptionsInfo{ColMode} = $Options{colmode}; + + $OptionsInfo{CategoryCol} = defined $Options{categorycol} ? $Options{categorycol} : undef; + $OptionsInfo{SpecifiedCategoryCol} = ""; + + if (defined $Options{categorycol}) { + my(@SpecifiedValues) = split ",", $Options{categorycol}; + if (@SpecifiedValues != 1) { + die "Error: Invalid number of values, ",scalar(@SpecifiedValues), " using \"--categorycol\" option: Only one value is allowed.\n"; + } + $OptionsInfo{SpecifiedCategoryCol} = $SpecifiedValues[0]; + if ($Options{colmode} =~ /^colnum$/i) { + if (!IsPositiveInteger($OptionsInfo{SpecifiedCategoryCol})) { + die "Error: Category column value, $OptionsInfo{SpecifiedCategoryCol}, specified using \"--categorycol\" is not valid. Allowed integer values: > 0.\n"; + } + } + } + + $OptionsInfo{Columns} = defined $Options{columns} ? $Options{columns} : undef; + @{$OptionsInfo{SpecifiedColumns}} = (); + @SpecifiedColumns = (); + + if (defined $Options{columns}) { + my(@SpecifiedValues) = split ",", $Options{columns}; + if ($Options{colmode} =~ /^colnum$/i) { + my($ColValue); + for $ColValue (@SpecifiedValues) { + if (!IsPositiveInteger($ColValue)) { + die "Error: Column value, $ColValue, specified using \"--columns\" is not valid: Allowed integer values: > 0.\n"; + } + } + } + push @SpecifiedColumns, @SpecifiedValues; + } + @{$OptionsInfo{SpecifiedColumns}} = @SpecifiedColumns; + + $OptionsInfo{InDelim} = $Options{indelim}; + + $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,"); + $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0; + $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef; + + $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef; + + # Process any specified rows values... + @SpecifiedRowValues = (); + @{$OptionsInfo{SpecifiedRowValues}} = (); + + $OptionsInfo{RowsMode} = $Options{rowsmode}; + $OptionsInfo{Rows} = defined $Options{rows} ? $Options{rows} : undef; + + $OptionsInfo{SpecifiedRowsMode} = $Options{rowsmode}; + + if (defined $Options{rows}) { + (@SpecifiedRowValues) = split ",", $Options{rows}; + } + else { + if ($Options{rowsmode} !~ /^rownums$/i) { + die "Error: Specify value for \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\n"; + } + push @SpecifiedRowValues, "1"; + } + @{$OptionsInfo{SpecifiedRowValues}} = @SpecifiedRowValues; + + my($SpecifiedColID, $SpecifiedRowID); + # Make sure specified values are okay... + if ($Options{rowsmode} =~ /^rowsbycolvalue$/i) { + if (@SpecifiedRowValues % 3) { + die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain triplets.\n"; + } + # Triplet format: colid,value,criteria. Criterion: le,ge,eq + my($Index, $ColID, $Criterion, $Value); + for ($Index = 0; $Index < @SpecifiedRowValues; $Index = $Index + 3) { + $ColID = $SpecifiedRowValues[$Index]; + $Value = $SpecifiedRowValues[$Index + 1]; + $Criterion = $SpecifiedRowValues[$Index + 2]; + if ($Options{colmode} =~ /^colnum$/i) { + if (!IsPositiveInteger($ColID)) { + die "Error: Invalid column id, $ColID, specified in triplet, \"$ColID,$Criterion,$Value\", using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; + } + } + if ($Criterion !~ /^(eq|le|ge)$/i) { + die "Error: Invalid criterion value, $Criterion, specified in triplet, \"$ColID,$Criterion,$Value\", using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed values: le, ge, or eq.\n"; + } + } + } + elsif ($Options{rowsmode} =~ /^rowsbycolvaluelist$/i) { + ($SpecifiedColID) = $SpecifiedRowValues[0]; + if ($Options{colmode} =~ /^colnum$/i) { + if (!IsPositiveInteger($SpecifiedColID)) { + die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; + } + } + if (@SpecifiedRowValues == 1) { + die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain more than one value\n"; + } + } + elsif ($Options{rowsmode} =~ /^rowsbycolvaluerange$/i) { + if (@SpecifiedRowValues != 3) { + die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain three values\n"; + } + ($SpecifiedColID) = $SpecifiedRowValues[0]; + if ($Options{colmode} =~ /^colnum$/i) { + if (!IsPositiveInteger($SpecifiedColID)) { + die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; + } + } + if ($SpecifiedRowValues[1] >= $SpecifiedRowValues[2]) { + die "Error: Invalid value triplet - ", JoinWords(\@SpecifiedRowValues, ',', 0) , " - specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nAllowed values: second value < third value\n"; + } + } + elsif ($Options{rowsmode} =~ /^(rowbymincolvalue|rowbymaxcolvalue)$/i) { + if (@SpecifiedRowValues != 1) { + die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nOnly one value is allowed.\n"; + } + ($SpecifiedColID) = $SpecifiedRowValues[0]; + if ($Options{colmode} =~ /^colnum$/i) { + if (!IsPositiveInteger($SpecifiedColID)) { + die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; + } + } + } + elsif ($Options{rowsmode} =~ /^rownums$/i) { + for $SpecifiedRowID (@SpecifiedRowValues) { + if (!IsPositiveInteger($SpecifiedRowID)) { + die "Error: Rows value, $SpecifiedRowID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; + } + } + } + elsif ($Options{rowsmode} =~ /^rownumrange$/i) { + if (@SpecifiedRowValues != 2) { + die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain only two values.\n"; + } + for $SpecifiedRowID (@SpecifiedRowValues) { + if (!IsPositiveInteger($SpecifiedRowID)) { + die "Error: Rows value, $SpecifiedRowID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; + } + } + if ($SpecifiedRowValues[0] >= $SpecifiedRowValues[1]) { + die "Error: Invalid value pair - ", JoinWords(\@SpecifiedRowValues, ',', 0) , " - specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nAllowed values: First value < second value\n"; + } + } +} + +# Setup script usage and retrieve command line arguments specified using various options... +sub SetupScriptUsage { + + # Setup default and retrieve all the options... + %Options = (); + $Options{colmode} = "colnum"; + $Options{indelim} = "comma"; + $Options{mode} = "columns"; + $Options{outdelim} = "comma"; + $Options{quote} = "yes"; + $Options{rowsmode} = "rownums"; + + if (!GetOptions(\%Options, "categorycol=s", "columns=s", "colmode|c=s", "help|h", "indelim=s", "mode|m=s", "outdelim=s", "overwrite|o", "quote|q=s", "root|r=s", "rows=s", "rowsmode=s", "workingdir|w=s")) { + die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; + } + if ($Options{workingdir}) { + if (! -d $Options{workingdir}) { + die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; + } + chdir $Options{workingdir} || die "Error: Couldn't chdir $Options{workingdir}: $! \n"; + } + if ($Options{mode} !~ /^(columns|rows|categories)$/i) { + die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: columns, rows or categories \n"; + } + if ($Options{colmode} !~ /^(colnum|collabel)$/i) { + die "Error: The value specified, $Options{colmode}, for option \"--colmode\" is not valid. Allowed values: colnum or collabel \n"; + } + if ($Options{indelim} !~ /^(comma|semicolon)$/i) { + die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; + } + if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { + die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; + } + if ($Options{quote} !~ /^(yes|no)$/i) { + die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; + } + if ($Options{rowsmode} !~ /^(rowsbycolvalue|rowsbycolvaluelist|rowsbycolvaluerange|rowbymincolvalue|rowbymaxcolvalue|rownums|rownumrange)$/i) { + die "Error: The value specified, $Options{rowsmode}, for option \"--rowsmode\" is not valid. Allowed values: rowsbycolvalue, rowsbycolvaluelist, rowsbycolvaluerange, rowbymincolvalue, rowbymaxcolvalue, rownum, rownumrange\n"; + } +} +__END__ + + +=head1 NAME + +ExtractFromTextFiles.pl - Extract specific data from TextFile(s) + +=head1 SYNOPSIS + +ExtractFromTextFiles.pl TextFile(s)... + +ExtractFromTextFiles.pl [B<-c, --colmode> colnum | collabel] [B<--categorycol > number | string] +[B<--columns> "colnum,[colnum]..." | "collabel,[collabel]..."] [B<-h, --help>] +[B<--indelim> I<comma | semicolon>] [B<-m, --mode > I<columns | rows | categories>] +[B<-o, --overwrite>] [B<--outdelim> I<comma | tab | semicolon>] [B<-q, --quote> I<yes | no>] +[B<--rows> "colid,value,criteria..." | "colid,value..." | "colid,mincolvalue,maxcolvalue" | "rownum,rownum,..." | colid | "minrownum,maxrownum"] +[ B<--rowsmode> rowsbycolvalue | rowsbycolvaluelist | rowsbycolvaluerange | rowbymincolvalue | rowbymaxcolvalue | rownums | rownumrange] +[B<-r, --root> I<rootname>] [B<-w, --workingdir> I<dirname>] TextFile(s)... + +=head1 DESCRIPTION + +Extract column(s)/row(s) data from I<TextFile(s)> identified by column numbers or labels. Or categorize +data using a specified column category. During categorization, a summary text file is +generated containing category name and count; an additional text file, containing data for +for each category, is also generated. The file names are separated by space. The +valid file extensions are I<.csv> and I<.tsv> for comma/semicolon and tab delimited +text files respectively. All other file names are ignored. All the text files in a +current directory can be specified by I<*.csv>, I<*.tsv>, or the current directory +name. The B<--indelim> option determines the format of I<TextFile(s)>. Any file +which doesn't correspond to the format indicated by B<--indelim> option is ignored. + +=head1 OPTIONS + +=over 4 + +=item B<-c, --colmode> I<colnum | collabel> + +Specify how columns are identified in I<TextFile(s)>: using column number or column +label. Possible values: I<colnum or collabel>. Default value: I<colnum>. + +=item B<--categorycol > I<number | string> + +Column used to categorize data. Default value: First column. + +For I<colnum> value of B<-c, --colmode> option, input value is a column number. +Example: I<1>. + +For I<collabel> value of B<-c, --colmode> option, input value is a column label. +Example: I<Mol_ID>. + +=item B<--columns> I<"colnum,[colnum]..." | "collabel,[collabel]..."> + +List of comma delimited columns to extract. Default value: First column. + +For I<colnum> value of B<-c, --colmode> option, input values format is: +I<colnum,colnum,...>. Example: I<1,3,5> + +For I<collabel> value of B<-c, --colmode> option, input values format is: +I<collabel,collabel,..>. Example: I<Mol_ID,MolWeight> + +=item B<-h, --help> + +Print this help message. + +=item B<--indelim> I<comma | semicolon> + +Input delimiter for CSV I<TextFile(s)>. Possible values: I<comma or semicolon>. +Default value: I<comma>. For TSV files, this option is ignored and I<tab> is used as a +delimiter. + +=item B<-m, --mode > I<columns | rows | categories> + +Specify what to extract from I<TextFile(s)>. Possible values: I<columns, rows, +or categories>. Default value: I<columns>. + +For I<columns> mode, data for appropriate columns specified by B<--columns> option +is extracted from I<TextFile(s)> and placed into new text files. + +For I<rows> mode, appropriate rows specified in conjuction with B<--rowsmode> and +B<rows> options are extracted from I<TextFile(s)> and placed into new text files. + +For I<categories> mode, coulmn specified by B<--categorycol> is +used to categorize data, and a summary text file is generated +containing category name and count; an additional text file, containing data for +for each category, is also generated. + +=item B<-o, --overwrite> + +Overwrite existing files. + +=item B<--outdelim> I<comma | tab | semicolon>. + +Output text file delimiter. Possible values: I<comma, tab, or semicolon>. +Default value: I<comma> + +=item B<-q, --quote> I<yes | no> + +Put quotes around column values in output text file. Possible values: I<yes or +no>. Default value: I<yes>. + +=item B<-r, --root> I<rootname> + +New file name is generated using the root: <Root>.<Ext>. Default for new file +names: <TextFile>CategoriesSummary.<Ext>, <TextFile>ExtractedColumns.<Ext>, and +<TextFile>ExtractedRows.<Ext> for I<categories>, I<columns>, and I<rows> mode +respectively. And <TextFile>Category<CategoryName>.<Ext> +for each category retrieved from each text file. The output file type determines <Ext> +value: csv and tsv for CSV, and TSV files respectively. + +This option is ignored for multiple input files. + +=item B<--rows> I<"colid,value,criteria..." | "colid,value..." | "colid,mincolvalue,maxcolvalue" | "rownum,rownum,..." | colid | "minrownum,maxrownum"> + +This value is B<--rowsmode> specific. In general, it's a list of comma separated column ids and +associated mode specific value. Based on Column ids specification, column label or number, is +controlled by B<-c, --colmode> option. + +First line containing column labels is always written out. And value comparisons assume +numerical column data. + +For I<rowsbycolvalue> mode, input value format contains these triplets: +I<colid,value, criteria...>. Possible values for criteria: I<le, ge or eq>. +Examples: + + MolWt,450,le + MolWt,450,le,LogP,5,le,SumNumNO,10,le,SumNHOH,5,le + +For I<rowsbycolvaluelist> mode, input value format is: I<colid,value...>. Examples: + + Mol_ID,20 + Mol_ID,20,1002,1115 + +For I<rowsbycolvaluerange> mode, input value format is: I<colid,mincolvalue,maxcolvalue>. Examples: + + MolWt,100,450 + +For I<rowbymincolvalue, rowbymaxcolvalue> modes, input value format is: I<colid>. + +For I<rownum> mode, input value format is: I<rownum>. Default value: I<2>. + +For I<rownumrange> mode, input value format is: I<minrownum, maxrownum>. Examples: + + 10,40 + +=item B<--rowsmode> I<rowsbycolvalue | rowsbycolvaluelist | rowsbycolvaluerange | rowbymincolvalue | rowbymaxcolvalue | rownums | rownumrange> + +Specify how to extract rows from I<TextFile(s)>. Possible values: I<rowsbycolvalue, rowsbycolvaluelist, rowsbycolvaluerange, +rowbymincolvalue, rowbymaxcolvalue, rownum, rownumrange>. Default value: I<rownum>. + +Use B<--rows> option to list rows criterion used for extraction of rows from +I<TextFile(s)>. + +=item B<-w, --workingdir> I<dirname> + +Location of working directory. Default: current directory. + +=back + +=head1 EXAMPLES + +To extract first column from a text file and generate a new CSV text file NewSample1.csv, +type: + + % ExtractFromTextFiles.pl -r NewSample1 -o Sample1.csv + +To extract columns Mol_ID, MolWeight, and NAME from Sample1.csv and generate a new +textfile NewSample1.tsv with no quotes, type: + + % ExtractFromTextFiles.pl -m columns -c collabel --columns "Mol_ID, + MolWeight,NAME" --outdelim tab --quote no -r NewSample1 + -o Sample1.csv + +To extract rows containing values for MolWeight column of less than 450 from +Sample1.csv and generate a new textfile NewSample1.csv, type: + + % ExtractFromTextFiles.pl -m rows --rowsmode rowsbycolvalue + -c collabel --rows MolWeight,450,le -r NewSample1 + -o Sample1.csv + +To extract rows containing values for MolWeight column between 400 and 500 from +Sample1.csv and generate a new textfile NewSample1.csv, type: + + % ExtractFromTextFiles.pl -m rows --rowsmode rowsbycolvaluerange + -c collabel --rows MolWeight,450,500 -r NewSample1 + -o Sample1.csv + +To extract a row containing minimum value for column MolWeight from Sample1.csv and generate +a new textfile NewSample1.csv, type: + + % ExtractFromTextFiles.pl -m rows --rowsmode rowbymincolvalue + -c collabel --rows MolWeight -r NewSample1 + -o Sample1.csv + +=head1 AUTHOR + +Manish Sud <msud@san.rr.com> + +=head1 SEE ALSO + +JoinTextFiles.pl, MergeTextFilesWithSD.pl, ModifyTextFilesFormat.pl, SplitTextFiles.pl + +=head1 COPYRIGHT + +Copyright (C) 2015 Manish Sud. All rights reserved. + +This file is part of MayaChemTools. + +MayaChemTools is free software; you can redistribute it and/or modify it under +the terms of the GNU Lesser General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + +=cut