Mercurial > repos > deepakjadmin > mayatool3_test2
diff bin/ModifySDFilesDataFields.pl @ 0:4816e4a8ae95 draft default tip
Uploaded
author | deepakjadmin |
---|---|
date | Wed, 20 Jan 2016 09:23:18 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/ModifySDFilesDataFields.pl Wed Jan 20 09:23:18 2016 -0500 @@ -0,0 +1,677 @@ +#!/usr/bin/perl -w +# +# $RCSfile: ModifySDFilesDataFields.pl,v $ +# $Date: 2015/02/28 20:46:20 $ +# $Revision: 1.27 $ +# +# Author: Manish Sud <msud@san.rr.com> +# +# Copyright (C) 2015 Manish Sud. All rights reserved. +# +# This file is part of MayaChemTools. +# +# MayaChemTools is free software; you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your option) any +# later version. +# +# MayaChemTools is distributed in the hope that it will be useful, but without +# any warranty; without even the implied warranty of merchantability of fitness +# for a particular purpose. See the GNU Lesser General Public License for more +# details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or +# write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, +# Boston, MA, 02111-1307, USA. +# + +use strict; +use FindBin; use lib "$FindBin::Bin/../lib"; +use Getopt::Long; +use File::Basename; +use Text::ParseWords; +use Benchmark; +use FileUtil; +use SDFileUtil; +use TextUtil; + +my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); + +# Autoflush STDOUT +$| = 1; + +# Starting message... +$ScriptName = basename($0); +print "\n$ScriptName: Starting...\n\n"; +$StartTime = new Benchmark; + +# Get the options and setup script... +SetupScriptUsage(); +if ($Options{help} || @ARGV < 1) { + die GetUsageFromPod("$FindBin::Bin/$ScriptName"); +} + +my(@SDFilesList); +@SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); + +# Process options... +print "Processing options...\n"; +my(%OptionsInfo); +ProcessOptions(); + +print "Checking input SD file(s)...\n"; +my(%SDFilesInfo); +RetrieveSDFilesInfo(); + +# Generate output files... +my($FileIndex); +if (@SDFilesList > 1) { + print "\nProcessing SD files...\n"; +} +for $FileIndex (0 .. $#SDFilesList) { + if ($SDFilesInfo{FileOkay}[$FileIndex]) { + print "\nProcessing file $SDFilesList[$FileIndex]...\n"; + ModifySDFile($FileIndex); + } +} +print "\n$ScriptName:Done...\n\n"; + +$EndTime = new Benchmark; +$TotalTime = timediff ($EndTime, $StartTime); +print "Total time: ", timestr($TotalTime), "\n"; + +############################################################################### + +# Modify SD file data fields.... +sub ModifySDFile { + my($Index) = @_; + my($SDFile, $NewSDFile); + + $SDFile = $SDFilesList[$Index]; + $NewSDFile = $SDFilesInfo{OutFile}[$Index]; + + print "Generating new SD file $NewSDFile...\n"; + open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n"; + open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n"; + + my($CmpdCount, $CmpdString, $CmpdData, $MolName, $OldSDField, $NewSDField, $CommonSDField, $Label, $Value, $FieldValues, $MolNameDataField, $URLCmpdIdFieldName, @CmpdLines, %DataFieldAndValues, @DataFieldLabels); + $CmpdCount = 0; + + COMPOUND: while ($CmpdString = ReadCmpdString(\*SDFILE)) { + $CmpdCount++; + @CmpdLines = split "\n", $CmpdString; + if ($OptionsInfo{UseDataFieldForMolName} || $OptionsInfo{ModifyDataFields}) { + %DataFieldAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); + } + if ($OptionsInfo{ModifyMolName}) { + if ($OptionsInfo{AlwaysReplaceMolName} || !IsNotEmpty($CmpdLines[0])) { + $MolNameDataField = $OptionsInfo{MolNameDataField}; + if ($OptionsInfo{UseDataFieldForMolName} && exists($DataFieldAndValues{$MolNameDataField})) { + $MolName = $DataFieldAndValues{$MolNameDataField}; + if (length($MolName) > 80) { + $MolName = substr($MolName, 0, 80); + } + } + else { + $MolName = "$OptionsInfo{MolNamePrefix}${CmpdCount}"; + } + $CmpdLines[0] = $MolName; + $CmpdString = join "\n", @CmpdLines; + } + } + if (!$OptionsInfo{ModifyDataFields}) { + # Just write the data and get the next compound... + print NEWSDFILE "$CmpdString\n"; + next COMPOUND; + } + # Write out the structure data now and handle the old data fields later... + ($CmpdData) = split /\n>/, $CmpdString; + print NEWSDFILE "$CmpdData\n"; + + # Modify specified data fields... + for $NewSDField (sort keys %{$OptionsInfo{SpecifiedNewToOldSDFieldMap}}) { + $FieldValues = ""; + for $OldSDField (@{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}}) { + if (exists($DataFieldAndValues{$OldSDField}) && length($DataFieldAndValues{$OldSDField})) { + $Value = $DataFieldAndValues{$OldSDField}; + $FieldValues .= ($FieldValues) ? "\n$Value" : $Value; + } + } + print NEWSDFILE "> <$NewSDField>\n$FieldValues\n\n"; + } + # Add specified common fields... + for $CommonSDField (sort keys %{$OptionsInfo{SpecifiedCommonFieldMap}}) { + $Value = $OptionsInfo{SpecifiedCommonFieldMap}{$CommonSDField}; + print NEWSDFILE "> <$CommonSDField>\n$Value\n\n"; + } + if ($OptionsInfo{CreateDataFieldURL}) { + $Value = ""; + $URLCmpdIdFieldName = $OptionsInfo{URLCmpdIdFieldName}; + if (exists($DataFieldAndValues{$URLCmpdIdFieldName}) && length($DataFieldAndValues{$URLCmpdIdFieldName})) { + $Value = $DataFieldAndValues{$URLCmpdIdFieldName}; + $Value = "$OptionsInfo{URLCGIScriptName}?$OptionsInfo{URLParamName}=${Value}"; + } + print NEWSDFILE "> <$OptionsInfo{URLDataFieldLabel}>\n$Value\n\n"; + } + + # Handle old data fields and write 'em in the same order as they appear in the input + # files... + if ($OptionsInfo{KeepAllOldDataFields} || $OptionsInfo{KeepUnMappedOldDataFields}) { + my($KeepLabel); + @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); + LABEL: for $Label (@DataFieldLabels) { + $KeepLabel = $OptionsInfo{KeepAllOldDataFields} ? 1 : ( exists($OptionsInfo{SpecifiedOldToNewSDFieldMap}{$Label}) ? 0 : 1 ); + if (!$KeepLabel) { + next LABEL; + } + $Value = $DataFieldAndValues{$Label}; + print NEWSDFILE "> <$Label>\n$Value\n\n"; + } + } + + print NEWSDFILE "\$\$\$\$\n"; + } + close NEWSDFILE; + close SDFILE; +} + +# Process option values... +sub ProcessOptions { + %OptionsInfo = (); + + $OptionsInfo{Mode} = $Options{mode}; + + $OptionsInfo{ModifyMolName} = 1; $OptionsInfo{ModifyDataFields} = 0; + if ($Options{mode} =~ /^both$/i) { + $OptionsInfo{ModifyMolName} = 1; $OptionsInfo{ModifyDataFields} = 1; + } + elsif ($Options{mode} =~ /^datafields$/i) { + $OptionsInfo{ModifyMolName} = 0; $OptionsInfo{ModifyDataFields} = 1; + } + + $OptionsInfo{KeepOldDataFields} = $Options{keepolddatafields}; + $OptionsInfo{KeepAllOldDataFields} = ($Options{keepolddatafields} =~ /^all$/i) ? 1 : 0; + $OptionsInfo{KeepUnMappedOldDataFields} = ($Options{keepolddatafields} =~ /^unmappedonly$/i) ? 1 : 0; + + $OptionsInfo{MolNameMode} = $Options{molnamemode}; + $OptionsInfo{UseDataFieldForMolName} = ($Options{molnamemode} =~ /^datafield$/i) ? 1 : 0; + + $OptionsInfo{MolName} = $Options{molname}; + $OptionsInfo{MolNameDataField} = ""; $OptionsInfo{MolNamePrefix} = "Cmpd"; + if ($Options{molname}) { + if ($OptionsInfo{UseDataFieldForMolName}) { + $OptionsInfo{MolNameDataField} = $Options{molname}; + } + else { + $OptionsInfo{MolNamePrefix} = $Options{molname}; + } + } + + $OptionsInfo{MolNameReplace} = $Options{molnamereplace}; + $OptionsInfo{AlwaysReplaceMolName} = ($Options{molnamereplace} =~ /^always$/i) ? 1 : 0; + + if ($Options{datafieldsmap} && $Options{datafieldsmapfile}) { + die "Error: Both \"--datafieldsmap\" and \"--datafieldsmapfile\" options specified: only one is allowed at a time\n"; + } + + $OptionsInfo{DataFieldsMap} = $Options{datafieldsmap} ? $Options{datafieldsmap} : ''; + $OptionsInfo{DataFieldsMapFile} = $Options{datafieldsmapfile} ? $Options{datafieldsmapfile} : ''; + + my($SpecifiedDataFieldMap); + + %{$OptionsInfo{SpecifiedNewToOldSDFieldMap}} = (); + %{$OptionsInfo{SpecifiedOldToNewSDFieldMap}} = (); + + $SpecifiedDataFieldMap = ""; + if ($Options{datafieldsmap}) { + $SpecifiedDataFieldMap = $Options{datafieldsmap}; + } + elsif ($Options{datafieldsmapfile}) { + my($Line, @LineWords); + open DATAFIELDSFILE, "$Options{datafieldsmapfile}" or die "Couldn't open $Options{datafieldsmapfile}: $! \n"; + while ($Line = GetTextLine(\*DATAFIELDSFILE)) { + @LineWords = quotewords(";", 0, $Line); + $SpecifiedDataFieldMap .= JoinWords(\@LineWords, ";", 0); + } + close DATAFIELDSFILE; + } + + if ($SpecifiedDataFieldMap) { + my($DataFieldMap, $DataField, $NewSDField, @OldSDFields, @DataFieldMapSplit, @DataFieldsSplit, $FirstField); + @DataFieldMapSplit = split ";", $SpecifiedDataFieldMap; + for $DataFieldMap (@DataFieldMapSplit) { + @DataFieldsSplit = split ",", $DataFieldMap; + if (@DataFieldsSplit == 1) { + die "Error: Invalid number of comma delimited values, ", scalar(@DataFieldsSplit), ", specified, @DataFieldsSplit, using \"--datafieldsmap or --datafieldsmapfile\" option: it must contain more than one value.\n"; + } + $FirstField = 1; + @OldSDFields = (); + for $DataField (@DataFieldsSplit) { + if (!(defined($DataField) && length($DataField))) { + die "Error: One of the comma delimited values, \"", join(",", @DataFieldsSplit), "\", specified using \"--datafieldsmap or --datafieldsmapfile\" option is empty.\n"; + } + if ($FirstField) { + $FirstField = 0; + $NewSDField = $DataField; + } + else { + push @OldSDFields, $DataField; + } + } + # Make sure a datafield is only specified once... + if (exists $OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}) { + die "Error: New data field, $NewSDField, specified more than once using \"--datafieldsmap or --datafieldsmapfile\" option.\n"; + } + @{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}} = (); + push @{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}}, @OldSDFields; + for $DataField (@OldSDFields) { + if (exists $OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataField} ) { + die "Error: SD field, $DataField, specified more than once using \"--datafieldsmap or --datafieldsmapfile\" option.\n"; + } + else { + $OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataField} = $NewSDField; + } + } + + } + } + + $OptionsInfo{DataFieldsCommon} = $Options{datafieldscommon} ? $Options{datafieldscommon} : ''; + %{$OptionsInfo{SpecifiedCommonFieldMap}} = (); + + if ($Options{datafieldscommon}) { + my($DataFieldName, $DataFieldValue, $Index, @CommonDataFieldsSplit); + @CommonDataFieldsSplit = split ",", $Options{datafieldscommon}; + if (@CommonDataFieldsSplit % 2) { + die "Error: Invalid number of comma delimited values, ", scalar(@CommonDataFieldsSplit), ", specified \"", join(",", @CommonDataFieldsSplit), "\" using \"--datafieldscommon\" option: it must contain even number of values.\n"; + } + for ($Index = 0; $Index < @CommonDataFieldsSplit; $Index += 2) { + $DataFieldName = $CommonDataFieldsSplit[$Index]; + $DataFieldValue = $CommonDataFieldsSplit[$Index + 1]; + if (exists $OptionsInfo{SpecifiedCommonFieldMap}{$DataFieldName}) { + die "Error: Common data field, $DataFieldName, specified more than once using \"--datafieldscommon\" option.\n"; + } + if (exists($OptionsInfo{SpecifiedNewToOldSDFieldMap}{$DataFieldName}) || exists($OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataFieldName})) { + die "Error: Common data field, $DataFieldName, specified using \"--datafieldscommon\" option cannot be specified in \"--datafieldsmap or --datafieldsmapfile\" option.\n"; + } + $OptionsInfo{SpecifiedCommonFieldMap}{$DataFieldName} = $DataFieldValue; + } + } + + $OptionsInfo{DataFieldURL} = $Options{datafieldurl} ? $Options{datafieldurl} : ''; + $OptionsInfo{CreateDataFieldURL} = (exists($Options{datafieldurl}) && length($Options{datafieldurl}) ) ? 1 : 0; + + $OptionsInfo{URLDataFieldLabel} = ""; $OptionsInfo{URLCGIScriptName} = ""; + $OptionsInfo{URLParamName} = ""; $OptionsInfo{URLCmpdIdFieldName} = ""; + + if ($OptionsInfo{CreateDataFieldURL}) { + my(@DataFieldURLSplit, $Value); + @DataFieldURLSplit = split ",", $Options{datafieldurl}; + if (@DataFieldURLSplit != 4) { + die "Error: Invalid number of values, ", scalar(@DataFieldURLSplit), ", specified using \"--datafieldURL\" option: it must contain 4 values.\n"; + } + for $Value (@DataFieldURLSplit) { + if (!IsNotEmpty($Value)) { + die "Error: One of the values, $Options{datafieldurl}, specified using \"--datafieldURL\" option is empty.\n"; + } + } + $OptionsInfo{URLDataFieldLabel} = $DataFieldURLSplit[0]; + $OptionsInfo{URLCGIScriptName} = $DataFieldURLSplit[1]; + $OptionsInfo{URLParamName} = $DataFieldURLSplit[2]; + $OptionsInfo{URLCmpdIdFieldName} = $DataFieldURLSplit[3]; + } + +} + +# Retrieve information about input SD files... +sub RetrieveSDFilesInfo { + my($Index, $SDFile, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFile, $DataFieldName); + + %SDFilesInfo = (); + @{$SDFilesInfo{FileOkay}} = (); + @{$SDFilesInfo{OutFile}} = (); + + FILELIST: for $Index (0 .. $#SDFilesList) { + $SDFile = $SDFilesList[$Index]; + + $SDFilesInfo{FileOkay}[$Index] = 0; + $SDFilesInfo{OutFile}[$Index] = ''; + + if (!(-e $SDFile)) { + warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; + next FILELIST; + } + if (!CheckFileType($SDFile, "sd sdf")) { + warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; + next FILELIST; + } + $FileDir = ""; $FileName = ""; $FileExt = ""; + ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); + if ($Options{root} && (@SDFilesList == 1)) { + my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root}); + if ($RootFileName && $RootFileExt) { + $FileName = $RootFileName; + } + else { + $FileName = $Options{root}; + } + $OutFileRoot = $FileName; + } + else { + $OutFileRoot = $FileName . "ModifiedDataFields"; + } + + $OutFile = $OutFileRoot . ".$FileExt"; + if (lc($OutFile) eq lc($SDFile)) { + warn "Warning: Ignoring file $SDFile:Output file name, $OutFile, is same as input SD file name, $SDFile\n"; + next FILELIST; + } + if (!$Options{overwrite}) { + if (-e $OutFile) { + warn "Warning: Ignoring file $SDFile: The file $OutFile already exists\n"; + next FILELIST; + } + } + + $SDFilesInfo{FileOkay}[$Index] = 1; + $SDFilesInfo{OutFile}[$Index] = $OutFile; + } +} + +# Setup script usage and retrieve command line arguments specified using various options... +sub SetupScriptUsage { + + # Retrieve all the options... + %Options = (); + $Options{detail} = 1; + $Options{keepolddatafields} = "none"; + $Options{mode} = "molname"; + $Options{molnamemode} = "labelprefix"; + $Options{molnamereplace} = "empty"; + + if (!GetOptions(\%Options, "detail|d=i", "datafieldscommon=s", "datafieldsmap=s", "datafieldsmapfile=s", "datafieldurl=s", "help|h", "keepolddatafields|k=s", "mode|m=s", "molname=s", "molnamemode=s", "molnamereplace=s", "overwrite|o", "root|r=s", "workingdir|w=s")) { + die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; + } + if ($Options{workingdir}) { + if (! -d $Options{workingdir}) { + die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; + } + chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; + } + if ($Options{keepolddatafields} !~ /^(all|unmappedonly|none)$/i) { + die "Error: The value specified, $Options{keepolddatafields}, for option \"-k --keepolddatafields\" is not valid. Allowed values: all, unmappedonly, or none\n"; + } + if ($Options{mode} !~ /^(molname|datafields|both)$/i) { + die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: molname, datafields, or both\n"; + } + if ($Options{molnamemode} !~ /^(datafield|labelprefix)$/i) { + die "Error: The value specified, $Options{molnamemode}, for option \"--molnamemode\" is not valid. Allowed values: datafield or labelprefix\n"; + } + if ($Options{molnamereplace} !~ /^(always|empty)$/i) { + die "Error: The value specified, $Options{molnamereplace}, for option \"--molnamereplace\" is not valid. Allowed values: always or empty\n"; + } + if (!IsPositiveInteger($Options{detail})) { + die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n"; + } +} + +__END__ + +=head1 NAME + +ModifySDFilesDataFields.pl - Modify data fields in SDFile(s) + +=head1 SYNOPSIS + +ModifySDFilesDataFields.pl SDFile(s)... + +ModifySDFilesDataFields.pl [B<-d, --detail> infolevel] +[B<--datafieldscommon> newfieldlabel, newfieldvalue, [newfieldlabel, newfieldvalue,...]] +[B<--datafieldsmap> newfieldlabel, oldfieldlabel, [oldfieldlabel,...]; [newfieldlabel, oldfieldlabel, [oldfieldlabel,...]]] +[B<--datafieldsmapfile> filename] [B<--datafieldURL> URLDataFieldLabel, CGIScriptPath, CGIParamName, CmpdIDFieldLabel] +[B<-h, --help>] [B<-k, --keepolddatafields> all | unmappedonly | none] [B<-m, --mode> molname | datafields | both] +[B<--molnamemode> datafield | labelprefix] [B<--molname> datafieldname or prefixstring] +[B<--molnamereplace> always | empty] [B<-o, --overwrite>] [B<-r, --root> rootname] +[B<-w, --workingdir> dirname] SDFile(s)... + +=head1 DESCRIPTION + +Modify molname line and data fields in I<SDFile(s)>. Molname line can be replaced by a +data field value or assigned a sequential ID prefixed with a specific string. For data +fields and modification of their values, these types of options are supported: replace +data field labels by another set of labels; combine values of multiple data fields and +assign a new label; add specific set of data field labels and values to all compound +records; and others. + +The file names are separated by space.The valid file extensions are I<.sdf> and I<.sd>. +All other file names are ignored. All the SD files in a current directory can be specified +either by I<*.sdf> or the current directory name. + +=head1 OPTIONS + +=over 4 + +=item B<-d, --detail> I<infolevel> + +Level of information to print about compound records being ignored. Default: I<1>. Possible +values: I<1, 2 or 3>. + +=item B<--datafieldscommon> I<newfieldlabel, newfieldvalue, [newfieldlabel, newfieldvalue,...]> + +Specify data field labels and values for addition to each compound record. It's a comma delimited +list of data field label and values pair. Default: I<none>. + +Examples: + + DepositionDate,YYYY-MM-DD + Source,www.domainname.org,ReleaseData,YYYY-MM-DD + +=item B<--datafieldsmap> I<newfieldlabel, oldfieldlabel, [oldfieldlabel,...]; [newfieldlabel, oldfieldlabel, [oldfieldlabel,...]]> + +Specify how various data field labels and values are combined to generate a new data field +labels and their values. All the comma delimited data fields, with in a semicolon delimited set, +are mapped to the first new data field label along with the data field values joined via new +line character. Default: I<none>. + +Examples: + + Synonym,Name,SystematicName,Synonym;CmpdID,Extreg + HBondDonors,SumNHOH + +=item B<--datafieldsmapfile> I<filename> + +Filename containing mapping of data fields. Format of data fields line in this file corresponds +to B<--datafieldsmap> option. Example: + + Line 1: Synonym,Name,SystematicName,Synonym;CmpdID,Extreg + Line 2: HBondDonors,SumNHOH + + +=item B<--datafieldURL> I<URLDataFieldLabel, CGIScriptPath, CGIParamName, CmpdIDFieldLabel> + +Specify how to generate a URL for retrieving compound data from a web server and add it +to each compound record. I<URLDataFieldLabel> is used as the data field label for URL value +which is created by combining I<CGIScriptPath,CGIParamName,CmpdIDFieldLabel> values: +CGIScriptPath?CGIParamName=CmpdIDFieldLabelValue. Default: I<none>. + +Example: + + Source,http://www.yourdomain.org/GetCmpd.pl,Reg_ID,Mol_ID + +=item B<-h, --help> + +Print this help message. + +=item B<-k, --keepolddatafields> I<all | unmappedonly | none> + +Specify how to transfer old data fields from input SDFile(s) to new SDFile(s) during +I<datafields | both> value of B<-m, --mode> option: keep all old data fields; write out the ones +not mapped to new fields as specified by B<--datafieldsmap> or <--datafieldsmapfile> options; +or ignore all old data field labels. For I<molname> B<-m --mode>, old datafields are always kept. +Possible values: I<all | unmappedonly | none>. Default: I<none>. + +=item B<-m, --mode> I<molname | datafields | both> + +Specify how to modify SDFile(s): I<molname> - change molname line by another datafield or value; +I<datafield> - modify data field labels and values by replacing one label by another, combining +multiple data field labels and values, adding specific set of data field labels and values to all compound, or +inserting an URL for compound retrieval to each record; I<both> - change molname line and datafields +simultaneously. Possible values: I<molname | datafields | both>. Default: I<molname> + +=item B<--molnamemode> I<datafield | labelprefix> + +Specify how to change molname line for B<-m --mode> option values of I<molname | both>: use +a datafield label value or assign a sequential ID prefixed with I<labelprefix>. Possible values: +I<datafield | labelprefix>. Default: I<labelprefix>. + +=item B<--molname> I<datafieldname or prefixstring> + +Molname generation method. For I<datafield> value of B<--molnamemode> option, it corresponds +to datafield label name whose value is used for molname; otherwise, it's a prefix string used for +generating compound IDs like labelprefixstring<Number>. Default value, I<Cmpd>, generates +compound IDs like Cmpd<Number> for molname. + +=item B<--molnamereplace> I<always | empty> + +Specify when to replace molname line for B<-m --mode> option values of I<molname | both>: +always replace the molname line using B<--molname> option or only when it's empty. Possible +values: I<always | empty>. Default: I<empty>. + +=item B<-o, --overwrite> + +Overwrite existing files. + +=item B<-r, --root> I<rootname> + +New SD file name is generated using the root: <Root>.<Ext>. Default new file +name: <InitialSDFileName>ModifiedDataFields.<Ext>. This option is ignored for multiple +input files. + +=item B<-w, --workingdir> I<dirname> + +Location of working directory. Default: current directory. + +=back + +=head1 EXAMPLES + +To replace empty molname lines by Cmpd<CmpdNumber> and generate a new SD file +NewSample1.sdf, type: + + % ModifySDFilesDataFields.pl -o -r NewSample1 Sample1.sdf + +To replace all molname lines by Mol_ID data field generate a new SD file +NewSample1.sdf, type: + + % ModifySDFilesDataFields.pl --molnamemode datafield + --molnamereplace always -r NewSample1 -o Sample1.sdf + +To replace all molname lines by Mol_ID data field, map Name and CompoundName to +a new datafield Synonym, and generate a new SD file NewSample1.sdf, type: + + % ModifySDFilesDataFields.pl --molnamemode datafield + --molnamereplace always --molname Mol_ID --mode both + --datafieldsmap "Synonym,Name,CompoundName" -r + NewSample1 -o Sample1.sdf + +To replace all molname lines by Mol_ID data field, map Name and CompoundName to +a new datafield Synonym, add common fields ReleaseDate and Source, and +generate a new SD file NewSample1.sdf without keeping any old SD data fields, type: + + % ModifySDFilesDataFields.pl --molnamemode datafield + --molnamereplace always --molname Mol_ID --mode both + --datafieldsmap "Synonym,Name,CompoundName" + --datafieldscommon "ReleaseDate,yyyy-mm-dd,Source, + www.mayachemtools.org" --keepolddatafields none -r + NewSample1 -o Sample1.sdf + +B<Preparing SD files PubChem deposition:> + +Consider a SD file with these fields: Mol_ID, Name, Synonyms and Systematic_Name. +And Mol_ID data field uniquely identifies your compound. + +To prepare a new SD file CmpdDataForPubChem.sdf containing only required +PUBCHEM_EXT_DATASOURCE_REGID field, type: + + % ModifySDFilesDataFields.pl --m datafields + --datafieldsmap + "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID" + -r CmpdDataForPubChem -o Sample1.sdf + +To prepare a new SD file CmpdDataForPubChem.sdf containing only required +PUBCHEM_EXT_DATASOURCE_REGID field and replace molname line with Mol_ID, type: + + % ModifySDFilesDataFields.pl --molnamemode datafield + --molnamereplace always --molname Mol_ID --mode both + --datafieldsmap + "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID" + -r CmpdDataForPubChem -o Sample1.sdf + +In addition to required PubChem data field, you can also add optional PubChem data +fields. + +To map your Name, Synonyms and Systematic_Name data fields to optional +PUBCHEM_SUBSTANCE_SYNONYM data field along with required ID field, type: + + % ModifySDFilesDataFields.pl --molnamemode datafield + --molnamereplace always --molname Mol_ID --mode both + --datafieldsmap + "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID; + PUBCHEM_SUBSTANCE_SYNONYM,Name,CompoundName" + -r CmpdDataForPubChem -o Sample1.sdf + +To add your <domain.org> as PUBCHEM_EXT_SUBSTANCE_URL and link substance +retrieval to your CGI script <http://www.yourdomain.org/GetCmpd.pl,Reg_ID,Mol_ID> +via PUBCHEM_EXT_DATASOURCE_REGID field along with optional and required +data fields, type: + + % ModifySDFilesDataFields.pl --molnamemode datafield + --molnamereplace always --molname Mol_ID --mode both + --datafieldsmap + "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID; + PUBCHEM_SUBSTANCE_SYNONYM,Name,CompoundName" + --datafieldscommon + "PUBCHEM_EXT_SUBSTANCE_URL,domain.org" + --datafieldURL "PUBCHEM_EXT_DATASOURCE_URL, + http://www.yourdomain.org/GetCmpd.pl,Reg_ID,Mol_ID" + -r CmpdDataForPubChem -o Sample1.sdf + +And to add a publication date and request a release data using +PUBCHEM_PUBLICATION_DATE and PUBCHEM_DEPOSITOR_RECORD_DATE data fields +along with all the data fields in earlier examples, type: +optional fields, type: + + % ModifySDFilesDataFields.pl --molnamemode datafield + --molnamereplace always --molname Mol_ID --mode both + --datafieldsmap + "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID; + PUBCHEM_SUBSTANCE_SYNONYM,Name,CompoundName" + --datafieldURL "PUBCHEM_EXT_DATASOURCE_URL, + http://www.yourdomain.org/GetCmpd.pl,Reg_ID,Mol_ID" + --datafieldscommon + "PUBCHEM_EXT_SUBSTANCE_URL,domain.org, + PUBCHEM_PUBLICATION_DATE,YYY-MM-DD, + PUBCHEM_DEPOSITOR_RECORD_DATE,YYYY-MM-DD" + -r CmpdDataForPubChem -o Sample1.sdf + +=head1 AUTHOR + +Manish Sud <msud@san.rr.com> + +=head1 SEE ALSO + +InfoSDFiles.pl, JoinSDFiles.pl, MergeTextFilesWithSD.pl, SplitSDFiles.pl, SDFilesToHTML.pl + +=head1 COPYRIGHT + +Copyright (C) 2015 Manish Sud. All rights reserved. + +This file is part of MayaChemTools. + +MayaChemTools is free software; you can redistribute it and/or modify it under +the terms of the GNU Lesser General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + +=cut