diff bin/ModifySDFilesDataFields.pl @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/ModifySDFilesDataFields.pl	Wed Jan 20 09:23:18 2016 -0500
@@ -0,0 +1,677 @@
+#!/usr/bin/perl -w
+#
+# $RCSfile: ModifySDFilesDataFields.pl,v $
+# $Date: 2015/02/28 20:46:20 $
+# $Revision: 1.27 $
+#
+# Author: Manish Sud <msud@san.rr.com>
+#
+# Copyright (C) 2015 Manish Sud. All rights reserved.
+#
+# This file is part of MayaChemTools.
+#
+# MayaChemTools is free software; you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your option) any
+# later version.
+#
+# MayaChemTools is distributed in the hope that it will be useful, but without
+# any warranty; without even the implied warranty of merchantability of fitness
+# for a particular purpose.  See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
+# write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
+# Boston, MA, 02111-1307, USA.
+#
+
+use strict;
+use FindBin; use lib "$FindBin::Bin/../lib";
+use Getopt::Long;
+use File::Basename;
+use Text::ParseWords;
+use Benchmark;
+use FileUtil;
+use SDFileUtil;
+use TextUtil;
+
+my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
+
+# Autoflush STDOUT
+$| = 1;
+
+# Starting message...
+$ScriptName = basename($0);
+print "\n$ScriptName: Starting...\n\n";
+$StartTime = new Benchmark;
+
+# Get the options and setup script...
+SetupScriptUsage();
+if ($Options{help} || @ARGV < 1) {
+  die GetUsageFromPod("$FindBin::Bin/$ScriptName");
+}
+
+my(@SDFilesList);
+@SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
+
+# Process options...
+print "Processing options...\n";
+my(%OptionsInfo);
+ProcessOptions();
+
+print "Checking input SD file(s)...\n";
+my(%SDFilesInfo);
+RetrieveSDFilesInfo();
+
+# Generate output files...
+my($FileIndex);
+if (@SDFilesList > 1) {
+  print "\nProcessing SD files...\n";
+}
+for $FileIndex (0 .. $#SDFilesList) {
+  if ($SDFilesInfo{FileOkay}[$FileIndex]) {
+    print "\nProcessing file $SDFilesList[$FileIndex]...\n";
+    ModifySDFile($FileIndex);
+  }
+}
+print "\n$ScriptName:Done...\n\n";
+
+$EndTime = new Benchmark;
+$TotalTime = timediff ($EndTime, $StartTime);
+print "Total time: ", timestr($TotalTime), "\n";
+
+###############################################################################
+
+# Modify SD file data fields....
+sub ModifySDFile {
+  my($Index) = @_;
+  my($SDFile, $NewSDFile);
+
+  $SDFile = $SDFilesList[$Index];
+  $NewSDFile = $SDFilesInfo{OutFile}[$Index];
+
+  print "Generating new SD file $NewSDFile...\n";
+  open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n";
+  open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
+
+  my($CmpdCount, $CmpdString, $CmpdData, $MolName, $OldSDField, $NewSDField, $CommonSDField, $Label, $Value, $FieldValues, $MolNameDataField, $URLCmpdIdFieldName, @CmpdLines, %DataFieldAndValues, @DataFieldLabels);
+  $CmpdCount = 0;
+
+  COMPOUND: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
+      $CmpdCount++;
+      @CmpdLines = split "\n", $CmpdString;
+      if ($OptionsInfo{UseDataFieldForMolName} || $OptionsInfo{ModifyDataFields}) {
+	%DataFieldAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
+      }
+      if ($OptionsInfo{ModifyMolName}) {
+	if ($OptionsInfo{AlwaysReplaceMolName} || !IsNotEmpty($CmpdLines[0])) {
+	  $MolNameDataField = $OptionsInfo{MolNameDataField};
+	  if ($OptionsInfo{UseDataFieldForMolName} && exists($DataFieldAndValues{$MolNameDataField})) {
+	    $MolName = $DataFieldAndValues{$MolNameDataField};
+	    if (length($MolName) > 80) {
+	      $MolName = substr($MolName, 0, 80);
+	    }
+	  }
+	  else {
+	    $MolName = "$OptionsInfo{MolNamePrefix}${CmpdCount}";
+	  }
+	  $CmpdLines[0] = $MolName;
+	  $CmpdString = join "\n", @CmpdLines;
+	}
+      }
+      if (!$OptionsInfo{ModifyDataFields}) {
+	# Just write the data and get the next compound...
+	print NEWSDFILE "$CmpdString\n";
+	next COMPOUND;
+      }
+      # Write out the structure data now and handle the old data fields later...
+      ($CmpdData) = split /\n>/, $CmpdString;
+      print NEWSDFILE "$CmpdData\n";
+
+      # Modify specified data fields...
+      for $NewSDField (sort keys %{$OptionsInfo{SpecifiedNewToOldSDFieldMap}}) {
+	$FieldValues = "";
+	for $OldSDField (@{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}}) {
+	  if (exists($DataFieldAndValues{$OldSDField}) && length($DataFieldAndValues{$OldSDField})) {
+	    $Value = $DataFieldAndValues{$OldSDField};
+	    $FieldValues .= ($FieldValues) ? "\n$Value" : $Value;
+	  }
+	}
+	print NEWSDFILE "> <$NewSDField>\n$FieldValues\n\n";
+      }
+      # Add specified common fields...
+      for $CommonSDField (sort keys %{$OptionsInfo{SpecifiedCommonFieldMap}}) {
+	$Value = $OptionsInfo{SpecifiedCommonFieldMap}{$CommonSDField};
+	print NEWSDFILE "> <$CommonSDField>\n$Value\n\n";
+      }
+      if ($OptionsInfo{CreateDataFieldURL}) {
+	$Value = "";
+	$URLCmpdIdFieldName = $OptionsInfo{URLCmpdIdFieldName};
+	if (exists($DataFieldAndValues{$URLCmpdIdFieldName}) && length($DataFieldAndValues{$URLCmpdIdFieldName})) {
+	  $Value = $DataFieldAndValues{$URLCmpdIdFieldName};
+	  $Value = "$OptionsInfo{URLCGIScriptName}?$OptionsInfo{URLParamName}=${Value}";
+	}
+	print NEWSDFILE "> <$OptionsInfo{URLDataFieldLabel}>\n$Value\n\n";
+      }
+
+      # Handle old data fields and write 'em in the same order as they appear in the input
+      # files...
+      if ($OptionsInfo{KeepAllOldDataFields} || $OptionsInfo{KeepUnMappedOldDataFields}) {
+	my($KeepLabel);
+	@DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
+	LABEL: for $Label (@DataFieldLabels) {
+	  $KeepLabel = $OptionsInfo{KeepAllOldDataFields} ? 1 : ( exists($OptionsInfo{SpecifiedOldToNewSDFieldMap}{$Label}) ? 0 : 1  );
+	  if (!$KeepLabel) {
+	    next LABEL;
+	  }
+	  $Value = $DataFieldAndValues{$Label};
+	  print NEWSDFILE "> <$Label>\n$Value\n\n";
+	}
+      }
+
+      print NEWSDFILE "\$\$\$\$\n";
+  }
+  close NEWSDFILE;
+  close SDFILE;
+}
+
+# Process option values...
+sub ProcessOptions {
+  %OptionsInfo = ();
+
+  $OptionsInfo{Mode} = $Options{mode};
+
+  $OptionsInfo{ModifyMolName} = 1; $OptionsInfo{ModifyDataFields} = 0;
+  if ($Options{mode} =~ /^both$/i) {
+    $OptionsInfo{ModifyMolName} = 1; $OptionsInfo{ModifyDataFields} = 1;
+  }
+  elsif ($Options{mode} =~ /^datafields$/i) {
+    $OptionsInfo{ModifyMolName} = 0; $OptionsInfo{ModifyDataFields} = 1;
+  }
+
+  $OptionsInfo{KeepOldDataFields} = $Options{keepolddatafields};
+  $OptionsInfo{KeepAllOldDataFields} = ($Options{keepolddatafields} =~ /^all$/i) ? 1 : 0;
+  $OptionsInfo{KeepUnMappedOldDataFields} = ($Options{keepolddatafields} =~ /^unmappedonly$/i) ? 1 : 0;
+
+  $OptionsInfo{MolNameMode} = $Options{molnamemode};
+  $OptionsInfo{UseDataFieldForMolName} = ($Options{molnamemode} =~ /^datafield$/i) ? 1 : 0;
+
+  $OptionsInfo{MolName} = $Options{molname};
+  $OptionsInfo{MolNameDataField} = ""; $OptionsInfo{MolNamePrefix} = "Cmpd";
+  if ($Options{molname}) {
+    if ($OptionsInfo{UseDataFieldForMolName}) {
+      $OptionsInfo{MolNameDataField} = $Options{molname};
+    }
+    else {
+      $OptionsInfo{MolNamePrefix} = $Options{molname};
+    }
+  }
+
+  $OptionsInfo{MolNameReplace} = $Options{molnamereplace};
+  $OptionsInfo{AlwaysReplaceMolName} = ($Options{molnamereplace} =~ /^always$/i) ? 1 : 0;
+
+  if ($Options{datafieldsmap} && $Options{datafieldsmapfile}) {
+    die "Error: Both \"--datafieldsmap\" and  \"--datafieldsmapfile\" options specified: only one is allowed at a time\n";
+  }
+
+  $OptionsInfo{DataFieldsMap} = $Options{datafieldsmap} ? $Options{datafieldsmap} : '';
+  $OptionsInfo{DataFieldsMapFile} = $Options{datafieldsmapfile} ? $Options{datafieldsmapfile} : '';
+
+  my($SpecifiedDataFieldMap);
+
+  %{$OptionsInfo{SpecifiedNewToOldSDFieldMap}} = ();
+  %{$OptionsInfo{SpecifiedOldToNewSDFieldMap}} = ();
+
+  $SpecifiedDataFieldMap = "";
+  if ($Options{datafieldsmap}) {
+    $SpecifiedDataFieldMap = $Options{datafieldsmap};
+  }
+  elsif ($Options{datafieldsmapfile}) {
+    my($Line, @LineWords);
+    open DATAFIELDSFILE, "$Options{datafieldsmapfile}" or die "Couldn't  open $Options{datafieldsmapfile}: $! \n";
+    while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
+      @LineWords = quotewords(";", 0, $Line);
+      $SpecifiedDataFieldMap .= JoinWords(\@LineWords, ";", 0);
+    }
+    close DATAFIELDSFILE;
+  }
+
+  if ($SpecifiedDataFieldMap) {
+    my($DataFieldMap, $DataField, $NewSDField, @OldSDFields, @DataFieldMapSplit, @DataFieldsSplit, $FirstField);
+    @DataFieldMapSplit = split ";", $SpecifiedDataFieldMap;
+    for $DataFieldMap (@DataFieldMapSplit) {
+      @DataFieldsSplit = split ",", $DataFieldMap;
+      if (@DataFieldsSplit == 1) {
+	die "Error: Invalid number of comma delimited values, ", scalar(@DataFieldsSplit), ", specified,  @DataFieldsSplit, using \"--datafieldsmap or --datafieldsmapfile\" option: it must contain more than one value.\n";
+      }
+      $FirstField = 1;
+      @OldSDFields = ();
+      for $DataField (@DataFieldsSplit) {
+	if (!(defined($DataField) && length($DataField))) {
+	  die "Error: One of the comma delimited values, \"", join(",", @DataFieldsSplit), "\", specified using \"--datafieldsmap or --datafieldsmapfile\" option is empty.\n";
+	}
+	if ($FirstField) {
+	  $FirstField = 0;
+	  $NewSDField = $DataField;
+	}
+	else {
+	  push @OldSDFields, $DataField;
+	}
+      }
+      # Make sure a datafield is only specified once...
+      if (exists $OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}) {
+	die "Error: New data field, $NewSDField, specified more than once using \"--datafieldsmap or --datafieldsmapfile\" option.\n";
+      }
+      @{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}} = ();
+      push @{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}}, @OldSDFields;
+      for $DataField (@OldSDFields) {
+	if (exists $OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataField} ) {
+	  die "Error: SD field, $DataField, specified more than once using \"--datafieldsmap or --datafieldsmapfile\" option.\n";
+	}
+	else {
+	  $OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataField} = $NewSDField;
+	}
+      }
+
+    }
+  }
+
+  $OptionsInfo{DataFieldsCommon} = $Options{datafieldscommon} ? $Options{datafieldscommon} : '';
+  %{$OptionsInfo{SpecifiedCommonFieldMap}} = ();
+
+  if ($Options{datafieldscommon}) {
+    my($DataFieldName, $DataFieldValue, $Index, @CommonDataFieldsSplit);
+    @CommonDataFieldsSplit = split ",", $Options{datafieldscommon};
+    if (@CommonDataFieldsSplit % 2) {
+	die "Error: Invalid number of comma delimited values, ", scalar(@CommonDataFieldsSplit), ", specified \"",  join(",", @CommonDataFieldsSplit), "\" using \"--datafieldscommon\" option: it must contain even number of values.\n";
+    }
+    for ($Index = 0; $Index < @CommonDataFieldsSplit; $Index += 2) {
+      $DataFieldName = $CommonDataFieldsSplit[$Index];
+      $DataFieldValue = $CommonDataFieldsSplit[$Index + 1];
+      if (exists $OptionsInfo{SpecifiedCommonFieldMap}{$DataFieldName}) {
+	die "Error: Common data field, $DataFieldName, specified more than once using \"--datafieldscommon\" option.\n";
+      }
+      if (exists($OptionsInfo{SpecifiedNewToOldSDFieldMap}{$DataFieldName}) || exists($OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataFieldName})) {
+	die "Error: Common data field, $DataFieldName, specified using \"--datafieldscommon\" option cannot be specified in \"--datafieldsmap or --datafieldsmapfile\" option.\n";
+      }
+      $OptionsInfo{SpecifiedCommonFieldMap}{$DataFieldName} = $DataFieldValue;
+    }
+  }
+
+  $OptionsInfo{DataFieldURL} = $Options{datafieldurl} ? $Options{datafieldurl} : '';
+  $OptionsInfo{CreateDataFieldURL} = (exists($Options{datafieldurl}) && length($Options{datafieldurl}) ) ? 1 : 0;
+
+  $OptionsInfo{URLDataFieldLabel} = ""; $OptionsInfo{URLCGIScriptName} = "";
+  $OptionsInfo{URLParamName} = ""; $OptionsInfo{URLCmpdIdFieldName} = "";
+
+  if ($OptionsInfo{CreateDataFieldURL}) {
+    my(@DataFieldURLSplit, $Value);
+    @DataFieldURLSplit = split ",", $Options{datafieldurl};
+    if (@DataFieldURLSplit != 4) {
+      die "Error: Invalid number of values, ", scalar(@DataFieldURLSplit), ", specified using \"--datafieldURL\" option: it must contain 4 values.\n";
+    }
+    for $Value (@DataFieldURLSplit) {
+      if (!IsNotEmpty($Value)) {
+	die "Error: One of the values, $Options{datafieldurl}, specified using \"--datafieldURL\" option is empty.\n";
+      }
+    }
+    $OptionsInfo{URLDataFieldLabel} = $DataFieldURLSplit[0];
+    $OptionsInfo{URLCGIScriptName} = $DataFieldURLSplit[1];
+    $OptionsInfo{URLParamName}  = $DataFieldURLSplit[2];
+    $OptionsInfo{URLCmpdIdFieldName} = $DataFieldURLSplit[3];
+  }
+
+}
+
+# Retrieve information about input SD files...
+sub RetrieveSDFilesInfo {
+  my($Index, $SDFile, $FileDir, $FileName, $FileExt, $OutFileRoot,  $OutFile, $DataFieldName);
+
+  %SDFilesInfo = ();
+  @{$SDFilesInfo{FileOkay}} = ();
+  @{$SDFilesInfo{OutFile}} = ();
+
+   FILELIST: for $Index (0 .. $#SDFilesList) {
+    $SDFile = $SDFilesList[$Index];
+
+    $SDFilesInfo{FileOkay}[$Index] = 0;
+    $SDFilesInfo{OutFile}[$Index] = '';
+
+    if (!(-e $SDFile)) {
+      warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
+      next FILELIST;
+    }
+    if (!CheckFileType($SDFile, "sd sdf")) {
+      warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
+      next FILELIST;
+    }
+    $FileDir = ""; $FileName = ""; $FileExt = "";
+    ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
+    if ($Options{root} && (@SDFilesList == 1)) {
+      my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
+      if ($RootFileName && $RootFileExt) {
+	$FileName = $RootFileName;
+      }
+      else {
+	$FileName = $Options{root};
+      }
+      $OutFileRoot = $FileName;
+    }
+    else {
+      $OutFileRoot = $FileName . "ModifiedDataFields";
+    }
+
+    $OutFile = $OutFileRoot . ".$FileExt";
+    if (lc($OutFile) eq lc($SDFile)) {
+      warn "Warning: Ignoring file $SDFile:Output file name, $OutFile, is same as input SD file name, $SDFile\n";
+      next FILELIST;
+    }
+    if (!$Options{overwrite}) {
+      if (-e $OutFile) {
+	warn "Warning: Ignoring file $SDFile: The file $OutFile already exists\n";
+	next FILELIST;
+      }
+    }
+
+    $SDFilesInfo{FileOkay}[$Index] = 1;
+    $SDFilesInfo{OutFile}[$Index] = $OutFile;
+  }
+}
+
+# Setup script usage  and retrieve command line arguments specified using various options...
+sub SetupScriptUsage {
+
+  # Retrieve all the options...
+  %Options = ();
+  $Options{detail} = 1;
+  $Options{keepolddatafields} = "none";
+  $Options{mode} = "molname";
+  $Options{molnamemode} = "labelprefix";
+  $Options{molnamereplace} = "empty";
+
+  if (!GetOptions(\%Options, "detail|d=i", "datafieldscommon=s", "datafieldsmap=s", "datafieldsmapfile=s", "datafieldurl=s", "help|h", "keepolddatafields|k=s", "mode|m=s", "molname=s", "molnamemode=s", "molnamereplace=s", "overwrite|o", "root|r=s", "workingdir|w=s")) {
+    die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
+  }
+  if ($Options{workingdir}) {
+    if (! -d $Options{workingdir}) {
+      die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
+    }
+    chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
+  }
+  if ($Options{keepolddatafields} !~ /^(all|unmappedonly|none)$/i) {
+    die "Error: The value specified, $Options{keepolddatafields}, for option \"-k --keepolddatafields\" is not valid. Allowed values: all, unmappedonly, or none\n";
+  }
+  if ($Options{mode} !~ /^(molname|datafields|both)$/i) {
+    die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: molname, datafields, or both\n";
+  }
+  if ($Options{molnamemode} !~ /^(datafield|labelprefix)$/i) {
+    die "Error: The value specified, $Options{molnamemode}, for option \"--molnamemode\" is not valid. Allowed values: datafield or labelprefix\n";
+  }
+  if ($Options{molnamereplace} !~ /^(always|empty)$/i) {
+    die "Error: The value specified, $Options{molnamereplace}, for option \"--molnamereplace\" is not valid. Allowed values: always or empty\n";
+  }
+  if (!IsPositiveInteger($Options{detail})) {
+    die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
+  }
+}
+
+__END__
+
+=head1 NAME
+
+ModifySDFilesDataFields.pl - Modify data fields in SDFile(s)
+
+=head1 SYNOPSIS
+
+ModifySDFilesDataFields.pl SDFile(s)...
+
+ModifySDFilesDataFields.pl [B<-d, --detail> infolevel]
+[B<--datafieldscommon> newfieldlabel, newfieldvalue, [newfieldlabel, newfieldvalue,...]]
+[B<--datafieldsmap> newfieldlabel, oldfieldlabel, [oldfieldlabel,...]; [newfieldlabel, oldfieldlabel, [oldfieldlabel,...]]]
+[B<--datafieldsmapfile> filename] [B<--datafieldURL> URLDataFieldLabel, CGIScriptPath, CGIParamName, CmpdIDFieldLabel]
+[B<-h, --help>] [B<-k, --keepolddatafields> all | unmappedonly | none] [B<-m, --mode> molname | datafields | both]
+[B<--molnamemode> datafield | labelprefix] [B<--molname> datafieldname or prefixstring]
+[B<--molnamereplace> always | empty] [B<-o, --overwrite>] [B<-r, --root> rootname]
+[B<-w, --workingdir> dirname] SDFile(s)...
+
+=head1 DESCRIPTION
+
+Modify molname line and data fields in I<SDFile(s)>. Molname line can be replaced by a
+data field value or assigned a sequential ID prefixed with a specific string. For data
+fields and modification of their values, these types of options are supported: replace
+data field labels by another set of labels; combine values of multiple data fields and
+assign a new label; add specific set of data field labels and values to all compound
+records; and others.
+
+The file names are separated by space.The valid file extensions are I<.sdf> and I<.sd>.
+All other file names are ignored. All the SD files in a current directory can be specified
+either by I<*.sdf> or the current directory name.
+
+=head1 OPTIONS
+
+=over 4
+
+=item B<-d, --detail> I<infolevel>
+
+Level of information to print about compound records being ignored. Default: I<1>. Possible
+values: I<1, 2 or 3>.
+
+=item B<--datafieldscommon> I<newfieldlabel, newfieldvalue, [newfieldlabel, newfieldvalue,...]>
+
+Specify data field labels and values for addition to each compound record. It's a comma delimited
+list of data field label and values pair. Default: I<none>.
+
+Examples:
+
+    DepositionDate,YYYY-MM-DD
+    Source,www.domainname.org,ReleaseData,YYYY-MM-DD
+
+=item B<--datafieldsmap> I<newfieldlabel, oldfieldlabel, [oldfieldlabel,...]; [newfieldlabel, oldfieldlabel, [oldfieldlabel,...]]>
+
+Specify how various data field labels and values are combined to generate a new data field
+labels and their values. All the comma delimited data fields, with in a semicolon delimited set,
+are mapped to the first new data field label along with the data field values joined via new
+line character. Default: I<none>.
+
+Examples:
+
+    Synonym,Name,SystematicName,Synonym;CmpdID,Extreg
+    HBondDonors,SumNHOH
+
+=item B<--datafieldsmapfile> I<filename>
+
+Filename containing mapping of data fields. Format of data fields line in this file corresponds
+to B<--datafieldsmap> option. Example:
+
+    Line 1: Synonym,Name,SystematicName,Synonym;CmpdID,Extreg
+    Line 2: HBondDonors,SumNHOH
+
+
+=item B<--datafieldURL> I<URLDataFieldLabel, CGIScriptPath, CGIParamName, CmpdIDFieldLabel>
+
+Specify how to generate a URL for retrieving compound data from a web server and add it
+to each compound record. I<URLDataFieldLabel> is used as the data field label for URL value
+which is created by combining I<CGIScriptPath,CGIParamName,CmpdIDFieldLabel> values:
+CGIScriptPath?CGIParamName=CmpdIDFieldLabelValue. Default: I<none>.
+
+Example:
+
+    Source,http://www.yourdomain.org/GetCmpd.pl,Reg_ID,Mol_ID
+
+=item B<-h, --help>
+
+Print this help message.
+
+=item B<-k, --keepolddatafields> I<all | unmappedonly | none>
+
+Specify how to transfer old data fields from input SDFile(s) to new SDFile(s) during
+I<datafields | both> value of B<-m, --mode> option: keep all old data fields; write out the ones
+not mapped to new fields as specified by B<--datafieldsmap> or <--datafieldsmapfile> options;
+or ignore all old data field labels. For I<molname> B<-m --mode>, old datafields are always kept.
+Possible values: I<all | unmappedonly | none>. Default: I<none>.
+
+=item B<-m, --mode> I<molname | datafields | both>
+
+Specify how to modify SDFile(s): I<molname> - change molname line by another datafield or value;
+I<datafield> - modify data field labels and values by replacing one label by another, combining
+multiple data field labels and values, adding specific set of data field labels and values to all compound, or
+inserting an URL for compound retrieval to each record; I<both> - change molname line and datafields
+simultaneously. Possible values: I<molname | datafields | both>. Default: I<molname>
+
+=item B<--molnamemode> I<datafield | labelprefix>
+
+Specify how to change molname line for B<-m --mode> option values of I<molname | both>: use
+a datafield label value or assign a sequential ID prefixed with I<labelprefix>. Possible values:
+I<datafield | labelprefix>. Default: I<labelprefix>.
+
+=item B<--molname> I<datafieldname or prefixstring>
+
+Molname generation method. For I<datafield> value of B<--molnamemode> option, it corresponds
+to datafield label name whose value is used for molname; otherwise, it's a prefix string used for
+generating compound IDs like labelprefixstring<Number>. Default value, I<Cmpd>, generates
+compound IDs like Cmpd<Number> for molname.
+
+=item B<--molnamereplace> I<always | empty>
+
+Specify when to replace molname line for B<-m --mode> option values of I<molname | both>:
+always replace the molname line using B<--molname> option or only when it's empty. Possible
+values: I<always | empty>. Default: I<empty>.
+
+=item B<-o, --overwrite>
+
+Overwrite existing files.
+
+=item B<-r, --root> I<rootname>
+
+New SD file name is generated using the root: <Root>.<Ext>. Default new file
+name: <InitialSDFileName>ModifiedDataFields.<Ext>. This option is ignored for multiple
+input files.
+
+=item B<-w, --workingdir> I<dirname>
+
+Location of working directory. Default: current directory.
+
+=back
+
+=head1 EXAMPLES
+
+To replace empty molname lines by Cmpd<CmpdNumber> and generate a new SD file
+NewSample1.sdf, type:
+
+    % ModifySDFilesDataFields.pl -o -r NewSample1 Sample1.sdf
+
+To replace all molname lines by Mol_ID data field generate a new SD file
+NewSample1.sdf, type:
+
+    % ModifySDFilesDataFields.pl --molnamemode datafield
+    --molnamereplace always -r NewSample1 -o Sample1.sdf
+
+To replace all molname lines by Mol_ID data field, map Name and CompoundName to
+a new datafield Synonym, and generate a new SD file NewSample1.sdf, type:
+
+    % ModifySDFilesDataFields.pl --molnamemode datafield
+      --molnamereplace always --molname Mol_ID --mode both
+      --datafieldsmap "Synonym,Name,CompoundName" -r
+      NewSample1 -o Sample1.sdf
+
+To replace all molname lines by Mol_ID data field, map Name and CompoundName to
+a new datafield Synonym, add common fields ReleaseDate and Source, and
+generate a new SD file NewSample1.sdf without keeping any old SD data fields, type:
+
+    % ModifySDFilesDataFields.pl --molnamemode datafield
+      --molnamereplace always --molname Mol_ID --mode both
+      --datafieldsmap "Synonym,Name,CompoundName"
+      --datafieldscommon "ReleaseDate,yyyy-mm-dd,Source,
+      www.mayachemtools.org" --keepolddatafields none -r
+      NewSample1 -o Sample1.sdf
+
+B<Preparing SD files PubChem deposition:>
+
+Consider a SD file with these fields: Mol_ID, Name, Synonyms and Systematic_Name.
+And Mol_ID data field uniquely identifies your compound.
+
+To prepare a new SD file CmpdDataForPubChem.sdf containing only required
+PUBCHEM_EXT_DATASOURCE_REGID field, type:
+
+    % ModifySDFilesDataFields.pl --m datafields
+      --datafieldsmap
+      "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID"
+      -r CmpdDataForPubChem -o Sample1.sdf
+
+To prepare a new SD file CmpdDataForPubChem.sdf containing only required
+PUBCHEM_EXT_DATASOURCE_REGID field and replace molname line with Mol_ID, type:
+
+    % ModifySDFilesDataFields.pl --molnamemode datafield
+      --molnamereplace always --molname Mol_ID --mode both
+      --datafieldsmap
+       "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID"
+      -r CmpdDataForPubChem -o Sample1.sdf
+
+In addition to required PubChem data field, you can also add optional PubChem data
+fields.
+
+To map your Name, Synonyms and Systematic_Name data fields to optional
+PUBCHEM_SUBSTANCE_SYNONYM data field along with required ID field, type:
+
+    % ModifySDFilesDataFields.pl --molnamemode datafield
+      --molnamereplace always --molname Mol_ID --mode both
+      --datafieldsmap
+      "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID;
+      PUBCHEM_SUBSTANCE_SYNONYM,Name,CompoundName"
+      -r CmpdDataForPubChem -o Sample1.sdf
+
+To add your <domain.org> as PUBCHEM_EXT_SUBSTANCE_URL and link substance
+retrieval to your CGI script <http://www.yourdomain.org/GetCmpd.pl,Reg_ID,Mol_ID>
+via PUBCHEM_EXT_DATASOURCE_REGID field along with optional and required
+data fields, type:
+
+    % ModifySDFilesDataFields.pl --molnamemode datafield
+      --molnamereplace always --molname Mol_ID --mode both
+      --datafieldsmap
+      "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID;
+      PUBCHEM_SUBSTANCE_SYNONYM,Name,CompoundName"
+      --datafieldscommon
+      "PUBCHEM_EXT_SUBSTANCE_URL,domain.org"
+      --datafieldURL "PUBCHEM_EXT_DATASOURCE_URL,
+      http://www.yourdomain.org/GetCmpd.pl,Reg_ID,Mol_ID"
+      -r CmpdDataForPubChem -o Sample1.sdf
+
+And to add a publication date and request a release data using
+PUBCHEM_PUBLICATION_DATE and PUBCHEM_DEPOSITOR_RECORD_DATE data fields
+along with all the data fields in earlier examples, type:
+optional fields, type:
+
+    % ModifySDFilesDataFields.pl --molnamemode datafield
+      --molnamereplace always --molname Mol_ID --mode both
+      --datafieldsmap
+      "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID;
+      PUBCHEM_SUBSTANCE_SYNONYM,Name,CompoundName"
+      --datafieldURL "PUBCHEM_EXT_DATASOURCE_URL,
+      http://www.yourdomain.org/GetCmpd.pl,Reg_ID,Mol_ID"
+      --datafieldscommon
+      "PUBCHEM_EXT_SUBSTANCE_URL,domain.org,
+      PUBCHEM_PUBLICATION_DATE,YYY-MM-DD,
+      PUBCHEM_DEPOSITOR_RECORD_DATE,YYYY-MM-DD"
+      -r CmpdDataForPubChem -o Sample1.sdf
+
+=head1 AUTHOR
+
+Manish Sud <msud@san.rr.com>
+
+=head1 SEE ALSO
+
+InfoSDFiles.pl, JoinSDFiles.pl, MergeTextFilesWithSD.pl, SplitSDFiles.pl, SDFilesToHTML.pl
+
+=head1 COPYRIGHT
+
+Copyright (C) 2015 Manish Sud. All rights reserved.
+
+This file is part of MayaChemTools.
+
+MayaChemTools is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License as published by the Free
+Software Foundation; either version 3 of the License, or (at your option)
+any later version.
+
+=cut