comparison bin/ModifySDFilesDataFields.pl @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: ModifySDFilesDataFields.pl,v $
4 # $Date: 2015/02/28 20:46:20 $
5 # $Revision: 1.27 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Text::ParseWords;
34 use Benchmark;
35 use FileUtil;
36 use SDFileUtil;
37 use TextUtil;
38
39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
40
41 # Autoflush STDOUT
42 $| = 1;
43
44 # Starting message...
45 $ScriptName = basename($0);
46 print "\n$ScriptName: Starting...\n\n";
47 $StartTime = new Benchmark;
48
49 # Get the options and setup script...
50 SetupScriptUsage();
51 if ($Options{help} || @ARGV < 1) {
52 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
53 }
54
55 my(@SDFilesList);
56 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
57
58 # Process options...
59 print "Processing options...\n";
60 my(%OptionsInfo);
61 ProcessOptions();
62
63 print "Checking input SD file(s)...\n";
64 my(%SDFilesInfo);
65 RetrieveSDFilesInfo();
66
67 # Generate output files...
68 my($FileIndex);
69 if (@SDFilesList > 1) {
70 print "\nProcessing SD files...\n";
71 }
72 for $FileIndex (0 .. $#SDFilesList) {
73 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
74 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
75 ModifySDFile($FileIndex);
76 }
77 }
78 print "\n$ScriptName:Done...\n\n";
79
80 $EndTime = new Benchmark;
81 $TotalTime = timediff ($EndTime, $StartTime);
82 print "Total time: ", timestr($TotalTime), "\n";
83
84 ###############################################################################
85
86 # Modify SD file data fields....
87 sub ModifySDFile {
88 my($Index) = @_;
89 my($SDFile, $NewSDFile);
90
91 $SDFile = $SDFilesList[$Index];
92 $NewSDFile = $SDFilesInfo{OutFile}[$Index];
93
94 print "Generating new SD file $NewSDFile...\n";
95 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n";
96 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
97
98 my($CmpdCount, $CmpdString, $CmpdData, $MolName, $OldSDField, $NewSDField, $CommonSDField, $Label, $Value, $FieldValues, $MolNameDataField, $URLCmpdIdFieldName, @CmpdLines, %DataFieldAndValues, @DataFieldLabels);
99 $CmpdCount = 0;
100
101 COMPOUND: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
102 $CmpdCount++;
103 @CmpdLines = split "\n", $CmpdString;
104 if ($OptionsInfo{UseDataFieldForMolName} || $OptionsInfo{ModifyDataFields}) {
105 %DataFieldAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
106 }
107 if ($OptionsInfo{ModifyMolName}) {
108 if ($OptionsInfo{AlwaysReplaceMolName} || !IsNotEmpty($CmpdLines[0])) {
109 $MolNameDataField = $OptionsInfo{MolNameDataField};
110 if ($OptionsInfo{UseDataFieldForMolName} && exists($DataFieldAndValues{$MolNameDataField})) {
111 $MolName = $DataFieldAndValues{$MolNameDataField};
112 if (length($MolName) > 80) {
113 $MolName = substr($MolName, 0, 80);
114 }
115 }
116 else {
117 $MolName = "$OptionsInfo{MolNamePrefix}${CmpdCount}";
118 }
119 $CmpdLines[0] = $MolName;
120 $CmpdString = join "\n", @CmpdLines;
121 }
122 }
123 if (!$OptionsInfo{ModifyDataFields}) {
124 # Just write the data and get the next compound...
125 print NEWSDFILE "$CmpdString\n";
126 next COMPOUND;
127 }
128 # Write out the structure data now and handle the old data fields later...
129 ($CmpdData) = split /\n>/, $CmpdString;
130 print NEWSDFILE "$CmpdData\n";
131
132 # Modify specified data fields...
133 for $NewSDField (sort keys %{$OptionsInfo{SpecifiedNewToOldSDFieldMap}}) {
134 $FieldValues = "";
135 for $OldSDField (@{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}}) {
136 if (exists($DataFieldAndValues{$OldSDField}) && length($DataFieldAndValues{$OldSDField})) {
137 $Value = $DataFieldAndValues{$OldSDField};
138 $FieldValues .= ($FieldValues) ? "\n$Value" : $Value;
139 }
140 }
141 print NEWSDFILE "> <$NewSDField>\n$FieldValues\n\n";
142 }
143 # Add specified common fields...
144 for $CommonSDField (sort keys %{$OptionsInfo{SpecifiedCommonFieldMap}}) {
145 $Value = $OptionsInfo{SpecifiedCommonFieldMap}{$CommonSDField};
146 print NEWSDFILE "> <$CommonSDField>\n$Value\n\n";
147 }
148 if ($OptionsInfo{CreateDataFieldURL}) {
149 $Value = "";
150 $URLCmpdIdFieldName = $OptionsInfo{URLCmpdIdFieldName};
151 if (exists($DataFieldAndValues{$URLCmpdIdFieldName}) && length($DataFieldAndValues{$URLCmpdIdFieldName})) {
152 $Value = $DataFieldAndValues{$URLCmpdIdFieldName};
153 $Value = "$OptionsInfo{URLCGIScriptName}?$OptionsInfo{URLParamName}=${Value}";
154 }
155 print NEWSDFILE "> <$OptionsInfo{URLDataFieldLabel}>\n$Value\n\n";
156 }
157
158 # Handle old data fields and write 'em in the same order as they appear in the input
159 # files...
160 if ($OptionsInfo{KeepAllOldDataFields} || $OptionsInfo{KeepUnMappedOldDataFields}) {
161 my($KeepLabel);
162 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
163 LABEL: for $Label (@DataFieldLabels) {
164 $KeepLabel = $OptionsInfo{KeepAllOldDataFields} ? 1 : ( exists($OptionsInfo{SpecifiedOldToNewSDFieldMap}{$Label}) ? 0 : 1 );
165 if (!$KeepLabel) {
166 next LABEL;
167 }
168 $Value = $DataFieldAndValues{$Label};
169 print NEWSDFILE "> <$Label>\n$Value\n\n";
170 }
171 }
172
173 print NEWSDFILE "\$\$\$\$\n";
174 }
175 close NEWSDFILE;
176 close SDFILE;
177 }
178
179 # Process option values...
180 sub ProcessOptions {
181 %OptionsInfo = ();
182
183 $OptionsInfo{Mode} = $Options{mode};
184
185 $OptionsInfo{ModifyMolName} = 1; $OptionsInfo{ModifyDataFields} = 0;
186 if ($Options{mode} =~ /^both$/i) {
187 $OptionsInfo{ModifyMolName} = 1; $OptionsInfo{ModifyDataFields} = 1;
188 }
189 elsif ($Options{mode} =~ /^datafields$/i) {
190 $OptionsInfo{ModifyMolName} = 0; $OptionsInfo{ModifyDataFields} = 1;
191 }
192
193 $OptionsInfo{KeepOldDataFields} = $Options{keepolddatafields};
194 $OptionsInfo{KeepAllOldDataFields} = ($Options{keepolddatafields} =~ /^all$/i) ? 1 : 0;
195 $OptionsInfo{KeepUnMappedOldDataFields} = ($Options{keepolddatafields} =~ /^unmappedonly$/i) ? 1 : 0;
196
197 $OptionsInfo{MolNameMode} = $Options{molnamemode};
198 $OptionsInfo{UseDataFieldForMolName} = ($Options{molnamemode} =~ /^datafield$/i) ? 1 : 0;
199
200 $OptionsInfo{MolName} = $Options{molname};
201 $OptionsInfo{MolNameDataField} = ""; $OptionsInfo{MolNamePrefix} = "Cmpd";
202 if ($Options{molname}) {
203 if ($OptionsInfo{UseDataFieldForMolName}) {
204 $OptionsInfo{MolNameDataField} = $Options{molname};
205 }
206 else {
207 $OptionsInfo{MolNamePrefix} = $Options{molname};
208 }
209 }
210
211 $OptionsInfo{MolNameReplace} = $Options{molnamereplace};
212 $OptionsInfo{AlwaysReplaceMolName} = ($Options{molnamereplace} =~ /^always$/i) ? 1 : 0;
213
214 if ($Options{datafieldsmap} && $Options{datafieldsmapfile}) {
215 die "Error: Both \"--datafieldsmap\" and \"--datafieldsmapfile\" options specified: only one is allowed at a time\n";
216 }
217
218 $OptionsInfo{DataFieldsMap} = $Options{datafieldsmap} ? $Options{datafieldsmap} : '';
219 $OptionsInfo{DataFieldsMapFile} = $Options{datafieldsmapfile} ? $Options{datafieldsmapfile} : '';
220
221 my($SpecifiedDataFieldMap);
222
223 %{$OptionsInfo{SpecifiedNewToOldSDFieldMap}} = ();
224 %{$OptionsInfo{SpecifiedOldToNewSDFieldMap}} = ();
225
226 $SpecifiedDataFieldMap = "";
227 if ($Options{datafieldsmap}) {
228 $SpecifiedDataFieldMap = $Options{datafieldsmap};
229 }
230 elsif ($Options{datafieldsmapfile}) {
231 my($Line, @LineWords);
232 open DATAFIELDSFILE, "$Options{datafieldsmapfile}" or die "Couldn't open $Options{datafieldsmapfile}: $! \n";
233 while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
234 @LineWords = quotewords(";", 0, $Line);
235 $SpecifiedDataFieldMap .= JoinWords(\@LineWords, ";", 0);
236 }
237 close DATAFIELDSFILE;
238 }
239
240 if ($SpecifiedDataFieldMap) {
241 my($DataFieldMap, $DataField, $NewSDField, @OldSDFields, @DataFieldMapSplit, @DataFieldsSplit, $FirstField);
242 @DataFieldMapSplit = split ";", $SpecifiedDataFieldMap;
243 for $DataFieldMap (@DataFieldMapSplit) {
244 @DataFieldsSplit = split ",", $DataFieldMap;
245 if (@DataFieldsSplit == 1) {
246 die "Error: Invalid number of comma delimited values, ", scalar(@DataFieldsSplit), ", specified, @DataFieldsSplit, using \"--datafieldsmap or --datafieldsmapfile\" option: it must contain more than one value.\n";
247 }
248 $FirstField = 1;
249 @OldSDFields = ();
250 for $DataField (@DataFieldsSplit) {
251 if (!(defined($DataField) && length($DataField))) {
252 die "Error: One of the comma delimited values, \"", join(",", @DataFieldsSplit), "\", specified using \"--datafieldsmap or --datafieldsmapfile\" option is empty.\n";
253 }
254 if ($FirstField) {
255 $FirstField = 0;
256 $NewSDField = $DataField;
257 }
258 else {
259 push @OldSDFields, $DataField;
260 }
261 }
262 # Make sure a datafield is only specified once...
263 if (exists $OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}) {
264 die "Error: New data field, $NewSDField, specified more than once using \"--datafieldsmap or --datafieldsmapfile\" option.\n";
265 }
266 @{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}} = ();
267 push @{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}}, @OldSDFields;
268 for $DataField (@OldSDFields) {
269 if (exists $OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataField} ) {
270 die "Error: SD field, $DataField, specified more than once using \"--datafieldsmap or --datafieldsmapfile\" option.\n";
271 }
272 else {
273 $OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataField} = $NewSDField;
274 }
275 }
276
277 }
278 }
279
280 $OptionsInfo{DataFieldsCommon} = $Options{datafieldscommon} ? $Options{datafieldscommon} : '';
281 %{$OptionsInfo{SpecifiedCommonFieldMap}} = ();
282
283 if ($Options{datafieldscommon}) {
284 my($DataFieldName, $DataFieldValue, $Index, @CommonDataFieldsSplit);
285 @CommonDataFieldsSplit = split ",", $Options{datafieldscommon};
286 if (@CommonDataFieldsSplit % 2) {
287 die "Error: Invalid number of comma delimited values, ", scalar(@CommonDataFieldsSplit), ", specified \"", join(",", @CommonDataFieldsSplit), "\" using \"--datafieldscommon\" option: it must contain even number of values.\n";
288 }
289 for ($Index = 0; $Index < @CommonDataFieldsSplit; $Index += 2) {
290 $DataFieldName = $CommonDataFieldsSplit[$Index];
291 $DataFieldValue = $CommonDataFieldsSplit[$Index + 1];
292 if (exists $OptionsInfo{SpecifiedCommonFieldMap}{$DataFieldName}) {
293 die "Error: Common data field, $DataFieldName, specified more than once using \"--datafieldscommon\" option.\n";
294 }
295 if (exists($OptionsInfo{SpecifiedNewToOldSDFieldMap}{$DataFieldName}) || exists($OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataFieldName})) {
296 die "Error: Common data field, $DataFieldName, specified using \"--datafieldscommon\" option cannot be specified in \"--datafieldsmap or --datafieldsmapfile\" option.\n";
297 }
298 $OptionsInfo{SpecifiedCommonFieldMap}{$DataFieldName} = $DataFieldValue;
299 }
300 }
301
302 $OptionsInfo{DataFieldURL} = $Options{datafieldurl} ? $Options{datafieldurl} : '';
303 $OptionsInfo{CreateDataFieldURL} = (exists($Options{datafieldurl}) && length($Options{datafieldurl}) ) ? 1 : 0;
304
305 $OptionsInfo{URLDataFieldLabel} = ""; $OptionsInfo{URLCGIScriptName} = "";
306 $OptionsInfo{URLParamName} = ""; $OptionsInfo{URLCmpdIdFieldName} = "";
307
308 if ($OptionsInfo{CreateDataFieldURL}) {
309 my(@DataFieldURLSplit, $Value);
310 @DataFieldURLSplit = split ",", $Options{datafieldurl};
311 if (@DataFieldURLSplit != 4) {
312 die "Error: Invalid number of values, ", scalar(@DataFieldURLSplit), ", specified using \"--datafieldURL\" option: it must contain 4 values.\n";
313 }
314 for $Value (@DataFieldURLSplit) {
315 if (!IsNotEmpty($Value)) {
316 die "Error: One of the values, $Options{datafieldurl}, specified using \"--datafieldURL\" option is empty.\n";
317 }
318 }
319 $OptionsInfo{URLDataFieldLabel} = $DataFieldURLSplit[0];
320 $OptionsInfo{URLCGIScriptName} = $DataFieldURLSplit[1];
321 $OptionsInfo{URLParamName} = $DataFieldURLSplit[2];
322 $OptionsInfo{URLCmpdIdFieldName} = $DataFieldURLSplit[3];
323 }
324
325 }
326
327 # Retrieve information about input SD files...
328 sub RetrieveSDFilesInfo {
329 my($Index, $SDFile, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFile, $DataFieldName);
330
331 %SDFilesInfo = ();
332 @{$SDFilesInfo{FileOkay}} = ();
333 @{$SDFilesInfo{OutFile}} = ();
334
335 FILELIST: for $Index (0 .. $#SDFilesList) {
336 $SDFile = $SDFilesList[$Index];
337
338 $SDFilesInfo{FileOkay}[$Index] = 0;
339 $SDFilesInfo{OutFile}[$Index] = '';
340
341 if (!(-e $SDFile)) {
342 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
343 next FILELIST;
344 }
345 if (!CheckFileType($SDFile, "sd sdf")) {
346 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
347 next FILELIST;
348 }
349 $FileDir = ""; $FileName = ""; $FileExt = "";
350 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
351 if ($Options{root} && (@SDFilesList == 1)) {
352 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
353 if ($RootFileName && $RootFileExt) {
354 $FileName = $RootFileName;
355 }
356 else {
357 $FileName = $Options{root};
358 }
359 $OutFileRoot = $FileName;
360 }
361 else {
362 $OutFileRoot = $FileName . "ModifiedDataFields";
363 }
364
365 $OutFile = $OutFileRoot . ".$FileExt";
366 if (lc($OutFile) eq lc($SDFile)) {
367 warn "Warning: Ignoring file $SDFile:Output file name, $OutFile, is same as input SD file name, $SDFile\n";
368 next FILELIST;
369 }
370 if (!$Options{overwrite}) {
371 if (-e $OutFile) {
372 warn "Warning: Ignoring file $SDFile: The file $OutFile already exists\n";
373 next FILELIST;
374 }
375 }
376
377 $SDFilesInfo{FileOkay}[$Index] = 1;
378 $SDFilesInfo{OutFile}[$Index] = $OutFile;
379 }
380 }
381
382 # Setup script usage and retrieve command line arguments specified using various options...
383 sub SetupScriptUsage {
384
385 # Retrieve all the options...
386 %Options = ();
387 $Options{detail} = 1;
388 $Options{keepolddatafields} = "none";
389 $Options{mode} = "molname";
390 $Options{molnamemode} = "labelprefix";
391 $Options{molnamereplace} = "empty";
392
393 if (!GetOptions(\%Options, "detail|d=i", "datafieldscommon=s", "datafieldsmap=s", "datafieldsmapfile=s", "datafieldurl=s", "help|h", "keepolddatafields|k=s", "mode|m=s", "molname=s", "molnamemode=s", "molnamereplace=s", "overwrite|o", "root|r=s", "workingdir|w=s")) {
394 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
395 }
396 if ($Options{workingdir}) {
397 if (! -d $Options{workingdir}) {
398 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
399 }
400 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
401 }
402 if ($Options{keepolddatafields} !~ /^(all|unmappedonly|none)$/i) {
403 die "Error: The value specified, $Options{keepolddatafields}, for option \"-k --keepolddatafields\" is not valid. Allowed values: all, unmappedonly, or none\n";
404 }
405 if ($Options{mode} !~ /^(molname|datafields|both)$/i) {
406 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: molname, datafields, or both\n";
407 }
408 if ($Options{molnamemode} !~ /^(datafield|labelprefix)$/i) {
409 die "Error: The value specified, $Options{molnamemode}, for option \"--molnamemode\" is not valid. Allowed values: datafield or labelprefix\n";
410 }
411 if ($Options{molnamereplace} !~ /^(always|empty)$/i) {
412 die "Error: The value specified, $Options{molnamereplace}, for option \"--molnamereplace\" is not valid. Allowed values: always or empty\n";
413 }
414 if (!IsPositiveInteger($Options{detail})) {
415 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
416 }
417 }
418
419 __END__
420
421 =head1 NAME
422
423 ModifySDFilesDataFields.pl - Modify data fields in SDFile(s)
424
425 =head1 SYNOPSIS
426
427 ModifySDFilesDataFields.pl SDFile(s)...
428
429 ModifySDFilesDataFields.pl [B<-d, --detail> infolevel]
430 [B<--datafieldscommon> newfieldlabel, newfieldvalue, [newfieldlabel, newfieldvalue,...]]
431 [B<--datafieldsmap> newfieldlabel, oldfieldlabel, [oldfieldlabel,...]; [newfieldlabel, oldfieldlabel, [oldfieldlabel,...]]]
432 [B<--datafieldsmapfile> filename] [B<--datafieldURL> URLDataFieldLabel, CGIScriptPath, CGIParamName, CmpdIDFieldLabel]
433 [B<-h, --help>] [B<-k, --keepolddatafields> all | unmappedonly | none] [B<-m, --mode> molname | datafields | both]
434 [B<--molnamemode> datafield | labelprefix] [B<--molname> datafieldname or prefixstring]
435 [B<--molnamereplace> always | empty] [B<-o, --overwrite>] [B<-r, --root> rootname]
436 [B<-w, --workingdir> dirname] SDFile(s)...
437
438 =head1 DESCRIPTION
439
440 Modify molname line and data fields in I<SDFile(s)>. Molname line can be replaced by a
441 data field value or assigned a sequential ID prefixed with a specific string. For data
442 fields and modification of their values, these types of options are supported: replace
443 data field labels by another set of labels; combine values of multiple data fields and
444 assign a new label; add specific set of data field labels and values to all compound
445 records; and others.
446
447 The file names are separated by space.The valid file extensions are I<.sdf> and I<.sd>.
448 All other file names are ignored. All the SD files in a current directory can be specified
449 either by I<*.sdf> or the current directory name.
450
451 =head1 OPTIONS
452
453 =over 4
454
455 =item B<-d, --detail> I<infolevel>
456
457 Level of information to print about compound records being ignored. Default: I<1>. Possible
458 values: I<1, 2 or 3>.
459
460 =item B<--datafieldscommon> I<newfieldlabel, newfieldvalue, [newfieldlabel, newfieldvalue,...]>
461
462 Specify data field labels and values for addition to each compound record. It's a comma delimited
463 list of data field label and values pair. Default: I<none>.
464
465 Examples:
466
467 DepositionDate,YYYY-MM-DD
468 Source,www.domainname.org,ReleaseData,YYYY-MM-DD
469
470 =item B<--datafieldsmap> I<newfieldlabel, oldfieldlabel, [oldfieldlabel,...]; [newfieldlabel, oldfieldlabel, [oldfieldlabel,...]]>
471
472 Specify how various data field labels and values are combined to generate a new data field
473 labels and their values. All the comma delimited data fields, with in a semicolon delimited set,
474 are mapped to the first new data field label along with the data field values joined via new
475 line character. Default: I<none>.
476
477 Examples:
478
479 Synonym,Name,SystematicName,Synonym;CmpdID,Extreg
480 HBondDonors,SumNHOH
481
482 =item B<--datafieldsmapfile> I<filename>
483
484 Filename containing mapping of data fields. Format of data fields line in this file corresponds
485 to B<--datafieldsmap> option. Example:
486
487 Line 1: Synonym,Name,SystematicName,Synonym;CmpdID,Extreg
488 Line 2: HBondDonors,SumNHOH
489
490
491 =item B<--datafieldURL> I<URLDataFieldLabel, CGIScriptPath, CGIParamName, CmpdIDFieldLabel>
492
493 Specify how to generate a URL for retrieving compound data from a web server and add it
494 to each compound record. I<URLDataFieldLabel> is used as the data field label for URL value
495 which is created by combining I<CGIScriptPath,CGIParamName,CmpdIDFieldLabel> values:
496 CGIScriptPath?CGIParamName=CmpdIDFieldLabelValue. Default: I<none>.
497
498 Example:
499
500 Source,http://www.yourdomain.org/GetCmpd.pl,Reg_ID,Mol_ID
501
502 =item B<-h, --help>
503
504 Print this help message.
505
506 =item B<-k, --keepolddatafields> I<all | unmappedonly | none>
507
508 Specify how to transfer old data fields from input SDFile(s) to new SDFile(s) during
509 I<datafields | both> value of B<-m, --mode> option: keep all old data fields; write out the ones
510 not mapped to new fields as specified by B<--datafieldsmap> or <--datafieldsmapfile> options;
511 or ignore all old data field labels. For I<molname> B<-m --mode>, old datafields are always kept.
512 Possible values: I<all | unmappedonly | none>. Default: I<none>.
513
514 =item B<-m, --mode> I<molname | datafields | both>
515
516 Specify how to modify SDFile(s): I<molname> - change molname line by another datafield or value;
517 I<datafield> - modify data field labels and values by replacing one label by another, combining
518 multiple data field labels and values, adding specific set of data field labels and values to all compound, or
519 inserting an URL for compound retrieval to each record; I<both> - change molname line and datafields
520 simultaneously. Possible values: I<molname | datafields | both>. Default: I<molname>
521
522 =item B<--molnamemode> I<datafield | labelprefix>
523
524 Specify how to change molname line for B<-m --mode> option values of I<molname | both>: use
525 a datafield label value or assign a sequential ID prefixed with I<labelprefix>. Possible values:
526 I<datafield | labelprefix>. Default: I<labelprefix>.
527
528 =item B<--molname> I<datafieldname or prefixstring>
529
530 Molname generation method. For I<datafield> value of B<--molnamemode> option, it corresponds
531 to datafield label name whose value is used for molname; otherwise, it's a prefix string used for
532 generating compound IDs like labelprefixstring<Number>. Default value, I<Cmpd>, generates
533 compound IDs like Cmpd<Number> for molname.
534
535 =item B<--molnamereplace> I<always | empty>
536
537 Specify when to replace molname line for B<-m --mode> option values of I<molname | both>:
538 always replace the molname line using B<--molname> option or only when it's empty. Possible
539 values: I<always | empty>. Default: I<empty>.
540
541 =item B<-o, --overwrite>
542
543 Overwrite existing files.
544
545 =item B<-r, --root> I<rootname>
546
547 New SD file name is generated using the root: <Root>.<Ext>. Default new file
548 name: <InitialSDFileName>ModifiedDataFields.<Ext>. This option is ignored for multiple
549 input files.
550
551 =item B<-w, --workingdir> I<dirname>
552
553 Location of working directory. Default: current directory.
554
555 =back
556
557 =head1 EXAMPLES
558
559 To replace empty molname lines by Cmpd<CmpdNumber> and generate a new SD file
560 NewSample1.sdf, type:
561
562 % ModifySDFilesDataFields.pl -o -r NewSample1 Sample1.sdf
563
564 To replace all molname lines by Mol_ID data field generate a new SD file
565 NewSample1.sdf, type:
566
567 % ModifySDFilesDataFields.pl --molnamemode datafield
568 --molnamereplace always -r NewSample1 -o Sample1.sdf
569
570 To replace all molname lines by Mol_ID data field, map Name and CompoundName to
571 a new datafield Synonym, and generate a new SD file NewSample1.sdf, type:
572
573 % ModifySDFilesDataFields.pl --molnamemode datafield
574 --molnamereplace always --molname Mol_ID --mode both
575 --datafieldsmap "Synonym,Name,CompoundName" -r
576 NewSample1 -o Sample1.sdf
577
578 To replace all molname lines by Mol_ID data field, map Name and CompoundName to
579 a new datafield Synonym, add common fields ReleaseDate and Source, and
580 generate a new SD file NewSample1.sdf without keeping any old SD data fields, type:
581
582 % ModifySDFilesDataFields.pl --molnamemode datafield
583 --molnamereplace always --molname Mol_ID --mode both
584 --datafieldsmap "Synonym,Name,CompoundName"
585 --datafieldscommon "ReleaseDate,yyyy-mm-dd,Source,
586 www.mayachemtools.org" --keepolddatafields none -r
587 NewSample1 -o Sample1.sdf
588
589 B<Preparing SD files PubChem deposition:>
590
591 Consider a SD file with these fields: Mol_ID, Name, Synonyms and Systematic_Name.
592 And Mol_ID data field uniquely identifies your compound.
593
594 To prepare a new SD file CmpdDataForPubChem.sdf containing only required
595 PUBCHEM_EXT_DATASOURCE_REGID field, type:
596
597 % ModifySDFilesDataFields.pl --m datafields
598 --datafieldsmap
599 "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID"
600 -r CmpdDataForPubChem -o Sample1.sdf
601
602 To prepare a new SD file CmpdDataForPubChem.sdf containing only required
603 PUBCHEM_EXT_DATASOURCE_REGID field and replace molname line with Mol_ID, type:
604
605 % ModifySDFilesDataFields.pl --molnamemode datafield
606 --molnamereplace always --molname Mol_ID --mode both
607 --datafieldsmap
608 "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID"
609 -r CmpdDataForPubChem -o Sample1.sdf
610
611 In addition to required PubChem data field, you can also add optional PubChem data
612 fields.
613
614 To map your Name, Synonyms and Systematic_Name data fields to optional
615 PUBCHEM_SUBSTANCE_SYNONYM data field along with required ID field, type:
616
617 % ModifySDFilesDataFields.pl --molnamemode datafield
618 --molnamereplace always --molname Mol_ID --mode both
619 --datafieldsmap
620 "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID;
621 PUBCHEM_SUBSTANCE_SYNONYM,Name,CompoundName"
622 -r CmpdDataForPubChem -o Sample1.sdf
623
624 To add your <domain.org> as PUBCHEM_EXT_SUBSTANCE_URL and link substance
625 retrieval to your CGI script <http://www.yourdomain.org/GetCmpd.pl,Reg_ID,Mol_ID>
626 via PUBCHEM_EXT_DATASOURCE_REGID field along with optional and required
627 data fields, type:
628
629 % ModifySDFilesDataFields.pl --molnamemode datafield
630 --molnamereplace always --molname Mol_ID --mode both
631 --datafieldsmap
632 "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID;
633 PUBCHEM_SUBSTANCE_SYNONYM,Name,CompoundName"
634 --datafieldscommon
635 "PUBCHEM_EXT_SUBSTANCE_URL,domain.org"
636 --datafieldURL "PUBCHEM_EXT_DATASOURCE_URL,
637 http://www.yourdomain.org/GetCmpd.pl,Reg_ID,Mol_ID"
638 -r CmpdDataForPubChem -o Sample1.sdf
639
640 And to add a publication date and request a release data using
641 PUBCHEM_PUBLICATION_DATE and PUBCHEM_DEPOSITOR_RECORD_DATE data fields
642 along with all the data fields in earlier examples, type:
643 optional fields, type:
644
645 % ModifySDFilesDataFields.pl --molnamemode datafield
646 --molnamereplace always --molname Mol_ID --mode both
647 --datafieldsmap
648 "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID;
649 PUBCHEM_SUBSTANCE_SYNONYM,Name,CompoundName"
650 --datafieldURL "PUBCHEM_EXT_DATASOURCE_URL,
651 http://www.yourdomain.org/GetCmpd.pl,Reg_ID,Mol_ID"
652 --datafieldscommon
653 "PUBCHEM_EXT_SUBSTANCE_URL,domain.org,
654 PUBCHEM_PUBLICATION_DATE,YYY-MM-DD,
655 PUBCHEM_DEPOSITOR_RECORD_DATE,YYYY-MM-DD"
656 -r CmpdDataForPubChem -o Sample1.sdf
657
658 =head1 AUTHOR
659
660 Manish Sud <msud@san.rr.com>
661
662 =head1 SEE ALSO
663
664 InfoSDFiles.pl, JoinSDFiles.pl, MergeTextFilesWithSD.pl, SplitSDFiles.pl, SDFilesToHTML.pl
665
666 =head1 COPYRIGHT
667
668 Copyright (C) 2015 Manish Sud. All rights reserved.
669
670 This file is part of MayaChemTools.
671
672 MayaChemTools is free software; you can redistribute it and/or modify it under
673 the terms of the GNU Lesser General Public License as published by the Free
674 Software Foundation; either version 3 of the License, or (at your option)
675 any later version.
676
677 =cut