0
|
1 #!/usr/bin/perl -w
|
|
2 #
|
|
3 # $RCSfile: ModifySDFilesDataFields.pl,v $
|
|
4 # $Date: 2015/02/28 20:46:20 $
|
|
5 # $Revision: 1.27 $
|
|
6 #
|
|
7 # Author: Manish Sud <msud@san.rr.com>
|
|
8 #
|
|
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
10 #
|
|
11 # This file is part of MayaChemTools.
|
|
12 #
|
|
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
14 # the terms of the GNU Lesser General Public License as published by the Free
|
|
15 # Software Foundation; either version 3 of the License, or (at your option) any
|
|
16 # later version.
|
|
17 #
|
|
18 # MayaChemTools is distributed in the hope that it will be useful, but without
|
|
19 # any warranty; without even the implied warranty of merchantability of fitness
|
|
20 # for a particular purpose. See the GNU Lesser General Public License for more
|
|
21 # details.
|
|
22 #
|
|
23 # You should have received a copy of the GNU Lesser General Public License
|
|
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
|
|
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
|
|
26 # Boston, MA, 02111-1307, USA.
|
|
27 #
|
|
28
|
|
29 use strict;
|
|
30 use FindBin; use lib "$FindBin::Bin/../lib";
|
|
31 use Getopt::Long;
|
|
32 use File::Basename;
|
|
33 use Text::ParseWords;
|
|
34 use Benchmark;
|
|
35 use FileUtil;
|
|
36 use SDFileUtil;
|
|
37 use TextUtil;
|
|
38
|
|
39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
|
|
40
|
|
41 # Autoflush STDOUT
|
|
42 $| = 1;
|
|
43
|
|
44 # Starting message...
|
|
45 $ScriptName = basename($0);
|
|
46 print "\n$ScriptName: Starting...\n\n";
|
|
47 $StartTime = new Benchmark;
|
|
48
|
|
49 # Get the options and setup script...
|
|
50 SetupScriptUsage();
|
|
51 if ($Options{help} || @ARGV < 1) {
|
|
52 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
|
|
53 }
|
|
54
|
|
55 my(@SDFilesList);
|
|
56 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
|
|
57
|
|
58 # Process options...
|
|
59 print "Processing options...\n";
|
|
60 my(%OptionsInfo);
|
|
61 ProcessOptions();
|
|
62
|
|
63 print "Checking input SD file(s)...\n";
|
|
64 my(%SDFilesInfo);
|
|
65 RetrieveSDFilesInfo();
|
|
66
|
|
67 # Generate output files...
|
|
68 my($FileIndex);
|
|
69 if (@SDFilesList > 1) {
|
|
70 print "\nProcessing SD files...\n";
|
|
71 }
|
|
72 for $FileIndex (0 .. $#SDFilesList) {
|
|
73 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
|
|
74 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
|
|
75 ModifySDFile($FileIndex);
|
|
76 }
|
|
77 }
|
|
78 print "\n$ScriptName:Done...\n\n";
|
|
79
|
|
80 $EndTime = new Benchmark;
|
|
81 $TotalTime = timediff ($EndTime, $StartTime);
|
|
82 print "Total time: ", timestr($TotalTime), "\n";
|
|
83
|
|
84 ###############################################################################
|
|
85
|
|
86 # Modify SD file data fields....
|
|
87 sub ModifySDFile {
|
|
88 my($Index) = @_;
|
|
89 my($SDFile, $NewSDFile);
|
|
90
|
|
91 $SDFile = $SDFilesList[$Index];
|
|
92 $NewSDFile = $SDFilesInfo{OutFile}[$Index];
|
|
93
|
|
94 print "Generating new SD file $NewSDFile...\n";
|
|
95 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n";
|
|
96 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
|
|
97
|
|
98 my($CmpdCount, $CmpdString, $CmpdData, $MolName, $OldSDField, $NewSDField, $CommonSDField, $Label, $Value, $FieldValues, $MolNameDataField, $URLCmpdIdFieldName, @CmpdLines, %DataFieldAndValues, @DataFieldLabels);
|
|
99 $CmpdCount = 0;
|
|
100
|
|
101 COMPOUND: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
|
|
102 $CmpdCount++;
|
|
103 @CmpdLines = split "\n", $CmpdString;
|
|
104 if ($OptionsInfo{UseDataFieldForMolName} || $OptionsInfo{ModifyDataFields}) {
|
|
105 %DataFieldAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
|
|
106 }
|
|
107 if ($OptionsInfo{ModifyMolName}) {
|
|
108 if ($OptionsInfo{AlwaysReplaceMolName} || !IsNotEmpty($CmpdLines[0])) {
|
|
109 $MolNameDataField = $OptionsInfo{MolNameDataField};
|
|
110 if ($OptionsInfo{UseDataFieldForMolName} && exists($DataFieldAndValues{$MolNameDataField})) {
|
|
111 $MolName = $DataFieldAndValues{$MolNameDataField};
|
|
112 if (length($MolName) > 80) {
|
|
113 $MolName = substr($MolName, 0, 80);
|
|
114 }
|
|
115 }
|
|
116 else {
|
|
117 $MolName = "$OptionsInfo{MolNamePrefix}${CmpdCount}";
|
|
118 }
|
|
119 $CmpdLines[0] = $MolName;
|
|
120 $CmpdString = join "\n", @CmpdLines;
|
|
121 }
|
|
122 }
|
|
123 if (!$OptionsInfo{ModifyDataFields}) {
|
|
124 # Just write the data and get the next compound...
|
|
125 print NEWSDFILE "$CmpdString\n";
|
|
126 next COMPOUND;
|
|
127 }
|
|
128 # Write out the structure data now and handle the old data fields later...
|
|
129 ($CmpdData) = split /\n>/, $CmpdString;
|
|
130 print NEWSDFILE "$CmpdData\n";
|
|
131
|
|
132 # Modify specified data fields...
|
|
133 for $NewSDField (sort keys %{$OptionsInfo{SpecifiedNewToOldSDFieldMap}}) {
|
|
134 $FieldValues = "";
|
|
135 for $OldSDField (@{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}}) {
|
|
136 if (exists($DataFieldAndValues{$OldSDField}) && length($DataFieldAndValues{$OldSDField})) {
|
|
137 $Value = $DataFieldAndValues{$OldSDField};
|
|
138 $FieldValues .= ($FieldValues) ? "\n$Value" : $Value;
|
|
139 }
|
|
140 }
|
|
141 print NEWSDFILE "> <$NewSDField>\n$FieldValues\n\n";
|
|
142 }
|
|
143 # Add specified common fields...
|
|
144 for $CommonSDField (sort keys %{$OptionsInfo{SpecifiedCommonFieldMap}}) {
|
|
145 $Value = $OptionsInfo{SpecifiedCommonFieldMap}{$CommonSDField};
|
|
146 print NEWSDFILE "> <$CommonSDField>\n$Value\n\n";
|
|
147 }
|
|
148 if ($OptionsInfo{CreateDataFieldURL}) {
|
|
149 $Value = "";
|
|
150 $URLCmpdIdFieldName = $OptionsInfo{URLCmpdIdFieldName};
|
|
151 if (exists($DataFieldAndValues{$URLCmpdIdFieldName}) && length($DataFieldAndValues{$URLCmpdIdFieldName})) {
|
|
152 $Value = $DataFieldAndValues{$URLCmpdIdFieldName};
|
|
153 $Value = "$OptionsInfo{URLCGIScriptName}?$OptionsInfo{URLParamName}=${Value}";
|
|
154 }
|
|
155 print NEWSDFILE "> <$OptionsInfo{URLDataFieldLabel}>\n$Value\n\n";
|
|
156 }
|
|
157
|
|
158 # Handle old data fields and write 'em in the same order as they appear in the input
|
|
159 # files...
|
|
160 if ($OptionsInfo{KeepAllOldDataFields} || $OptionsInfo{KeepUnMappedOldDataFields}) {
|
|
161 my($KeepLabel);
|
|
162 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
|
|
163 LABEL: for $Label (@DataFieldLabels) {
|
|
164 $KeepLabel = $OptionsInfo{KeepAllOldDataFields} ? 1 : ( exists($OptionsInfo{SpecifiedOldToNewSDFieldMap}{$Label}) ? 0 : 1 );
|
|
165 if (!$KeepLabel) {
|
|
166 next LABEL;
|
|
167 }
|
|
168 $Value = $DataFieldAndValues{$Label};
|
|
169 print NEWSDFILE "> <$Label>\n$Value\n\n";
|
|
170 }
|
|
171 }
|
|
172
|
|
173 print NEWSDFILE "\$\$\$\$\n";
|
|
174 }
|
|
175 close NEWSDFILE;
|
|
176 close SDFILE;
|
|
177 }
|
|
178
|
|
179 # Process option values...
|
|
180 sub ProcessOptions {
|
|
181 %OptionsInfo = ();
|
|
182
|
|
183 $OptionsInfo{Mode} = $Options{mode};
|
|
184
|
|
185 $OptionsInfo{ModifyMolName} = 1; $OptionsInfo{ModifyDataFields} = 0;
|
|
186 if ($Options{mode} =~ /^both$/i) {
|
|
187 $OptionsInfo{ModifyMolName} = 1; $OptionsInfo{ModifyDataFields} = 1;
|
|
188 }
|
|
189 elsif ($Options{mode} =~ /^datafields$/i) {
|
|
190 $OptionsInfo{ModifyMolName} = 0; $OptionsInfo{ModifyDataFields} = 1;
|
|
191 }
|
|
192
|
|
193 $OptionsInfo{KeepOldDataFields} = $Options{keepolddatafields};
|
|
194 $OptionsInfo{KeepAllOldDataFields} = ($Options{keepolddatafields} =~ /^all$/i) ? 1 : 0;
|
|
195 $OptionsInfo{KeepUnMappedOldDataFields} = ($Options{keepolddatafields} =~ /^unmappedonly$/i) ? 1 : 0;
|
|
196
|
|
197 $OptionsInfo{MolNameMode} = $Options{molnamemode};
|
|
198 $OptionsInfo{UseDataFieldForMolName} = ($Options{molnamemode} =~ /^datafield$/i) ? 1 : 0;
|
|
199
|
|
200 $OptionsInfo{MolName} = $Options{molname};
|
|
201 $OptionsInfo{MolNameDataField} = ""; $OptionsInfo{MolNamePrefix} = "Cmpd";
|
|
202 if ($Options{molname}) {
|
|
203 if ($OptionsInfo{UseDataFieldForMolName}) {
|
|
204 $OptionsInfo{MolNameDataField} = $Options{molname};
|
|
205 }
|
|
206 else {
|
|
207 $OptionsInfo{MolNamePrefix} = $Options{molname};
|
|
208 }
|
|
209 }
|
|
210
|
|
211 $OptionsInfo{MolNameReplace} = $Options{molnamereplace};
|
|
212 $OptionsInfo{AlwaysReplaceMolName} = ($Options{molnamereplace} =~ /^always$/i) ? 1 : 0;
|
|
213
|
|
214 if ($Options{datafieldsmap} && $Options{datafieldsmapfile}) {
|
|
215 die "Error: Both \"--datafieldsmap\" and \"--datafieldsmapfile\" options specified: only one is allowed at a time\n";
|
|
216 }
|
|
217
|
|
218 $OptionsInfo{DataFieldsMap} = $Options{datafieldsmap} ? $Options{datafieldsmap} : '';
|
|
219 $OptionsInfo{DataFieldsMapFile} = $Options{datafieldsmapfile} ? $Options{datafieldsmapfile} : '';
|
|
220
|
|
221 my($SpecifiedDataFieldMap);
|
|
222
|
|
223 %{$OptionsInfo{SpecifiedNewToOldSDFieldMap}} = ();
|
|
224 %{$OptionsInfo{SpecifiedOldToNewSDFieldMap}} = ();
|
|
225
|
|
226 $SpecifiedDataFieldMap = "";
|
|
227 if ($Options{datafieldsmap}) {
|
|
228 $SpecifiedDataFieldMap = $Options{datafieldsmap};
|
|
229 }
|
|
230 elsif ($Options{datafieldsmapfile}) {
|
|
231 my($Line, @LineWords);
|
|
232 open DATAFIELDSFILE, "$Options{datafieldsmapfile}" or die "Couldn't open $Options{datafieldsmapfile}: $! \n";
|
|
233 while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
|
|
234 @LineWords = quotewords(";", 0, $Line);
|
|
235 $SpecifiedDataFieldMap .= JoinWords(\@LineWords, ";", 0);
|
|
236 }
|
|
237 close DATAFIELDSFILE;
|
|
238 }
|
|
239
|
|
240 if ($SpecifiedDataFieldMap) {
|
|
241 my($DataFieldMap, $DataField, $NewSDField, @OldSDFields, @DataFieldMapSplit, @DataFieldsSplit, $FirstField);
|
|
242 @DataFieldMapSplit = split ";", $SpecifiedDataFieldMap;
|
|
243 for $DataFieldMap (@DataFieldMapSplit) {
|
|
244 @DataFieldsSplit = split ",", $DataFieldMap;
|
|
245 if (@DataFieldsSplit == 1) {
|
|
246 die "Error: Invalid number of comma delimited values, ", scalar(@DataFieldsSplit), ", specified, @DataFieldsSplit, using \"--datafieldsmap or --datafieldsmapfile\" option: it must contain more than one value.\n";
|
|
247 }
|
|
248 $FirstField = 1;
|
|
249 @OldSDFields = ();
|
|
250 for $DataField (@DataFieldsSplit) {
|
|
251 if (!(defined($DataField) && length($DataField))) {
|
|
252 die "Error: One of the comma delimited values, \"", join(",", @DataFieldsSplit), "\", specified using \"--datafieldsmap or --datafieldsmapfile\" option is empty.\n";
|
|
253 }
|
|
254 if ($FirstField) {
|
|
255 $FirstField = 0;
|
|
256 $NewSDField = $DataField;
|
|
257 }
|
|
258 else {
|
|
259 push @OldSDFields, $DataField;
|
|
260 }
|
|
261 }
|
|
262 # Make sure a datafield is only specified once...
|
|
263 if (exists $OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}) {
|
|
264 die "Error: New data field, $NewSDField, specified more than once using \"--datafieldsmap or --datafieldsmapfile\" option.\n";
|
|
265 }
|
|
266 @{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}} = ();
|
|
267 push @{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}}, @OldSDFields;
|
|
268 for $DataField (@OldSDFields) {
|
|
269 if (exists $OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataField} ) {
|
|
270 die "Error: SD field, $DataField, specified more than once using \"--datafieldsmap or --datafieldsmapfile\" option.\n";
|
|
271 }
|
|
272 else {
|
|
273 $OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataField} = $NewSDField;
|
|
274 }
|
|
275 }
|
|
276
|
|
277 }
|
|
278 }
|
|
279
|
|
280 $OptionsInfo{DataFieldsCommon} = $Options{datafieldscommon} ? $Options{datafieldscommon} : '';
|
|
281 %{$OptionsInfo{SpecifiedCommonFieldMap}} = ();
|
|
282
|
|
283 if ($Options{datafieldscommon}) {
|
|
284 my($DataFieldName, $DataFieldValue, $Index, @CommonDataFieldsSplit);
|
|
285 @CommonDataFieldsSplit = split ",", $Options{datafieldscommon};
|
|
286 if (@CommonDataFieldsSplit % 2) {
|
|
287 die "Error: Invalid number of comma delimited values, ", scalar(@CommonDataFieldsSplit), ", specified \"", join(",", @CommonDataFieldsSplit), "\" using \"--datafieldscommon\" option: it must contain even number of values.\n";
|
|
288 }
|
|
289 for ($Index = 0; $Index < @CommonDataFieldsSplit; $Index += 2) {
|
|
290 $DataFieldName = $CommonDataFieldsSplit[$Index];
|
|
291 $DataFieldValue = $CommonDataFieldsSplit[$Index + 1];
|
|
292 if (exists $OptionsInfo{SpecifiedCommonFieldMap}{$DataFieldName}) {
|
|
293 die "Error: Common data field, $DataFieldName, specified more than once using \"--datafieldscommon\" option.\n";
|
|
294 }
|
|
295 if (exists($OptionsInfo{SpecifiedNewToOldSDFieldMap}{$DataFieldName}) || exists($OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataFieldName})) {
|
|
296 die "Error: Common data field, $DataFieldName, specified using \"--datafieldscommon\" option cannot be specified in \"--datafieldsmap or --datafieldsmapfile\" option.\n";
|
|
297 }
|
|
298 $OptionsInfo{SpecifiedCommonFieldMap}{$DataFieldName} = $DataFieldValue;
|
|
299 }
|
|
300 }
|
|
301
|
|
302 $OptionsInfo{DataFieldURL} = $Options{datafieldurl} ? $Options{datafieldurl} : '';
|
|
303 $OptionsInfo{CreateDataFieldURL} = (exists($Options{datafieldurl}) && length($Options{datafieldurl}) ) ? 1 : 0;
|
|
304
|
|
305 $OptionsInfo{URLDataFieldLabel} = ""; $OptionsInfo{URLCGIScriptName} = "";
|
|
306 $OptionsInfo{URLParamName} = ""; $OptionsInfo{URLCmpdIdFieldName} = "";
|
|
307
|
|
308 if ($OptionsInfo{CreateDataFieldURL}) {
|
|
309 my(@DataFieldURLSplit, $Value);
|
|
310 @DataFieldURLSplit = split ",", $Options{datafieldurl};
|
|
311 if (@DataFieldURLSplit != 4) {
|
|
312 die "Error: Invalid number of values, ", scalar(@DataFieldURLSplit), ", specified using \"--datafieldURL\" option: it must contain 4 values.\n";
|
|
313 }
|
|
314 for $Value (@DataFieldURLSplit) {
|
|
315 if (!IsNotEmpty($Value)) {
|
|
316 die "Error: One of the values, $Options{datafieldurl}, specified using \"--datafieldURL\" option is empty.\n";
|
|
317 }
|
|
318 }
|
|
319 $OptionsInfo{URLDataFieldLabel} = $DataFieldURLSplit[0];
|
|
320 $OptionsInfo{URLCGIScriptName} = $DataFieldURLSplit[1];
|
|
321 $OptionsInfo{URLParamName} = $DataFieldURLSplit[2];
|
|
322 $OptionsInfo{URLCmpdIdFieldName} = $DataFieldURLSplit[3];
|
|
323 }
|
|
324
|
|
325 }
|
|
326
|
|
327 # Retrieve information about input SD files...
|
|
328 sub RetrieveSDFilesInfo {
|
|
329 my($Index, $SDFile, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFile, $DataFieldName);
|
|
330
|
|
331 %SDFilesInfo = ();
|
|
332 @{$SDFilesInfo{FileOkay}} = ();
|
|
333 @{$SDFilesInfo{OutFile}} = ();
|
|
334
|
|
335 FILELIST: for $Index (0 .. $#SDFilesList) {
|
|
336 $SDFile = $SDFilesList[$Index];
|
|
337
|
|
338 $SDFilesInfo{FileOkay}[$Index] = 0;
|
|
339 $SDFilesInfo{OutFile}[$Index] = '';
|
|
340
|
|
341 if (!(-e $SDFile)) {
|
|
342 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
|
|
343 next FILELIST;
|
|
344 }
|
|
345 if (!CheckFileType($SDFile, "sd sdf")) {
|
|
346 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
|
|
347 next FILELIST;
|
|
348 }
|
|
349 $FileDir = ""; $FileName = ""; $FileExt = "";
|
|
350 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
|
|
351 if ($Options{root} && (@SDFilesList == 1)) {
|
|
352 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
|
|
353 if ($RootFileName && $RootFileExt) {
|
|
354 $FileName = $RootFileName;
|
|
355 }
|
|
356 else {
|
|
357 $FileName = $Options{root};
|
|
358 }
|
|
359 $OutFileRoot = $FileName;
|
|
360 }
|
|
361 else {
|
|
362 $OutFileRoot = $FileName . "ModifiedDataFields";
|
|
363 }
|
|
364
|
|
365 $OutFile = $OutFileRoot . ".$FileExt";
|
|
366 if (lc($OutFile) eq lc($SDFile)) {
|
|
367 warn "Warning: Ignoring file $SDFile:Output file name, $OutFile, is same as input SD file name, $SDFile\n";
|
|
368 next FILELIST;
|
|
369 }
|
|
370 if (!$Options{overwrite}) {
|
|
371 if (-e $OutFile) {
|
|
372 warn "Warning: Ignoring file $SDFile: The file $OutFile already exists\n";
|
|
373 next FILELIST;
|
|
374 }
|
|
375 }
|
|
376
|
|
377 $SDFilesInfo{FileOkay}[$Index] = 1;
|
|
378 $SDFilesInfo{OutFile}[$Index] = $OutFile;
|
|
379 }
|
|
380 }
|
|
381
|
|
382 # Setup script usage and retrieve command line arguments specified using various options...
|
|
383 sub SetupScriptUsage {
|
|
384
|
|
385 # Retrieve all the options...
|
|
386 %Options = ();
|
|
387 $Options{detail} = 1;
|
|
388 $Options{keepolddatafields} = "none";
|
|
389 $Options{mode} = "molname";
|
|
390 $Options{molnamemode} = "labelprefix";
|
|
391 $Options{molnamereplace} = "empty";
|
|
392
|
|
393 if (!GetOptions(\%Options, "detail|d=i", "datafieldscommon=s", "datafieldsmap=s", "datafieldsmapfile=s", "datafieldurl=s", "help|h", "keepolddatafields|k=s", "mode|m=s", "molname=s", "molnamemode=s", "molnamereplace=s", "overwrite|o", "root|r=s", "workingdir|w=s")) {
|
|
394 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
|
|
395 }
|
|
396 if ($Options{workingdir}) {
|
|
397 if (! -d $Options{workingdir}) {
|
|
398 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
|
|
399 }
|
|
400 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
|
|
401 }
|
|
402 if ($Options{keepolddatafields} !~ /^(all|unmappedonly|none)$/i) {
|
|
403 die "Error: The value specified, $Options{keepolddatafields}, for option \"-k --keepolddatafields\" is not valid. Allowed values: all, unmappedonly, or none\n";
|
|
404 }
|
|
405 if ($Options{mode} !~ /^(molname|datafields|both)$/i) {
|
|
406 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: molname, datafields, or both\n";
|
|
407 }
|
|
408 if ($Options{molnamemode} !~ /^(datafield|labelprefix)$/i) {
|
|
409 die "Error: The value specified, $Options{molnamemode}, for option \"--molnamemode\" is not valid. Allowed values: datafield or labelprefix\n";
|
|
410 }
|
|
411 if ($Options{molnamereplace} !~ /^(always|empty)$/i) {
|
|
412 die "Error: The value specified, $Options{molnamereplace}, for option \"--molnamereplace\" is not valid. Allowed values: always or empty\n";
|
|
413 }
|
|
414 if (!IsPositiveInteger($Options{detail})) {
|
|
415 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
|
|
416 }
|
|
417 }
|
|
418
|
|
419 __END__
|
|
420
|
|
421 =head1 NAME
|
|
422
|
|
423 ModifySDFilesDataFields.pl - Modify data fields in SDFile(s)
|
|
424
|
|
425 =head1 SYNOPSIS
|
|
426
|
|
427 ModifySDFilesDataFields.pl SDFile(s)...
|
|
428
|
|
429 ModifySDFilesDataFields.pl [B<-d, --detail> infolevel]
|
|
430 [B<--datafieldscommon> newfieldlabel, newfieldvalue, [newfieldlabel, newfieldvalue,...]]
|
|
431 [B<--datafieldsmap> newfieldlabel, oldfieldlabel, [oldfieldlabel,...]; [newfieldlabel, oldfieldlabel, [oldfieldlabel,...]]]
|
|
432 [B<--datafieldsmapfile> filename] [B<--datafieldURL> URLDataFieldLabel, CGIScriptPath, CGIParamName, CmpdIDFieldLabel]
|
|
433 [B<-h, --help>] [B<-k, --keepolddatafields> all | unmappedonly | none] [B<-m, --mode> molname | datafields | both]
|
|
434 [B<--molnamemode> datafield | labelprefix] [B<--molname> datafieldname or prefixstring]
|
|
435 [B<--molnamereplace> always | empty] [B<-o, --overwrite>] [B<-r, --root> rootname]
|
|
436 [B<-w, --workingdir> dirname] SDFile(s)...
|
|
437
|
|
438 =head1 DESCRIPTION
|
|
439
|
|
440 Modify molname line and data fields in I<SDFile(s)>. Molname line can be replaced by a
|
|
441 data field value or assigned a sequential ID prefixed with a specific string. For data
|
|
442 fields and modification of their values, these types of options are supported: replace
|
|
443 data field labels by another set of labels; combine values of multiple data fields and
|
|
444 assign a new label; add specific set of data field labels and values to all compound
|
|
445 records; and others.
|
|
446
|
|
447 The file names are separated by space.The valid file extensions are I<.sdf> and I<.sd>.
|
|
448 All other file names are ignored. All the SD files in a current directory can be specified
|
|
449 either by I<*.sdf> or the current directory name.
|
|
450
|
|
451 =head1 OPTIONS
|
|
452
|
|
453 =over 4
|
|
454
|
|
455 =item B<-d, --detail> I<infolevel>
|
|
456
|
|
457 Level of information to print about compound records being ignored. Default: I<1>. Possible
|
|
458 values: I<1, 2 or 3>.
|
|
459
|
|
460 =item B<--datafieldscommon> I<newfieldlabel, newfieldvalue, [newfieldlabel, newfieldvalue,...]>
|
|
461
|
|
462 Specify data field labels and values for addition to each compound record. It's a comma delimited
|
|
463 list of data field label and values pair. Default: I<none>.
|
|
464
|
|
465 Examples:
|
|
466
|
|
467 DepositionDate,YYYY-MM-DD
|
|
468 Source,www.domainname.org,ReleaseData,YYYY-MM-DD
|
|
469
|
|
470 =item B<--datafieldsmap> I<newfieldlabel, oldfieldlabel, [oldfieldlabel,...]; [newfieldlabel, oldfieldlabel, [oldfieldlabel,...]]>
|
|
471
|
|
472 Specify how various data field labels and values are combined to generate a new data field
|
|
473 labels and their values. All the comma delimited data fields, with in a semicolon delimited set,
|
|
474 are mapped to the first new data field label along with the data field values joined via new
|
|
475 line character. Default: I<none>.
|
|
476
|
|
477 Examples:
|
|
478
|
|
479 Synonym,Name,SystematicName,Synonym;CmpdID,Extreg
|
|
480 HBondDonors,SumNHOH
|
|
481
|
|
482 =item B<--datafieldsmapfile> I<filename>
|
|
483
|
|
484 Filename containing mapping of data fields. Format of data fields line in this file corresponds
|
|
485 to B<--datafieldsmap> option. Example:
|
|
486
|
|
487 Line 1: Synonym,Name,SystematicName,Synonym;CmpdID,Extreg
|
|
488 Line 2: HBondDonors,SumNHOH
|
|
489
|
|
490
|
|
491 =item B<--datafieldURL> I<URLDataFieldLabel, CGIScriptPath, CGIParamName, CmpdIDFieldLabel>
|
|
492
|
|
493 Specify how to generate a URL for retrieving compound data from a web server and add it
|
|
494 to each compound record. I<URLDataFieldLabel> is used as the data field label for URL value
|
|
495 which is created by combining I<CGIScriptPath,CGIParamName,CmpdIDFieldLabel> values:
|
|
496 CGIScriptPath?CGIParamName=CmpdIDFieldLabelValue. Default: I<none>.
|
|
497
|
|
498 Example:
|
|
499
|
|
500 Source,http://www.yourdomain.org/GetCmpd.pl,Reg_ID,Mol_ID
|
|
501
|
|
502 =item B<-h, --help>
|
|
503
|
|
504 Print this help message.
|
|
505
|
|
506 =item B<-k, --keepolddatafields> I<all | unmappedonly | none>
|
|
507
|
|
508 Specify how to transfer old data fields from input SDFile(s) to new SDFile(s) during
|
|
509 I<datafields | both> value of B<-m, --mode> option: keep all old data fields; write out the ones
|
|
510 not mapped to new fields as specified by B<--datafieldsmap> or <--datafieldsmapfile> options;
|
|
511 or ignore all old data field labels. For I<molname> B<-m --mode>, old datafields are always kept.
|
|
512 Possible values: I<all | unmappedonly | none>. Default: I<none>.
|
|
513
|
|
514 =item B<-m, --mode> I<molname | datafields | both>
|
|
515
|
|
516 Specify how to modify SDFile(s): I<molname> - change molname line by another datafield or value;
|
|
517 I<datafield> - modify data field labels and values by replacing one label by another, combining
|
|
518 multiple data field labels and values, adding specific set of data field labels and values to all compound, or
|
|
519 inserting an URL for compound retrieval to each record; I<both> - change molname line and datafields
|
|
520 simultaneously. Possible values: I<molname | datafields | both>. Default: I<molname>
|
|
521
|
|
522 =item B<--molnamemode> I<datafield | labelprefix>
|
|
523
|
|
524 Specify how to change molname line for B<-m --mode> option values of I<molname | both>: use
|
|
525 a datafield label value or assign a sequential ID prefixed with I<labelprefix>. Possible values:
|
|
526 I<datafield | labelprefix>. Default: I<labelprefix>.
|
|
527
|
|
528 =item B<--molname> I<datafieldname or prefixstring>
|
|
529
|
|
530 Molname generation method. For I<datafield> value of B<--molnamemode> option, it corresponds
|
|
531 to datafield label name whose value is used for molname; otherwise, it's a prefix string used for
|
|
532 generating compound IDs like labelprefixstring<Number>. Default value, I<Cmpd>, generates
|
|
533 compound IDs like Cmpd<Number> for molname.
|
|
534
|
|
535 =item B<--molnamereplace> I<always | empty>
|
|
536
|
|
537 Specify when to replace molname line for B<-m --mode> option values of I<molname | both>:
|
|
538 always replace the molname line using B<--molname> option or only when it's empty. Possible
|
|
539 values: I<always | empty>. Default: I<empty>.
|
|
540
|
|
541 =item B<-o, --overwrite>
|
|
542
|
|
543 Overwrite existing files.
|
|
544
|
|
545 =item B<-r, --root> I<rootname>
|
|
546
|
|
547 New SD file name is generated using the root: <Root>.<Ext>. Default new file
|
|
548 name: <InitialSDFileName>ModifiedDataFields.<Ext>. This option is ignored for multiple
|
|
549 input files.
|
|
550
|
|
551 =item B<-w, --workingdir> I<dirname>
|
|
552
|
|
553 Location of working directory. Default: current directory.
|
|
554
|
|
555 =back
|
|
556
|
|
557 =head1 EXAMPLES
|
|
558
|
|
559 To replace empty molname lines by Cmpd<CmpdNumber> and generate a new SD file
|
|
560 NewSample1.sdf, type:
|
|
561
|
|
562 % ModifySDFilesDataFields.pl -o -r NewSample1 Sample1.sdf
|
|
563
|
|
564 To replace all molname lines by Mol_ID data field generate a new SD file
|
|
565 NewSample1.sdf, type:
|
|
566
|
|
567 % ModifySDFilesDataFields.pl --molnamemode datafield
|
|
568 --molnamereplace always -r NewSample1 -o Sample1.sdf
|
|
569
|
|
570 To replace all molname lines by Mol_ID data field, map Name and CompoundName to
|
|
571 a new datafield Synonym, and generate a new SD file NewSample1.sdf, type:
|
|
572
|
|
573 % ModifySDFilesDataFields.pl --molnamemode datafield
|
|
574 --molnamereplace always --molname Mol_ID --mode both
|
|
575 --datafieldsmap "Synonym,Name,CompoundName" -r
|
|
576 NewSample1 -o Sample1.sdf
|
|
577
|
|
578 To replace all molname lines by Mol_ID data field, map Name and CompoundName to
|
|
579 a new datafield Synonym, add common fields ReleaseDate and Source, and
|
|
580 generate a new SD file NewSample1.sdf without keeping any old SD data fields, type:
|
|
581
|
|
582 % ModifySDFilesDataFields.pl --molnamemode datafield
|
|
583 --molnamereplace always --molname Mol_ID --mode both
|
|
584 --datafieldsmap "Synonym,Name,CompoundName"
|
|
585 --datafieldscommon "ReleaseDate,yyyy-mm-dd,Source,
|
|
586 www.mayachemtools.org" --keepolddatafields none -r
|
|
587 NewSample1 -o Sample1.sdf
|
|
588
|
|
589 B<Preparing SD files PubChem deposition:>
|
|
590
|
|
591 Consider a SD file with these fields: Mol_ID, Name, Synonyms and Systematic_Name.
|
|
592 And Mol_ID data field uniquely identifies your compound.
|
|
593
|
|
594 To prepare a new SD file CmpdDataForPubChem.sdf containing only required
|
|
595 PUBCHEM_EXT_DATASOURCE_REGID field, type:
|
|
596
|
|
597 % ModifySDFilesDataFields.pl --m datafields
|
|
598 --datafieldsmap
|
|
599 "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID"
|
|
600 -r CmpdDataForPubChem -o Sample1.sdf
|
|
601
|
|
602 To prepare a new SD file CmpdDataForPubChem.sdf containing only required
|
|
603 PUBCHEM_EXT_DATASOURCE_REGID field and replace molname line with Mol_ID, type:
|
|
604
|
|
605 % ModifySDFilesDataFields.pl --molnamemode datafield
|
|
606 --molnamereplace always --molname Mol_ID --mode both
|
|
607 --datafieldsmap
|
|
608 "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID"
|
|
609 -r CmpdDataForPubChem -o Sample1.sdf
|
|
610
|
|
611 In addition to required PubChem data field, you can also add optional PubChem data
|
|
612 fields.
|
|
613
|
|
614 To map your Name, Synonyms and Systematic_Name data fields to optional
|
|
615 PUBCHEM_SUBSTANCE_SYNONYM data field along with required ID field, type:
|
|
616
|
|
617 % ModifySDFilesDataFields.pl --molnamemode datafield
|
|
618 --molnamereplace always --molname Mol_ID --mode both
|
|
619 --datafieldsmap
|
|
620 "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID;
|
|
621 PUBCHEM_SUBSTANCE_SYNONYM,Name,CompoundName"
|
|
622 -r CmpdDataForPubChem -o Sample1.sdf
|
|
623
|
|
624 To add your <domain.org> as PUBCHEM_EXT_SUBSTANCE_URL and link substance
|
|
625 retrieval to your CGI script <http://www.yourdomain.org/GetCmpd.pl,Reg_ID,Mol_ID>
|
|
626 via PUBCHEM_EXT_DATASOURCE_REGID field along with optional and required
|
|
627 data fields, type:
|
|
628
|
|
629 % ModifySDFilesDataFields.pl --molnamemode datafield
|
|
630 --molnamereplace always --molname Mol_ID --mode both
|
|
631 --datafieldsmap
|
|
632 "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID;
|
|
633 PUBCHEM_SUBSTANCE_SYNONYM,Name,CompoundName"
|
|
634 --datafieldscommon
|
|
635 "PUBCHEM_EXT_SUBSTANCE_URL,domain.org"
|
|
636 --datafieldURL "PUBCHEM_EXT_DATASOURCE_URL,
|
|
637 http://www.yourdomain.org/GetCmpd.pl,Reg_ID,Mol_ID"
|
|
638 -r CmpdDataForPubChem -o Sample1.sdf
|
|
639
|
|
640 And to add a publication date and request a release data using
|
|
641 PUBCHEM_PUBLICATION_DATE and PUBCHEM_DEPOSITOR_RECORD_DATE data fields
|
|
642 along with all the data fields in earlier examples, type:
|
|
643 optional fields, type:
|
|
644
|
|
645 % ModifySDFilesDataFields.pl --molnamemode datafield
|
|
646 --molnamereplace always --molname Mol_ID --mode both
|
|
647 --datafieldsmap
|
|
648 "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID;
|
|
649 PUBCHEM_SUBSTANCE_SYNONYM,Name,CompoundName"
|
|
650 --datafieldURL "PUBCHEM_EXT_DATASOURCE_URL,
|
|
651 http://www.yourdomain.org/GetCmpd.pl,Reg_ID,Mol_ID"
|
|
652 --datafieldscommon
|
|
653 "PUBCHEM_EXT_SUBSTANCE_URL,domain.org,
|
|
654 PUBCHEM_PUBLICATION_DATE,YYY-MM-DD,
|
|
655 PUBCHEM_DEPOSITOR_RECORD_DATE,YYYY-MM-DD"
|
|
656 -r CmpdDataForPubChem -o Sample1.sdf
|
|
657
|
|
658 =head1 AUTHOR
|
|
659
|
|
660 Manish Sud <msud@san.rr.com>
|
|
661
|
|
662 =head1 SEE ALSO
|
|
663
|
|
664 InfoSDFiles.pl, JoinSDFiles.pl, MergeTextFilesWithSD.pl, SplitSDFiles.pl, SDFilesToHTML.pl
|
|
665
|
|
666 =head1 COPYRIGHT
|
|
667
|
|
668 Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
669
|
|
670 This file is part of MayaChemTools.
|
|
671
|
|
672 MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
673 the terms of the GNU Lesser General Public License as published by the Free
|
|
674 Software Foundation; either version 3 of the License, or (at your option)
|
|
675 any later version.
|
|
676
|
|
677 =cut
|