Mercurial > repos > deepakjadmin > mayatool3_test2
comparison bin/ModifySDFilesDataFields.pl @ 0:4816e4a8ae95 draft default tip
Uploaded
author | deepakjadmin |
---|---|
date | Wed, 20 Jan 2016 09:23:18 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4816e4a8ae95 |
---|---|
1 #!/usr/bin/perl -w | |
2 # | |
3 # $RCSfile: ModifySDFilesDataFields.pl,v $ | |
4 # $Date: 2015/02/28 20:46:20 $ | |
5 # $Revision: 1.27 $ | |
6 # | |
7 # Author: Manish Sud <msud@san.rr.com> | |
8 # | |
9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
10 # | |
11 # This file is part of MayaChemTools. | |
12 # | |
13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
14 # the terms of the GNU Lesser General Public License as published by the Free | |
15 # Software Foundation; either version 3 of the License, or (at your option) any | |
16 # later version. | |
17 # | |
18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
19 # any warranty; without even the implied warranty of merchantability of fitness | |
20 # for a particular purpose. See the GNU Lesser General Public License for more | |
21 # details. | |
22 # | |
23 # You should have received a copy of the GNU Lesser General Public License | |
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
26 # Boston, MA, 02111-1307, USA. | |
27 # | |
28 | |
29 use strict; | |
30 use FindBin; use lib "$FindBin::Bin/../lib"; | |
31 use Getopt::Long; | |
32 use File::Basename; | |
33 use Text::ParseWords; | |
34 use Benchmark; | |
35 use FileUtil; | |
36 use SDFileUtil; | |
37 use TextUtil; | |
38 | |
39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | |
40 | |
41 # Autoflush STDOUT | |
42 $| = 1; | |
43 | |
44 # Starting message... | |
45 $ScriptName = basename($0); | |
46 print "\n$ScriptName: Starting...\n\n"; | |
47 $StartTime = new Benchmark; | |
48 | |
49 # Get the options and setup script... | |
50 SetupScriptUsage(); | |
51 if ($Options{help} || @ARGV < 1) { | |
52 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | |
53 } | |
54 | |
55 my(@SDFilesList); | |
56 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); | |
57 | |
58 # Process options... | |
59 print "Processing options...\n"; | |
60 my(%OptionsInfo); | |
61 ProcessOptions(); | |
62 | |
63 print "Checking input SD file(s)...\n"; | |
64 my(%SDFilesInfo); | |
65 RetrieveSDFilesInfo(); | |
66 | |
67 # Generate output files... | |
68 my($FileIndex); | |
69 if (@SDFilesList > 1) { | |
70 print "\nProcessing SD files...\n"; | |
71 } | |
72 for $FileIndex (0 .. $#SDFilesList) { | |
73 if ($SDFilesInfo{FileOkay}[$FileIndex]) { | |
74 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; | |
75 ModifySDFile($FileIndex); | |
76 } | |
77 } | |
78 print "\n$ScriptName:Done...\n\n"; | |
79 | |
80 $EndTime = new Benchmark; | |
81 $TotalTime = timediff ($EndTime, $StartTime); | |
82 print "Total time: ", timestr($TotalTime), "\n"; | |
83 | |
84 ############################################################################### | |
85 | |
86 # Modify SD file data fields.... | |
87 sub ModifySDFile { | |
88 my($Index) = @_; | |
89 my($SDFile, $NewSDFile); | |
90 | |
91 $SDFile = $SDFilesList[$Index]; | |
92 $NewSDFile = $SDFilesInfo{OutFile}[$Index]; | |
93 | |
94 print "Generating new SD file $NewSDFile...\n"; | |
95 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n"; | |
96 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n"; | |
97 | |
98 my($CmpdCount, $CmpdString, $CmpdData, $MolName, $OldSDField, $NewSDField, $CommonSDField, $Label, $Value, $FieldValues, $MolNameDataField, $URLCmpdIdFieldName, @CmpdLines, %DataFieldAndValues, @DataFieldLabels); | |
99 $CmpdCount = 0; | |
100 | |
101 COMPOUND: while ($CmpdString = ReadCmpdString(\*SDFILE)) { | |
102 $CmpdCount++; | |
103 @CmpdLines = split "\n", $CmpdString; | |
104 if ($OptionsInfo{UseDataFieldForMolName} || $OptionsInfo{ModifyDataFields}) { | |
105 %DataFieldAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
106 } | |
107 if ($OptionsInfo{ModifyMolName}) { | |
108 if ($OptionsInfo{AlwaysReplaceMolName} || !IsNotEmpty($CmpdLines[0])) { | |
109 $MolNameDataField = $OptionsInfo{MolNameDataField}; | |
110 if ($OptionsInfo{UseDataFieldForMolName} && exists($DataFieldAndValues{$MolNameDataField})) { | |
111 $MolName = $DataFieldAndValues{$MolNameDataField}; | |
112 if (length($MolName) > 80) { | |
113 $MolName = substr($MolName, 0, 80); | |
114 } | |
115 } | |
116 else { | |
117 $MolName = "$OptionsInfo{MolNamePrefix}${CmpdCount}"; | |
118 } | |
119 $CmpdLines[0] = $MolName; | |
120 $CmpdString = join "\n", @CmpdLines; | |
121 } | |
122 } | |
123 if (!$OptionsInfo{ModifyDataFields}) { | |
124 # Just write the data and get the next compound... | |
125 print NEWSDFILE "$CmpdString\n"; | |
126 next COMPOUND; | |
127 } | |
128 # Write out the structure data now and handle the old data fields later... | |
129 ($CmpdData) = split /\n>/, $CmpdString; | |
130 print NEWSDFILE "$CmpdData\n"; | |
131 | |
132 # Modify specified data fields... | |
133 for $NewSDField (sort keys %{$OptionsInfo{SpecifiedNewToOldSDFieldMap}}) { | |
134 $FieldValues = ""; | |
135 for $OldSDField (@{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}}) { | |
136 if (exists($DataFieldAndValues{$OldSDField}) && length($DataFieldAndValues{$OldSDField})) { | |
137 $Value = $DataFieldAndValues{$OldSDField}; | |
138 $FieldValues .= ($FieldValues) ? "\n$Value" : $Value; | |
139 } | |
140 } | |
141 print NEWSDFILE "> <$NewSDField>\n$FieldValues\n\n"; | |
142 } | |
143 # Add specified common fields... | |
144 for $CommonSDField (sort keys %{$OptionsInfo{SpecifiedCommonFieldMap}}) { | |
145 $Value = $OptionsInfo{SpecifiedCommonFieldMap}{$CommonSDField}; | |
146 print NEWSDFILE "> <$CommonSDField>\n$Value\n\n"; | |
147 } | |
148 if ($OptionsInfo{CreateDataFieldURL}) { | |
149 $Value = ""; | |
150 $URLCmpdIdFieldName = $OptionsInfo{URLCmpdIdFieldName}; | |
151 if (exists($DataFieldAndValues{$URLCmpdIdFieldName}) && length($DataFieldAndValues{$URLCmpdIdFieldName})) { | |
152 $Value = $DataFieldAndValues{$URLCmpdIdFieldName}; | |
153 $Value = "$OptionsInfo{URLCGIScriptName}?$OptionsInfo{URLParamName}=${Value}"; | |
154 } | |
155 print NEWSDFILE "> <$OptionsInfo{URLDataFieldLabel}>\n$Value\n\n"; | |
156 } | |
157 | |
158 # Handle old data fields and write 'em in the same order as they appear in the input | |
159 # files... | |
160 if ($OptionsInfo{KeepAllOldDataFields} || $OptionsInfo{KeepUnMappedOldDataFields}) { | |
161 my($KeepLabel); | |
162 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); | |
163 LABEL: for $Label (@DataFieldLabels) { | |
164 $KeepLabel = $OptionsInfo{KeepAllOldDataFields} ? 1 : ( exists($OptionsInfo{SpecifiedOldToNewSDFieldMap}{$Label}) ? 0 : 1 ); | |
165 if (!$KeepLabel) { | |
166 next LABEL; | |
167 } | |
168 $Value = $DataFieldAndValues{$Label}; | |
169 print NEWSDFILE "> <$Label>\n$Value\n\n"; | |
170 } | |
171 } | |
172 | |
173 print NEWSDFILE "\$\$\$\$\n"; | |
174 } | |
175 close NEWSDFILE; | |
176 close SDFILE; | |
177 } | |
178 | |
179 # Process option values... | |
180 sub ProcessOptions { | |
181 %OptionsInfo = (); | |
182 | |
183 $OptionsInfo{Mode} = $Options{mode}; | |
184 | |
185 $OptionsInfo{ModifyMolName} = 1; $OptionsInfo{ModifyDataFields} = 0; | |
186 if ($Options{mode} =~ /^both$/i) { | |
187 $OptionsInfo{ModifyMolName} = 1; $OptionsInfo{ModifyDataFields} = 1; | |
188 } | |
189 elsif ($Options{mode} =~ /^datafields$/i) { | |
190 $OptionsInfo{ModifyMolName} = 0; $OptionsInfo{ModifyDataFields} = 1; | |
191 } | |
192 | |
193 $OptionsInfo{KeepOldDataFields} = $Options{keepolddatafields}; | |
194 $OptionsInfo{KeepAllOldDataFields} = ($Options{keepolddatafields} =~ /^all$/i) ? 1 : 0; | |
195 $OptionsInfo{KeepUnMappedOldDataFields} = ($Options{keepolddatafields} =~ /^unmappedonly$/i) ? 1 : 0; | |
196 | |
197 $OptionsInfo{MolNameMode} = $Options{molnamemode}; | |
198 $OptionsInfo{UseDataFieldForMolName} = ($Options{molnamemode} =~ /^datafield$/i) ? 1 : 0; | |
199 | |
200 $OptionsInfo{MolName} = $Options{molname}; | |
201 $OptionsInfo{MolNameDataField} = ""; $OptionsInfo{MolNamePrefix} = "Cmpd"; | |
202 if ($Options{molname}) { | |
203 if ($OptionsInfo{UseDataFieldForMolName}) { | |
204 $OptionsInfo{MolNameDataField} = $Options{molname}; | |
205 } | |
206 else { | |
207 $OptionsInfo{MolNamePrefix} = $Options{molname}; | |
208 } | |
209 } | |
210 | |
211 $OptionsInfo{MolNameReplace} = $Options{molnamereplace}; | |
212 $OptionsInfo{AlwaysReplaceMolName} = ($Options{molnamereplace} =~ /^always$/i) ? 1 : 0; | |
213 | |
214 if ($Options{datafieldsmap} && $Options{datafieldsmapfile}) { | |
215 die "Error: Both \"--datafieldsmap\" and \"--datafieldsmapfile\" options specified: only one is allowed at a time\n"; | |
216 } | |
217 | |
218 $OptionsInfo{DataFieldsMap} = $Options{datafieldsmap} ? $Options{datafieldsmap} : ''; | |
219 $OptionsInfo{DataFieldsMapFile} = $Options{datafieldsmapfile} ? $Options{datafieldsmapfile} : ''; | |
220 | |
221 my($SpecifiedDataFieldMap); | |
222 | |
223 %{$OptionsInfo{SpecifiedNewToOldSDFieldMap}} = (); | |
224 %{$OptionsInfo{SpecifiedOldToNewSDFieldMap}} = (); | |
225 | |
226 $SpecifiedDataFieldMap = ""; | |
227 if ($Options{datafieldsmap}) { | |
228 $SpecifiedDataFieldMap = $Options{datafieldsmap}; | |
229 } | |
230 elsif ($Options{datafieldsmapfile}) { | |
231 my($Line, @LineWords); | |
232 open DATAFIELDSFILE, "$Options{datafieldsmapfile}" or die "Couldn't open $Options{datafieldsmapfile}: $! \n"; | |
233 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { | |
234 @LineWords = quotewords(";", 0, $Line); | |
235 $SpecifiedDataFieldMap .= JoinWords(\@LineWords, ";", 0); | |
236 } | |
237 close DATAFIELDSFILE; | |
238 } | |
239 | |
240 if ($SpecifiedDataFieldMap) { | |
241 my($DataFieldMap, $DataField, $NewSDField, @OldSDFields, @DataFieldMapSplit, @DataFieldsSplit, $FirstField); | |
242 @DataFieldMapSplit = split ";", $SpecifiedDataFieldMap; | |
243 for $DataFieldMap (@DataFieldMapSplit) { | |
244 @DataFieldsSplit = split ",", $DataFieldMap; | |
245 if (@DataFieldsSplit == 1) { | |
246 die "Error: Invalid number of comma delimited values, ", scalar(@DataFieldsSplit), ", specified, @DataFieldsSplit, using \"--datafieldsmap or --datafieldsmapfile\" option: it must contain more than one value.\n"; | |
247 } | |
248 $FirstField = 1; | |
249 @OldSDFields = (); | |
250 for $DataField (@DataFieldsSplit) { | |
251 if (!(defined($DataField) && length($DataField))) { | |
252 die "Error: One of the comma delimited values, \"", join(",", @DataFieldsSplit), "\", specified using \"--datafieldsmap or --datafieldsmapfile\" option is empty.\n"; | |
253 } | |
254 if ($FirstField) { | |
255 $FirstField = 0; | |
256 $NewSDField = $DataField; | |
257 } | |
258 else { | |
259 push @OldSDFields, $DataField; | |
260 } | |
261 } | |
262 # Make sure a datafield is only specified once... | |
263 if (exists $OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}) { | |
264 die "Error: New data field, $NewSDField, specified more than once using \"--datafieldsmap or --datafieldsmapfile\" option.\n"; | |
265 } | |
266 @{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}} = (); | |
267 push @{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}}, @OldSDFields; | |
268 for $DataField (@OldSDFields) { | |
269 if (exists $OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataField} ) { | |
270 die "Error: SD field, $DataField, specified more than once using \"--datafieldsmap or --datafieldsmapfile\" option.\n"; | |
271 } | |
272 else { | |
273 $OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataField} = $NewSDField; | |
274 } | |
275 } | |
276 | |
277 } | |
278 } | |
279 | |
280 $OptionsInfo{DataFieldsCommon} = $Options{datafieldscommon} ? $Options{datafieldscommon} : ''; | |
281 %{$OptionsInfo{SpecifiedCommonFieldMap}} = (); | |
282 | |
283 if ($Options{datafieldscommon}) { | |
284 my($DataFieldName, $DataFieldValue, $Index, @CommonDataFieldsSplit); | |
285 @CommonDataFieldsSplit = split ",", $Options{datafieldscommon}; | |
286 if (@CommonDataFieldsSplit % 2) { | |
287 die "Error: Invalid number of comma delimited values, ", scalar(@CommonDataFieldsSplit), ", specified \"", join(",", @CommonDataFieldsSplit), "\" using \"--datafieldscommon\" option: it must contain even number of values.\n"; | |
288 } | |
289 for ($Index = 0; $Index < @CommonDataFieldsSplit; $Index += 2) { | |
290 $DataFieldName = $CommonDataFieldsSplit[$Index]; | |
291 $DataFieldValue = $CommonDataFieldsSplit[$Index + 1]; | |
292 if (exists $OptionsInfo{SpecifiedCommonFieldMap}{$DataFieldName}) { | |
293 die "Error: Common data field, $DataFieldName, specified more than once using \"--datafieldscommon\" option.\n"; | |
294 } | |
295 if (exists($OptionsInfo{SpecifiedNewToOldSDFieldMap}{$DataFieldName}) || exists($OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataFieldName})) { | |
296 die "Error: Common data field, $DataFieldName, specified using \"--datafieldscommon\" option cannot be specified in \"--datafieldsmap or --datafieldsmapfile\" option.\n"; | |
297 } | |
298 $OptionsInfo{SpecifiedCommonFieldMap}{$DataFieldName} = $DataFieldValue; | |
299 } | |
300 } | |
301 | |
302 $OptionsInfo{DataFieldURL} = $Options{datafieldurl} ? $Options{datafieldurl} : ''; | |
303 $OptionsInfo{CreateDataFieldURL} = (exists($Options{datafieldurl}) && length($Options{datafieldurl}) ) ? 1 : 0; | |
304 | |
305 $OptionsInfo{URLDataFieldLabel} = ""; $OptionsInfo{URLCGIScriptName} = ""; | |
306 $OptionsInfo{URLParamName} = ""; $OptionsInfo{URLCmpdIdFieldName} = ""; | |
307 | |
308 if ($OptionsInfo{CreateDataFieldURL}) { | |
309 my(@DataFieldURLSplit, $Value); | |
310 @DataFieldURLSplit = split ",", $Options{datafieldurl}; | |
311 if (@DataFieldURLSplit != 4) { | |
312 die "Error: Invalid number of values, ", scalar(@DataFieldURLSplit), ", specified using \"--datafieldURL\" option: it must contain 4 values.\n"; | |
313 } | |
314 for $Value (@DataFieldURLSplit) { | |
315 if (!IsNotEmpty($Value)) { | |
316 die "Error: One of the values, $Options{datafieldurl}, specified using \"--datafieldURL\" option is empty.\n"; | |
317 } | |
318 } | |
319 $OptionsInfo{URLDataFieldLabel} = $DataFieldURLSplit[0]; | |
320 $OptionsInfo{URLCGIScriptName} = $DataFieldURLSplit[1]; | |
321 $OptionsInfo{URLParamName} = $DataFieldURLSplit[2]; | |
322 $OptionsInfo{URLCmpdIdFieldName} = $DataFieldURLSplit[3]; | |
323 } | |
324 | |
325 } | |
326 | |
327 # Retrieve information about input SD files... | |
328 sub RetrieveSDFilesInfo { | |
329 my($Index, $SDFile, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFile, $DataFieldName); | |
330 | |
331 %SDFilesInfo = (); | |
332 @{$SDFilesInfo{FileOkay}} = (); | |
333 @{$SDFilesInfo{OutFile}} = (); | |
334 | |
335 FILELIST: for $Index (0 .. $#SDFilesList) { | |
336 $SDFile = $SDFilesList[$Index]; | |
337 | |
338 $SDFilesInfo{FileOkay}[$Index] = 0; | |
339 $SDFilesInfo{OutFile}[$Index] = ''; | |
340 | |
341 if (!(-e $SDFile)) { | |
342 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; | |
343 next FILELIST; | |
344 } | |
345 if (!CheckFileType($SDFile, "sd sdf")) { | |
346 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; | |
347 next FILELIST; | |
348 } | |
349 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
350 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); | |
351 if ($Options{root} && (@SDFilesList == 1)) { | |
352 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root}); | |
353 if ($RootFileName && $RootFileExt) { | |
354 $FileName = $RootFileName; | |
355 } | |
356 else { | |
357 $FileName = $Options{root}; | |
358 } | |
359 $OutFileRoot = $FileName; | |
360 } | |
361 else { | |
362 $OutFileRoot = $FileName . "ModifiedDataFields"; | |
363 } | |
364 | |
365 $OutFile = $OutFileRoot . ".$FileExt"; | |
366 if (lc($OutFile) eq lc($SDFile)) { | |
367 warn "Warning: Ignoring file $SDFile:Output file name, $OutFile, is same as input SD file name, $SDFile\n"; | |
368 next FILELIST; | |
369 } | |
370 if (!$Options{overwrite}) { | |
371 if (-e $OutFile) { | |
372 warn "Warning: Ignoring file $SDFile: The file $OutFile already exists\n"; | |
373 next FILELIST; | |
374 } | |
375 } | |
376 | |
377 $SDFilesInfo{FileOkay}[$Index] = 1; | |
378 $SDFilesInfo{OutFile}[$Index] = $OutFile; | |
379 } | |
380 } | |
381 | |
382 # Setup script usage and retrieve command line arguments specified using various options... | |
383 sub SetupScriptUsage { | |
384 | |
385 # Retrieve all the options... | |
386 %Options = (); | |
387 $Options{detail} = 1; | |
388 $Options{keepolddatafields} = "none"; | |
389 $Options{mode} = "molname"; | |
390 $Options{molnamemode} = "labelprefix"; | |
391 $Options{molnamereplace} = "empty"; | |
392 | |
393 if (!GetOptions(\%Options, "detail|d=i", "datafieldscommon=s", "datafieldsmap=s", "datafieldsmapfile=s", "datafieldurl=s", "help|h", "keepolddatafields|k=s", "mode|m=s", "molname=s", "molnamemode=s", "molnamereplace=s", "overwrite|o", "root|r=s", "workingdir|w=s")) { | |
394 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | |
395 } | |
396 if ($Options{workingdir}) { | |
397 if (! -d $Options{workingdir}) { | |
398 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | |
399 } | |
400 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | |
401 } | |
402 if ($Options{keepolddatafields} !~ /^(all|unmappedonly|none)$/i) { | |
403 die "Error: The value specified, $Options{keepolddatafields}, for option \"-k --keepolddatafields\" is not valid. Allowed values: all, unmappedonly, or none\n"; | |
404 } | |
405 if ($Options{mode} !~ /^(molname|datafields|both)$/i) { | |
406 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: molname, datafields, or both\n"; | |
407 } | |
408 if ($Options{molnamemode} !~ /^(datafield|labelprefix)$/i) { | |
409 die "Error: The value specified, $Options{molnamemode}, for option \"--molnamemode\" is not valid. Allowed values: datafield or labelprefix\n"; | |
410 } | |
411 if ($Options{molnamereplace} !~ /^(always|empty)$/i) { | |
412 die "Error: The value specified, $Options{molnamereplace}, for option \"--molnamereplace\" is not valid. Allowed values: always or empty\n"; | |
413 } | |
414 if (!IsPositiveInteger($Options{detail})) { | |
415 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n"; | |
416 } | |
417 } | |
418 | |
419 __END__ | |
420 | |
421 =head1 NAME | |
422 | |
423 ModifySDFilesDataFields.pl - Modify data fields in SDFile(s) | |
424 | |
425 =head1 SYNOPSIS | |
426 | |
427 ModifySDFilesDataFields.pl SDFile(s)... | |
428 | |
429 ModifySDFilesDataFields.pl [B<-d, --detail> infolevel] | |
430 [B<--datafieldscommon> newfieldlabel, newfieldvalue, [newfieldlabel, newfieldvalue,...]] | |
431 [B<--datafieldsmap> newfieldlabel, oldfieldlabel, [oldfieldlabel,...]; [newfieldlabel, oldfieldlabel, [oldfieldlabel,...]]] | |
432 [B<--datafieldsmapfile> filename] [B<--datafieldURL> URLDataFieldLabel, CGIScriptPath, CGIParamName, CmpdIDFieldLabel] | |
433 [B<-h, --help>] [B<-k, --keepolddatafields> all | unmappedonly | none] [B<-m, --mode> molname | datafields | both] | |
434 [B<--molnamemode> datafield | labelprefix] [B<--molname> datafieldname or prefixstring] | |
435 [B<--molnamereplace> always | empty] [B<-o, --overwrite>] [B<-r, --root> rootname] | |
436 [B<-w, --workingdir> dirname] SDFile(s)... | |
437 | |
438 =head1 DESCRIPTION | |
439 | |
440 Modify molname line and data fields in I<SDFile(s)>. Molname line can be replaced by a | |
441 data field value or assigned a sequential ID prefixed with a specific string. For data | |
442 fields and modification of their values, these types of options are supported: replace | |
443 data field labels by another set of labels; combine values of multiple data fields and | |
444 assign a new label; add specific set of data field labels and values to all compound | |
445 records; and others. | |
446 | |
447 The file names are separated by space.The valid file extensions are I<.sdf> and I<.sd>. | |
448 All other file names are ignored. All the SD files in a current directory can be specified | |
449 either by I<*.sdf> or the current directory name. | |
450 | |
451 =head1 OPTIONS | |
452 | |
453 =over 4 | |
454 | |
455 =item B<-d, --detail> I<infolevel> | |
456 | |
457 Level of information to print about compound records being ignored. Default: I<1>. Possible | |
458 values: I<1, 2 or 3>. | |
459 | |
460 =item B<--datafieldscommon> I<newfieldlabel, newfieldvalue, [newfieldlabel, newfieldvalue,...]> | |
461 | |
462 Specify data field labels and values for addition to each compound record. It's a comma delimited | |
463 list of data field label and values pair. Default: I<none>. | |
464 | |
465 Examples: | |
466 | |
467 DepositionDate,YYYY-MM-DD | |
468 Source,www.domainname.org,ReleaseData,YYYY-MM-DD | |
469 | |
470 =item B<--datafieldsmap> I<newfieldlabel, oldfieldlabel, [oldfieldlabel,...]; [newfieldlabel, oldfieldlabel, [oldfieldlabel,...]]> | |
471 | |
472 Specify how various data field labels and values are combined to generate a new data field | |
473 labels and their values. All the comma delimited data fields, with in a semicolon delimited set, | |
474 are mapped to the first new data field label along with the data field values joined via new | |
475 line character. Default: I<none>. | |
476 | |
477 Examples: | |
478 | |
479 Synonym,Name,SystematicName,Synonym;CmpdID,Extreg | |
480 HBondDonors,SumNHOH | |
481 | |
482 =item B<--datafieldsmapfile> I<filename> | |
483 | |
484 Filename containing mapping of data fields. Format of data fields line in this file corresponds | |
485 to B<--datafieldsmap> option. Example: | |
486 | |
487 Line 1: Synonym,Name,SystematicName,Synonym;CmpdID,Extreg | |
488 Line 2: HBondDonors,SumNHOH | |
489 | |
490 | |
491 =item B<--datafieldURL> I<URLDataFieldLabel, CGIScriptPath, CGIParamName, CmpdIDFieldLabel> | |
492 | |
493 Specify how to generate a URL for retrieving compound data from a web server and add it | |
494 to each compound record. I<URLDataFieldLabel> is used as the data field label for URL value | |
495 which is created by combining I<CGIScriptPath,CGIParamName,CmpdIDFieldLabel> values: | |
496 CGIScriptPath?CGIParamName=CmpdIDFieldLabelValue. Default: I<none>. | |
497 | |
498 Example: | |
499 | |
500 Source,http://www.yourdomain.org/GetCmpd.pl,Reg_ID,Mol_ID | |
501 | |
502 =item B<-h, --help> | |
503 | |
504 Print this help message. | |
505 | |
506 =item B<-k, --keepolddatafields> I<all | unmappedonly | none> | |
507 | |
508 Specify how to transfer old data fields from input SDFile(s) to new SDFile(s) during | |
509 I<datafields | both> value of B<-m, --mode> option: keep all old data fields; write out the ones | |
510 not mapped to new fields as specified by B<--datafieldsmap> or <--datafieldsmapfile> options; | |
511 or ignore all old data field labels. For I<molname> B<-m --mode>, old datafields are always kept. | |
512 Possible values: I<all | unmappedonly | none>. Default: I<none>. | |
513 | |
514 =item B<-m, --mode> I<molname | datafields | both> | |
515 | |
516 Specify how to modify SDFile(s): I<molname> - change molname line by another datafield or value; | |
517 I<datafield> - modify data field labels and values by replacing one label by another, combining | |
518 multiple data field labels and values, adding specific set of data field labels and values to all compound, or | |
519 inserting an URL for compound retrieval to each record; I<both> - change molname line and datafields | |
520 simultaneously. Possible values: I<molname | datafields | both>. Default: I<molname> | |
521 | |
522 =item B<--molnamemode> I<datafield | labelprefix> | |
523 | |
524 Specify how to change molname line for B<-m --mode> option values of I<molname | both>: use | |
525 a datafield label value or assign a sequential ID prefixed with I<labelprefix>. Possible values: | |
526 I<datafield | labelprefix>. Default: I<labelprefix>. | |
527 | |
528 =item B<--molname> I<datafieldname or prefixstring> | |
529 | |
530 Molname generation method. For I<datafield> value of B<--molnamemode> option, it corresponds | |
531 to datafield label name whose value is used for molname; otherwise, it's a prefix string used for | |
532 generating compound IDs like labelprefixstring<Number>. Default value, I<Cmpd>, generates | |
533 compound IDs like Cmpd<Number> for molname. | |
534 | |
535 =item B<--molnamereplace> I<always | empty> | |
536 | |
537 Specify when to replace molname line for B<-m --mode> option values of I<molname | both>: | |
538 always replace the molname line using B<--molname> option or only when it's empty. Possible | |
539 values: I<always | empty>. Default: I<empty>. | |
540 | |
541 =item B<-o, --overwrite> | |
542 | |
543 Overwrite existing files. | |
544 | |
545 =item B<-r, --root> I<rootname> | |
546 | |
547 New SD file name is generated using the root: <Root>.<Ext>. Default new file | |
548 name: <InitialSDFileName>ModifiedDataFields.<Ext>. This option is ignored for multiple | |
549 input files. | |
550 | |
551 =item B<-w, --workingdir> I<dirname> | |
552 | |
553 Location of working directory. Default: current directory. | |
554 | |
555 =back | |
556 | |
557 =head1 EXAMPLES | |
558 | |
559 To replace empty molname lines by Cmpd<CmpdNumber> and generate a new SD file | |
560 NewSample1.sdf, type: | |
561 | |
562 % ModifySDFilesDataFields.pl -o -r NewSample1 Sample1.sdf | |
563 | |
564 To replace all molname lines by Mol_ID data field generate a new SD file | |
565 NewSample1.sdf, type: | |
566 | |
567 % ModifySDFilesDataFields.pl --molnamemode datafield | |
568 --molnamereplace always -r NewSample1 -o Sample1.sdf | |
569 | |
570 To replace all molname lines by Mol_ID data field, map Name and CompoundName to | |
571 a new datafield Synonym, and generate a new SD file NewSample1.sdf, type: | |
572 | |
573 % ModifySDFilesDataFields.pl --molnamemode datafield | |
574 --molnamereplace always --molname Mol_ID --mode both | |
575 --datafieldsmap "Synonym,Name,CompoundName" -r | |
576 NewSample1 -o Sample1.sdf | |
577 | |
578 To replace all molname lines by Mol_ID data field, map Name and CompoundName to | |
579 a new datafield Synonym, add common fields ReleaseDate and Source, and | |
580 generate a new SD file NewSample1.sdf without keeping any old SD data fields, type: | |
581 | |
582 % ModifySDFilesDataFields.pl --molnamemode datafield | |
583 --molnamereplace always --molname Mol_ID --mode both | |
584 --datafieldsmap "Synonym,Name,CompoundName" | |
585 --datafieldscommon "ReleaseDate,yyyy-mm-dd,Source, | |
586 www.mayachemtools.org" --keepolddatafields none -r | |
587 NewSample1 -o Sample1.sdf | |
588 | |
589 B<Preparing SD files PubChem deposition:> | |
590 | |
591 Consider a SD file with these fields: Mol_ID, Name, Synonyms and Systematic_Name. | |
592 And Mol_ID data field uniquely identifies your compound. | |
593 | |
594 To prepare a new SD file CmpdDataForPubChem.sdf containing only required | |
595 PUBCHEM_EXT_DATASOURCE_REGID field, type: | |
596 | |
597 % ModifySDFilesDataFields.pl --m datafields | |
598 --datafieldsmap | |
599 "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID" | |
600 -r CmpdDataForPubChem -o Sample1.sdf | |
601 | |
602 To prepare a new SD file CmpdDataForPubChem.sdf containing only required | |
603 PUBCHEM_EXT_DATASOURCE_REGID field and replace molname line with Mol_ID, type: | |
604 | |
605 % ModifySDFilesDataFields.pl --molnamemode datafield | |
606 --molnamereplace always --molname Mol_ID --mode both | |
607 --datafieldsmap | |
608 "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID" | |
609 -r CmpdDataForPubChem -o Sample1.sdf | |
610 | |
611 In addition to required PubChem data field, you can also add optional PubChem data | |
612 fields. | |
613 | |
614 To map your Name, Synonyms and Systematic_Name data fields to optional | |
615 PUBCHEM_SUBSTANCE_SYNONYM data field along with required ID field, type: | |
616 | |
617 % ModifySDFilesDataFields.pl --molnamemode datafield | |
618 --molnamereplace always --molname Mol_ID --mode both | |
619 --datafieldsmap | |
620 "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID; | |
621 PUBCHEM_SUBSTANCE_SYNONYM,Name,CompoundName" | |
622 -r CmpdDataForPubChem -o Sample1.sdf | |
623 | |
624 To add your <domain.org> as PUBCHEM_EXT_SUBSTANCE_URL and link substance | |
625 retrieval to your CGI script <http://www.yourdomain.org/GetCmpd.pl,Reg_ID,Mol_ID> | |
626 via PUBCHEM_EXT_DATASOURCE_REGID field along with optional and required | |
627 data fields, type: | |
628 | |
629 % ModifySDFilesDataFields.pl --molnamemode datafield | |
630 --molnamereplace always --molname Mol_ID --mode both | |
631 --datafieldsmap | |
632 "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID; | |
633 PUBCHEM_SUBSTANCE_SYNONYM,Name,CompoundName" | |
634 --datafieldscommon | |
635 "PUBCHEM_EXT_SUBSTANCE_URL,domain.org" | |
636 --datafieldURL "PUBCHEM_EXT_DATASOURCE_URL, | |
637 http://www.yourdomain.org/GetCmpd.pl,Reg_ID,Mol_ID" | |
638 -r CmpdDataForPubChem -o Sample1.sdf | |
639 | |
640 And to add a publication date and request a release data using | |
641 PUBCHEM_PUBLICATION_DATE and PUBCHEM_DEPOSITOR_RECORD_DATE data fields | |
642 along with all the data fields in earlier examples, type: | |
643 optional fields, type: | |
644 | |
645 % ModifySDFilesDataFields.pl --molnamemode datafield | |
646 --molnamereplace always --molname Mol_ID --mode both | |
647 --datafieldsmap | |
648 "PUBCHEM_EXT_DATASOURCE_REGID,Mol_ID; | |
649 PUBCHEM_SUBSTANCE_SYNONYM,Name,CompoundName" | |
650 --datafieldURL "PUBCHEM_EXT_DATASOURCE_URL, | |
651 http://www.yourdomain.org/GetCmpd.pl,Reg_ID,Mol_ID" | |
652 --datafieldscommon | |
653 "PUBCHEM_EXT_SUBSTANCE_URL,domain.org, | |
654 PUBCHEM_PUBLICATION_DATE,YYY-MM-DD, | |
655 PUBCHEM_DEPOSITOR_RECORD_DATE,YYYY-MM-DD" | |
656 -r CmpdDataForPubChem -o Sample1.sdf | |
657 | |
658 =head1 AUTHOR | |
659 | |
660 Manish Sud <msud@san.rr.com> | |
661 | |
662 =head1 SEE ALSO | |
663 | |
664 InfoSDFiles.pl, JoinSDFiles.pl, MergeTextFilesWithSD.pl, SplitSDFiles.pl, SDFilesToHTML.pl | |
665 | |
666 =head1 COPYRIGHT | |
667 | |
668 Copyright (C) 2015 Manish Sud. All rights reserved. | |
669 | |
670 This file is part of MayaChemTools. | |
671 | |
672 MayaChemTools is free software; you can redistribute it and/or modify it under | |
673 the terms of the GNU Lesser General Public License as published by the Free | |
674 Software Foundation; either version 3 of the License, or (at your option) | |
675 any later version. | |
676 | |
677 =cut |