0
|
1 package SDFileUtil;
|
|
2 #
|
|
3 # $RCSfile: SDFileUtil.pm,v $
|
|
4 # $Date: 2015/02/28 20:47:18 $
|
|
5 # $Revision: 1.49 $
|
|
6 #
|
|
7 # Author: Manish Sud <msud@san.rr.com>
|
|
8 #
|
|
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
10 #
|
|
11 # This file is part of MayaChemTools.
|
|
12 #
|
|
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
14 # the terms of the GNU Lesser General Public License as published by the Free
|
|
15 # Software Foundation; either version 3 of the License, or (at your option) any
|
|
16 # later version.
|
|
17 #
|
|
18 # MayaChemTools is distributed in the hope that it will be useful, but without
|
|
19 # any warranty; without even the implied warranty of merchantability of fitness
|
|
20 # for a particular purpose. See the GNU Lesser General Public License for more
|
|
21 # details.
|
|
22 #
|
|
23 # You should have received a copy of the GNU Lesser General Public License
|
|
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
|
|
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
|
|
26 # Boston, MA, 02111-1307, USA.
|
|
27 #
|
|
28
|
|
29 use strict;
|
|
30 use Exporter;
|
|
31 use Carp;
|
|
32 use PeriodicTable qw(IsElement);
|
|
33 use TimeUtil ();
|
|
34
|
|
35 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
|
|
36
|
|
37 @ISA = qw(Exporter);
|
|
38 @EXPORT = qw(GenerateCmpdAtomLine GenerateCmpdBondLine GenerateCmpdChargePropertyLines GenerateCmpdCommentsLine GenerateCmpdCountsLine GenerateCmpdAtomAliasPropertyLines GenerateCmpdIsotopePropertyLines GenerateCmpdDataHeaderLabelsAndValuesLines GenerateCmpdMiscInfoLine GenerateCmpdRadicalPropertyLines GenerateCmpdMolNameLine GenerateEmptyCtabBlockLines GenerateMiscLineDateStamp GetAllAndCommonCmpdDataHeaderLabels GetCmpdDataHeaderLabels GetCmpdDataHeaderLabelsAndValues GetCmpdFragments GetCtabLinesCount GetUnknownAtoms GetInvalidAtomNumbers MDLChargeToInternalCharge InternalChargeToMDLCharge MDLBondTypeToInternalBondOrder InternalBondOrderToMDLBondType MDLBondStereoToInternalBondStereochemistry InternalBondStereochemistryToMDLBondStereo InternalSpinMultiplicityToMDLRadical MDLRadicalToInternalSpinMultiplicity IsCmpd3D IsCmpd2D ParseCmpdAtomLine ParseCmpdBondLine ParseCmpdCommentsLine ParseCmpdCountsLine ParseCmpdMiscInfoLine ParseCmpdMolNameLine ParseCmpdAtomAliasPropertyLine ParseCmpdChargePropertyLine ParseCmpdIsotopePropertyLine ParseCmpdRadicalPropertyLine ReadCmpdString RemoveCmpdDataHeaderLabelAndValue WashCmpd);
|
|
39 @EXPORT_OK = qw();
|
|
40 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]);
|
|
41
|
|
42 # Format data for compounds count line...
|
|
43 sub GenerateCmpdCountsLine {
|
|
44 my($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version, $Line);
|
|
45
|
|
46 if (@_ == 5) {
|
|
47 ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version) = @_;
|
|
48 }
|
|
49 elsif (@_ == 3) {
|
|
50 ($AtomCount, $BondCount, $ChiralFlag) = @_;
|
|
51 $PropertyCount = 999;
|
|
52 $Version = "V2000";
|
|
53 }
|
|
54 else {
|
|
55 ($AtomCount, $BondCount) = @_;
|
|
56 $ChiralFlag = 0;
|
|
57 $PropertyCount = 999;
|
|
58 $Version = "V2000";
|
|
59 }
|
|
60 if ($AtomCount > 999) {
|
|
61 croak "Error: SDFileUtil::GenerateCmpdCountsLine: The atom count, $AtomCount, exceeds maximum of 999 allowed for CTAB version 2000. The Extended Connection Table (V3000) format in MDL MOL and SD files is not supported by the current release of MayaChemTools...";
|
|
62 }
|
|
63 $Line = sprintf "%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%6s", $AtomCount, $BondCount, 0, 0, $ChiralFlag, 0, 0, 0, 0, 0, $PropertyCount, $Version;
|
|
64
|
|
65 return ($Line);
|
|
66 }
|
|
67
|
|
68 # Generate comments line...
|
|
69 sub GenerateCmpdCommentsLine {
|
|
70 my($Comments) = @_;
|
|
71 my($Line);
|
|
72
|
|
73 $Line = (length($Comments) > 80) ? substr($Comments, 0, 80) : $Comments;
|
|
74
|
|
75 return $Line;
|
|
76 }
|
|
77
|
|
78 # Generate molname line...
|
|
79 sub GenerateCmpdMolNameLine {
|
|
80 my($MolName) = @_;
|
|
81 my($Line);
|
|
82
|
|
83 $Line = (length($MolName) > 80) ? substr($MolName, 0, 80) : $MolName;
|
|
84
|
|
85 return $Line;
|
|
86 }
|
|
87
|
|
88 # Generate data for compounds misc info line...
|
|
89 sub GenerateCmpdMiscInfoLine {
|
|
90 my($ProgramName, $UserInitial, $Code) = @_;
|
|
91 my($Date, $Line);
|
|
92
|
|
93 if (!(defined($ProgramName) && $ProgramName)) {
|
|
94 $ProgramName = "MayaChem";
|
|
95 }
|
|
96 if (!(defined($UserInitial) && $UserInitial)) {
|
|
97 $UserInitial = " ";
|
|
98 }
|
|
99 if (!(defined($Code) && $Code)) {
|
|
100 $Code = "2D";
|
|
101 }
|
|
102
|
|
103 if (length($ProgramName) > 8) {
|
|
104 $ProgramName = substr($ProgramName, 0, 8);
|
|
105 }
|
|
106 if (length($UserInitial) > 2) {
|
|
107 $UserInitial = substr($UserInitial, 0, 2);
|
|
108 }
|
|
109 if (length($Code) > 2) {
|
|
110 $Code = substr($Code, 0, 2);
|
|
111 }
|
|
112 $Date = GenerateMiscLineDateStamp();
|
|
113
|
|
114 $Line = "${UserInitial}${ProgramName}${Date}${Code}";
|
|
115
|
|
116 return $Line;
|
|
117 }
|
|
118
|
|
119 # Generate data for compounds misc info line...
|
|
120 sub GenerateEmptyCtabBlockLines {
|
|
121 my($Date, $Lines);
|
|
122
|
|
123 if (@_ == 1) {
|
|
124 ($Date) = @_;
|
|
125 }
|
|
126 else {
|
|
127 $Date = GenerateMiscLineDateStamp();
|
|
128 }
|
|
129 # First line: Blank molname line...
|
|
130 # Second line: Misc info...
|
|
131 # Third line: Blank comments line...
|
|
132 # Fourth line: Counts line reflecting empty structure data block...
|
|
133 $Lines = "\n";
|
|
134 $Lines .= " MayaChem${Date}2D\n";
|
|
135 $Lines .= "\n";
|
|
136 $Lines .= GenerateCmpdCountsLine(0, 0, 0) . "\n";
|
|
137 $Lines .= "M END";
|
|
138
|
|
139 return $Lines;
|
|
140 }
|
|
141
|
|
142 # Generate SD file data stamp...
|
|
143 sub GenerateMiscLineDateStamp {
|
|
144 return TimeUtil::SDFileTimeStamp();
|
|
145 }
|
|
146
|
|
147 # Generate data for compound atom line...
|
|
148 #
|
|
149 sub GenerateCmpdAtomLine {
|
|
150 my($AtomSymbol, $AtomX, $AtomY, $AtomZ, $MassDifference, $Charge, $StereoParity) = @_;
|
|
151 my($Line);
|
|
152
|
|
153 if (!defined $MassDifference) {
|
|
154 $MassDifference = 0;
|
|
155 }
|
|
156 if (!defined $Charge) {
|
|
157 $Charge = 0;
|
|
158 }
|
|
159 if (!defined $StereoParity) {
|
|
160 $StereoParity = 0;
|
|
161 }
|
|
162 $Line = sprintf "%10.4f%10.4f%10.4f %-3s%2i%3i%3i 0 0 0 0 0 0 0 0 0", $AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity;
|
|
163
|
|
164 return $Line
|
|
165 }
|
|
166
|
|
167 # Generate data for compound bond line...
|
|
168 #
|
|
169 sub GenerateCmpdBondLine {
|
|
170 my($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = @_;
|
|
171 my($Line);
|
|
172
|
|
173 if (!defined $BondStereo) {
|
|
174 $BondStereo = 0;
|
|
175 }
|
|
176 $Line = sprintf "%3i%3i%3i%3i 0 0 0", $FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo;
|
|
177
|
|
178 return $Line
|
|
179 }
|
|
180
|
|
181 # Generate charge property lines for CTAB block...
|
|
182 #
|
|
183 sub GenerateCmpdChargePropertyLines {
|
|
184 my($ChargeValuePairsRef) = @_;
|
|
185
|
|
186 return _GenerateCmpdGenericPropertyLines('Charge', $ChargeValuePairsRef);
|
|
187 }
|
|
188
|
|
189 # Generate isotope property lines for CTAB block...
|
|
190 #
|
|
191 sub GenerateCmpdIsotopePropertyLines {
|
|
192 my($IsotopeValuePairsRef) = @_;
|
|
193
|
|
194 return _GenerateCmpdGenericPropertyLines('Isotope', $IsotopeValuePairsRef);
|
|
195 }
|
|
196
|
|
197 # Generate radical property line property lines for CTAB block...
|
|
198 #
|
|
199 sub GenerateCmpdRadicalPropertyLines {
|
|
200 my($RadicalValuePairsRef) = @_;
|
|
201
|
|
202 return _GenerateCmpdGenericPropertyLines('Radical', $RadicalValuePairsRef);
|
|
203 }
|
|
204
|
|
205 # Generate atom alias property line property lines for CTAB block...
|
|
206 #
|
|
207 # Atom alias property line format:
|
|
208 #
|
|
209 # A aaa
|
|
210 # x...
|
|
211 #
|
|
212 # aaa: Atom number
|
|
213 # x: Atom alias in next line
|
|
214 #
|
|
215 sub GenerateCmpdAtomAliasPropertyLines {
|
|
216 my($PropertyValuePairsRef) = @_;
|
|
217 my($Index, $AtomNum, $AtomAlias, $Line, @PropertyLines);
|
|
218
|
|
219 @PropertyLines = ();
|
|
220
|
|
221 for ($Index = 0; $Index < $#{$PropertyValuePairsRef}; $Index += 2) {
|
|
222 $AtomNum = $PropertyValuePairsRef->[$Index];
|
|
223 $AtomAlias = $PropertyValuePairsRef->[$Index + 1];
|
|
224
|
|
225 $Line = "A " . sprintf "%3i", $AtomNum;
|
|
226
|
|
227 push @PropertyLines, $Line;
|
|
228 push @PropertyLines, $AtomAlias;
|
|
229 }
|
|
230
|
|
231 return @PropertyLines;
|
|
232 }
|
|
233
|
|
234 # Generate data header labels and values lines...
|
|
235 #
|
|
236 sub GenerateCmpdDataHeaderLabelsAndValuesLines {
|
|
237 my($DataHeaderLabelsRef, $DataHeaderLabelsAndValuesRef, $SortDataLabels) = @_;
|
|
238 my($DataLabel, $DataValue, @DataLabels, @DataLines);
|
|
239
|
|
240 if (!defined $SortDataLabels) {
|
|
241 $SortDataLabels = 0;
|
|
242 }
|
|
243
|
|
244 @DataLines = ();
|
|
245 @DataLabels = ();
|
|
246 if ($SortDataLabels) {
|
|
247 push @DataLabels, sort @{$DataHeaderLabelsRef};
|
|
248 }
|
|
249 else {
|
|
250 push @DataLabels, @{$DataHeaderLabelsRef};
|
|
251 }
|
|
252 for $DataLabel (@DataLabels) {
|
|
253 $DataValue = '';
|
|
254 if (exists $DataHeaderLabelsAndValuesRef->{$DataLabel}) {
|
|
255 $DataValue = $DataHeaderLabelsAndValuesRef->{$DataLabel};
|
|
256 }
|
|
257 push @DataLines, ("> <${DataLabel}>", "$DataValue", "");
|
|
258 }
|
|
259 return @DataLines;
|
|
260 }
|
|
261
|
|
262 # Parse data field header in SD file and return lists of all and common data field
|
|
263 # labels.
|
|
264 sub GetAllAndCommonCmpdDataHeaderLabels {
|
|
265 my($SDFileRef) = @_;
|
|
266 my($CmpdCount, $CmpdString, $Label, @CmpdLines, @DataFieldLabels, @CommonDataFieldLabels, %DataFieldLabelsMap);
|
|
267
|
|
268 $CmpdCount = 0;
|
|
269 @DataFieldLabels = ();
|
|
270 @CommonDataFieldLabels = ();
|
|
271 %DataFieldLabelsMap = ();
|
|
272
|
|
273 while ($CmpdString = ReadCmpdString($SDFileRef)) {
|
|
274 $CmpdCount++;
|
|
275 @CmpdLines = split "\n", $CmpdString;
|
|
276 # Process compound data header labels and figure out which ones are present for
|
|
277 # all the compounds...
|
|
278 if (@DataFieldLabels) {
|
|
279 my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines);
|
|
280 my(%CmpdDataFieldLabelsMap) = ();
|
|
281 # Setup a map for the current labels...
|
|
282 for $Label (@CmpdDataFieldLabels) {
|
|
283 $CmpdDataFieldLabelsMap{$Label} = "PresentInSome";
|
|
284 }
|
|
285 # Check the presence old labels for this compound; otherwise, mark 'em new...
|
|
286 for $Label (@DataFieldLabels) {
|
|
287 if (!$CmpdDataFieldLabelsMap{$Label}) {
|
|
288 $DataFieldLabelsMap{$Label} = "PresentInSome";
|
|
289 }
|
|
290 }
|
|
291 # Check the presence this compound in the old labels; otherwise, add 'em...
|
|
292 for $Label (@CmpdDataFieldLabels ) {
|
|
293 if (!$DataFieldLabelsMap{$Label}) {
|
|
294 # It's a new label...
|
|
295 push @DataFieldLabels, $Label;
|
|
296 $DataFieldLabelsMap{$Label} = "PresentInSome";
|
|
297 }
|
|
298 }
|
|
299 }
|
|
300 else {
|
|
301 # Get the initial label set and set up a map...
|
|
302 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
|
|
303 for $Label (@DataFieldLabels) {
|
|
304 $DataFieldLabelsMap{$Label} = "PresentInAll";
|
|
305 }
|
|
306 }
|
|
307 }
|
|
308 # Identify the common data field labels...
|
|
309 @CommonDataFieldLabels = ();
|
|
310 for $Label (@DataFieldLabels) {
|
|
311 if ($DataFieldLabelsMap{$Label} eq "PresentInAll") {
|
|
312 push @CommonDataFieldLabels, $Label;
|
|
313 }
|
|
314 }
|
|
315 return ($CmpdCount, \@DataFieldLabels, \@CommonDataFieldLabels);
|
|
316 }
|
|
317
|
|
318 # Parse all the data header labels and return 'em as an list...
|
|
319 #
|
|
320 # Format:
|
|
321 #
|
|
322 #> Data header line
|
|
323 #Data line(s)
|
|
324 #Blank line
|
|
325 #
|
|
326 # [Data Header] (one line) precedes each item of data, starts with a greater than (>) sign, and
|
|
327 # contains at least one of the following:
|
|
328 # The field name enclosed in angle brackets. For example: <melting.point>
|
|
329 # The field number, DTn , where n represents the number assigned to the field in a MACCS-II database
|
|
330 #
|
|
331 #Optional information for the data header includes:
|
|
332 # The compound’s external and internal registry numbers. External registry numbers must be enclosed in parentheses.
|
|
333 # Any combination of information
|
|
334 #
|
|
335 #The following are examples of valid data headers:
|
|
336 #> <MELTING.POINT>
|
|
337 #> 55 (MD-08974) <BOILING.POINT> DT12
|
|
338 #> DT12 55
|
|
339 #> (MD-0894) <BOILING.POINT> FROM ARCHIVES
|
|
340 #
|
|
341 #Notes: Sometimes last blank line is missing and can be just followed by $$$$
|
|
342 #
|
|
343 sub GetCmpdDataHeaderLabels {
|
|
344 my($CmpdLines) = @_;
|
|
345 my($CmpdLine, $Label, @Labels);
|
|
346
|
|
347 @Labels = ();
|
|
348 CMPDLINE: for $CmpdLine (@$CmpdLines) {
|
|
349 if ($CmpdLine !~ /^>/) {
|
|
350 next CMPDLINE;
|
|
351 }
|
|
352 # Does the line contains field name enclosed in angular brackets?
|
|
353 ($Label) = $CmpdLine =~ /<.*?>/g;
|
|
354 if (!defined($Label)) {
|
|
355 next CMPDLINE;
|
|
356 }
|
|
357 $Label =~ s/(<|>)//g;
|
|
358 push @Labels, $Label;
|
|
359 }
|
|
360 return (@Labels);
|
|
361 }
|
|
362
|
|
363 # Parse all the data header labels and values
|
|
364 sub GetCmpdDataHeaderLabelsAndValues {
|
|
365 my($CmpdLines) = @_;
|
|
366 my($CmpdLine, $CurrentLabel, $Label, $Value, $ValueCount, $ProcessingLabelData, @Values, %DataFields);
|
|
367
|
|
368 %DataFields = ();
|
|
369 $ProcessingLabelData = 0;
|
|
370 $ValueCount = 0;
|
|
371 CMPDLINE: for $CmpdLine (@$CmpdLines) {
|
|
372 if ($CmpdLine =~ /^\$\$\$\$/) {
|
|
373 last CMPDLINE;
|
|
374 }
|
|
375 if ($CmpdLine =~ /^>/) {
|
|
376 # Does the line contains field name enclosed in angular brackets?
|
|
377 ($Label) = $CmpdLine =~ /<.*?>/g;
|
|
378 if (defined $Label) {
|
|
379 $CurrentLabel = $Label;
|
|
380 $CurrentLabel =~ s/(<|>)//g;
|
|
381 $ProcessingLabelData = 0;
|
|
382 $ValueCount = 0;
|
|
383
|
|
384 if ($CurrentLabel) {
|
|
385 $ProcessingLabelData = 1;
|
|
386 $DataFields{$CurrentLabel} = '';
|
|
387 next CMPDLINE;
|
|
388 }
|
|
389 }
|
|
390 else {
|
|
391 if (!$ProcessingLabelData) {
|
|
392 # Data line containing no <label> as allowed by SDF format. Just ignore it...
|
|
393 next CMPDLINE;
|
|
394 }
|
|
395 }
|
|
396 }
|
|
397 if (!$ProcessingLabelData) {
|
|
398 next CMPDLINE;
|
|
399 }
|
|
400 if (!(defined($CmpdLine) && length($CmpdLine))) {
|
|
401 # Blank line terminates value for a label...
|
|
402 $CurrentLabel = '';
|
|
403 $ValueCount = 0;
|
|
404 $ProcessingLabelData = 0;
|
|
405 next CMPDLINE;
|
|
406 }
|
|
407 $ValueCount++;
|
|
408 $Value = $CmpdLine;
|
|
409
|
|
410 if ($ValueCount > 1) {
|
|
411 $DataFields{$CurrentLabel} .= "\n" . $Value;
|
|
412 }
|
|
413 else {
|
|
414 $DataFields{$CurrentLabel} = $Value;
|
|
415 }
|
|
416 }
|
|
417 return (%DataFields);
|
|
418 }
|
|
419
|
|
420 # Return an updated compoud string after removing data header label along with its
|
|
421 # value from the specified compound string...
|
|
422 #
|
|
423 sub RemoveCmpdDataHeaderLabelAndValue {
|
|
424 my($CmpdString, $DataHeaderLabel) = @_;
|
|
425 my($Line, $PorcessingDataHeaderLabel, @CmpdLines);
|
|
426
|
|
427 @CmpdLines = ();
|
|
428 $PorcessingDataHeaderLabel = 0;
|
|
429
|
|
430 CMPDLINE: for $Line (split "\n", $CmpdString) {
|
|
431 if ($Line =~ /^>/ && $Line =~ /<$DataHeaderLabel>/i) {
|
|
432 $PorcessingDataHeaderLabel = 1;
|
|
433 next CMPDLINE;
|
|
434 }
|
|
435
|
|
436 if ($PorcessingDataHeaderLabel) {
|
|
437 # Blank line indicates end of fingerprints data value...
|
|
438 if ($Line =~ /^\$\$\$\$/) {
|
|
439 push @CmpdLines, $Line;
|
|
440 $PorcessingDataHeaderLabel = 0;
|
|
441 }
|
|
442 elsif (!length($Line)) {
|
|
443 $PorcessingDataHeaderLabel = 0;
|
|
444 }
|
|
445 next CMPDLINE;
|
|
446 }
|
|
447
|
|
448 # Track compound lines without fingerprints data...
|
|
449 push @CmpdLines, $Line;
|
|
450 }
|
|
451
|
|
452 return join "\n", @CmpdLines;
|
|
453 }
|
|
454
|
|
455 #
|
|
456 # Using bond blocks, figure out the number of disconnected fragments and
|
|
457 # return their values along with the atom numbers in a string delimited by new
|
|
458 # line character.
|
|
459 #
|
|
460 sub GetCmpdFragments {
|
|
461 my($CmpdLines) = @_;
|
|
462 my($AtomCount, $BondCount, $FirstAtomNum, $SecondAtomNum, @AtomConnections, $BondType, $FragmentString, $FragmentCount, $LineIndex, $Index, $AtomNum, $NbrAtomNum, @ProcessedAtoms, $ProcessedAtomCount, $ProcessAtomNum, @ProcessingAtoms, @ConnectedAtoms, %Fragments, $FragmentNum, $AFragmentString);
|
|
463
|
|
464 # Setup the connection table for each atom...
|
|
465 @AtomConnections = ();
|
|
466 ($AtomCount, $BondCount) = ParseCmpdCountsLine(@$CmpdLines[3]);
|
|
467 for $AtomNum (1 .. $AtomCount) {
|
|
468 %{$AtomConnections[$AtomNum]} = ();
|
|
469 }
|
|
470 for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) {
|
|
471 ($FirstAtomNum, $SecondAtomNum, $BondType) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]);
|
|
472 if (!$AtomConnections[$FirstAtomNum]{$SecondAtomNum}) {
|
|
473 $AtomConnections[$FirstAtomNum]{$SecondAtomNum} = $BondType;
|
|
474 }
|
|
475 if (!$AtomConnections[$SecondAtomNum]{$FirstAtomNum}) {
|
|
476 $AtomConnections[$SecondAtomNum]{$FirstAtomNum} = $BondType;
|
|
477 }
|
|
478 }
|
|
479
|
|
480 #Get set to count fragments...
|
|
481 $ProcessedAtomCount = 0;
|
|
482 $FragmentNum = 0;
|
|
483 %Fragments = ();
|
|
484 @ProcessedAtoms = ();
|
|
485 for $AtomNum (1 .. $AtomCount) {
|
|
486 $ProcessedAtoms[$AtomNum] = 0;
|
|
487 }
|
|
488 while ($ProcessedAtomCount < $AtomCount) {
|
|
489 @ProcessingAtoms = ();
|
|
490 @ConnectedAtoms = ();
|
|
491 ATOMNUM: for $AtomNum (1 .. $AtomCount) {
|
|
492 if (!$ProcessedAtoms[$AtomNum]) {
|
|
493 $ProcessedAtomCount++;
|
|
494 $ProcessedAtoms[$AtomNum] = 1;
|
|
495 push @ProcessingAtoms, $AtomNum;
|
|
496 $FragmentNum++;
|
|
497 @{$Fragments{$FragmentNum} } = ();
|
|
498 push @{$Fragments{$FragmentNum} }, $AtomNum;
|
|
499 last ATOMNUM;
|
|
500 }
|
|
501 }
|
|
502
|
|
503 # Go over the neighbors and follow the connection trail while collecting the
|
|
504 # atoms numbers present in the connected fragment...
|
|
505 while (@ProcessingAtoms) {
|
|
506 for ($Index = 0; $Index < @ProcessingAtoms; $Index++) {
|
|
507 $ProcessAtomNum = $ProcessingAtoms[$Index];
|
|
508 for $NbrAtomNum (keys %{$AtomConnections[$ProcessAtomNum]}) {
|
|
509 if (!$ProcessedAtoms[$NbrAtomNum]) {
|
|
510 $ProcessedAtomCount++;
|
|
511 $ProcessedAtoms[$NbrAtomNum] = 1;
|
|
512 push @ConnectedAtoms, $NbrAtomNum;
|
|
513 push @{ $Fragments{$FragmentNum} }, $NbrAtomNum;
|
|
514 }
|
|
515 }
|
|
516 }
|
|
517 @ProcessingAtoms = ();
|
|
518 @ProcessingAtoms = @ConnectedAtoms;
|
|
519 @ConnectedAtoms = ();
|
|
520 }
|
|
521 }
|
|
522 $FragmentCount = $FragmentNum;
|
|
523 $FragmentString = "";
|
|
524
|
|
525 # Sort out the fragments by size...
|
|
526 for $FragmentNum (sort { @{$Fragments{$b}} <=> @{$Fragments{$a}} } keys %Fragments ) {
|
|
527 # Sort the atoms in a fragment by their numbers...
|
|
528 $AFragmentString = join " ", sort { $a <=> $b } @{ $Fragments{$FragmentNum} };
|
|
529 if ($FragmentString) {
|
|
530 $FragmentString .= "\n" . $AFragmentString;
|
|
531 }
|
|
532 else {
|
|
533 $FragmentString = $AFragmentString;
|
|
534 }
|
|
535 }
|
|
536 return ($FragmentCount, $FragmentString);
|
|
537 }
|
|
538
|
|
539 # Count number of lines present in between 4th and line containg "M END"
|
|
540 sub GetCtabLinesCount {
|
|
541 my($CmpdLines) = @_;
|
|
542 my($LineIndex, $CtabLinesCount);
|
|
543
|
|
544 $CtabLinesCount = 0;
|
|
545 LINE: for ($LineIndex = 4; $LineIndex < @$CmpdLines; $LineIndex++) {
|
|
546 #
|
|
547 # Any line after atom and bond data starting with anything other than space or
|
|
548 # a digit indicates end of Ctab atom/bond data block...
|
|
549 #
|
|
550 if (@$CmpdLines[$LineIndex] !~ /^[0-9 ]/) {
|
|
551 $CtabLinesCount = $LineIndex - 4;
|
|
552 last LINE;
|
|
553 }
|
|
554 }
|
|
555 return $CtabLinesCount;
|
|
556 }
|
|
557
|
|
558 # Using atom blocks, count the number of atoms which contain special element
|
|
559 # symbols not present in the periodic table.
|
|
560 sub GetUnknownAtoms {
|
|
561 my($CmpdLines) = @_;
|
|
562 my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines, $LineIndex, $AtomCount, $AtomSymbol);
|
|
563
|
|
564 $UnknownAtomCount = 0;
|
|
565 $UnknownAtoms = "";
|
|
566 $UnknownAtomLines = "";
|
|
567 ($AtomCount) = ParseCmpdCountsLine(@$CmpdLines[3]);
|
|
568 for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) {
|
|
569 ($AtomSymbol) = ParseCmpdAtomLine(@$CmpdLines[$LineIndex]);
|
|
570 if (!IsElement($AtomSymbol)) {
|
|
571 $UnknownAtomCount++;
|
|
572 $UnknownAtoms .= " $AtomSymbol";
|
|
573 if ($UnknownAtomLines) {
|
|
574 $UnknownAtomLines .= "\n" . @$CmpdLines[$LineIndex];
|
|
575 }
|
|
576 else {
|
|
577 $UnknownAtomLines = @$CmpdLines[$LineIndex];
|
|
578 }
|
|
579 }
|
|
580 }
|
|
581 return ($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines);
|
|
582 }
|
|
583
|
|
584 # Check z coordinates of all atoms to see whether any of them is non-zero
|
|
585 # which makes the compound geometry three dimensional...
|
|
586 #
|
|
587 sub IsCmpd3D {
|
|
588 my($CmpdLines) = @_;
|
|
589 my($LineIndex, $AtomCount, $AtomSymbol, $AtomX, $AtomY, $AtomZ);
|
|
590
|
|
591 ($AtomCount) = ParseCmpdCountsLine(@$CmpdLines[3]);
|
|
592 for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) {
|
|
593 ($AtomSymbol, $AtomX, $AtomY, $AtomZ) = ParseCmpdAtomLine(@$CmpdLines[$LineIndex]);
|
|
594 if ($AtomZ != 0) {
|
|
595 return 1;
|
|
596 }
|
|
597 }
|
|
598 return 0;
|
|
599 }
|
|
600
|
|
601 # Check whether it's a 2D compound...
|
|
602 #
|
|
603 sub IsCmpd2D {
|
|
604 my($CmpdLines) = @_;
|
|
605
|
|
606 return IsCmpd3D($CmpdLines) ? 0 : 1;
|
|
607 }
|
|
608
|
|
609 # Using bond blocks, count the number of bond lines which contain atom numbers
|
|
610 # greater than atom count specified in compound count line...
|
|
611 #
|
|
612 sub GetInvalidAtomNumbers {
|
|
613 my($CmpdLines) = @_;
|
|
614 my($LineIndex, $AtomCount, $BondCount, $FirstAtomNum, $SecondAtomNum, $InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines, $Line, $InvalidAtomPropertyLine, $ValuePairIndex, $AtomNum, $Value, @ValuePairs);
|
|
615
|
|
616 ($AtomCount, $BondCount) = ParseCmpdCountsLine(@$CmpdLines[3]);
|
|
617
|
|
618 $InvalidAtomNumbersCount = 0;
|
|
619 $InvalidAtomNumbers = "";
|
|
620 $InvalidAtomNumberLines = "";
|
|
621
|
|
622 # Go over bond block lines...
|
|
623 LINE: for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) {
|
|
624 ($FirstAtomNum, $SecondAtomNum) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]);
|
|
625 if ($FirstAtomNum <= $AtomCount && $SecondAtomNum <= $AtomCount) {
|
|
626 next LINE;
|
|
627 }
|
|
628 if ($FirstAtomNum > $AtomCount) {
|
|
629 $InvalidAtomNumbersCount++;
|
|
630 $InvalidAtomNumbers .= " $FirstAtomNum";
|
|
631 }
|
|
632 if ($SecondAtomNum > $AtomCount) {
|
|
633 $InvalidAtomNumbersCount++;
|
|
634 $InvalidAtomNumbers .= " $SecondAtomNum";
|
|
635 }
|
|
636 if ($InvalidAtomNumberLines) {
|
|
637 $InvalidAtomNumberLines .= "\n" . @$CmpdLines[$LineIndex];
|
|
638 }
|
|
639 else {
|
|
640 $InvalidAtomNumberLines = @$CmpdLines[$LineIndex];
|
|
641 }
|
|
642 }
|
|
643 # Go over property lines before M END...
|
|
644 #
|
|
645 LINE: for ($LineIndex = (4 + $AtomCount + $BondCount); $LineIndex < @$CmpdLines; $LineIndex++) {
|
|
646 $Line = @$CmpdLines[$LineIndex];
|
|
647 @ValuePairs = ();
|
|
648 if ($Line =~ /^M END/i) {
|
|
649 last LINE;
|
|
650 }
|
|
651 @ValuePairs = ();
|
|
652 if ($Line =~ /^M CHG/i) {
|
|
653 @ValuePairs = ParseCmpdChargePropertyLine($Line);
|
|
654 }
|
|
655 elsif ($Line =~ /^M RAD/i) {
|
|
656 @ValuePairs = ParseCmpdRadicalPropertyLine($Line);
|
|
657 }
|
|
658 elsif ($Line =~ /^M ISO/i) {
|
|
659 @ValuePairs = ParseCmpdIsotopePropertyLine($Line);
|
|
660 }
|
|
661 elsif ($Line =~ /^A /i) {
|
|
662 my($NextLine);
|
|
663 $LineIndex++;
|
|
664 $NextLine = @$CmpdLines[$LineIndex];
|
|
665 @ValuePairs = ParseCmpdAtomAliasPropertyLine($Line, $NextLine);
|
|
666 }
|
|
667 else {
|
|
668 next LINE;
|
|
669 }
|
|
670
|
|
671 $InvalidAtomPropertyLine = 0;
|
|
672 for ($ValuePairIndex = 0; $ValuePairIndex < $#ValuePairs; $ValuePairIndex += 2) {
|
|
673 $AtomNum = $ValuePairs[$ValuePairIndex]; $Value = $ValuePairs[$ValuePairIndex + 1];
|
|
674 if ($AtomNum > $AtomCount) {
|
|
675 $InvalidAtomPropertyLine = 1;
|
|
676 $InvalidAtomNumbersCount++;
|
|
677 $InvalidAtomNumbers .= " $AtomNum";
|
|
678 }
|
|
679 }
|
|
680 if ($InvalidAtomPropertyLine) {
|
|
681 if ($InvalidAtomNumberLines) {
|
|
682 $InvalidAtomNumberLines .= "\n" . $Line;
|
|
683 }
|
|
684 else {
|
|
685 $InvalidAtomNumberLines = $Line;
|
|
686 }
|
|
687 }
|
|
688 }
|
|
689
|
|
690 return ($InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines);
|
|
691 }
|
|
692
|
|
693 # Ctab lines: Atom block
|
|
694 #
|
|
695 # Format: xxxxx.xxxxyyyyy.yyyyzzzzz.zzzz aaaddcccssshhhbbbvvvHHHrrriiimmmnnneee
|
|
696 # A10 A10 A10 xA3 A2A3 A3 A3 A3 A3 A3 A3 A3 A3 A3 A3
|
|
697 # x,y,z: Atom coordinates
|
|
698 # aaa: Atom symbol. Entry in periodic table or L for atom list, A, Q, * for unspecified
|
|
699 # atom, and LP for lone pair, or R# for Rgroup label
|
|
700 # dd: Mass difference. -3, -2, -1, 0, 1, 2, 3, 4 (0 for value beyond these limits)
|
|
701 # ccc: Charge. 0 = uncharged or value other than these, 1 = +3, 2 = +2, 3 = +1,
|
|
702 # 4 = doublet radical, 5 = -1, 6 = -2, 7 = -3
|
|
703 # sss: Atom stereo parity. 0 = not stereo, 1 = odd, 2 = even, 3 = either or unmarked stereo center
|
|
704 # hhh: Hydrogen count + 1. 1 = H0, 2 = H1, 3 = H2, 4 = H3, 5 = H4
|
|
705 # bbb: Stereo care box. 0 = ignore stereo configuration of this double bond atom, 1 = stereo
|
|
706 # configuration of double bond atom must match
|
|
707 # vvv: Valence. 0 = no marking (default)(1 to 14) = (1 to 14) 15 = zero valence
|
|
708 # HHH: H0 designator. 0 = not specified, 1 = no H atoms allowed (redundant due to hhh)
|
|
709 # rrr: Not used
|
|
710 # iii: Not used
|
|
711 # mmm: Atom-atom mapping number. 1 - number of atoms
|
|
712 # nnn: Inversion/retention flag. 0 = property not applied, 1 = configuration is inverted,
|
|
713 # 2 = configuration is retained.
|
|
714 # eee: Exact change flag. 0 = property not applied, 1 = change on atom must be
|
|
715 # exactly as shown
|
|
716 #
|
|
717 # Notes:
|
|
718 # . StereoParity: 1 - ClockwiseStereo, 2 - AntiClockwiseStereo; 3 - Either; 0 - none. These
|
|
719 # values determine chirailty around the chiral center; a non zero value indicates atom
|
|
720 # has been marked as chiral center.
|
|
721 #
|
|
722 sub ParseCmpdAtomLine {
|
|
723 my($Line) = @_;
|
|
724 my ($LineIndex, $AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity);
|
|
725
|
|
726 ($AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity) = ('') x 7;
|
|
727 if (length($Line) > 31) {
|
|
728 ($AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity) = unpack("A10A10A10xA3A2A3A3", $Line);
|
|
729 }
|
|
730 else {
|
|
731 ($AtomX, $AtomY, $AtomZ, $AtomSymbol) = unpack("A10A10A10", $Line);
|
|
732 }
|
|
733 return ($AtomSymbol, $AtomX, $AtomY, $AtomZ, $MassDifference, $Charge, $StereoParity);
|
|
734 }
|
|
735
|
|
736 # Map MDL charge value used in SD and MOL files to internal charge used by MayaChemTools.
|
|
737 #
|
|
738 sub MDLChargeToInternalCharge {
|
|
739 my($MDLCharge) = @_;
|
|
740 my($InternalCharge);
|
|
741
|
|
742 CHARGE: {
|
|
743 if ($MDLCharge == 0) { $InternalCharge = 0; last CHARGE;}
|
|
744 if ($MDLCharge == 1) { $InternalCharge = 3; last CHARGE;}
|
|
745 if ($MDLCharge == 2) { $InternalCharge = 2; last CHARGE;}
|
|
746 if ($MDLCharge == 3) { $InternalCharge = 1; last CHARGE;}
|
|
747 if ($MDLCharge == 5) { $InternalCharge = -1; last CHARGE;}
|
|
748 if ($MDLCharge == 6) { $InternalCharge = -2; last CHARGE;}
|
|
749 if ($MDLCharge == 7) { $InternalCharge = -3; last CHARGE;}
|
|
750 # All other MDL charge values, including 4 corresponding to "doublet radical",
|
|
751 # are assigned internal value of 0.
|
|
752 $InternalCharge = 0;
|
|
753 if ($MDLCharge != 4) {
|
|
754 carp "Warning: MDLChargeToInternalCharge: MDL charge value, $MDLCharge, is not supported: An internal charge value, 0, has been assigned...";
|
|
755 }
|
|
756 }
|
|
757 return $InternalCharge;
|
|
758 }
|
|
759
|
|
760 # Map internal charge used by MayaChemTools to MDL charge value used in SD and MOL files.
|
|
761 #
|
|
762 sub InternalChargeToMDLCharge {
|
|
763 my($InternalCharge) = @_;
|
|
764 my($MDLCharge);
|
|
765
|
|
766 CHARGE: {
|
|
767 if ($InternalCharge == 3) { $MDLCharge = 1; last CHARGE;}
|
|
768 if ($InternalCharge == 2) { $MDLCharge = 2; last CHARGE;}
|
|
769 if ($InternalCharge == 1) { $MDLCharge = 3; last CHARGE;}
|
|
770 if ($InternalCharge == -1) { $MDLCharge = 5; last CHARGE;}
|
|
771 if ($InternalCharge == -2) { $MDLCharge = 6; last CHARGE;}
|
|
772 if ($InternalCharge == -3) { $MDLCharge = 7; last CHARGE;}
|
|
773 # All other MDL charge values, including 4 corresponding to "doublet radical",
|
|
774 # are assigned internal value of 0.
|
|
775 $MDLCharge = 0;
|
|
776 }
|
|
777 return $MDLCharge;
|
|
778 }
|
|
779
|
|
780 # Ctab lines: Bond block
|
|
781 #
|
|
782 # Format: 111222tttsssxxxrrrccc
|
|
783 #
|
|
784 # 111: First atom number.
|
|
785 # 222: Second atom number.
|
|
786 # ttt: Bond type. 1 = Single, 2 = Double, 3 = Triple, 4 = Aromatic, 5 = Single or Double,
|
|
787 # 6 = Single or Aromatic, 7 = Double or Aromatic, 8 = Any
|
|
788 # sss: Bond stereo. Single bonds: 0 = not stereo, 1 = Up, 4 = Either, 6 = Down,
|
|
789 # Double bonds: 0 = Use x-, y-, z-coords from atom block to determine cis or trans,
|
|
790 # 3 = Cis or trans (either) double bond
|
|
791 # xxx: Not used
|
|
792 # rrr: Bond topology. 0 = Either, 1 = Ring, 2 = Chain
|
|
793 # ccc: Reacting center status. 0 = unmarked, 1 = a center, -1 = not a center,
|
|
794 # Additional: 2 = no change,4 = bond made/broken, 8 = bond order changes 12 = 4+8
|
|
795 # (both made/broken and changes); 5 = (4 + 1), 9 = (8 + 1), and 13 = (12 + 1) are also possible
|
|
796 #
|
|
797 sub ParseCmpdBondLine {
|
|
798 my($Line) = @_;
|
|
799 my($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo);
|
|
800
|
|
801 ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = map {s/ //g; $_} unpack("A3A3A3A3", $Line);
|
|
802 return ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo);
|
|
803 }
|
|
804
|
|
805 # Map MDL bond type value used in SD and MOL files to internal bond order and bond types
|
|
806 # values used by MayaChemTools...
|
|
807 #
|
|
808 sub MDLBondTypeToInternalBondOrder {
|
|
809 my($MDLBondType) = @_;
|
|
810 my($InternalBondOrder, $InternalBondType);
|
|
811
|
|
812 $InternalBondType = '';
|
|
813
|
|
814 BONDTYPE: {
|
|
815 if ($MDLBondType == 1) { $InternalBondOrder = 1; $InternalBondType = 'Single'; last BONDTYPE;}
|
|
816 if ($MDLBondType == 2) { $InternalBondOrder = 2; $InternalBondType = 'Double'; last BONDTYPE;}
|
|
817 if ($MDLBondType == 3) { $InternalBondOrder = 3; $InternalBondType = 'Triple'; last BONDTYPE;}
|
|
818 if ($MDLBondType == 4) { $InternalBondOrder = 1.5; $InternalBondType = 'Aromatic'; last BONDTYPE;} # Aromatic
|
|
819 if ($MDLBondType == 5) { $InternalBondOrder = 1; $InternalBondType = 'SingleOrDouble'; last BONDTYPE;} # Aromatic
|
|
820 if ($MDLBondType == 6) { $InternalBondOrder = 1; $InternalBondType = 'SingleOrAromatic'; last BONDTYPE;} # Aromatic
|
|
821 if ($MDLBondType == 7) { $InternalBondOrder = 2; $InternalBondType = 'DoubleOrAromatic'; last BONDTYPE;} # Aromatic
|
|
822 if ($MDLBondType == 8) { $InternalBondOrder = 1; $InternalBondType = 'Any'; last BONDTYPE;} # Aromatic
|
|
823 #
|
|
824 # Although MDL aromatic bond values are used for query only and explicit Kekule bond order
|
|
825 # values must be assigned, internal value of 1.5 is allowed to indicate aromatic bond orders.
|
|
826 #
|
|
827 # All other MDL bond type values - 5 = Single or Double, 6 = Single or Aromatic, 7 = Double or Aromatic,
|
|
828 # 8 = Any - are also assigned appropriate internal value of 1: These are meant to be used for
|
|
829 # structure queries by MDL products.
|
|
830 #
|
|
831 $InternalBondOrder = 1;
|
|
832 $InternalBondType = 'Single';
|
|
833
|
|
834 carp "Warning: MDLBondTypeToInternalBondOrder: MDL bond type value, $MDLBondType, is not supported: An internal bond order value, 0, has been assigned...";
|
|
835 }
|
|
836 return ($InternalBondOrder, $InternalBondType);
|
|
837 }
|
|
838
|
|
839 # Map internal bond order and bond type values used by MayaChemTools to MDL bond type value used
|
|
840 # in SD and MOL files...
|
|
841 #
|
|
842 sub InternalBondOrderToMDLBondType {
|
|
843 my($InternalBondOrder, $InternalBondType) = @_;
|
|
844 my($MDLBondType);
|
|
845
|
|
846 BONDTYPE: {
|
|
847 if ($InternalBondOrder == 1) {
|
|
848 if ($InternalBondType =~ /^SingleOrDouble$/i) {
|
|
849 $MDLBondType = 5;
|
|
850 }
|
|
851 elsif ($InternalBondType =~ /^SingleOrAromatic$/i) {
|
|
852 $MDLBondType = 6;
|
|
853 }
|
|
854 elsif ($InternalBondType =~ /^Any$/i) {
|
|
855 $MDLBondType = 8;
|
|
856 }
|
|
857 else {
|
|
858 $MDLBondType = 1;
|
|
859 }
|
|
860 $MDLBondType = 1;
|
|
861 last BONDTYPE;
|
|
862 }
|
|
863 if ($InternalBondOrder == 2) {
|
|
864 if ($InternalBondType =~ /^DoubleOrAromatic$/i) {
|
|
865 $MDLBondType = 7;
|
|
866 }
|
|
867 else {
|
|
868 $MDLBondType = 2;
|
|
869 }
|
|
870 last BONDTYPE;
|
|
871 }
|
|
872 if ($InternalBondOrder == 3) { $MDLBondType = 3; last BONDTYPE;}
|
|
873 if ($InternalBondOrder == 1.5) { $MDLBondType = 4; last BONDTYPE;}
|
|
874 if ($InternalBondType =~ /^Any$/i) { $MDLBondType = 8; last BONDTYPE;}
|
|
875
|
|
876 $MDLBondType = 1;
|
|
877
|
|
878 carp "Warning: InternalBondOrderToMDLBondType: Internal bond order and type values, $InternalBondOrder and $InternalBondType, don't match any valid MDL bond type: MDL bond type value, 1, has been assigned...";
|
|
879 }
|
|
880 return $MDLBondType;
|
|
881 }
|
|
882
|
|
883 # Third line: Comments - A blank line is also allowed.
|
|
884 sub ParseCmpdCommentsLine {
|
|
885 my($Line) = @_;
|
|
886 my($Comments);
|
|
887
|
|
888 $Comments = unpack("A80", $Line);
|
|
889
|
|
890 return ($Comments);
|
|
891 }
|
|
892
|
|
893 # Map MDL bond stereo value used in SD and MOL files to internal bond stereochemistry values used by MayaChemTools...
|
|
894 #
|
|
895 sub MDLBondStereoToInternalBondStereochemistry {
|
|
896 my($MDLBondStereo) = @_;
|
|
897 my($InternalBondStereo);
|
|
898
|
|
899 $InternalBondStereo = '';
|
|
900
|
|
901 BONDSTEREO: {
|
|
902 if ($MDLBondStereo == 1) { $InternalBondStereo = 'Up'; last BONDSTEREO;}
|
|
903 if ($MDLBondStereo == 4) { $InternalBondStereo = 'UpOrDown'; last BONDSTEREO;}
|
|
904 if ($MDLBondStereo == 6) { $InternalBondStereo = 'Down'; last BONDSTEREO;}
|
|
905 if ($MDLBondStereo == 3) { $InternalBondStereo = 'CisOrTrans'; last BONDSTEREO;}
|
|
906 if ($MDLBondStereo == 0) { $InternalBondStereo = 'None'; last BONDSTEREO;}
|
|
907
|
|
908 $InternalBondStereo = '';
|
|
909 carp "Warning: MDLBondStereoToInternalBondType: MDL bond stereo value, $MDLBondStereo, is not supported: It has been ignored and bond order would be used to determine bond type...";
|
|
910 }
|
|
911 return $InternalBondStereo;
|
|
912 }
|
|
913
|
|
914 # Map internal bond stereochemistry values used by MayaChemTools to MDL bond stereo value used in SD and MOL files...
|
|
915 #
|
|
916 sub InternalBondStereochemistryToMDLBondStereo {
|
|
917 my($InternalBondStereo) = @_;
|
|
918 my($MDLBondStereo);
|
|
919
|
|
920 $MDLBondStereo = 0;
|
|
921
|
|
922 BONDSTEREO: {
|
|
923 if ($InternalBondStereo =~ /^Up$/i) { $MDLBondStereo = 1; last BONDSTEREO;}
|
|
924 if ($InternalBondStereo =~ /^UpOrDown$/i) { $MDLBondStereo = 4; last BONDSTEREO;}
|
|
925 if ($InternalBondStereo =~ /^Down$/) { $MDLBondStereo = 6; last BONDSTEREO;}
|
|
926 if ($InternalBondStereo =~ /^CisOrTrans$/) { $MDLBondStereo = 3; last BONDSTEREO;}
|
|
927
|
|
928 $MDLBondStereo = 0;
|
|
929 }
|
|
930 return $MDLBondStereo;
|
|
931 }
|
|
932
|
|
933 # Fourth line: Counts
|
|
934 #
|
|
935 # Format: aaabbblllfffcccsssxxxrrrpppiiimmmvvvvvv
|
|
936 #
|
|
937 # aaa: number of atoms; bbb: number of bonds; lll: number of atom lists; fff: (obsolete)
|
|
938 # ccc: chiral flag: 0=not chiral, 1=chiral; sss: number of stext entries; xxx,rrr,ppp,iii:
|
|
939 # (obsolete); mmm: number of lines of additional properties, including the M END line, No
|
|
940 # longer supported, default is set to 999; vvvvvv: version
|
|
941
|
|
942 sub ParseCmpdCountsLine {
|
|
943 my($Line) = @_;
|
|
944 my($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version);
|
|
945
|
|
946 if (length($Line) >= 39) {
|
|
947 ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version) = unpack("A3A3x3x3A3x3x3x3x3x3A3A6", $Line);
|
|
948 }
|
|
949 elsif (length($Line) >= 15) {
|
|
950 ($PropertyCount, $Version) = ("999", "v2000");
|
|
951 ($AtomCount, $BondCount, $ChiralFlag) = unpack("A3A3x3x3A3", $Line);
|
|
952 }
|
|
953 else {
|
|
954 ($ChiralFlag, $PropertyCount, $Version) = ("0", "999", "v2000");
|
|
955 ($AtomCount, $BondCount) = unpack("A3A3", $Line);
|
|
956 }
|
|
957
|
|
958 if ($Version =~ /V3000/i) {
|
|
959 # Current version of MayaChemTools modules and classes for processing MDL MOL and SD don't support
|
|
960 # V3000. So instead of relying on callers, just exit with an error to disable any processing of V3000
|
|
961 # format.
|
|
962 croak "Error: SDFileUtil::ParseCmpdCountsLine: The Extended Connection Table (V3000) format in MDL MOL and SD files is not supported by the current release of MayaChemTools...";
|
|
963 }
|
|
964
|
|
965 return ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version);
|
|
966 }
|
|
967
|
|
968 # Second line: Misc info
|
|
969 #
|
|
970 # Format: IIPPPPPPPPMMDDYYHHmmddSSssssssssssEEEEEEEEEEEERRRRRR
|
|
971 # A2A8 A10 A2I2A10 A12 A6
|
|
972 # User's first and last initials (I), program name (P), date/time (M/D/Y,H:m),
|
|
973 # dimensional codes - 2D or 3D (d),scaling factors (S, s), energy (E) if modeling program input,
|
|
974 # internal registry number (R) if input through MDL form. A blank line is also allowed.
|
|
975 sub ParseCmpdMiscInfoLine {
|
|
976 my($Line) = @_;
|
|
977 my($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum);
|
|
978
|
|
979 ($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum) = unpack("A2A8A10A2A2A10A12A6", $Line);
|
|
980 return ($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum);
|
|
981 }
|
|
982
|
|
983 # First line: Molecule name. This line is unformatted, but like all other lines in a
|
|
984 # molfile may not extend beyond column 80. A blank line is also allowed.
|
|
985 sub ParseCmpdMolNameLine {
|
|
986 my($Line) = @_;
|
|
987 my($MolName);
|
|
988
|
|
989 $MolName = unpack("A80", $Line);
|
|
990
|
|
991 return ($MolName);
|
|
992 }
|
|
993
|
|
994 # Parse atom alias property line in CTAB generic properties block.
|
|
995 #
|
|
996 # Atom alias property line format:
|
|
997 #
|
|
998 # A aaa
|
|
999 # x...
|
|
1000 #
|
|
1001 # aaa: Atom number
|
|
1002 # x: Atom alias in next line
|
|
1003 #
|
|
1004 sub ParseCmpdAtomAliasPropertyLine {
|
|
1005 my($Line, $NextLine) = @_;
|
|
1006 my($Label, $AtomNumber, $AtomAlias);
|
|
1007
|
|
1008 ($Label, $AtomNumber) = split(' ', $Line);
|
|
1009 $AtomAlias = $NextLine;
|
|
1010
|
|
1011 if (!$AtomAlias) {
|
|
1012 carp "Warning: _ParseCmpdAtomAliasPropertyLine: No atom alias value specified on the line following atom alias property line...";
|
|
1013 }
|
|
1014
|
|
1015 return ($AtomNumber, $AtomAlias);
|
|
1016 }
|
|
1017
|
|
1018 # Parse charge property line in CTAB generic properties block.
|
|
1019 #
|
|
1020 # Charge property line format:
|
|
1021 #
|
|
1022 # M CHGnn8 aaa vvv ...
|
|
1023 #
|
|
1024 # nn8: Number of value pairs. Maximum of 8 pairs allowed.
|
|
1025 # aaa: Atom number
|
|
1026 # vvv: -15 to +15. Default of 0 = uncharged atom. When present, this property supersedes
|
|
1027 # all charge and radical values in the atom block, forcing a 0 charge on all atoms not
|
|
1028 # listed in an M CHG or M RAD line.
|
|
1029 #
|
|
1030 sub ParseCmpdChargePropertyLine {
|
|
1031 my($Line) = @_;
|
|
1032
|
|
1033 return _ParseCmpdGenericPropertyLine('Charge', $Line);
|
|
1034 }
|
|
1035
|
|
1036
|
|
1037 # Parse isotope property line in CTAB generic properties block.
|
|
1038 #
|
|
1039 # Isoptope property line format:
|
|
1040 #
|
|
1041 # M ISOnn8 aaa vvv ...
|
|
1042 #
|
|
1043 # nn8: Number of value paris. Maximum of 8 pairs allowed.
|
|
1044 # aaa: Atom number
|
|
1045 # vvv: Absolute mass of the atom isotope as a positive integer. When present, this property
|
|
1046 # supersedes all isotope values in the atom block. Default (no entry) means natural
|
|
1047 # abundance. The difference between this absolute mass value and the natural
|
|
1048 # abundance value specified in the PTABLE.DAT file must be within the range of -18
|
|
1049 # to +12
|
|
1050 #
|
|
1051 # Notes:
|
|
1052 # . Values correspond to mass numbers...
|
|
1053 #
|
|
1054 sub ParseCmpdIsotopePropertyLine {
|
|
1055 my($Line) = @_;
|
|
1056
|
|
1057 return _ParseCmpdGenericPropertyLine('Isotope', $Line);
|
|
1058 }
|
|
1059
|
|
1060 # Parse radical property line in CTAB generic properties block.
|
|
1061 #
|
|
1062 # Radical property line format:
|
|
1063 #
|
|
1064 # M RADnn8 aaa vvv ...
|
|
1065 #
|
|
1066 # nn8: Number of value paris. Maximum of 8 pairs allowed.
|
|
1067 # aaa: Atom number
|
|
1068 # vvv: Default of 0 = no radical, 1 = singlet, 2 = doublet, 3 = triplet . When
|
|
1069 # present, this property supersedes all charge and radical values in the atom block,
|
|
1070 # forcing a 0 (zero) charge and radical on all atoms not listed in an M CHG or
|
|
1071 # M RAD line.
|
|
1072 #
|
|
1073 sub ParseCmpdRadicalPropertyLine {
|
|
1074 my($Line) = @_;
|
|
1075
|
|
1076 return _ParseCmpdGenericPropertyLine('Radical', $Line);
|
|
1077 }
|
|
1078
|
|
1079 # Map MDL radical stereo value used in SD and MOL files to internal spin multiplicity values used by MayaChemTools...
|
|
1080 #
|
|
1081 sub MDLRadicalToInternalSpinMultiplicity {
|
|
1082 my($MDLRadical) = @_;
|
|
1083 my($InternalSpinMultiplicity);
|
|
1084
|
|
1085 $InternalSpinMultiplicity = '';
|
|
1086
|
|
1087 SPINMULTIPLICITY: {
|
|
1088 if ($MDLRadical == 0) { $InternalSpinMultiplicity = 0; last SPINMULTIPLICITY;}
|
|
1089 if ($MDLRadical == 1) { $InternalSpinMultiplicity = 1; last SPINMULTIPLICITY;}
|
|
1090 if ($MDLRadical == 2) { $InternalSpinMultiplicity = 2; last SPINMULTIPLICITY;}
|
|
1091 if ($MDLRadical == 3) { $InternalSpinMultiplicity = 3; last SPINMULTIPLICITY;}
|
|
1092 $InternalSpinMultiplicity = '';
|
|
1093 carp "Warning: MDLRadicalToInternalSpinMultiplicity: MDL radical value, $MDLRadical, specifed on line M RAD is not supported...";
|
|
1094 }
|
|
1095 return $InternalSpinMultiplicity;
|
|
1096 }
|
|
1097
|
|
1098 # Map internal spin multiplicity values used by MayaChemTools to MDL radical stereo value used in SD and MOL files...
|
|
1099 #
|
|
1100 sub InternalSpinMultiplicityToMDLRadical {
|
|
1101 my($InternalSpinMultiplicity) = @_;
|
|
1102 my($MDLRadical);
|
|
1103
|
|
1104 $MDLRadical = 0;
|
|
1105
|
|
1106 SPINMULTIPLICITY: {
|
|
1107 if ($InternalSpinMultiplicity == 1) { $MDLRadical = 1; last SPINMULTIPLICITY;}
|
|
1108 if ($InternalSpinMultiplicity == 2) { $MDLRadical = 2; last SPINMULTIPLICITY;}
|
|
1109 if ($InternalSpinMultiplicity == 3) { $MDLRadical = 3; last SPINMULTIPLICITY;}
|
|
1110 $MDLRadical = 0;
|
|
1111 }
|
|
1112 return $MDLRadical;
|
|
1113 }
|
|
1114
|
|
1115 # Process generic CTAB property line...
|
|
1116 sub _ParseCmpdGenericPropertyLine {
|
|
1117 my($PropertyName, $Line) = @_;
|
|
1118
|
|
1119 my($Label, $PropertyLabel, $ValuesCount, $ValuePairsCount, @ValuePairs);
|
|
1120
|
|
1121 @ValuePairs = ();
|
|
1122 ($Label, $PropertyLabel, $ValuesCount, @ValuePairs) = split(' ', $Line);
|
|
1123 $ValuePairsCount = (scalar @ValuePairs)/2;
|
|
1124 if ($ValuesCount != $ValuePairsCount) {
|
|
1125 carp "Warning: _ParseCmpdGenericPropertyLine: Number of atom number and $PropertyName value paris specified on $Label $PropertyLabel property line, $ValuePairsCount, does not match expected value of $ValuesCount...";
|
|
1126 }
|
|
1127
|
|
1128 return (@ValuePairs);
|
|
1129 }
|
|
1130
|
|
1131 # Generic CTAB property lines for charge, istope and radical properties...
|
|
1132 #
|
|
1133 sub _GenerateCmpdGenericPropertyLines {
|
|
1134 my($PropertyName, $PropertyValuePairsRef) = @_;
|
|
1135 my($Index, $PropertyLabel, $Line, $PropertyCount, $AtomNum, $PropertyValue, @PropertyLines);
|
|
1136
|
|
1137 @PropertyLines = ();
|
|
1138 NAME: {
|
|
1139 if ($PropertyName =~ /^Charge$/i) { $PropertyLabel = "M CHG"; last NAME; }
|
|
1140 if ($PropertyName =~ /^Isotope$/i) { $PropertyLabel = "M ISO"; last NAME; }
|
|
1141 if ($PropertyName =~ /^Radical$/i) { $PropertyLabel = "M RAD"; last NAME; }
|
|
1142 carp "Warning: _GenerateCmpdGenericPropertyLines: Unknown property name, $PropertyName, specified...";
|
|
1143 return @PropertyLines;
|
|
1144 }
|
|
1145
|
|
1146 # A maximum of 8 property pair values allowed per line...
|
|
1147 $PropertyCount = 0;
|
|
1148 $Line = '';
|
|
1149 for ($Index = 0; $Index < $#{$PropertyValuePairsRef}; $Index += 2) {
|
|
1150 if ($PropertyCount > 8) {
|
|
1151 # Setup property line...
|
|
1152 $Line = "${PropertyLabel} 8${Line}";
|
|
1153 push @PropertyLines, $Line;
|
|
1154
|
|
1155 $PropertyCount = 0;
|
|
1156 $Line = '';
|
|
1157 }
|
|
1158 $PropertyCount++;
|
|
1159 $AtomNum = $PropertyValuePairsRef->[$Index];
|
|
1160 $PropertyValue = $PropertyValuePairsRef->[$Index + 1];
|
|
1161 $Line .= sprintf " %3i %3i", $AtomNum, $PropertyValue;
|
|
1162 }
|
|
1163 if ($Line) {
|
|
1164 $Line = "${PropertyLabel} ${PropertyCount}${Line}";
|
|
1165 push @PropertyLines, $Line;
|
|
1166 }
|
|
1167 return @PropertyLines;
|
|
1168 }
|
|
1169
|
|
1170 #
|
|
1171 # Read compound data into a string and return its value
|
|
1172 sub ReadCmpdString {
|
|
1173 my($SDFileRef) = @_;
|
|
1174 my($CmpdString);
|
|
1175
|
|
1176 $CmpdString = "";
|
|
1177 LINE: while (defined($_ = <$SDFileRef>)) {
|
|
1178 # Change Windows and Mac new line char to UNIX...
|
|
1179 s/(\r\n)|(\r)/\n/g;
|
|
1180
|
|
1181 if (/^\$\$\$\$/) {
|
|
1182 # Take out any new line char at the end by explicitly removing it instead of using
|
|
1183 # chomp, which might not always work correctly on files generated on a system
|
|
1184 # with a value of input line separator different from the current system...
|
|
1185 s/\n$//g;
|
|
1186
|
|
1187 # Doesn't hurt to chomp...
|
|
1188 chomp;
|
|
1189
|
|
1190 $CmpdString .= $_;
|
|
1191 last LINE;
|
|
1192 }
|
|
1193 else {
|
|
1194 $CmpdString .= $_;
|
|
1195 }
|
|
1196 }
|
|
1197 return $CmpdString;
|
|
1198 }
|
|
1199
|
|
1200 # Find out the number of fragements in the compounds. And for the compound with
|
|
1201 # more than one fragment, remove all the others besides the largest one.
|
|
1202 sub WashCmpd {
|
|
1203 my($CmpdLines) = @_;
|
|
1204 my($WashedCmpdString, $FragmentCount, $Fragments);
|
|
1205
|
|
1206 $WashedCmpdString = "";
|
|
1207 ($FragmentCount, $Fragments) = GetCmpdFragments($CmpdLines);
|
|
1208 if ($FragmentCount > 1) {
|
|
1209 # Go over the compound data for the largest fragment including property
|
|
1210 # data...
|
|
1211 my (@AllFragments, @LargestFragment, %LargestFragmentAtoms, @WashedCmpdLines, $Index, $LineIndex, $AtomCount, $BondCount, $NewAtomCount, $NewBondCount, $FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo, $FirstNewAtomNum, $SecondNewAtomNum, $AtomNum, $ChiralFlag, $BondLine, $MENDLineIndex, $Line, $Value, @ValuePairs, @NewValuePairs, $ValuePairIndex, $NewAtomNum, @NewPropertyLines);
|
|
1212
|
|
1213 @AllFragments = (); @LargestFragment = ();
|
|
1214 %LargestFragmentAtoms = ();
|
|
1215 @AllFragments = split "\n", $Fragments;
|
|
1216 @LargestFragment = split " ", $AllFragments[0];
|
|
1217 for $Index (0 .. $#LargestFragment) {
|
|
1218 # Map old atom numbers to new atom numbers as the fragment atom numbers are sorted
|
|
1219 # from lowest to highest old atom numbers...
|
|
1220 $LargestFragmentAtoms{$LargestFragment[$Index]} = $Index + 1;
|
|
1221 }
|
|
1222 @WashedCmpdLines = ();
|
|
1223 push @WashedCmpdLines, @$CmpdLines[0], @$CmpdLines[1], @$CmpdLines[2], @$CmpdLines[3];
|
|
1224 ($AtomCount, $BondCount, $ChiralFlag) = ParseCmpdCountsLine(@$CmpdLines[3]);
|
|
1225 $NewAtomCount = @LargestFragment;
|
|
1226 $NewBondCount = 0;
|
|
1227 $AtomNum = 0;
|
|
1228 # Retrieve the largest fragment atom lines...
|
|
1229 for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) {
|
|
1230 $AtomNum++;
|
|
1231 if ($LargestFragmentAtoms{$AtomNum}) {
|
|
1232 push @WashedCmpdLines, @$CmpdLines[$LineIndex];
|
|
1233 }
|
|
1234 }
|
|
1235 # Retrieve the largest fragment bond lines...
|
|
1236 for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) {
|
|
1237 ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]);
|
|
1238 if ($LargestFragmentAtoms{$FirstAtomNum} && $LargestFragmentAtoms{$SecondAtomNum}) {
|
|
1239 $NewBondCount++;
|
|
1240 # Set up bond line with new atom number mapping...
|
|
1241 $FirstNewAtomNum = $LargestFragmentAtoms{$FirstAtomNum};
|
|
1242 $SecondNewAtomNum = $LargestFragmentAtoms{$SecondAtomNum};
|
|
1243 $BondLine = GenerateCmpdBondLine($FirstNewAtomNum, $SecondNewAtomNum, $BondType, $BondStereo);
|
|
1244 push @WashedCmpdLines, $BondLine;
|
|
1245 }
|
|
1246 }
|
|
1247 # Get property lines for CHG, ISO and RAD label and map the old atom numbers to new
|
|
1248 # atom numners; Others, property lines before M END line are skipped as atom numbers for
|
|
1249 # other properties might not valid anymore...
|
|
1250 #
|
|
1251 $MENDLineIndex = $LineIndex;
|
|
1252 LINE: for ($LineIndex = (4 + $AtomCount + $BondCount); $LineIndex < @$CmpdLines; $LineIndex++) {
|
|
1253 $Line = @$CmpdLines[$LineIndex];
|
|
1254 if ($Line =~ /^M END/i) {
|
|
1255 push @WashedCmpdLines, "M END";
|
|
1256 $MENDLineIndex = $LineIndex;
|
|
1257 last LINE;
|
|
1258 }
|
|
1259
|
|
1260 @ValuePairs = ();
|
|
1261 if ($Line =~ /^M CHG/i) {
|
|
1262 @ValuePairs = ParseCmpdChargePropertyLine($Line);
|
|
1263 }
|
|
1264 elsif ($Line =~ /^M RAD/i) {
|
|
1265 @ValuePairs = ParseCmpdRadicalPropertyLine($Line);
|
|
1266 }
|
|
1267 elsif ($Line =~ /^M ISO/i) {
|
|
1268 @ValuePairs = ParseCmpdIsotopePropertyLine($Line);
|
|
1269 }
|
|
1270 elsif ($Line =~ /^A /i) {
|
|
1271 my($NextLine);
|
|
1272 $LineIndex++;
|
|
1273 $NextLine = @$CmpdLines[$LineIndex];
|
|
1274 @ValuePairs = ParseCmpdAtomAliasPropertyLine($Line, $NextLine);
|
|
1275 }
|
|
1276 else {
|
|
1277 next LINE;
|
|
1278 }
|
|
1279
|
|
1280 if (!@ValuePairs) {
|
|
1281 next LINE;
|
|
1282 }
|
|
1283
|
|
1284 # Collect values for valid atom numbers with mapping to new atom numbers...
|
|
1285 @NewValuePairs = ();
|
|
1286 VALUEINDEX: for ($ValuePairIndex = 0; $ValuePairIndex < $#ValuePairs; $ValuePairIndex += 2) {
|
|
1287 $AtomNum = $ValuePairs[$ValuePairIndex]; $Value = $ValuePairs[$ValuePairIndex + 1];
|
|
1288 if (!exists $LargestFragmentAtoms{$AtomNum}) {
|
|
1289 next VALUEINDEX;
|
|
1290 }
|
|
1291 $NewAtomNum = $LargestFragmentAtoms{$AtomNum};
|
|
1292 push @NewValuePairs, ($NewAtomNum, $Value)
|
|
1293 }
|
|
1294 if (!@NewValuePairs) {
|
|
1295 next LINE;
|
|
1296 }
|
|
1297 @NewPropertyLines = ();
|
|
1298 if ($Line =~ /^M CHG/i) {
|
|
1299 @NewPropertyLines = GenerateCmpdChargePropertyLines(\@NewValuePairs);
|
|
1300 }
|
|
1301 elsif ($Line =~ /^M RAD/i) {
|
|
1302 @NewPropertyLines = GenerateCmpdRadicalPropertyLines(\@NewValuePairs);
|
|
1303 }
|
|
1304 elsif ($Line =~ /^M ISO/i) {
|
|
1305 @NewPropertyLines = GenerateCmpdIsotopePropertyLines(\@NewValuePairs);
|
|
1306 }
|
|
1307 elsif ($Line =~ /^A /i) {
|
|
1308 @NewPropertyLines = GenerateCmpdAtomAliasPropertyLines(\@NewValuePairs);
|
|
1309 }
|
|
1310 push @WashedCmpdLines, @NewPropertyLines;
|
|
1311 }
|
|
1312
|
|
1313 # Retrieve rest of the data label and value property data...
|
|
1314 for ($LineIndex = (1 + $MENDLineIndex); $LineIndex < @$CmpdLines; $LineIndex++) {
|
|
1315 push @WashedCmpdLines, @$CmpdLines[$LineIndex];
|
|
1316 }
|
|
1317 # Update atom and bond count line...
|
|
1318 $WashedCmpdLines[3] = GenerateCmpdCountsLine($NewAtomCount, $NewBondCount, $ChiralFlag);
|
|
1319
|
|
1320 $WashedCmpdString = join "\n", @WashedCmpdLines;
|
|
1321 }
|
|
1322 return ($FragmentCount, $Fragments, $WashedCmpdString);
|
|
1323 }
|
|
1324
|
|
1325 1;
|
|
1326
|
|
1327 __END__
|
|
1328
|
|
1329 =head1 NAME
|
|
1330
|
|
1331 SDFileUtil
|
|
1332
|
|
1333 =head1 SYNOPSIS
|
|
1334
|
|
1335 use SDFileUtil ;
|
|
1336
|
|
1337 use SDFileUtil qw(:all);
|
|
1338
|
|
1339 =head1 DESCRIPTION
|
|
1340
|
|
1341 B<SDFileUtil> module provides the following functions:
|
|
1342
|
|
1343 GenerateCmpdAtomAliasPropertyLines, GenerateCmpdAtomLine, GenerateCmpdBondLine,
|
|
1344 GenerateCmpdChargePropertyLines, GenerateCmpdCommentsLine, GenerateCmpdCountsLine,
|
|
1345 GenerateCmpdDataHeaderLabelsAndValuesLines, GenerateCmpdIsotopePropertyLines,
|
|
1346 GenerateCmpdMiscInfoLine, GenerateCmpdMolNameLine,
|
|
1347 GenerateCmpdRadicalPropertyLines, GenerateEmptyCtabBlockLines,
|
|
1348 GenerateMiscLineDateStamp, GetAllAndCommonCmpdDataHeaderLabels,
|
|
1349 GetCmpdDataHeaderLabels, GetCmpdDataHeaderLabelsAndValues, GetCmpdFragments,
|
|
1350 GetCtabLinesCount, GetInvalidAtomNumbers, GetUnknownAtoms,
|
|
1351 InternalBondOrderToMDLBondType, InternalBondStereochemistryToMDLBondStereo,
|
|
1352 InternalChargeToMDLCharge, InternalSpinMultiplicityToMDLRadical, IsCmpd2D,
|
|
1353 IsCmpd3D, MDLBondStereoToInternalBondStereochemistry,
|
|
1354 MDLBondTypeToInternalBondOrder, MDLChargeToInternalCharge,
|
|
1355 MDLRadicalToInternalSpinMultiplicity, ParseCmpdAtomAliasPropertyLine,
|
|
1356 ParseCmpdAtomLine, ParseCmpdBondLine, ParseCmpdChargePropertyLine,
|
|
1357 ParseCmpdCommentsLine, ParseCmpdCountsLine, ParseCmpdIsotopePropertyLine,
|
|
1358 ParseCmpdMiscInfoLine, ParseCmpdMolNameLine, ParseCmpdRadicalPropertyLine,
|
|
1359 ReadCmpdString, RemoveCmpdDataHeaderLabelAndValue, WashCmpd
|
|
1360
|
|
1361 =head1 METHODS
|
|
1362
|
|
1363 =over 4
|
|
1364
|
|
1365 =item B<GenerateCmpdAtomAliasPropertyLines>
|
|
1366
|
|
1367 @Lines = GenerateCmpdAtomAliasPropertyLines($AliasValuePairsRef);
|
|
1368
|
|
1369 Returns a formatted atom alias property lines corresponding to successive pairs
|
|
1370 of atom number and alias values specified by a refernce to an array. Two lines
|
|
1371 are generate for each atom number and alias value pairs: First line - A <AtomNum>;
|
|
1372 Second line:<AtomAlias>.
|
|
1373
|
|
1374 =item B<GenerateCmpdAtomLine>
|
|
1375
|
|
1376 $Line = GenerateCmpdAtomLine($AtomSymbol, $AtomX, $AtomY,
|
|
1377 $AtomZ, [$MassDifference, $Charge, $StereoParity]);
|
|
1378
|
|
1379 Returns a formatted atom data line containing all the input values.
|
|
1380
|
|
1381 =item B<GenerateCmpdBondLine>
|
|
1382
|
|
1383 $Line = GenerateCmpdBondLine($FirstAtomNum, $SecondAtomNum,
|
|
1384 $BondType, [$BondStereo]);
|
|
1385
|
|
1386 Returns a formatted bond data line containing all the input values.
|
|
1387
|
|
1388 =item B<GenerateCmpdChargePropertyLines>
|
|
1389
|
|
1390 @Lines = GenerateCmpdChargePropertyLines($ChargeValuePairsRef);
|
|
1391
|
|
1392 Returns a formatted M CHG property lines corresponding to successive pairs of
|
|
1393 atom number and charge values specified by a refernce to an array.
|
|
1394
|
|
1395 =item B<GenerateCmpdCommentsLine>
|
|
1396
|
|
1397 $Line = GenerateCmpdCommentsLine($Comments);
|
|
1398
|
|
1399 Returns a formatted comments data line.
|
|
1400
|
|
1401 =item B<GenerateCmpdCountsLine>
|
|
1402
|
|
1403 $Line = GenerateCmpdCountsLine($AtomCount, $BondCount,
|
|
1404 $ChiralFlag, [$PropertyCount, $Version]);
|
|
1405
|
|
1406 Returns a formatted line containing all the input values. The default values of 999
|
|
1407 and V2000 are used for I<PropertyCount> and I<Version>.
|
|
1408
|
|
1409 =item B<GenerateCmpdDataHeaderLabelsAndValuesLines>
|
|
1410
|
|
1411 @Lines = GenerateCmpdDataHeaderLabelsAndValuesLines(
|
|
1412 $DataHeaderLabelsRef, $DataHeaderLabelsAndValuesRef,
|
|
1413 [$SortDataLabels]);
|
|
1414
|
|
1415 Returns formatted data lines containing header label and values lines corresponding to
|
|
1416 all data header labels in array reference I<DataHeaderLabelsRef> with values in hash
|
|
1417 reference I<DataHeaderLabelsAndValuesRef>. By default, data header labels are
|
|
1418 not sorted and correspond to the label order in array reference I<DataHeaderLabelsRef>.
|
|
1419
|
|
1420 =item B<GenerateCmpdIsotopePropertyLines>
|
|
1421
|
|
1422 @Lines = GenerateCmpdIsotopePropertyLines($IsotopeValuePairsRef);
|
|
1423
|
|
1424 Returns a formatted M ISO property lines corresponding to successive pairs of
|
|
1425 atom number and isotope values specified by a refernce to an array.
|
|
1426
|
|
1427 =item B<GenerateCmpdMiscInfoLine>
|
|
1428
|
|
1429 $Line = GenerateCmpdMiscInfoLine([$ProgramName, $UserInitial,
|
|
1430 $Code]);
|
|
1431
|
|
1432 Returns a formatted line containing specified user initial, program name, date and code.
|
|
1433 Default values are: I<ProgramName - MayaChem; UserInitial - NULL; Code - 2D>.
|
|
1434
|
|
1435 =item B<GenerateCmpdMolNameLine>
|
|
1436
|
|
1437 $Line = GenerateCmpdMolNameLine($MolName);
|
|
1438
|
|
1439 Returns a formatted molecule name data line.
|
|
1440
|
|
1441 =item B<GenerateCmpdRadicalPropertyLines>
|
|
1442
|
|
1443 @Lines = GenerateCmpdRadicalPropertyLines($RadicalValuePairsRef);
|
|
1444
|
|
1445 Returns a formatted M CHG property lines corresponding to successive pairs of
|
|
1446 atom number and multiplicity values specified by a refernce to an array.
|
|
1447
|
|
1448 =item B<GenerateEmptyCtabBlockLines>
|
|
1449
|
|
1450 $Lines = GenerateCmpdMiscInfoLine([$Date]);
|
|
1451
|
|
1452 Returns formatted lines representing empty CTAB block.
|
|
1453
|
|
1454 =item B<GenerateMiscLineDateStamp>
|
|
1455
|
|
1456 $Line = GenerateMiscLineDateStamp();
|
|
1457
|
|
1458 Returns date stamp for misc line.
|
|
1459
|
|
1460 =item B<GetAllAndCommonCmpdDataHeaderLabels>
|
|
1461
|
|
1462 ($CmpdCount, $DataFieldLabelsArrayRef,
|
|
1463 $CommonDataFieldLabelsArrayRef) =
|
|
1464 GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
|
|
1465
|
|
1466 Returns number of comopunds, a reference to an array containing all unique data header
|
|
1467 label and a reference to an array containing common data field labels for all compounds
|
|
1468 in SD file.
|
|
1469
|
|
1470 =item B<GetCmpdDataHeaderLabels>
|
|
1471
|
|
1472 (@Labels) = GetCmpdDataHeaderLabels(\@CmpdLines);
|
|
1473
|
|
1474 Returns an array containg data header labels for a compound
|
|
1475
|
|
1476 =item B<GetCmpdDataHeaderLabelsAndValues>
|
|
1477
|
|
1478 (%DataValues) = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
|
|
1479
|
|
1480 Returns a hash conating data header labes and values for a compound.
|
|
1481
|
|
1482 =item B<GetCmpdFragments>
|
|
1483
|
|
1484 ($FragmentCount, $FragmentString) = GetCmpdFragments(\@CmpLines);
|
|
1485
|
|
1486 Figures out the number of disconnected fragments and return their values along
|
|
1487 with the atom numbers in a string delimited by new line character. Fragment data
|
|
1488 in B<FragmentString> is sorted on based on its size.
|
|
1489
|
|
1490 =item B<GetCtabLinesCount>
|
|
1491
|
|
1492 $CtabLinesCount = GetCtabLinesCount(\@CmpdLines);
|
|
1493
|
|
1494 Returns number of lines present between the 4th line and the line containg "M END".
|
|
1495
|
|
1496 =item B<GetInvalidAtomNumbers>
|
|
1497
|
|
1498 ($InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines) =
|
|
1499 GetInvalidAtomNumbers(\@CmpdLines);
|
|
1500
|
|
1501 Returns a list of values containing information about invalid atom numbers present
|
|
1502 in block or atom property lines.
|
|
1503
|
|
1504 =item B<GetUnknownAtoms>
|
|
1505
|
|
1506 ($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines) =
|
|
1507 GetUnknownAtoms(\@CmpdLines);
|
|
1508
|
|
1509 Returns a list of values containing information about atoms which contain special element
|
|
1510 symbols not present in the periodic table.
|
|
1511
|
|
1512 =item B<InternalBondOrderToMDLBondType>
|
|
1513
|
|
1514 $MDLBondType = InternalBondOrderToMDLBondType($InternalBondOrder);
|
|
1515
|
|
1516 Returns value of I<MDLBondType> corresponding to I<InternalBondOrder>.
|
|
1517
|
|
1518 InternalBondOrder MDLBondType
|
|
1519
|
|
1520 1 1
|
|
1521 2 2
|
|
1522 3 3
|
|
1523 1.5 4
|
|
1524
|
|
1525 =item B<InternalBondStereochemistryToMDLBondStereo>
|
|
1526
|
|
1527 $MDLBondStereo = InternalBondStereochemistryToMDLBondStereo(
|
|
1528 $InternalBondStereo);
|
|
1529
|
|
1530 Returns value of I<MDLBondStereo> corresponding to I<InternalBondStereo> using following
|
|
1531 mapping:
|
|
1532
|
|
1533 InternalBondStereo MDLBondStereo
|
|
1534
|
|
1535 Up 1
|
|
1536 UpOrDown 4
|
|
1537 Down 6
|
|
1538 CisOrTrans 3
|
|
1539 Other 0
|
|
1540
|
|
1541 =item B<InternalChargeToMDLCharge>
|
|
1542
|
|
1543 $MDLCharge = InternalChargeToMDLCharge($InternalCharge);
|
|
1544
|
|
1545 Returns value of I<MDLCharge> corresponding to I<InternalCharge> using following
|
|
1546 mapping:
|
|
1547
|
|
1548 InternalCharge MDLCharge
|
|
1549
|
|
1550 3 1
|
|
1551 2 2
|
|
1552 1 3
|
|
1553 -1 5
|
|
1554 -2 6
|
|
1555 -3 7
|
|
1556
|
|
1557 =item B<InternalSpinMultiplicityToMDLRadical>
|
|
1558
|
|
1559 $MDLRadical = InternalSpinMultiplicityToMDLRadical(
|
|
1560 $InternalSpinMultiplicity);
|
|
1561
|
|
1562 Returns value of I<MDLRadical> corresponding to I<InternalSpinMultiplicity>. These
|
|
1563 value are equivalent.
|
|
1564
|
|
1565 =item B<MDLBondStereoToInternalBondType>
|
|
1566
|
|
1567 $InternalBondType = MDLBondStereoToInternalBondType($MDLBondStereo);
|
|
1568
|
|
1569 Returns value of I<InternalBondType> corresponding to I<MDLBondStereo> using
|
|
1570 mapping shown for B<InternalBondTypeToMDLBondStereo> function.
|
|
1571
|
|
1572 =item B<IsCmpd2D>
|
|
1573
|
|
1574 $Status = IsCmpd2D();
|
|
1575
|
|
1576 Returns 1 or 0 based on whether z-coordinate of any atom is non-zero.
|
|
1577
|
|
1578 =item B<IsCmpd3D>
|
|
1579
|
|
1580 $Status = IsCmpd3D();
|
|
1581
|
|
1582 Returns 1 or 0 based on whether z-coordinate of any atom is non-zero.
|
|
1583
|
|
1584 =item B<MDLBondStereoToInternalBondStereochemistry>
|
|
1585
|
|
1586 $InternalBondStereo = MDLBondStereoToInternalBondStereochemistry(
|
|
1587 $MDLBondStereo);
|
|
1588
|
|
1589 Returns value of I<InternalBondStereo> corresponding to I<MDLBondStereo> using
|
|
1590 mapping shown for B<InternalBondStereochemistryToMDLBondStereo> function.
|
|
1591
|
|
1592 =item B<MDLBondTypeToInternalBondOrder>
|
|
1593
|
|
1594 $InternalBondOrder = MDLBondTypeToInternalBondOrder($MDLBondType);
|
|
1595
|
|
1596 Returns value of I<InternalBondOrder> corresponding to I<MDLBondType> using
|
|
1597 mapping shown for B<InternalBondOrderToMDLBondType> function.
|
|
1598
|
|
1599 =item B<MDLChargeToInternalCharge>
|
|
1600
|
|
1601 $InternalCharge = MDLChargeToInternalCharge($MDLCharge);
|
|
1602
|
|
1603 Returns value of I<$InternalCharge> corresponding to I<MDLCharge> using
|
|
1604 mapping shown for B<InternalChargeToMDLCharge> function.
|
|
1605
|
|
1606 =item B<MDLRadicalToInternalSpinMultiplicity>
|
|
1607
|
|
1608 $InternalSpinMultiplicity = MDLRadicalToInternalSpinMultiplicity(
|
|
1609 $MDLRadical);
|
|
1610
|
|
1611 Returns value of I<InternalSpinMultiplicity> corresponding to I<MDLRadical>. These
|
|
1612 value are equivalent.
|
|
1613
|
|
1614 =item B<ParseCmpdAtomAliasPropertyLine>
|
|
1615
|
|
1616 @AtomNumAndValuePairs = ParseCmpdAtomAliasPropertyLine(
|
|
1617 $CurrentLine, $NexLine);
|
|
1618
|
|
1619 Parses atom alias propery lines in CTAB generic properties block and returns an array
|
|
1620 with successive pairs of values corresponding to atom number and its alias.
|
|
1621
|
|
1622 =item B<ParseCmpdAtomLine>
|
|
1623
|
|
1624 ($AtomSymbol, $AtomX, $AtomY, $AtomZ, $MassDifference, $Charge,
|
|
1625 $StereoParity) = ParseCmpdAtomLine($AtomDataLine);
|
|
1626
|
|
1627 Parses compound data line containing atom information and returns a list
|
|
1628 of values.
|
|
1629
|
|
1630 =item B<ParseCmpdBondLine>
|
|
1631
|
|
1632 ($FirstAtomNum, $SecondAtomNum, $BondType) =
|
|
1633 ParseCmpdBondLine($BondDataLine);
|
|
1634
|
|
1635 Parses compound data line containing bond information and returns a list of
|
|
1636 values.
|
|
1637
|
|
1638 =item B<ParseCmpdCommentsLine>
|
|
1639
|
|
1640 $Comments = ParseCmpdCommentsLine($CommentsDataLine);
|
|
1641
|
|
1642 Returns the comment string.
|
|
1643
|
|
1644 =item B<ParseCmpdChargePropertyLine>
|
|
1645
|
|
1646 @AtomNumAndValuePairs = ParseCmpdChargePropertyLine(
|
|
1647 $ChargeDataLine);
|
|
1648
|
|
1649 Parses charge propery line in CTAB generic properties block and returns an array
|
|
1650 with successive pairs of values corresponding to atom number and its charge.
|
|
1651
|
|
1652 =item B<ParseCmpdCountsLine>
|
|
1653
|
|
1654 ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version) =
|
|
1655 ParseCmpdCountsLine(\@CountDataLines);
|
|
1656
|
|
1657 Returns a list of values containing count information.
|
|
1658
|
|
1659 =item B<ParseCmpdMiscInfoLine>
|
|
1660
|
|
1661 ($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2,
|
|
1662 $Energy, $RegistryNum) = ParseCmpdMiscInfoLine($Line);
|
|
1663
|
|
1664 Returns a list of values containing miscellaneous information.
|
|
1665
|
|
1666 =item B<ParseCmpdIsotopePropertyLine>
|
|
1667
|
|
1668 @AtomNumAndValuePairs = ParseCmpdIsotopePropertyLine(
|
|
1669 $IsotopeDataLine);
|
|
1670
|
|
1671 Parses isotopic propery line in CTAB generic properties block and returns an array
|
|
1672 with successive pairs of values corresponding to atom number and absolute mass of
|
|
1673 atom isotope.
|
|
1674
|
|
1675 =item B<ParseCmpdMolNameLine>
|
|
1676
|
|
1677 $MolName = ParseCmpdMolNameLine($Line);
|
|
1678
|
|
1679 Returns a string containing molecule name.
|
|
1680
|
|
1681 =item B<ParseCmpdRadicalPropertyLine>
|
|
1682
|
|
1683 @AtomNumAndValuePairs = ParseCmpdRadicalPropertyLine(
|
|
1684 $RadicalDataLine);
|
|
1685
|
|
1686 Parses radical propery line in CTAB generic properties block and returns an array
|
|
1687 with successive pairs of values corresponding to atom number and radical number
|
|
1688 value.
|
|
1689
|
|
1690 =item B<RemoveCmpdDataHeaderLabelAndValue>
|
|
1691
|
|
1692 $NewCmpdString = RemoveCmpdDataHeaderLabelAndValue($CmpdString,
|
|
1693 $DataHeaderLabel);
|
|
1694
|
|
1695 Returns a B<NewCmpdString> after removing I<DataHeaderLabel> along with its
|
|
1696 value from I<CmpdString>.
|
|
1697
|
|
1698 =item B<ReadCmpdString>
|
|
1699
|
|
1700 $CmpdString = ReadCmpdString(\*SDFILEHANDLE);
|
|
1701
|
|
1702 Returns a string containing all the data lines for the next available compound
|
|
1703 in an already open file indicated by SDFILEHANDLE. A NULL string is returned
|
|
1704 on EOF.
|
|
1705
|
|
1706 =item B<WashCmpd>
|
|
1707
|
|
1708 ($FragmentCount, $Fragments, $WashedCmpdString) =
|
|
1709 WashCmpd(\@CmpdLines);
|
|
1710
|
|
1711 Figures out the number of disconnected fragments and return their values along
|
|
1712 with the atom numbers in a string delimited by new line character. Fragment data
|
|
1713 in B<FragmentString> is sorted on based on its size.
|
|
1714
|
|
1715 =back
|
|
1716
|
|
1717 =head1 AUTHOR
|
|
1718
|
|
1719 Manish Sud <msud@san.rr.com>
|
|
1720
|
|
1721 =head1 SEE ALSO
|
|
1722
|
|
1723 TextUtil.pm
|
|
1724
|
|
1725 =head1 COPYRIGHT
|
|
1726
|
|
1727 Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
1728
|
|
1729 This file is part of MayaChemTools.
|
|
1730
|
|
1731 MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
1732 the terms of the GNU Lesser General Public License as published by the Free
|
|
1733 Software Foundation; either version 3 of the License, or (at your option)
|
|
1734 any later version.
|
|
1735
|
|
1736 =cut
|