comparison lib/SDFileUtil.pm @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 package SDFileUtil;
2 #
3 # $RCSfile: SDFileUtil.pm,v $
4 # $Date: 2015/02/28 20:47:18 $
5 # $Revision: 1.49 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use Exporter;
31 use Carp;
32 use PeriodicTable qw(IsElement);
33 use TimeUtil ();
34
35 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
36
37 @ISA = qw(Exporter);
38 @EXPORT = qw(GenerateCmpdAtomLine GenerateCmpdBondLine GenerateCmpdChargePropertyLines GenerateCmpdCommentsLine GenerateCmpdCountsLine GenerateCmpdAtomAliasPropertyLines GenerateCmpdIsotopePropertyLines GenerateCmpdDataHeaderLabelsAndValuesLines GenerateCmpdMiscInfoLine GenerateCmpdRadicalPropertyLines GenerateCmpdMolNameLine GenerateEmptyCtabBlockLines GenerateMiscLineDateStamp GetAllAndCommonCmpdDataHeaderLabels GetCmpdDataHeaderLabels GetCmpdDataHeaderLabelsAndValues GetCmpdFragments GetCtabLinesCount GetUnknownAtoms GetInvalidAtomNumbers MDLChargeToInternalCharge InternalChargeToMDLCharge MDLBondTypeToInternalBondOrder InternalBondOrderToMDLBondType MDLBondStereoToInternalBondStereochemistry InternalBondStereochemistryToMDLBondStereo InternalSpinMultiplicityToMDLRadical MDLRadicalToInternalSpinMultiplicity IsCmpd3D IsCmpd2D ParseCmpdAtomLine ParseCmpdBondLine ParseCmpdCommentsLine ParseCmpdCountsLine ParseCmpdMiscInfoLine ParseCmpdMolNameLine ParseCmpdAtomAliasPropertyLine ParseCmpdChargePropertyLine ParseCmpdIsotopePropertyLine ParseCmpdRadicalPropertyLine ReadCmpdString RemoveCmpdDataHeaderLabelAndValue WashCmpd);
39 @EXPORT_OK = qw();
40 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]);
41
42 # Format data for compounds count line...
43 sub GenerateCmpdCountsLine {
44 my($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version, $Line);
45
46 if (@_ == 5) {
47 ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version) = @_;
48 }
49 elsif (@_ == 3) {
50 ($AtomCount, $BondCount, $ChiralFlag) = @_;
51 $PropertyCount = 999;
52 $Version = "V2000";
53 }
54 else {
55 ($AtomCount, $BondCount) = @_;
56 $ChiralFlag = 0;
57 $PropertyCount = 999;
58 $Version = "V2000";
59 }
60 if ($AtomCount > 999) {
61 croak "Error: SDFileUtil::GenerateCmpdCountsLine: The atom count, $AtomCount, exceeds maximum of 999 allowed for CTAB version 2000. The Extended Connection Table (V3000) format in MDL MOL and SD files is not supported by the current release of MayaChemTools...";
62 }
63 $Line = sprintf "%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%6s", $AtomCount, $BondCount, 0, 0, $ChiralFlag, 0, 0, 0, 0, 0, $PropertyCount, $Version;
64
65 return ($Line);
66 }
67
68 # Generate comments line...
69 sub GenerateCmpdCommentsLine {
70 my($Comments) = @_;
71 my($Line);
72
73 $Line = (length($Comments) > 80) ? substr($Comments, 0, 80) : $Comments;
74
75 return $Line;
76 }
77
78 # Generate molname line...
79 sub GenerateCmpdMolNameLine {
80 my($MolName) = @_;
81 my($Line);
82
83 $Line = (length($MolName) > 80) ? substr($MolName, 0, 80) : $MolName;
84
85 return $Line;
86 }
87
88 # Generate data for compounds misc info line...
89 sub GenerateCmpdMiscInfoLine {
90 my($ProgramName, $UserInitial, $Code) = @_;
91 my($Date, $Line);
92
93 if (!(defined($ProgramName) && $ProgramName)) {
94 $ProgramName = "MayaChem";
95 }
96 if (!(defined($UserInitial) && $UserInitial)) {
97 $UserInitial = " ";
98 }
99 if (!(defined($Code) && $Code)) {
100 $Code = "2D";
101 }
102
103 if (length($ProgramName) > 8) {
104 $ProgramName = substr($ProgramName, 0, 8);
105 }
106 if (length($UserInitial) > 2) {
107 $UserInitial = substr($UserInitial, 0, 2);
108 }
109 if (length($Code) > 2) {
110 $Code = substr($Code, 0, 2);
111 }
112 $Date = GenerateMiscLineDateStamp();
113
114 $Line = "${UserInitial}${ProgramName}${Date}${Code}";
115
116 return $Line;
117 }
118
119 # Generate data for compounds misc info line...
120 sub GenerateEmptyCtabBlockLines {
121 my($Date, $Lines);
122
123 if (@_ == 1) {
124 ($Date) = @_;
125 }
126 else {
127 $Date = GenerateMiscLineDateStamp();
128 }
129 # First line: Blank molname line...
130 # Second line: Misc info...
131 # Third line: Blank comments line...
132 # Fourth line: Counts line reflecting empty structure data block...
133 $Lines = "\n";
134 $Lines .= " MayaChem${Date}2D\n";
135 $Lines .= "\n";
136 $Lines .= GenerateCmpdCountsLine(0, 0, 0) . "\n";
137 $Lines .= "M END";
138
139 return $Lines;
140 }
141
142 # Generate SD file data stamp...
143 sub GenerateMiscLineDateStamp {
144 return TimeUtil::SDFileTimeStamp();
145 }
146
147 # Generate data for compound atom line...
148 #
149 sub GenerateCmpdAtomLine {
150 my($AtomSymbol, $AtomX, $AtomY, $AtomZ, $MassDifference, $Charge, $StereoParity) = @_;
151 my($Line);
152
153 if (!defined $MassDifference) {
154 $MassDifference = 0;
155 }
156 if (!defined $Charge) {
157 $Charge = 0;
158 }
159 if (!defined $StereoParity) {
160 $StereoParity = 0;
161 }
162 $Line = sprintf "%10.4f%10.4f%10.4f %-3s%2i%3i%3i 0 0 0 0 0 0 0 0 0", $AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity;
163
164 return $Line
165 }
166
167 # Generate data for compound bond line...
168 #
169 sub GenerateCmpdBondLine {
170 my($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = @_;
171 my($Line);
172
173 if (!defined $BondStereo) {
174 $BondStereo = 0;
175 }
176 $Line = sprintf "%3i%3i%3i%3i 0 0 0", $FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo;
177
178 return $Line
179 }
180
181 # Generate charge property lines for CTAB block...
182 #
183 sub GenerateCmpdChargePropertyLines {
184 my($ChargeValuePairsRef) = @_;
185
186 return _GenerateCmpdGenericPropertyLines('Charge', $ChargeValuePairsRef);
187 }
188
189 # Generate isotope property lines for CTAB block...
190 #
191 sub GenerateCmpdIsotopePropertyLines {
192 my($IsotopeValuePairsRef) = @_;
193
194 return _GenerateCmpdGenericPropertyLines('Isotope', $IsotopeValuePairsRef);
195 }
196
197 # Generate radical property line property lines for CTAB block...
198 #
199 sub GenerateCmpdRadicalPropertyLines {
200 my($RadicalValuePairsRef) = @_;
201
202 return _GenerateCmpdGenericPropertyLines('Radical', $RadicalValuePairsRef);
203 }
204
205 # Generate atom alias property line property lines for CTAB block...
206 #
207 # Atom alias property line format:
208 #
209 # A aaa
210 # x...
211 #
212 # aaa: Atom number
213 # x: Atom alias in next line
214 #
215 sub GenerateCmpdAtomAliasPropertyLines {
216 my($PropertyValuePairsRef) = @_;
217 my($Index, $AtomNum, $AtomAlias, $Line, @PropertyLines);
218
219 @PropertyLines = ();
220
221 for ($Index = 0; $Index < $#{$PropertyValuePairsRef}; $Index += 2) {
222 $AtomNum = $PropertyValuePairsRef->[$Index];
223 $AtomAlias = $PropertyValuePairsRef->[$Index + 1];
224
225 $Line = "A " . sprintf "%3i", $AtomNum;
226
227 push @PropertyLines, $Line;
228 push @PropertyLines, $AtomAlias;
229 }
230
231 return @PropertyLines;
232 }
233
234 # Generate data header labels and values lines...
235 #
236 sub GenerateCmpdDataHeaderLabelsAndValuesLines {
237 my($DataHeaderLabelsRef, $DataHeaderLabelsAndValuesRef, $SortDataLabels) = @_;
238 my($DataLabel, $DataValue, @DataLabels, @DataLines);
239
240 if (!defined $SortDataLabels) {
241 $SortDataLabels = 0;
242 }
243
244 @DataLines = ();
245 @DataLabels = ();
246 if ($SortDataLabels) {
247 push @DataLabels, sort @{$DataHeaderLabelsRef};
248 }
249 else {
250 push @DataLabels, @{$DataHeaderLabelsRef};
251 }
252 for $DataLabel (@DataLabels) {
253 $DataValue = '';
254 if (exists $DataHeaderLabelsAndValuesRef->{$DataLabel}) {
255 $DataValue = $DataHeaderLabelsAndValuesRef->{$DataLabel};
256 }
257 push @DataLines, ("> <${DataLabel}>", "$DataValue", "");
258 }
259 return @DataLines;
260 }
261
262 # Parse data field header in SD file and return lists of all and common data field
263 # labels.
264 sub GetAllAndCommonCmpdDataHeaderLabels {
265 my($SDFileRef) = @_;
266 my($CmpdCount, $CmpdString, $Label, @CmpdLines, @DataFieldLabels, @CommonDataFieldLabels, %DataFieldLabelsMap);
267
268 $CmpdCount = 0;
269 @DataFieldLabels = ();
270 @CommonDataFieldLabels = ();
271 %DataFieldLabelsMap = ();
272
273 while ($CmpdString = ReadCmpdString($SDFileRef)) {
274 $CmpdCount++;
275 @CmpdLines = split "\n", $CmpdString;
276 # Process compound data header labels and figure out which ones are present for
277 # all the compounds...
278 if (@DataFieldLabels) {
279 my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines);
280 my(%CmpdDataFieldLabelsMap) = ();
281 # Setup a map for the current labels...
282 for $Label (@CmpdDataFieldLabels) {
283 $CmpdDataFieldLabelsMap{$Label} = "PresentInSome";
284 }
285 # Check the presence old labels for this compound; otherwise, mark 'em new...
286 for $Label (@DataFieldLabels) {
287 if (!$CmpdDataFieldLabelsMap{$Label}) {
288 $DataFieldLabelsMap{$Label} = "PresentInSome";
289 }
290 }
291 # Check the presence this compound in the old labels; otherwise, add 'em...
292 for $Label (@CmpdDataFieldLabels ) {
293 if (!$DataFieldLabelsMap{$Label}) {
294 # It's a new label...
295 push @DataFieldLabels, $Label;
296 $DataFieldLabelsMap{$Label} = "PresentInSome";
297 }
298 }
299 }
300 else {
301 # Get the initial label set and set up a map...
302 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
303 for $Label (@DataFieldLabels) {
304 $DataFieldLabelsMap{$Label} = "PresentInAll";
305 }
306 }
307 }
308 # Identify the common data field labels...
309 @CommonDataFieldLabels = ();
310 for $Label (@DataFieldLabels) {
311 if ($DataFieldLabelsMap{$Label} eq "PresentInAll") {
312 push @CommonDataFieldLabels, $Label;
313 }
314 }
315 return ($CmpdCount, \@DataFieldLabels, \@CommonDataFieldLabels);
316 }
317
318 # Parse all the data header labels and return 'em as an list...
319 #
320 # Format:
321 #
322 #> Data header line
323 #Data line(s)
324 #Blank line
325 #
326 # [Data Header] (one line) precedes each item of data, starts with a greater than (>) sign, and
327 # contains at least one of the following:
328 # The field name enclosed in angle brackets. For example: <melting.point>
329 # The field number, DTn , where n represents the number assigned to the field in a MACCS-II database
330 #
331 #Optional information for the data header includes:
332 # The compound’s external and internal registry numbers. External registry numbers must be enclosed in parentheses.
333 # Any combination of information
334 #
335 #The following are examples of valid data headers:
336 #> <MELTING.POINT>
337 #> 55 (MD-08974) <BOILING.POINT> DT12
338 #> DT12 55
339 #> (MD-0894) <BOILING.POINT> FROM ARCHIVES
340 #
341 #Notes: Sometimes last blank line is missing and can be just followed by $$$$
342 #
343 sub GetCmpdDataHeaderLabels {
344 my($CmpdLines) = @_;
345 my($CmpdLine, $Label, @Labels);
346
347 @Labels = ();
348 CMPDLINE: for $CmpdLine (@$CmpdLines) {
349 if ($CmpdLine !~ /^>/) {
350 next CMPDLINE;
351 }
352 # Does the line contains field name enclosed in angular brackets?
353 ($Label) = $CmpdLine =~ /<.*?>/g;
354 if (!defined($Label)) {
355 next CMPDLINE;
356 }
357 $Label =~ s/(<|>)//g;
358 push @Labels, $Label;
359 }
360 return (@Labels);
361 }
362
363 # Parse all the data header labels and values
364 sub GetCmpdDataHeaderLabelsAndValues {
365 my($CmpdLines) = @_;
366 my($CmpdLine, $CurrentLabel, $Label, $Value, $ValueCount, $ProcessingLabelData, @Values, %DataFields);
367
368 %DataFields = ();
369 $ProcessingLabelData = 0;
370 $ValueCount = 0;
371 CMPDLINE: for $CmpdLine (@$CmpdLines) {
372 if ($CmpdLine =~ /^\$\$\$\$/) {
373 last CMPDLINE;
374 }
375 if ($CmpdLine =~ /^>/) {
376 # Does the line contains field name enclosed in angular brackets?
377 ($Label) = $CmpdLine =~ /<.*?>/g;
378 if (defined $Label) {
379 $CurrentLabel = $Label;
380 $CurrentLabel =~ s/(<|>)//g;
381 $ProcessingLabelData = 0;
382 $ValueCount = 0;
383
384 if ($CurrentLabel) {
385 $ProcessingLabelData = 1;
386 $DataFields{$CurrentLabel} = '';
387 next CMPDLINE;
388 }
389 }
390 else {
391 if (!$ProcessingLabelData) {
392 # Data line containing no <label> as allowed by SDF format. Just ignore it...
393 next CMPDLINE;
394 }
395 }
396 }
397 if (!$ProcessingLabelData) {
398 next CMPDLINE;
399 }
400 if (!(defined($CmpdLine) && length($CmpdLine))) {
401 # Blank line terminates value for a label...
402 $CurrentLabel = '';
403 $ValueCount = 0;
404 $ProcessingLabelData = 0;
405 next CMPDLINE;
406 }
407 $ValueCount++;
408 $Value = $CmpdLine;
409
410 if ($ValueCount > 1) {
411 $DataFields{$CurrentLabel} .= "\n" . $Value;
412 }
413 else {
414 $DataFields{$CurrentLabel} = $Value;
415 }
416 }
417 return (%DataFields);
418 }
419
420 # Return an updated compoud string after removing data header label along with its
421 # value from the specified compound string...
422 #
423 sub RemoveCmpdDataHeaderLabelAndValue {
424 my($CmpdString, $DataHeaderLabel) = @_;
425 my($Line, $PorcessingDataHeaderLabel, @CmpdLines);
426
427 @CmpdLines = ();
428 $PorcessingDataHeaderLabel = 0;
429
430 CMPDLINE: for $Line (split "\n", $CmpdString) {
431 if ($Line =~ /^>/ && $Line =~ /<$DataHeaderLabel>/i) {
432 $PorcessingDataHeaderLabel = 1;
433 next CMPDLINE;
434 }
435
436 if ($PorcessingDataHeaderLabel) {
437 # Blank line indicates end of fingerprints data value...
438 if ($Line =~ /^\$\$\$\$/) {
439 push @CmpdLines, $Line;
440 $PorcessingDataHeaderLabel = 0;
441 }
442 elsif (!length($Line)) {
443 $PorcessingDataHeaderLabel = 0;
444 }
445 next CMPDLINE;
446 }
447
448 # Track compound lines without fingerprints data...
449 push @CmpdLines, $Line;
450 }
451
452 return join "\n", @CmpdLines;
453 }
454
455 #
456 # Using bond blocks, figure out the number of disconnected fragments and
457 # return their values along with the atom numbers in a string delimited by new
458 # line character.
459 #
460 sub GetCmpdFragments {
461 my($CmpdLines) = @_;
462 my($AtomCount, $BondCount, $FirstAtomNum, $SecondAtomNum, @AtomConnections, $BondType, $FragmentString, $FragmentCount, $LineIndex, $Index, $AtomNum, $NbrAtomNum, @ProcessedAtoms, $ProcessedAtomCount, $ProcessAtomNum, @ProcessingAtoms, @ConnectedAtoms, %Fragments, $FragmentNum, $AFragmentString);
463
464 # Setup the connection table for each atom...
465 @AtomConnections = ();
466 ($AtomCount, $BondCount) = ParseCmpdCountsLine(@$CmpdLines[3]);
467 for $AtomNum (1 .. $AtomCount) {
468 %{$AtomConnections[$AtomNum]} = ();
469 }
470 for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) {
471 ($FirstAtomNum, $SecondAtomNum, $BondType) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]);
472 if (!$AtomConnections[$FirstAtomNum]{$SecondAtomNum}) {
473 $AtomConnections[$FirstAtomNum]{$SecondAtomNum} = $BondType;
474 }
475 if (!$AtomConnections[$SecondAtomNum]{$FirstAtomNum}) {
476 $AtomConnections[$SecondAtomNum]{$FirstAtomNum} = $BondType;
477 }
478 }
479
480 #Get set to count fragments...
481 $ProcessedAtomCount = 0;
482 $FragmentNum = 0;
483 %Fragments = ();
484 @ProcessedAtoms = ();
485 for $AtomNum (1 .. $AtomCount) {
486 $ProcessedAtoms[$AtomNum] = 0;
487 }
488 while ($ProcessedAtomCount < $AtomCount) {
489 @ProcessingAtoms = ();
490 @ConnectedAtoms = ();
491 ATOMNUM: for $AtomNum (1 .. $AtomCount) {
492 if (!$ProcessedAtoms[$AtomNum]) {
493 $ProcessedAtomCount++;
494 $ProcessedAtoms[$AtomNum] = 1;
495 push @ProcessingAtoms, $AtomNum;
496 $FragmentNum++;
497 @{$Fragments{$FragmentNum} } = ();
498 push @{$Fragments{$FragmentNum} }, $AtomNum;
499 last ATOMNUM;
500 }
501 }
502
503 # Go over the neighbors and follow the connection trail while collecting the
504 # atoms numbers present in the connected fragment...
505 while (@ProcessingAtoms) {
506 for ($Index = 0; $Index < @ProcessingAtoms; $Index++) {
507 $ProcessAtomNum = $ProcessingAtoms[$Index];
508 for $NbrAtomNum (keys %{$AtomConnections[$ProcessAtomNum]}) {
509 if (!$ProcessedAtoms[$NbrAtomNum]) {
510 $ProcessedAtomCount++;
511 $ProcessedAtoms[$NbrAtomNum] = 1;
512 push @ConnectedAtoms, $NbrAtomNum;
513 push @{ $Fragments{$FragmentNum} }, $NbrAtomNum;
514 }
515 }
516 }
517 @ProcessingAtoms = ();
518 @ProcessingAtoms = @ConnectedAtoms;
519 @ConnectedAtoms = ();
520 }
521 }
522 $FragmentCount = $FragmentNum;
523 $FragmentString = "";
524
525 # Sort out the fragments by size...
526 for $FragmentNum (sort { @{$Fragments{$b}} <=> @{$Fragments{$a}} } keys %Fragments ) {
527 # Sort the atoms in a fragment by their numbers...
528 $AFragmentString = join " ", sort { $a <=> $b } @{ $Fragments{$FragmentNum} };
529 if ($FragmentString) {
530 $FragmentString .= "\n" . $AFragmentString;
531 }
532 else {
533 $FragmentString = $AFragmentString;
534 }
535 }
536 return ($FragmentCount, $FragmentString);
537 }
538
539 # Count number of lines present in between 4th and line containg "M END"
540 sub GetCtabLinesCount {
541 my($CmpdLines) = @_;
542 my($LineIndex, $CtabLinesCount);
543
544 $CtabLinesCount = 0;
545 LINE: for ($LineIndex = 4; $LineIndex < @$CmpdLines; $LineIndex++) {
546 #
547 # Any line after atom and bond data starting with anything other than space or
548 # a digit indicates end of Ctab atom/bond data block...
549 #
550 if (@$CmpdLines[$LineIndex] !~ /^[0-9 ]/) {
551 $CtabLinesCount = $LineIndex - 4;
552 last LINE;
553 }
554 }
555 return $CtabLinesCount;
556 }
557
558 # Using atom blocks, count the number of atoms which contain special element
559 # symbols not present in the periodic table.
560 sub GetUnknownAtoms {
561 my($CmpdLines) = @_;
562 my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines, $LineIndex, $AtomCount, $AtomSymbol);
563
564 $UnknownAtomCount = 0;
565 $UnknownAtoms = "";
566 $UnknownAtomLines = "";
567 ($AtomCount) = ParseCmpdCountsLine(@$CmpdLines[3]);
568 for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) {
569 ($AtomSymbol) = ParseCmpdAtomLine(@$CmpdLines[$LineIndex]);
570 if (!IsElement($AtomSymbol)) {
571 $UnknownAtomCount++;
572 $UnknownAtoms .= " $AtomSymbol";
573 if ($UnknownAtomLines) {
574 $UnknownAtomLines .= "\n" . @$CmpdLines[$LineIndex];
575 }
576 else {
577 $UnknownAtomLines = @$CmpdLines[$LineIndex];
578 }
579 }
580 }
581 return ($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines);
582 }
583
584 # Check z coordinates of all atoms to see whether any of them is non-zero
585 # which makes the compound geometry three dimensional...
586 #
587 sub IsCmpd3D {
588 my($CmpdLines) = @_;
589 my($LineIndex, $AtomCount, $AtomSymbol, $AtomX, $AtomY, $AtomZ);
590
591 ($AtomCount) = ParseCmpdCountsLine(@$CmpdLines[3]);
592 for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) {
593 ($AtomSymbol, $AtomX, $AtomY, $AtomZ) = ParseCmpdAtomLine(@$CmpdLines[$LineIndex]);
594 if ($AtomZ != 0) {
595 return 1;
596 }
597 }
598 return 0;
599 }
600
601 # Check whether it's a 2D compound...
602 #
603 sub IsCmpd2D {
604 my($CmpdLines) = @_;
605
606 return IsCmpd3D($CmpdLines) ? 0 : 1;
607 }
608
609 # Using bond blocks, count the number of bond lines which contain atom numbers
610 # greater than atom count specified in compound count line...
611 #
612 sub GetInvalidAtomNumbers {
613 my($CmpdLines) = @_;
614 my($LineIndex, $AtomCount, $BondCount, $FirstAtomNum, $SecondAtomNum, $InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines, $Line, $InvalidAtomPropertyLine, $ValuePairIndex, $AtomNum, $Value, @ValuePairs);
615
616 ($AtomCount, $BondCount) = ParseCmpdCountsLine(@$CmpdLines[3]);
617
618 $InvalidAtomNumbersCount = 0;
619 $InvalidAtomNumbers = "";
620 $InvalidAtomNumberLines = "";
621
622 # Go over bond block lines...
623 LINE: for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) {
624 ($FirstAtomNum, $SecondAtomNum) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]);
625 if ($FirstAtomNum <= $AtomCount && $SecondAtomNum <= $AtomCount) {
626 next LINE;
627 }
628 if ($FirstAtomNum > $AtomCount) {
629 $InvalidAtomNumbersCount++;
630 $InvalidAtomNumbers .= " $FirstAtomNum";
631 }
632 if ($SecondAtomNum > $AtomCount) {
633 $InvalidAtomNumbersCount++;
634 $InvalidAtomNumbers .= " $SecondAtomNum";
635 }
636 if ($InvalidAtomNumberLines) {
637 $InvalidAtomNumberLines .= "\n" . @$CmpdLines[$LineIndex];
638 }
639 else {
640 $InvalidAtomNumberLines = @$CmpdLines[$LineIndex];
641 }
642 }
643 # Go over property lines before M END...
644 #
645 LINE: for ($LineIndex = (4 + $AtomCount + $BondCount); $LineIndex < @$CmpdLines; $LineIndex++) {
646 $Line = @$CmpdLines[$LineIndex];
647 @ValuePairs = ();
648 if ($Line =~ /^M END/i) {
649 last LINE;
650 }
651 @ValuePairs = ();
652 if ($Line =~ /^M CHG/i) {
653 @ValuePairs = ParseCmpdChargePropertyLine($Line);
654 }
655 elsif ($Line =~ /^M RAD/i) {
656 @ValuePairs = ParseCmpdRadicalPropertyLine($Line);
657 }
658 elsif ($Line =~ /^M ISO/i) {
659 @ValuePairs = ParseCmpdIsotopePropertyLine($Line);
660 }
661 elsif ($Line =~ /^A /i) {
662 my($NextLine);
663 $LineIndex++;
664 $NextLine = @$CmpdLines[$LineIndex];
665 @ValuePairs = ParseCmpdAtomAliasPropertyLine($Line, $NextLine);
666 }
667 else {
668 next LINE;
669 }
670
671 $InvalidAtomPropertyLine = 0;
672 for ($ValuePairIndex = 0; $ValuePairIndex < $#ValuePairs; $ValuePairIndex += 2) {
673 $AtomNum = $ValuePairs[$ValuePairIndex]; $Value = $ValuePairs[$ValuePairIndex + 1];
674 if ($AtomNum > $AtomCount) {
675 $InvalidAtomPropertyLine = 1;
676 $InvalidAtomNumbersCount++;
677 $InvalidAtomNumbers .= " $AtomNum";
678 }
679 }
680 if ($InvalidAtomPropertyLine) {
681 if ($InvalidAtomNumberLines) {
682 $InvalidAtomNumberLines .= "\n" . $Line;
683 }
684 else {
685 $InvalidAtomNumberLines = $Line;
686 }
687 }
688 }
689
690 return ($InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines);
691 }
692
693 # Ctab lines: Atom block
694 #
695 # Format: xxxxx.xxxxyyyyy.yyyyzzzzz.zzzz aaaddcccssshhhbbbvvvHHHrrriiimmmnnneee
696 # A10 A10 A10 xA3 A2A3 A3 A3 A3 A3 A3 A3 A3 A3 A3 A3
697 # x,y,z: Atom coordinates
698 # aaa: Atom symbol. Entry in periodic table or L for atom list, A, Q, * for unspecified
699 # atom, and LP for lone pair, or R# for Rgroup label
700 # dd: Mass difference. -3, -2, -1, 0, 1, 2, 3, 4 (0 for value beyond these limits)
701 # ccc: Charge. 0 = uncharged or value other than these, 1 = +3, 2 = +2, 3 = +1,
702 # 4 = doublet radical, 5 = -1, 6 = -2, 7 = -3
703 # sss: Atom stereo parity. 0 = not stereo, 1 = odd, 2 = even, 3 = either or unmarked stereo center
704 # hhh: Hydrogen count + 1. 1 = H0, 2 = H1, 3 = H2, 4 = H3, 5 = H4
705 # bbb: Stereo care box. 0 = ignore stereo configuration of this double bond atom, 1 = stereo
706 # configuration of double bond atom must match
707 # vvv: Valence. 0 = no marking (default)(1 to 14) = (1 to 14) 15 = zero valence
708 # HHH: H0 designator. 0 = not specified, 1 = no H atoms allowed (redundant due to hhh)
709 # rrr: Not used
710 # iii: Not used
711 # mmm: Atom-atom mapping number. 1 - number of atoms
712 # nnn: Inversion/retention flag. 0 = property not applied, 1 = configuration is inverted,
713 # 2 = configuration is retained.
714 # eee: Exact change flag. 0 = property not applied, 1 = change on atom must be
715 # exactly as shown
716 #
717 # Notes:
718 # . StereoParity: 1 - ClockwiseStereo, 2 - AntiClockwiseStereo; 3 - Either; 0 - none. These
719 # values determine chirailty around the chiral center; a non zero value indicates atom
720 # has been marked as chiral center.
721 #
722 sub ParseCmpdAtomLine {
723 my($Line) = @_;
724 my ($LineIndex, $AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity);
725
726 ($AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity) = ('') x 7;
727 if (length($Line) > 31) {
728 ($AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity) = unpack("A10A10A10xA3A2A3A3", $Line);
729 }
730 else {
731 ($AtomX, $AtomY, $AtomZ, $AtomSymbol) = unpack("A10A10A10", $Line);
732 }
733 return ($AtomSymbol, $AtomX, $AtomY, $AtomZ, $MassDifference, $Charge, $StereoParity);
734 }
735
736 # Map MDL charge value used in SD and MOL files to internal charge used by MayaChemTools.
737 #
738 sub MDLChargeToInternalCharge {
739 my($MDLCharge) = @_;
740 my($InternalCharge);
741
742 CHARGE: {
743 if ($MDLCharge == 0) { $InternalCharge = 0; last CHARGE;}
744 if ($MDLCharge == 1) { $InternalCharge = 3; last CHARGE;}
745 if ($MDLCharge == 2) { $InternalCharge = 2; last CHARGE;}
746 if ($MDLCharge == 3) { $InternalCharge = 1; last CHARGE;}
747 if ($MDLCharge == 5) { $InternalCharge = -1; last CHARGE;}
748 if ($MDLCharge == 6) { $InternalCharge = -2; last CHARGE;}
749 if ($MDLCharge == 7) { $InternalCharge = -3; last CHARGE;}
750 # All other MDL charge values, including 4 corresponding to "doublet radical",
751 # are assigned internal value of 0.
752 $InternalCharge = 0;
753 if ($MDLCharge != 4) {
754 carp "Warning: MDLChargeToInternalCharge: MDL charge value, $MDLCharge, is not supported: An internal charge value, 0, has been assigned...";
755 }
756 }
757 return $InternalCharge;
758 }
759
760 # Map internal charge used by MayaChemTools to MDL charge value used in SD and MOL files.
761 #
762 sub InternalChargeToMDLCharge {
763 my($InternalCharge) = @_;
764 my($MDLCharge);
765
766 CHARGE: {
767 if ($InternalCharge == 3) { $MDLCharge = 1; last CHARGE;}
768 if ($InternalCharge == 2) { $MDLCharge = 2; last CHARGE;}
769 if ($InternalCharge == 1) { $MDLCharge = 3; last CHARGE;}
770 if ($InternalCharge == -1) { $MDLCharge = 5; last CHARGE;}
771 if ($InternalCharge == -2) { $MDLCharge = 6; last CHARGE;}
772 if ($InternalCharge == -3) { $MDLCharge = 7; last CHARGE;}
773 # All other MDL charge values, including 4 corresponding to "doublet radical",
774 # are assigned internal value of 0.
775 $MDLCharge = 0;
776 }
777 return $MDLCharge;
778 }
779
780 # Ctab lines: Bond block
781 #
782 # Format: 111222tttsssxxxrrrccc
783 #
784 # 111: First atom number.
785 # 222: Second atom number.
786 # ttt: Bond type. 1 = Single, 2 = Double, 3 = Triple, 4 = Aromatic, 5 = Single or Double,
787 # 6 = Single or Aromatic, 7 = Double or Aromatic, 8 = Any
788 # sss: Bond stereo. Single bonds: 0 = not stereo, 1 = Up, 4 = Either, 6 = Down,
789 # Double bonds: 0 = Use x-, y-, z-coords from atom block to determine cis or trans,
790 # 3 = Cis or trans (either) double bond
791 # xxx: Not used
792 # rrr: Bond topology. 0 = Either, 1 = Ring, 2 = Chain
793 # ccc: Reacting center status. 0 = unmarked, 1 = a center, -1 = not a center,
794 # Additional: 2 = no change,4 = bond made/broken, 8 = bond order changes 12 = 4+8
795 # (both made/broken and changes); 5 = (4 + 1), 9 = (8 + 1), and 13 = (12 + 1) are also possible
796 #
797 sub ParseCmpdBondLine {
798 my($Line) = @_;
799 my($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo);
800
801 ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = map {s/ //g; $_} unpack("A3A3A3A3", $Line);
802 return ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo);
803 }
804
805 # Map MDL bond type value used in SD and MOL files to internal bond order and bond types
806 # values used by MayaChemTools...
807 #
808 sub MDLBondTypeToInternalBondOrder {
809 my($MDLBondType) = @_;
810 my($InternalBondOrder, $InternalBondType);
811
812 $InternalBondType = '';
813
814 BONDTYPE: {
815 if ($MDLBondType == 1) { $InternalBondOrder = 1; $InternalBondType = 'Single'; last BONDTYPE;}
816 if ($MDLBondType == 2) { $InternalBondOrder = 2; $InternalBondType = 'Double'; last BONDTYPE;}
817 if ($MDLBondType == 3) { $InternalBondOrder = 3; $InternalBondType = 'Triple'; last BONDTYPE;}
818 if ($MDLBondType == 4) { $InternalBondOrder = 1.5; $InternalBondType = 'Aromatic'; last BONDTYPE;} # Aromatic
819 if ($MDLBondType == 5) { $InternalBondOrder = 1; $InternalBondType = 'SingleOrDouble'; last BONDTYPE;} # Aromatic
820 if ($MDLBondType == 6) { $InternalBondOrder = 1; $InternalBondType = 'SingleOrAromatic'; last BONDTYPE;} # Aromatic
821 if ($MDLBondType == 7) { $InternalBondOrder = 2; $InternalBondType = 'DoubleOrAromatic'; last BONDTYPE;} # Aromatic
822 if ($MDLBondType == 8) { $InternalBondOrder = 1; $InternalBondType = 'Any'; last BONDTYPE;} # Aromatic
823 #
824 # Although MDL aromatic bond values are used for query only and explicit Kekule bond order
825 # values must be assigned, internal value of 1.5 is allowed to indicate aromatic bond orders.
826 #
827 # All other MDL bond type values - 5 = Single or Double, 6 = Single or Aromatic, 7 = Double or Aromatic,
828 # 8 = Any - are also assigned appropriate internal value of 1: These are meant to be used for
829 # structure queries by MDL products.
830 #
831 $InternalBondOrder = 1;
832 $InternalBondType = 'Single';
833
834 carp "Warning: MDLBondTypeToInternalBondOrder: MDL bond type value, $MDLBondType, is not supported: An internal bond order value, 0, has been assigned...";
835 }
836 return ($InternalBondOrder, $InternalBondType);
837 }
838
839 # Map internal bond order and bond type values used by MayaChemTools to MDL bond type value used
840 # in SD and MOL files...
841 #
842 sub InternalBondOrderToMDLBondType {
843 my($InternalBondOrder, $InternalBondType) = @_;
844 my($MDLBondType);
845
846 BONDTYPE: {
847 if ($InternalBondOrder == 1) {
848 if ($InternalBondType =~ /^SingleOrDouble$/i) {
849 $MDLBondType = 5;
850 }
851 elsif ($InternalBondType =~ /^SingleOrAromatic$/i) {
852 $MDLBondType = 6;
853 }
854 elsif ($InternalBondType =~ /^Any$/i) {
855 $MDLBondType = 8;
856 }
857 else {
858 $MDLBondType = 1;
859 }
860 $MDLBondType = 1;
861 last BONDTYPE;
862 }
863 if ($InternalBondOrder == 2) {
864 if ($InternalBondType =~ /^DoubleOrAromatic$/i) {
865 $MDLBondType = 7;
866 }
867 else {
868 $MDLBondType = 2;
869 }
870 last BONDTYPE;
871 }
872 if ($InternalBondOrder == 3) { $MDLBondType = 3; last BONDTYPE;}
873 if ($InternalBondOrder == 1.5) { $MDLBondType = 4; last BONDTYPE;}
874 if ($InternalBondType =~ /^Any$/i) { $MDLBondType = 8; last BONDTYPE;}
875
876 $MDLBondType = 1;
877
878 carp "Warning: InternalBondOrderToMDLBondType: Internal bond order and type values, $InternalBondOrder and $InternalBondType, don't match any valid MDL bond type: MDL bond type value, 1, has been assigned...";
879 }
880 return $MDLBondType;
881 }
882
883 # Third line: Comments - A blank line is also allowed.
884 sub ParseCmpdCommentsLine {
885 my($Line) = @_;
886 my($Comments);
887
888 $Comments = unpack("A80", $Line);
889
890 return ($Comments);
891 }
892
893 # Map MDL bond stereo value used in SD and MOL files to internal bond stereochemistry values used by MayaChemTools...
894 #
895 sub MDLBondStereoToInternalBondStereochemistry {
896 my($MDLBondStereo) = @_;
897 my($InternalBondStereo);
898
899 $InternalBondStereo = '';
900
901 BONDSTEREO: {
902 if ($MDLBondStereo == 1) { $InternalBondStereo = 'Up'; last BONDSTEREO;}
903 if ($MDLBondStereo == 4) { $InternalBondStereo = 'UpOrDown'; last BONDSTEREO;}
904 if ($MDLBondStereo == 6) { $InternalBondStereo = 'Down'; last BONDSTEREO;}
905 if ($MDLBondStereo == 3) { $InternalBondStereo = 'CisOrTrans'; last BONDSTEREO;}
906 if ($MDLBondStereo == 0) { $InternalBondStereo = 'None'; last BONDSTEREO;}
907
908 $InternalBondStereo = '';
909 carp "Warning: MDLBondStereoToInternalBondType: MDL bond stereo value, $MDLBondStereo, is not supported: It has been ignored and bond order would be used to determine bond type...";
910 }
911 return $InternalBondStereo;
912 }
913
914 # Map internal bond stereochemistry values used by MayaChemTools to MDL bond stereo value used in SD and MOL files...
915 #
916 sub InternalBondStereochemistryToMDLBondStereo {
917 my($InternalBondStereo) = @_;
918 my($MDLBondStereo);
919
920 $MDLBondStereo = 0;
921
922 BONDSTEREO: {
923 if ($InternalBondStereo =~ /^Up$/i) { $MDLBondStereo = 1; last BONDSTEREO;}
924 if ($InternalBondStereo =~ /^UpOrDown$/i) { $MDLBondStereo = 4; last BONDSTEREO;}
925 if ($InternalBondStereo =~ /^Down$/) { $MDLBondStereo = 6; last BONDSTEREO;}
926 if ($InternalBondStereo =~ /^CisOrTrans$/) { $MDLBondStereo = 3; last BONDSTEREO;}
927
928 $MDLBondStereo = 0;
929 }
930 return $MDLBondStereo;
931 }
932
933 # Fourth line: Counts
934 #
935 # Format: aaabbblllfffcccsssxxxrrrpppiiimmmvvvvvv
936 #
937 # aaa: number of atoms; bbb: number of bonds; lll: number of atom lists; fff: (obsolete)
938 # ccc: chiral flag: 0=not chiral, 1=chiral; sss: number of stext entries; xxx,rrr,ppp,iii:
939 # (obsolete); mmm: number of lines of additional properties, including the M END line, No
940 # longer supported, default is set to 999; vvvvvv: version
941
942 sub ParseCmpdCountsLine {
943 my($Line) = @_;
944 my($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version);
945
946 if (length($Line) >= 39) {
947 ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version) = unpack("A3A3x3x3A3x3x3x3x3x3A3A6", $Line);
948 }
949 elsif (length($Line) >= 15) {
950 ($PropertyCount, $Version) = ("999", "v2000");
951 ($AtomCount, $BondCount, $ChiralFlag) = unpack("A3A3x3x3A3", $Line);
952 }
953 else {
954 ($ChiralFlag, $PropertyCount, $Version) = ("0", "999", "v2000");
955 ($AtomCount, $BondCount) = unpack("A3A3", $Line);
956 }
957
958 if ($Version =~ /V3000/i) {
959 # Current version of MayaChemTools modules and classes for processing MDL MOL and SD don't support
960 # V3000. So instead of relying on callers, just exit with an error to disable any processing of V3000
961 # format.
962 croak "Error: SDFileUtil::ParseCmpdCountsLine: The Extended Connection Table (V3000) format in MDL MOL and SD files is not supported by the current release of MayaChemTools...";
963 }
964
965 return ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version);
966 }
967
968 # Second line: Misc info
969 #
970 # Format: IIPPPPPPPPMMDDYYHHmmddSSssssssssssEEEEEEEEEEEERRRRRR
971 # A2A8 A10 A2I2A10 A12 A6
972 # User's first and last initials (I), program name (P), date/time (M/D/Y,H:m),
973 # dimensional codes - 2D or 3D (d),scaling factors (S, s), energy (E) if modeling program input,
974 # internal registry number (R) if input through MDL form. A blank line is also allowed.
975 sub ParseCmpdMiscInfoLine {
976 my($Line) = @_;
977 my($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum);
978
979 ($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum) = unpack("A2A8A10A2A2A10A12A6", $Line);
980 return ($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum);
981 }
982
983 # First line: Molecule name. This line is unformatted, but like all other lines in a
984 # molfile may not extend beyond column 80. A blank line is also allowed.
985 sub ParseCmpdMolNameLine {
986 my($Line) = @_;
987 my($MolName);
988
989 $MolName = unpack("A80", $Line);
990
991 return ($MolName);
992 }
993
994 # Parse atom alias property line in CTAB generic properties block.
995 #
996 # Atom alias property line format:
997 #
998 # A aaa
999 # x...
1000 #
1001 # aaa: Atom number
1002 # x: Atom alias in next line
1003 #
1004 sub ParseCmpdAtomAliasPropertyLine {
1005 my($Line, $NextLine) = @_;
1006 my($Label, $AtomNumber, $AtomAlias);
1007
1008 ($Label, $AtomNumber) = split(' ', $Line);
1009 $AtomAlias = $NextLine;
1010
1011 if (!$AtomAlias) {
1012 carp "Warning: _ParseCmpdAtomAliasPropertyLine: No atom alias value specified on the line following atom alias property line...";
1013 }
1014
1015 return ($AtomNumber, $AtomAlias);
1016 }
1017
1018 # Parse charge property line in CTAB generic properties block.
1019 #
1020 # Charge property line format:
1021 #
1022 # M CHGnn8 aaa vvv ...
1023 #
1024 # nn8: Number of value pairs. Maximum of 8 pairs allowed.
1025 # aaa: Atom number
1026 # vvv: -15 to +15. Default of 0 = uncharged atom. When present, this property supersedes
1027 # all charge and radical values in the atom block, forcing a 0 charge on all atoms not
1028 # listed in an M CHG or M RAD line.
1029 #
1030 sub ParseCmpdChargePropertyLine {
1031 my($Line) = @_;
1032
1033 return _ParseCmpdGenericPropertyLine('Charge', $Line);
1034 }
1035
1036
1037 # Parse isotope property line in CTAB generic properties block.
1038 #
1039 # Isoptope property line format:
1040 #
1041 # M ISOnn8 aaa vvv ...
1042 #
1043 # nn8: Number of value paris. Maximum of 8 pairs allowed.
1044 # aaa: Atom number
1045 # vvv: Absolute mass of the atom isotope as a positive integer. When present, this property
1046 # supersedes all isotope values in the atom block. Default (no entry) means natural
1047 # abundance. The difference between this absolute mass value and the natural
1048 # abundance value specified in the PTABLE.DAT file must be within the range of -18
1049 # to +12
1050 #
1051 # Notes:
1052 # . Values correspond to mass numbers...
1053 #
1054 sub ParseCmpdIsotopePropertyLine {
1055 my($Line) = @_;
1056
1057 return _ParseCmpdGenericPropertyLine('Isotope', $Line);
1058 }
1059
1060 # Parse radical property line in CTAB generic properties block.
1061 #
1062 # Radical property line format:
1063 #
1064 # M RADnn8 aaa vvv ...
1065 #
1066 # nn8: Number of value paris. Maximum of 8 pairs allowed.
1067 # aaa: Atom number
1068 # vvv: Default of 0 = no radical, 1 = singlet, 2 = doublet, 3 = triplet . When
1069 # present, this property supersedes all charge and radical values in the atom block,
1070 # forcing a 0 (zero) charge and radical on all atoms not listed in an M CHG or
1071 # M RAD line.
1072 #
1073 sub ParseCmpdRadicalPropertyLine {
1074 my($Line) = @_;
1075
1076 return _ParseCmpdGenericPropertyLine('Radical', $Line);
1077 }
1078
1079 # Map MDL radical stereo value used in SD and MOL files to internal spin multiplicity values used by MayaChemTools...
1080 #
1081 sub MDLRadicalToInternalSpinMultiplicity {
1082 my($MDLRadical) = @_;
1083 my($InternalSpinMultiplicity);
1084
1085 $InternalSpinMultiplicity = '';
1086
1087 SPINMULTIPLICITY: {
1088 if ($MDLRadical == 0) { $InternalSpinMultiplicity = 0; last SPINMULTIPLICITY;}
1089 if ($MDLRadical == 1) { $InternalSpinMultiplicity = 1; last SPINMULTIPLICITY;}
1090 if ($MDLRadical == 2) { $InternalSpinMultiplicity = 2; last SPINMULTIPLICITY;}
1091 if ($MDLRadical == 3) { $InternalSpinMultiplicity = 3; last SPINMULTIPLICITY;}
1092 $InternalSpinMultiplicity = '';
1093 carp "Warning: MDLRadicalToInternalSpinMultiplicity: MDL radical value, $MDLRadical, specifed on line M RAD is not supported...";
1094 }
1095 return $InternalSpinMultiplicity;
1096 }
1097
1098 # Map internal spin multiplicity values used by MayaChemTools to MDL radical stereo value used in SD and MOL files...
1099 #
1100 sub InternalSpinMultiplicityToMDLRadical {
1101 my($InternalSpinMultiplicity) = @_;
1102 my($MDLRadical);
1103
1104 $MDLRadical = 0;
1105
1106 SPINMULTIPLICITY: {
1107 if ($InternalSpinMultiplicity == 1) { $MDLRadical = 1; last SPINMULTIPLICITY;}
1108 if ($InternalSpinMultiplicity == 2) { $MDLRadical = 2; last SPINMULTIPLICITY;}
1109 if ($InternalSpinMultiplicity == 3) { $MDLRadical = 3; last SPINMULTIPLICITY;}
1110 $MDLRadical = 0;
1111 }
1112 return $MDLRadical;
1113 }
1114
1115 # Process generic CTAB property line...
1116 sub _ParseCmpdGenericPropertyLine {
1117 my($PropertyName, $Line) = @_;
1118
1119 my($Label, $PropertyLabel, $ValuesCount, $ValuePairsCount, @ValuePairs);
1120
1121 @ValuePairs = ();
1122 ($Label, $PropertyLabel, $ValuesCount, @ValuePairs) = split(' ', $Line);
1123 $ValuePairsCount = (scalar @ValuePairs)/2;
1124 if ($ValuesCount != $ValuePairsCount) {
1125 carp "Warning: _ParseCmpdGenericPropertyLine: Number of atom number and $PropertyName value paris specified on $Label $PropertyLabel property line, $ValuePairsCount, does not match expected value of $ValuesCount...";
1126 }
1127
1128 return (@ValuePairs);
1129 }
1130
1131 # Generic CTAB property lines for charge, istope and radical properties...
1132 #
1133 sub _GenerateCmpdGenericPropertyLines {
1134 my($PropertyName, $PropertyValuePairsRef) = @_;
1135 my($Index, $PropertyLabel, $Line, $PropertyCount, $AtomNum, $PropertyValue, @PropertyLines);
1136
1137 @PropertyLines = ();
1138 NAME: {
1139 if ($PropertyName =~ /^Charge$/i) { $PropertyLabel = "M CHG"; last NAME; }
1140 if ($PropertyName =~ /^Isotope$/i) { $PropertyLabel = "M ISO"; last NAME; }
1141 if ($PropertyName =~ /^Radical$/i) { $PropertyLabel = "M RAD"; last NAME; }
1142 carp "Warning: _GenerateCmpdGenericPropertyLines: Unknown property name, $PropertyName, specified...";
1143 return @PropertyLines;
1144 }
1145
1146 # A maximum of 8 property pair values allowed per line...
1147 $PropertyCount = 0;
1148 $Line = '';
1149 for ($Index = 0; $Index < $#{$PropertyValuePairsRef}; $Index += 2) {
1150 if ($PropertyCount > 8) {
1151 # Setup property line...
1152 $Line = "${PropertyLabel} 8${Line}";
1153 push @PropertyLines, $Line;
1154
1155 $PropertyCount = 0;
1156 $Line = '';
1157 }
1158 $PropertyCount++;
1159 $AtomNum = $PropertyValuePairsRef->[$Index];
1160 $PropertyValue = $PropertyValuePairsRef->[$Index + 1];
1161 $Line .= sprintf " %3i %3i", $AtomNum, $PropertyValue;
1162 }
1163 if ($Line) {
1164 $Line = "${PropertyLabel} ${PropertyCount}${Line}";
1165 push @PropertyLines, $Line;
1166 }
1167 return @PropertyLines;
1168 }
1169
1170 #
1171 # Read compound data into a string and return its value
1172 sub ReadCmpdString {
1173 my($SDFileRef) = @_;
1174 my($CmpdString);
1175
1176 $CmpdString = "";
1177 LINE: while (defined($_ = <$SDFileRef>)) {
1178 # Change Windows and Mac new line char to UNIX...
1179 s/(\r\n)|(\r)/\n/g;
1180
1181 if (/^\$\$\$\$/) {
1182 # Take out any new line char at the end by explicitly removing it instead of using
1183 # chomp, which might not always work correctly on files generated on a system
1184 # with a value of input line separator different from the current system...
1185 s/\n$//g;
1186
1187 # Doesn't hurt to chomp...
1188 chomp;
1189
1190 $CmpdString .= $_;
1191 last LINE;
1192 }
1193 else {
1194 $CmpdString .= $_;
1195 }
1196 }
1197 return $CmpdString;
1198 }
1199
1200 # Find out the number of fragements in the compounds. And for the compound with
1201 # more than one fragment, remove all the others besides the largest one.
1202 sub WashCmpd {
1203 my($CmpdLines) = @_;
1204 my($WashedCmpdString, $FragmentCount, $Fragments);
1205
1206 $WashedCmpdString = "";
1207 ($FragmentCount, $Fragments) = GetCmpdFragments($CmpdLines);
1208 if ($FragmentCount > 1) {
1209 # Go over the compound data for the largest fragment including property
1210 # data...
1211 my (@AllFragments, @LargestFragment, %LargestFragmentAtoms, @WashedCmpdLines, $Index, $LineIndex, $AtomCount, $BondCount, $NewAtomCount, $NewBondCount, $FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo, $FirstNewAtomNum, $SecondNewAtomNum, $AtomNum, $ChiralFlag, $BondLine, $MENDLineIndex, $Line, $Value, @ValuePairs, @NewValuePairs, $ValuePairIndex, $NewAtomNum, @NewPropertyLines);
1212
1213 @AllFragments = (); @LargestFragment = ();
1214 %LargestFragmentAtoms = ();
1215 @AllFragments = split "\n", $Fragments;
1216 @LargestFragment = split " ", $AllFragments[0];
1217 for $Index (0 .. $#LargestFragment) {
1218 # Map old atom numbers to new atom numbers as the fragment atom numbers are sorted
1219 # from lowest to highest old atom numbers...
1220 $LargestFragmentAtoms{$LargestFragment[$Index]} = $Index + 1;
1221 }
1222 @WashedCmpdLines = ();
1223 push @WashedCmpdLines, @$CmpdLines[0], @$CmpdLines[1], @$CmpdLines[2], @$CmpdLines[3];
1224 ($AtomCount, $BondCount, $ChiralFlag) = ParseCmpdCountsLine(@$CmpdLines[3]);
1225 $NewAtomCount = @LargestFragment;
1226 $NewBondCount = 0;
1227 $AtomNum = 0;
1228 # Retrieve the largest fragment atom lines...
1229 for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) {
1230 $AtomNum++;
1231 if ($LargestFragmentAtoms{$AtomNum}) {
1232 push @WashedCmpdLines, @$CmpdLines[$LineIndex];
1233 }
1234 }
1235 # Retrieve the largest fragment bond lines...
1236 for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) {
1237 ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]);
1238 if ($LargestFragmentAtoms{$FirstAtomNum} && $LargestFragmentAtoms{$SecondAtomNum}) {
1239 $NewBondCount++;
1240 # Set up bond line with new atom number mapping...
1241 $FirstNewAtomNum = $LargestFragmentAtoms{$FirstAtomNum};
1242 $SecondNewAtomNum = $LargestFragmentAtoms{$SecondAtomNum};
1243 $BondLine = GenerateCmpdBondLine($FirstNewAtomNum, $SecondNewAtomNum, $BondType, $BondStereo);
1244 push @WashedCmpdLines, $BondLine;
1245 }
1246 }
1247 # Get property lines for CHG, ISO and RAD label and map the old atom numbers to new
1248 # atom numners; Others, property lines before M END line are skipped as atom numbers for
1249 # other properties might not valid anymore...
1250 #
1251 $MENDLineIndex = $LineIndex;
1252 LINE: for ($LineIndex = (4 + $AtomCount + $BondCount); $LineIndex < @$CmpdLines; $LineIndex++) {
1253 $Line = @$CmpdLines[$LineIndex];
1254 if ($Line =~ /^M END/i) {
1255 push @WashedCmpdLines, "M END";
1256 $MENDLineIndex = $LineIndex;
1257 last LINE;
1258 }
1259
1260 @ValuePairs = ();
1261 if ($Line =~ /^M CHG/i) {
1262 @ValuePairs = ParseCmpdChargePropertyLine($Line);
1263 }
1264 elsif ($Line =~ /^M RAD/i) {
1265 @ValuePairs = ParseCmpdRadicalPropertyLine($Line);
1266 }
1267 elsif ($Line =~ /^M ISO/i) {
1268 @ValuePairs = ParseCmpdIsotopePropertyLine($Line);
1269 }
1270 elsif ($Line =~ /^A /i) {
1271 my($NextLine);
1272 $LineIndex++;
1273 $NextLine = @$CmpdLines[$LineIndex];
1274 @ValuePairs = ParseCmpdAtomAliasPropertyLine($Line, $NextLine);
1275 }
1276 else {
1277 next LINE;
1278 }
1279
1280 if (!@ValuePairs) {
1281 next LINE;
1282 }
1283
1284 # Collect values for valid atom numbers with mapping to new atom numbers...
1285 @NewValuePairs = ();
1286 VALUEINDEX: for ($ValuePairIndex = 0; $ValuePairIndex < $#ValuePairs; $ValuePairIndex += 2) {
1287 $AtomNum = $ValuePairs[$ValuePairIndex]; $Value = $ValuePairs[$ValuePairIndex + 1];
1288 if (!exists $LargestFragmentAtoms{$AtomNum}) {
1289 next VALUEINDEX;
1290 }
1291 $NewAtomNum = $LargestFragmentAtoms{$AtomNum};
1292 push @NewValuePairs, ($NewAtomNum, $Value)
1293 }
1294 if (!@NewValuePairs) {
1295 next LINE;
1296 }
1297 @NewPropertyLines = ();
1298 if ($Line =~ /^M CHG/i) {
1299 @NewPropertyLines = GenerateCmpdChargePropertyLines(\@NewValuePairs);
1300 }
1301 elsif ($Line =~ /^M RAD/i) {
1302 @NewPropertyLines = GenerateCmpdRadicalPropertyLines(\@NewValuePairs);
1303 }
1304 elsif ($Line =~ /^M ISO/i) {
1305 @NewPropertyLines = GenerateCmpdIsotopePropertyLines(\@NewValuePairs);
1306 }
1307 elsif ($Line =~ /^A /i) {
1308 @NewPropertyLines = GenerateCmpdAtomAliasPropertyLines(\@NewValuePairs);
1309 }
1310 push @WashedCmpdLines, @NewPropertyLines;
1311 }
1312
1313 # Retrieve rest of the data label and value property data...
1314 for ($LineIndex = (1 + $MENDLineIndex); $LineIndex < @$CmpdLines; $LineIndex++) {
1315 push @WashedCmpdLines, @$CmpdLines[$LineIndex];
1316 }
1317 # Update atom and bond count line...
1318 $WashedCmpdLines[3] = GenerateCmpdCountsLine($NewAtomCount, $NewBondCount, $ChiralFlag);
1319
1320 $WashedCmpdString = join "\n", @WashedCmpdLines;
1321 }
1322 return ($FragmentCount, $Fragments, $WashedCmpdString);
1323 }
1324
1325 1;
1326
1327 __END__
1328
1329 =head1 NAME
1330
1331 SDFileUtil
1332
1333 =head1 SYNOPSIS
1334
1335 use SDFileUtil ;
1336
1337 use SDFileUtil qw(:all);
1338
1339 =head1 DESCRIPTION
1340
1341 B<SDFileUtil> module provides the following functions:
1342
1343 GenerateCmpdAtomAliasPropertyLines, GenerateCmpdAtomLine, GenerateCmpdBondLine,
1344 GenerateCmpdChargePropertyLines, GenerateCmpdCommentsLine, GenerateCmpdCountsLine,
1345 GenerateCmpdDataHeaderLabelsAndValuesLines, GenerateCmpdIsotopePropertyLines,
1346 GenerateCmpdMiscInfoLine, GenerateCmpdMolNameLine,
1347 GenerateCmpdRadicalPropertyLines, GenerateEmptyCtabBlockLines,
1348 GenerateMiscLineDateStamp, GetAllAndCommonCmpdDataHeaderLabels,
1349 GetCmpdDataHeaderLabels, GetCmpdDataHeaderLabelsAndValues, GetCmpdFragments,
1350 GetCtabLinesCount, GetInvalidAtomNumbers, GetUnknownAtoms,
1351 InternalBondOrderToMDLBondType, InternalBondStereochemistryToMDLBondStereo,
1352 InternalChargeToMDLCharge, InternalSpinMultiplicityToMDLRadical, IsCmpd2D,
1353 IsCmpd3D, MDLBondStereoToInternalBondStereochemistry,
1354 MDLBondTypeToInternalBondOrder, MDLChargeToInternalCharge,
1355 MDLRadicalToInternalSpinMultiplicity, ParseCmpdAtomAliasPropertyLine,
1356 ParseCmpdAtomLine, ParseCmpdBondLine, ParseCmpdChargePropertyLine,
1357 ParseCmpdCommentsLine, ParseCmpdCountsLine, ParseCmpdIsotopePropertyLine,
1358 ParseCmpdMiscInfoLine, ParseCmpdMolNameLine, ParseCmpdRadicalPropertyLine,
1359 ReadCmpdString, RemoveCmpdDataHeaderLabelAndValue, WashCmpd
1360
1361 =head1 METHODS
1362
1363 =over 4
1364
1365 =item B<GenerateCmpdAtomAliasPropertyLines>
1366
1367 @Lines = GenerateCmpdAtomAliasPropertyLines($AliasValuePairsRef);
1368
1369 Returns a formatted atom alias property lines corresponding to successive pairs
1370 of atom number and alias values specified by a refernce to an array. Two lines
1371 are generate for each atom number and alias value pairs: First line - A <AtomNum>;
1372 Second line:<AtomAlias>.
1373
1374 =item B<GenerateCmpdAtomLine>
1375
1376 $Line = GenerateCmpdAtomLine($AtomSymbol, $AtomX, $AtomY,
1377 $AtomZ, [$MassDifference, $Charge, $StereoParity]);
1378
1379 Returns a formatted atom data line containing all the input values.
1380
1381 =item B<GenerateCmpdBondLine>
1382
1383 $Line = GenerateCmpdBondLine($FirstAtomNum, $SecondAtomNum,
1384 $BondType, [$BondStereo]);
1385
1386 Returns a formatted bond data line containing all the input values.
1387
1388 =item B<GenerateCmpdChargePropertyLines>
1389
1390 @Lines = GenerateCmpdChargePropertyLines($ChargeValuePairsRef);
1391
1392 Returns a formatted M CHG property lines corresponding to successive pairs of
1393 atom number and charge values specified by a refernce to an array.
1394
1395 =item B<GenerateCmpdCommentsLine>
1396
1397 $Line = GenerateCmpdCommentsLine($Comments);
1398
1399 Returns a formatted comments data line.
1400
1401 =item B<GenerateCmpdCountsLine>
1402
1403 $Line = GenerateCmpdCountsLine($AtomCount, $BondCount,
1404 $ChiralFlag, [$PropertyCount, $Version]);
1405
1406 Returns a formatted line containing all the input values. The default values of 999
1407 and V2000 are used for I<PropertyCount> and I<Version>.
1408
1409 =item B<GenerateCmpdDataHeaderLabelsAndValuesLines>
1410
1411 @Lines = GenerateCmpdDataHeaderLabelsAndValuesLines(
1412 $DataHeaderLabelsRef, $DataHeaderLabelsAndValuesRef,
1413 [$SortDataLabels]);
1414
1415 Returns formatted data lines containing header label and values lines corresponding to
1416 all data header labels in array reference I<DataHeaderLabelsRef> with values in hash
1417 reference I<DataHeaderLabelsAndValuesRef>. By default, data header labels are
1418 not sorted and correspond to the label order in array reference I<DataHeaderLabelsRef>.
1419
1420 =item B<GenerateCmpdIsotopePropertyLines>
1421
1422 @Lines = GenerateCmpdIsotopePropertyLines($IsotopeValuePairsRef);
1423
1424 Returns a formatted M ISO property lines corresponding to successive pairs of
1425 atom number and isotope values specified by a refernce to an array.
1426
1427 =item B<GenerateCmpdMiscInfoLine>
1428
1429 $Line = GenerateCmpdMiscInfoLine([$ProgramName, $UserInitial,
1430 $Code]);
1431
1432 Returns a formatted line containing specified user initial, program name, date and code.
1433 Default values are: I<ProgramName - MayaChem; UserInitial - NULL; Code - 2D>.
1434
1435 =item B<GenerateCmpdMolNameLine>
1436
1437 $Line = GenerateCmpdMolNameLine($MolName);
1438
1439 Returns a formatted molecule name data line.
1440
1441 =item B<GenerateCmpdRadicalPropertyLines>
1442
1443 @Lines = GenerateCmpdRadicalPropertyLines($RadicalValuePairsRef);
1444
1445 Returns a formatted M CHG property lines corresponding to successive pairs of
1446 atom number and multiplicity values specified by a refernce to an array.
1447
1448 =item B<GenerateEmptyCtabBlockLines>
1449
1450 $Lines = GenerateCmpdMiscInfoLine([$Date]);
1451
1452 Returns formatted lines representing empty CTAB block.
1453
1454 =item B<GenerateMiscLineDateStamp>
1455
1456 $Line = GenerateMiscLineDateStamp();
1457
1458 Returns date stamp for misc line.
1459
1460 =item B<GetAllAndCommonCmpdDataHeaderLabels>
1461
1462 ($CmpdCount, $DataFieldLabelsArrayRef,
1463 $CommonDataFieldLabelsArrayRef) =
1464 GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
1465
1466 Returns number of comopunds, a reference to an array containing all unique data header
1467 label and a reference to an array containing common data field labels for all compounds
1468 in SD file.
1469
1470 =item B<GetCmpdDataHeaderLabels>
1471
1472 (@Labels) = GetCmpdDataHeaderLabels(\@CmpdLines);
1473
1474 Returns an array containg data header labels for a compound
1475
1476 =item B<GetCmpdDataHeaderLabelsAndValues>
1477
1478 (%DataValues) = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
1479
1480 Returns a hash conating data header labes and values for a compound.
1481
1482 =item B<GetCmpdFragments>
1483
1484 ($FragmentCount, $FragmentString) = GetCmpdFragments(\@CmpLines);
1485
1486 Figures out the number of disconnected fragments and return their values along
1487 with the atom numbers in a string delimited by new line character. Fragment data
1488 in B<FragmentString> is sorted on based on its size.
1489
1490 =item B<GetCtabLinesCount>
1491
1492 $CtabLinesCount = GetCtabLinesCount(\@CmpdLines);
1493
1494 Returns number of lines present between the 4th line and the line containg "M END".
1495
1496 =item B<GetInvalidAtomNumbers>
1497
1498 ($InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines) =
1499 GetInvalidAtomNumbers(\@CmpdLines);
1500
1501 Returns a list of values containing information about invalid atom numbers present
1502 in block or atom property lines.
1503
1504 =item B<GetUnknownAtoms>
1505
1506 ($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines) =
1507 GetUnknownAtoms(\@CmpdLines);
1508
1509 Returns a list of values containing information about atoms which contain special element
1510 symbols not present in the periodic table.
1511
1512 =item B<InternalBondOrderToMDLBondType>
1513
1514 $MDLBondType = InternalBondOrderToMDLBondType($InternalBondOrder);
1515
1516 Returns value of I<MDLBondType> corresponding to I<InternalBondOrder>.
1517
1518 InternalBondOrder MDLBondType
1519
1520 1 1
1521 2 2
1522 3 3
1523 1.5 4
1524
1525 =item B<InternalBondStereochemistryToMDLBondStereo>
1526
1527 $MDLBondStereo = InternalBondStereochemistryToMDLBondStereo(
1528 $InternalBondStereo);
1529
1530 Returns value of I<MDLBondStereo> corresponding to I<InternalBondStereo> using following
1531 mapping:
1532
1533 InternalBondStereo MDLBondStereo
1534
1535 Up 1
1536 UpOrDown 4
1537 Down 6
1538 CisOrTrans 3
1539 Other 0
1540
1541 =item B<InternalChargeToMDLCharge>
1542
1543 $MDLCharge = InternalChargeToMDLCharge($InternalCharge);
1544
1545 Returns value of I<MDLCharge> corresponding to I<InternalCharge> using following
1546 mapping:
1547
1548 InternalCharge MDLCharge
1549
1550 3 1
1551 2 2
1552 1 3
1553 -1 5
1554 -2 6
1555 -3 7
1556
1557 =item B<InternalSpinMultiplicityToMDLRadical>
1558
1559 $MDLRadical = InternalSpinMultiplicityToMDLRadical(
1560 $InternalSpinMultiplicity);
1561
1562 Returns value of I<MDLRadical> corresponding to I<InternalSpinMultiplicity>. These
1563 value are equivalent.
1564
1565 =item B<MDLBondStereoToInternalBondType>
1566
1567 $InternalBondType = MDLBondStereoToInternalBondType($MDLBondStereo);
1568
1569 Returns value of I<InternalBondType> corresponding to I<MDLBondStereo> using
1570 mapping shown for B<InternalBondTypeToMDLBondStereo> function.
1571
1572 =item B<IsCmpd2D>
1573
1574 $Status = IsCmpd2D();
1575
1576 Returns 1 or 0 based on whether z-coordinate of any atom is non-zero.
1577
1578 =item B<IsCmpd3D>
1579
1580 $Status = IsCmpd3D();
1581
1582 Returns 1 or 0 based on whether z-coordinate of any atom is non-zero.
1583
1584 =item B<MDLBondStereoToInternalBondStereochemistry>
1585
1586 $InternalBondStereo = MDLBondStereoToInternalBondStereochemistry(
1587 $MDLBondStereo);
1588
1589 Returns value of I<InternalBondStereo> corresponding to I<MDLBondStereo> using
1590 mapping shown for B<InternalBondStereochemistryToMDLBondStereo> function.
1591
1592 =item B<MDLBondTypeToInternalBondOrder>
1593
1594 $InternalBondOrder = MDLBondTypeToInternalBondOrder($MDLBondType);
1595
1596 Returns value of I<InternalBondOrder> corresponding to I<MDLBondType> using
1597 mapping shown for B<InternalBondOrderToMDLBondType> function.
1598
1599 =item B<MDLChargeToInternalCharge>
1600
1601 $InternalCharge = MDLChargeToInternalCharge($MDLCharge);
1602
1603 Returns value of I<$InternalCharge> corresponding to I<MDLCharge> using
1604 mapping shown for B<InternalChargeToMDLCharge> function.
1605
1606 =item B<MDLRadicalToInternalSpinMultiplicity>
1607
1608 $InternalSpinMultiplicity = MDLRadicalToInternalSpinMultiplicity(
1609 $MDLRadical);
1610
1611 Returns value of I<InternalSpinMultiplicity> corresponding to I<MDLRadical>. These
1612 value are equivalent.
1613
1614 =item B<ParseCmpdAtomAliasPropertyLine>
1615
1616 @AtomNumAndValuePairs = ParseCmpdAtomAliasPropertyLine(
1617 $CurrentLine, $NexLine);
1618
1619 Parses atom alias propery lines in CTAB generic properties block and returns an array
1620 with successive pairs of values corresponding to atom number and its alias.
1621
1622 =item B<ParseCmpdAtomLine>
1623
1624 ($AtomSymbol, $AtomX, $AtomY, $AtomZ, $MassDifference, $Charge,
1625 $StereoParity) = ParseCmpdAtomLine($AtomDataLine);
1626
1627 Parses compound data line containing atom information and returns a list
1628 of values.
1629
1630 =item B<ParseCmpdBondLine>
1631
1632 ($FirstAtomNum, $SecondAtomNum, $BondType) =
1633 ParseCmpdBondLine($BondDataLine);
1634
1635 Parses compound data line containing bond information and returns a list of
1636 values.
1637
1638 =item B<ParseCmpdCommentsLine>
1639
1640 $Comments = ParseCmpdCommentsLine($CommentsDataLine);
1641
1642 Returns the comment string.
1643
1644 =item B<ParseCmpdChargePropertyLine>
1645
1646 @AtomNumAndValuePairs = ParseCmpdChargePropertyLine(
1647 $ChargeDataLine);
1648
1649 Parses charge propery line in CTAB generic properties block and returns an array
1650 with successive pairs of values corresponding to atom number and its charge.
1651
1652 =item B<ParseCmpdCountsLine>
1653
1654 ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version) =
1655 ParseCmpdCountsLine(\@CountDataLines);
1656
1657 Returns a list of values containing count information.
1658
1659 =item B<ParseCmpdMiscInfoLine>
1660
1661 ($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2,
1662 $Energy, $RegistryNum) = ParseCmpdMiscInfoLine($Line);
1663
1664 Returns a list of values containing miscellaneous information.
1665
1666 =item B<ParseCmpdIsotopePropertyLine>
1667
1668 @AtomNumAndValuePairs = ParseCmpdIsotopePropertyLine(
1669 $IsotopeDataLine);
1670
1671 Parses isotopic propery line in CTAB generic properties block and returns an array
1672 with successive pairs of values corresponding to atom number and absolute mass of
1673 atom isotope.
1674
1675 =item B<ParseCmpdMolNameLine>
1676
1677 $MolName = ParseCmpdMolNameLine($Line);
1678
1679 Returns a string containing molecule name.
1680
1681 =item B<ParseCmpdRadicalPropertyLine>
1682
1683 @AtomNumAndValuePairs = ParseCmpdRadicalPropertyLine(
1684 $RadicalDataLine);
1685
1686 Parses radical propery line in CTAB generic properties block and returns an array
1687 with successive pairs of values corresponding to atom number and radical number
1688 value.
1689
1690 =item B<RemoveCmpdDataHeaderLabelAndValue>
1691
1692 $NewCmpdString = RemoveCmpdDataHeaderLabelAndValue($CmpdString,
1693 $DataHeaderLabel);
1694
1695 Returns a B<NewCmpdString> after removing I<DataHeaderLabel> along with its
1696 value from I<CmpdString>.
1697
1698 =item B<ReadCmpdString>
1699
1700 $CmpdString = ReadCmpdString(\*SDFILEHANDLE);
1701
1702 Returns a string containing all the data lines for the next available compound
1703 in an already open file indicated by SDFILEHANDLE. A NULL string is returned
1704 on EOF.
1705
1706 =item B<WashCmpd>
1707
1708 ($FragmentCount, $Fragments, $WashedCmpdString) =
1709 WashCmpd(\@CmpdLines);
1710
1711 Figures out the number of disconnected fragments and return their values along
1712 with the atom numbers in a string delimited by new line character. Fragment data
1713 in B<FragmentString> is sorted on based on its size.
1714
1715 =back
1716
1717 =head1 AUTHOR
1718
1719 Manish Sud <msud@san.rr.com>
1720
1721 =head1 SEE ALSO
1722
1723 TextUtil.pm
1724
1725 =head1 COPYRIGHT
1726
1727 Copyright (C) 2015 Manish Sud. All rights reserved.
1728
1729 This file is part of MayaChemTools.
1730
1731 MayaChemTools is free software; you can redistribute it and/or modify it under
1732 the terms of the GNU Lesser General Public License as published by the Free
1733 Software Foundation; either version 3 of the License, or (at your option)
1734 any later version.
1735
1736 =cut