Mercurial > repos > bcclaywell > argo_navis
comparison venv/lib/python2.7/site-packages/docutils/parsers/rst/tableparser.py @ 0:d67268158946 draft
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
author | bcclaywell |
---|---|
date | Mon, 12 Oct 2015 17:43:33 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d67268158946 |
---|---|
1 # $Id: tableparser.py 7320 2012-01-19 22:33:02Z milde $ | |
2 # Author: David Goodger <goodger@python.org> | |
3 # Copyright: This module has been placed in the public domain. | |
4 | |
5 """ | |
6 This module defines table parser classes,which parse plaintext-graphic tables | |
7 and produce a well-formed data structure suitable for building a CALS table. | |
8 | |
9 :Classes: | |
10 - `GridTableParser`: Parse fully-formed tables represented with a grid. | |
11 - `SimpleTableParser`: Parse simple tables, delimited by top & bottom | |
12 borders. | |
13 | |
14 :Exception class: `TableMarkupError` | |
15 | |
16 :Function: | |
17 `update_dict_of_lists()`: Merge two dictionaries containing list values. | |
18 """ | |
19 | |
20 __docformat__ = 'reStructuredText' | |
21 | |
22 | |
23 import re | |
24 import sys | |
25 from docutils import DataError | |
26 from docutils.utils import strip_combining_chars | |
27 | |
28 | |
29 class TableMarkupError(DataError): | |
30 | |
31 """ | |
32 Raise if there is any problem with table markup. | |
33 | |
34 The keyword argument `offset` denotes the offset of the problem | |
35 from the table's start line. | |
36 """ | |
37 | |
38 def __init__(self, *args, **kwargs): | |
39 self.offset = kwargs.pop('offset', 0) | |
40 DataError.__init__(self, *args) | |
41 | |
42 | |
43 class TableParser: | |
44 | |
45 """ | |
46 Abstract superclass for the common parts of the syntax-specific parsers. | |
47 """ | |
48 | |
49 head_body_separator_pat = None | |
50 """Matches the row separator between head rows and body rows.""" | |
51 | |
52 double_width_pad_char = '\x00' | |
53 """Padding character for East Asian double-width text.""" | |
54 | |
55 def parse(self, block): | |
56 """ | |
57 Analyze the text `block` and return a table data structure. | |
58 | |
59 Given a plaintext-graphic table in `block` (list of lines of text; no | |
60 whitespace padding), parse the table, construct and return the data | |
61 necessary to construct a CALS table or equivalent. | |
62 | |
63 Raise `TableMarkupError` if there is any problem with the markup. | |
64 """ | |
65 self.setup(block) | |
66 self.find_head_body_sep() | |
67 self.parse_table() | |
68 structure = self.structure_from_cells() | |
69 return structure | |
70 | |
71 def find_head_body_sep(self): | |
72 """Look for a head/body row separator line; store the line index.""" | |
73 for i in range(len(self.block)): | |
74 line = self.block[i] | |
75 if self.head_body_separator_pat.match(line): | |
76 if self.head_body_sep: | |
77 raise TableMarkupError( | |
78 'Multiple head/body row separators ' | |
79 '(table lines %s and %s); only one allowed.' | |
80 % (self.head_body_sep+1, i+1), offset=i) | |
81 else: | |
82 self.head_body_sep = i | |
83 self.block[i] = line.replace('=', '-') | |
84 if self.head_body_sep == 0 or self.head_body_sep == (len(self.block) | |
85 - 1): | |
86 raise TableMarkupError('The head/body row separator may not be ' | |
87 'the first or last line of the table.', | |
88 offset=i) | |
89 | |
90 | |
91 class GridTableParser(TableParser): | |
92 | |
93 """ | |
94 Parse a grid table using `parse()`. | |
95 | |
96 Here's an example of a grid table:: | |
97 | |
98 +------------------------+------------+----------+----------+ | |
99 | Header row, column 1 | Header 2 | Header 3 | Header 4 | | |
100 +========================+============+==========+==========+ | |
101 | body row 1, column 1 | column 2 | column 3 | column 4 | | |
102 +------------------------+------------+----------+----------+ | |
103 | body row 2 | Cells may span columns. | | |
104 +------------------------+------------+---------------------+ | |
105 | body row 3 | Cells may | - Table cells | | |
106 +------------------------+ span rows. | - contain | | |
107 | body row 4 | | - body elements. | | |
108 +------------------------+------------+---------------------+ | |
109 | |
110 Intersections use '+', row separators use '-' (except for one optional | |
111 head/body row separator, which uses '='), and column separators use '|'. | |
112 | |
113 Passing the above table to the `parse()` method will result in the | |
114 following data structure:: | |
115 | |
116 ([24, 12, 10, 10], | |
117 [[(0, 0, 1, ['Header row, column 1']), | |
118 (0, 0, 1, ['Header 2']), | |
119 (0, 0, 1, ['Header 3']), | |
120 (0, 0, 1, ['Header 4'])]], | |
121 [[(0, 0, 3, ['body row 1, column 1']), | |
122 (0, 0, 3, ['column 2']), | |
123 (0, 0, 3, ['column 3']), | |
124 (0, 0, 3, ['column 4'])], | |
125 [(0, 0, 5, ['body row 2']), | |
126 (0, 2, 5, ['Cells may span columns.']), | |
127 None, | |
128 None], | |
129 [(0, 0, 7, ['body row 3']), | |
130 (1, 0, 7, ['Cells may', 'span rows.', '']), | |
131 (1, 1, 7, ['- Table cells', '- contain', '- body elements.']), | |
132 None], | |
133 [(0, 0, 9, ['body row 4']), None, None, None]]) | |
134 | |
135 The first item is a list containing column widths (colspecs). The second | |
136 item is a list of head rows, and the third is a list of body rows. Each | |
137 row contains a list of cells. Each cell is either None (for a cell unused | |
138 because of another cell's span), or a tuple. A cell tuple contains four | |
139 items: the number of extra rows used by the cell in a vertical span | |
140 (morerows); the number of extra columns used by the cell in a horizontal | |
141 span (morecols); the line offset of the first line of the cell contents; | |
142 and the cell contents, a list of lines of text. | |
143 """ | |
144 | |
145 head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$') | |
146 | |
147 def setup(self, block): | |
148 self.block = block[:] # make a copy; it may be modified | |
149 self.block.disconnect() # don't propagate changes to parent | |
150 self.bottom = len(block) - 1 | |
151 self.right = len(block[0]) - 1 | |
152 self.head_body_sep = None | |
153 self.done = [-1] * len(block[0]) | |
154 self.cells = [] | |
155 self.rowseps = {0: [0]} | |
156 self.colseps = {0: [0]} | |
157 | |
158 def parse_table(self): | |
159 """ | |
160 Start with a queue of upper-left corners, containing the upper-left | |
161 corner of the table itself. Trace out one rectangular cell, remember | |
162 it, and add its upper-right and lower-left corners to the queue of | |
163 potential upper-left corners of further cells. Process the queue in | |
164 top-to-bottom order, keeping track of how much of each text column has | |
165 been seen. | |
166 | |
167 We'll end up knowing all the row and column boundaries, cell positions | |
168 and their dimensions. | |
169 """ | |
170 corners = [(0, 0)] | |
171 while corners: | |
172 top, left = corners.pop(0) | |
173 if top == self.bottom or left == self.right \ | |
174 or top <= self.done[left]: | |
175 continue | |
176 result = self.scan_cell(top, left) | |
177 if not result: | |
178 continue | |
179 bottom, right, rowseps, colseps = result | |
180 update_dict_of_lists(self.rowseps, rowseps) | |
181 update_dict_of_lists(self.colseps, colseps) | |
182 self.mark_done(top, left, bottom, right) | |
183 cellblock = self.block.get_2D_block(top + 1, left + 1, | |
184 bottom, right) | |
185 cellblock.disconnect() # lines in cell can't sync with parent | |
186 cellblock.replace(self.double_width_pad_char, '') | |
187 self.cells.append((top, left, bottom, right, cellblock)) | |
188 corners.extend([(top, right), (bottom, left)]) | |
189 corners.sort() | |
190 if not self.check_parse_complete(): | |
191 raise TableMarkupError('Malformed table; parse incomplete.') | |
192 | |
193 def mark_done(self, top, left, bottom, right): | |
194 """For keeping track of how much of each text column has been seen.""" | |
195 before = top - 1 | |
196 after = bottom - 1 | |
197 for col in range(left, right): | |
198 assert self.done[col] == before | |
199 self.done[col] = after | |
200 | |
201 def check_parse_complete(self): | |
202 """Each text column should have been completely seen.""" | |
203 last = self.bottom - 1 | |
204 for col in range(self.right): | |
205 if self.done[col] != last: | |
206 return False | |
207 return True | |
208 | |
209 def scan_cell(self, top, left): | |
210 """Starting at the top-left corner, start tracing out a cell.""" | |
211 assert self.block[top][left] == '+' | |
212 result = self.scan_right(top, left) | |
213 return result | |
214 | |
215 def scan_right(self, top, left): | |
216 """ | |
217 Look for the top-right corner of the cell, and make note of all column | |
218 boundaries ('+'). | |
219 """ | |
220 colseps = {} | |
221 line = self.block[top] | |
222 for i in range(left + 1, self.right + 1): | |
223 if line[i] == '+': | |
224 colseps[i] = [top] | |
225 result = self.scan_down(top, left, i) | |
226 if result: | |
227 bottom, rowseps, newcolseps = result | |
228 update_dict_of_lists(colseps, newcolseps) | |
229 return bottom, i, rowseps, colseps | |
230 elif line[i] != '-': | |
231 return None | |
232 return None | |
233 | |
234 def scan_down(self, top, left, right): | |
235 """ | |
236 Look for the bottom-right corner of the cell, making note of all row | |
237 boundaries. | |
238 """ | |
239 rowseps = {} | |
240 for i in range(top + 1, self.bottom + 1): | |
241 if self.block[i][right] == '+': | |
242 rowseps[i] = [right] | |
243 result = self.scan_left(top, left, i, right) | |
244 if result: | |
245 newrowseps, colseps = result | |
246 update_dict_of_lists(rowseps, newrowseps) | |
247 return i, rowseps, colseps | |
248 elif self.block[i][right] != '|': | |
249 return None | |
250 return None | |
251 | |
252 def scan_left(self, top, left, bottom, right): | |
253 """ | |
254 Noting column boundaries, look for the bottom-left corner of the cell. | |
255 It must line up with the starting point. | |
256 """ | |
257 colseps = {} | |
258 line = self.block[bottom] | |
259 for i in range(right - 1, left, -1): | |
260 if line[i] == '+': | |
261 colseps[i] = [bottom] | |
262 elif line[i] != '-': | |
263 return None | |
264 if line[left] != '+': | |
265 return None | |
266 result = self.scan_up(top, left, bottom, right) | |
267 if result is not None: | |
268 rowseps = result | |
269 return rowseps, colseps | |
270 return None | |
271 | |
272 def scan_up(self, top, left, bottom, right): | |
273 """ | |
274 Noting row boundaries, see if we can return to the starting point. | |
275 """ | |
276 rowseps = {} | |
277 for i in range(bottom - 1, top, -1): | |
278 if self.block[i][left] == '+': | |
279 rowseps[i] = [left] | |
280 elif self.block[i][left] != '|': | |
281 return None | |
282 return rowseps | |
283 | |
284 def structure_from_cells(self): | |
285 """ | |
286 From the data collected by `scan_cell()`, convert to the final data | |
287 structure. | |
288 """ | |
289 rowseps = self.rowseps.keys() # list of row boundaries | |
290 rowseps.sort() | |
291 rowindex = {} | |
292 for i in range(len(rowseps)): | |
293 rowindex[rowseps[i]] = i # row boundary -> row number mapping | |
294 colseps = self.colseps.keys() # list of column boundaries | |
295 colseps.sort() | |
296 colindex = {} | |
297 for i in range(len(colseps)): | |
298 colindex[colseps[i]] = i # column boundary -> col number map | |
299 colspecs = [(colseps[i] - colseps[i - 1] - 1) | |
300 for i in range(1, len(colseps))] # list of column widths | |
301 # prepare an empty table with the correct number of rows & columns | |
302 onerow = [None for i in range(len(colseps) - 1)] | |
303 rows = [onerow[:] for i in range(len(rowseps) - 1)] | |
304 # keep track of # of cells remaining; should reduce to zero | |
305 remaining = (len(rowseps) - 1) * (len(colseps) - 1) | |
306 for top, left, bottom, right, block in self.cells: | |
307 rownum = rowindex[top] | |
308 colnum = colindex[left] | |
309 assert rows[rownum][colnum] is None, ( | |
310 'Cell (row %s, column %s) already used.' | |
311 % (rownum + 1, colnum + 1)) | |
312 morerows = rowindex[bottom] - rownum - 1 | |
313 morecols = colindex[right] - colnum - 1 | |
314 remaining -= (morerows + 1) * (morecols + 1) | |
315 # write the cell into the table | |
316 rows[rownum][colnum] = (morerows, morecols, top + 1, block) | |
317 assert remaining == 0, 'Unused cells remaining.' | |
318 if self.head_body_sep: # separate head rows from body rows | |
319 numheadrows = rowindex[self.head_body_sep] | |
320 headrows = rows[:numheadrows] | |
321 bodyrows = rows[numheadrows:] | |
322 else: | |
323 headrows = [] | |
324 bodyrows = rows | |
325 return (colspecs, headrows, bodyrows) | |
326 | |
327 | |
328 class SimpleTableParser(TableParser): | |
329 | |
330 """ | |
331 Parse a simple table using `parse()`. | |
332 | |
333 Here's an example of a simple table:: | |
334 | |
335 ===== ===== | |
336 col 1 col 2 | |
337 ===== ===== | |
338 1 Second column of row 1. | |
339 2 Second column of row 2. | |
340 Second line of paragraph. | |
341 3 - Second column of row 3. | |
342 | |
343 - Second item in bullet | |
344 list (row 3, column 2). | |
345 4 is a span | |
346 ------------ | |
347 5 | |
348 ===== ===== | |
349 | |
350 Top and bottom borders use '=', column span underlines use '-', column | |
351 separation is indicated with spaces. | |
352 | |
353 Passing the above table to the `parse()` method will result in the | |
354 following data structure, whose interpretation is the same as for | |
355 `GridTableParser`:: | |
356 | |
357 ([5, 25], | |
358 [[(0, 0, 1, ['col 1']), | |
359 (0, 0, 1, ['col 2'])]], | |
360 [[(0, 0, 3, ['1']), | |
361 (0, 0, 3, ['Second column of row 1.'])], | |
362 [(0, 0, 4, ['2']), | |
363 (0, 0, 4, ['Second column of row 2.', | |
364 'Second line of paragraph.'])], | |
365 [(0, 0, 6, ['3']), | |
366 (0, 0, 6, ['- Second column of row 3.', | |
367 '', | |
368 '- Second item in bullet', | |
369 ' list (row 3, column 2).'])], | |
370 [(0, 1, 10, ['4 is a span'])], | |
371 [(0, 0, 12, ['5']), | |
372 (0, 0, 12, [''])]]) | |
373 """ | |
374 | |
375 head_body_separator_pat = re.compile('=[ =]*$') | |
376 span_pat = re.compile('-[ -]*$') | |
377 | |
378 def setup(self, block): | |
379 self.block = block[:] # make a copy; it will be modified | |
380 self.block.disconnect() # don't propagate changes to parent | |
381 # Convert top & bottom borders to column span underlines: | |
382 self.block[0] = self.block[0].replace('=', '-') | |
383 self.block[-1] = self.block[-1].replace('=', '-') | |
384 self.head_body_sep = None | |
385 self.columns = [] | |
386 self.border_end = None | |
387 self.table = [] | |
388 self.done = [-1] * len(block[0]) | |
389 self.rowseps = {0: [0]} | |
390 self.colseps = {0: [0]} | |
391 | |
392 def parse_table(self): | |
393 """ | |
394 First determine the column boundaries from the top border, then | |
395 process rows. Each row may consist of multiple lines; accumulate | |
396 lines until a row is complete. Call `self.parse_row` to finish the | |
397 job. | |
398 """ | |
399 # Top border must fully describe all table columns. | |
400 self.columns = self.parse_columns(self.block[0], 0) | |
401 self.border_end = self.columns[-1][1] | |
402 firststart, firstend = self.columns[0] | |
403 offset = 1 # skip top border | |
404 start = 1 | |
405 text_found = None | |
406 while offset < len(self.block): | |
407 line = self.block[offset] | |
408 if self.span_pat.match(line): | |
409 # Column span underline or border; row is complete. | |
410 self.parse_row(self.block[start:offset], start, | |
411 (line.rstrip(), offset)) | |
412 start = offset + 1 | |
413 text_found = None | |
414 elif line[firststart:firstend].strip(): | |
415 # First column not blank, therefore it's a new row. | |
416 if text_found and offset != start: | |
417 self.parse_row(self.block[start:offset], start) | |
418 start = offset | |
419 text_found = 1 | |
420 elif not text_found: | |
421 start = offset + 1 | |
422 offset += 1 | |
423 | |
424 def parse_columns(self, line, offset): | |
425 """ | |
426 Given a column span underline, return a list of (begin, end) pairs. | |
427 """ | |
428 cols = [] | |
429 end = 0 | |
430 while True: | |
431 begin = line.find('-', end) | |
432 end = line.find(' ', begin) | |
433 if begin < 0: | |
434 break | |
435 if end < 0: | |
436 end = len(line) | |
437 cols.append((begin, end)) | |
438 if self.columns: | |
439 if cols[-1][1] != self.border_end: | |
440 raise TableMarkupError('Column span incomplete in table ' | |
441 'line %s.' % (offset+1), | |
442 offset=offset) | |
443 # Allow for an unbounded rightmost column: | |
444 cols[-1] = (cols[-1][0], self.columns[-1][1]) | |
445 return cols | |
446 | |
447 def init_row(self, colspec, offset): | |
448 i = 0 | |
449 cells = [] | |
450 for start, end in colspec: | |
451 morecols = 0 | |
452 try: | |
453 assert start == self.columns[i][0] | |
454 while end != self.columns[i][1]: | |
455 i += 1 | |
456 morecols += 1 | |
457 except (AssertionError, IndexError): | |
458 raise TableMarkupError('Column span alignment problem ' | |
459 'in table line %s.' % (offset+2), | |
460 offset=offset+1) | |
461 cells.append([0, morecols, offset, []]) | |
462 i += 1 | |
463 return cells | |
464 | |
465 def parse_row(self, lines, start, spanline=None): | |
466 """ | |
467 Given the text `lines` of a row, parse it and append to `self.table`. | |
468 | |
469 The row is parsed according to the current column spec (either | |
470 `spanline` if provided or `self.columns`). For each column, extract | |
471 text from each line, and check for text in column margins. Finally, | |
472 adjust for insignificant whitespace. | |
473 """ | |
474 if not (lines or spanline): | |
475 # No new row, just blank lines. | |
476 return | |
477 if spanline: | |
478 columns = self.parse_columns(*spanline) | |
479 span_offset = spanline[1] | |
480 else: | |
481 columns = self.columns[:] | |
482 span_offset = start | |
483 self.check_columns(lines, start, columns) | |
484 row = self.init_row(columns, start) | |
485 for i in range(len(columns)): | |
486 start, end = columns[i] | |
487 cellblock = lines.get_2D_block(0, start, len(lines), end) | |
488 cellblock.disconnect() # lines in cell can't sync with parent | |
489 cellblock.replace(self.double_width_pad_char, '') | |
490 row[i][3] = cellblock | |
491 self.table.append(row) | |
492 | |
493 def check_columns(self, lines, first_line, columns): | |
494 """ | |
495 Check for text in column margins and text overflow in the last column. | |
496 Raise TableMarkupError if anything but whitespace is in column margins. | |
497 Adjust the end value for the last column if there is text overflow. | |
498 """ | |
499 # "Infinite" value for a dummy last column's beginning, used to | |
500 # check for text overflow: | |
501 columns.append((sys.maxint, None)) | |
502 lastcol = len(columns) - 2 | |
503 # combining characters do not contribute to the column width | |
504 lines = [strip_combining_chars(line) for line in lines] | |
505 | |
506 for i in range(len(columns) - 1): | |
507 start, end = columns[i] | |
508 nextstart = columns[i+1][0] | |
509 offset = 0 | |
510 for line in lines: | |
511 if i == lastcol and line[end:].strip(): | |
512 text = line[start:].rstrip() | |
513 new_end = start + len(text) | |
514 columns[i] = (start, new_end) | |
515 main_start, main_end = self.columns[-1] | |
516 if new_end > main_end: | |
517 self.columns[-1] = (main_start, new_end) | |
518 elif line[end:nextstart].strip(): | |
519 raise TableMarkupError('Text in column margin ' | |
520 'in table line %s.' % (first_line+offset+1), | |
521 offset=first_line+offset) | |
522 offset += 1 | |
523 columns.pop() | |
524 | |
525 def structure_from_cells(self): | |
526 colspecs = [end - start for start, end in self.columns] | |
527 first_body_row = 0 | |
528 if self.head_body_sep: | |
529 for i in range(len(self.table)): | |
530 if self.table[i][0][2] > self.head_body_sep: | |
531 first_body_row = i | |
532 break | |
533 return (colspecs, self.table[:first_body_row], | |
534 self.table[first_body_row:]) | |
535 | |
536 | |
537 def update_dict_of_lists(master, newdata): | |
538 """ | |
539 Extend the list values of `master` with those from `newdata`. | |
540 | |
541 Both parameters must be dictionaries containing list values. | |
542 """ | |
543 for key, values in newdata.items(): | |
544 master.setdefault(key, []).extend(values) |