In The Art of Unix Programming, Eric S. Raymond describes a data file metaformat based on RFC 822. [http://www.faqs.org/docs/artu/ch05s02.html#id2902039] This is a simple parser for that format.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | """Parse files stored in the RFC 822 metaformat."""
from extensions.itertools import two_finger
from re import compile as Regex
def lines(string):
"""Get the logical lines of the string."""
return merge_lines(string.splitlines())
def load(string):
"""Parse the given string."""
return pairs(remove_comments(lines(string)))
def merge_lines(lines):
"""Merge every line that begins with whitespace with its predecessor. May
raise a ParseError."""
new_lines = []
offset = 0
for offset, line in enumerate(lines):
if len(line) > 0 and not line.isspace():
break
lines = lines[offset:]
if starts_with_whitespace(lines[0]):
raise ParseError("%d: '%s': Keys cannot be indented.")
for line in lines:
if starts_with_whitespace(line):
new_lines[-1] += line
else:
new_lines.append(line)
return [line.strip() for line in new_lines]
def pairify(string):
"""Convert a string of the form "key: value" to a tuple ("key",
"value"). May raise a ParseError."""
items = string.split(":", 1)
try:
return items[0].strip(), items[1].strip()
except IndexError:
raise ParseError("'%s': Keys must be terminated with a colon." %
string)
def pairs(lines):
"""Convert a list of lines into a dictionary. May raise a ParseError."""
return dict(pairify(line) for line in lines)
def remove_comments(lines):
"""Remove all lines containing a comment."""
comment_line = Regex("^\s*#.*$")
eol_comment = Regex(r"(?<!\\)#.*$")
return [eol_comment.sub("", line) for line in lines if not comment_line.match(line)]
def starts_with_whitespace(line):
return len(line) == 0 or line[0].isspace()
class ParseError(Exception): pass
|
I don't have the book, but it seems more logical (to me at least) to remove comments before joining continuation lines. That is, load() should return:
so this text:
is parsed as:
True. When I first wrote load(), I hadn't planned on allowing end-of-line comments, so remove_comments() was only going to remove lines matching /^\s#.$/. When I included end-of-line comments I did not make the necessary modifications.