In The Art of Unix Programming, Eric S. Raymond describes a data file metaformat based on RFC 822. [http://www.faqs.org/docs/artu/ch05s02.html#id2902039] This is a simple parser for that format.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | """Parse files stored in the RFC 822 metaformat."""
from extensions.itertools import two_finger
from re import compile as Regex
def lines(string):
    """Get the logical lines of the string."""
    return merge_lines(string.splitlines())
def load(string):
    """Parse the given string."""
    return pairs(remove_comments(lines(string)))
def merge_lines(lines):
    """Merge every line that begins with whitespace with its predecessor.  May
    raise a ParseError."""
    new_lines = []
    offset = 0
    for offset, line in enumerate(lines):
        if len(line) > 0 and not line.isspace():
            break
    lines = lines[offset:]
    if starts_with_whitespace(lines[0]):
        raise ParseError("%d: '%s': Keys cannot be indented.")
    for line in lines:
        if starts_with_whitespace(line):
            new_lines[-1] += line
        else:
            new_lines.append(line)
    return [line.strip() for line in new_lines]
def pairify(string):
    """Convert a string of the form "key: value" to a tuple ("key",
    "value").  May raise a ParseError."""
    items = string.split(":", 1)
    try:
        return items[0].strip(), items[1].strip()
    except IndexError:
        raise ParseError("'%s': Keys must be terminated with a colon." %
                         string)
def pairs(lines):
    """Convert a list of lines into a dictionary.  May raise a ParseError."""
    return dict(pairify(line) for line in lines)
def remove_comments(lines):
    """Remove all lines containing a comment."""
    comment_line = Regex("^\s*#.*$")
    eol_comment = Regex(r"(?<!\\)#.*$")
    return [eol_comment.sub("", line) for line in lines if not comment_line.match(line)]
def starts_with_whitespace(line):
    return len(line) == 0 or line[0].isspace()
class ParseError(Exception): pass
 | 

 Download
Download Copy to clipboard
Copy to clipboard

I don't have the book, but it seems more logical (to me at least) to remove comments before joining continuation lines. That is, load() should return:
so this text:
is parsed as:
True. When I first wrote load(), I hadn't planned on allowing end-of-line comments, so remove_comments() was only going to remove lines matching /^\s#.$/. When I included end-of-line comments I did not make the necessary modifications.