uniform matcher( "re pattern" / re / func / dict / list / tuple / set ) « Python recipes

matcher() makes a string matcher function from any of:

"RE pattern string"
re.compile()
a function, i.e. callable
a dict / list / tuple / set / container

This uniformity is simple, useful, a Good Thing.

A few example functions using matchers are here too: grep getfields kwgrep.

      """
matcher() makes a string matcher function from any of:
    "RE pattern string"
    re.compile()
    a function, i.e. callable
    a dict / list / tuple / set / container

This uniformity is simple, useful, a Good Thing.

Usage:
    matchf = matcher( "re pattern" / re / func / dict / list / tuple / set )
    ...
    if matchf( str ):
        -- re.search( str ) / func( str ) / str in the container

A few example functions using matchers are here too:

  grep( matcher(), afile )
    -- print matching lines + header and trailer, in a file or iterable

  getfields( "kw kw2 ...".split(), afile )
    -- lines starting "kw:"  ->  [ ("kw", "kw: line") ... ]

  kwgrep, combined grep and getfields:
    kwgrep(
        "name: ^mp3  version: ^1.2.3"   -- match these
        "home-page: summary:"           -- and get these too
        )
    -> lines, nhit, nre
    e.g. [( "name", "name: mp3xx" )], 1, 2  -- 1 kw match only, not both

"""

# care: re.match( "end$" ) doesn't match "end " "end\n" "end\r"
# so rstrip lines early
# see also: pls egg-info list + search,  gmatch goopat

# 5may: matcher( ">= NJ" )  ->  lambda x: x >= "NJ"  -- str not num


import re, sys

__version__ = "2009-05-06-May"
__author_email__ = "denis-bz-py@t-online.de"
__credits__ = "BeautifulSoup"
Test = 0

_re_type = type( re.compile( "" ))
_relop_re = re.compile( r" ([<=>] =?) \s* (.*) ", re.X )

#-------------------------------------------------------------------------------
def matcher( x, negate=False ):
    """ matcher( "string" / compiled re / func / dict / list / tuple / set )
        -> a uniform match func, see above
    """
    if x in ( "", ".*", "*" ):  # always match
        f = lambda _: True
    elif isinstance( x, basestring ):
        if x[0] in "<=>":
            f = compare_func( x )  # ">= NJ"
        else:
            f = re.compile( x ) .search  # not BSoup lambda s: s == x
    elif isinstance( x, _re_type ):
        f = x.search
    elif callable( x ):  # e.g. re.compile().match
        f = x
    elif isinstance( x, (list, tuple) ):
        f = set( x ).__contains__
    elif hasattr( x, "__contains__" ):  # dict, set -- care if they change later ?!
        f = x.__contains__
    elif x in ( 0, 1, True, False, None ):
        f = lambda _: x
    # matcher( matcher) == matcher
    else:
        assert 0, "matcher: %s must be one of: str re callable dict list tuple" % x
    if negate:
        return lambda x: not f( x )  # match this - that, cf goopat
    return f

#...............................................................................
def grep( matchf, afile, header="", indent="", trailer="", out=sys.stdout ):
    """ print lines matching matchf ("re" / re / func / dict) + header and trailer
        -> nmatch
        out None: just return 1 on first match / 0
    """
    matchf = matcher( matchf )
    if isinstance( afile, basestring ):
        afile = open( afile )  # IOError: [Errno 2] No such file or directory
    nmatch = 0
    for line in afile:
        if matchf( line ):
            if out is None:
                return 1
            if header:
                print >>out, header
                header = None
            print "%s%s" % (indent, line)
            nmatch += 1
    if nmatch and out and trailer:
        print >>out, trailer.rstrip( " " )
    return nmatch


_kw_re = re.compile(      r" \s*  ([\w.-]+)  \s*  :  \s* ", re.X )
_kw_rest_re = re.compile( r" \s*  ([\w.-]+)  \s*  :  \s*  (.*)  ", re.X )
                                #  kw: rest of line

#...............................................................................
def getfields( fields, afile, lower=True ):
    """ grep lines starting with given keywords / field names ":"
        e.g. "name version".split()
       -> lines [ ("name", "name: ...") ... ]
    """
    if isinstance( fields, basestring ):
        fields = filter( None, re.split( r"[\s:,]+", fields ))
    matchf = matcher( fields )
    if isinstance( afile, basestring ):
        afile = open( afile )
    lines = []
    for line in afile:
        m = _kw_re.match( line )  # kw: ...
        if not m:
            continue
        kw = m.group( 1 )
        if lower:  kw = kw.lower()
        if matchf( kw ):  # kw in dict list tuple or set  or func( kw )
            lines.append( (kw, line.rstrip()) )
    return lines

#...............................................................................
def kwgrep( grepdict, afile, lower=True ):
    """ combined grep + getfields:
        "name: ^mp3  version: ^1.2.3"   -- match these
        "home-page: summary:"           -- and get these too
        or a dict, kw -> matcher() / "" for also-gets
    usage:
        lines, nmatch, nre = kwgrep()
        if nmatch == nre:
            ... all REs matched, here both name: and version:
            ... or 0 == 0, just getfields
    """
    if isinstance( grepdict, basestring ):
        grepdict = str_grepdict( grepdict )
    if isinstance( afile, basestring ):
        afile = open( afile )
    lines = []
    hits = {}  # nr diff keywords w RE matches
    for line in afile:
        m = _kw_rest_re.match( line )  # kw: ...
        if not m:
            continue
        kw = m.group( 1 )
        if lower:  kw = kw.lower()
        if kw not in grepdict:
            continue
        matchf = grepdict[kw]
        if not matchf:
            lines.append( (kw, line.rstrip()) )  # also-gets
            continue
        restofline = m.group( 2 ) .rstrip()
        if matchf( restofline ):
            lines.append( (kw, line.rstrip()) )
            hits[kw] = 1
    nre = sum( [bool(re) for re in grepdict.values()] )
    if Test:
        print >>sys.stderr, "test kwgrep:", lines, len(hits), nre
    return (lines, len(hits), nre)


def str_grepdict( s ):  # for kwget
    if ":" not in s:  # "a b c", "a: b: c:" -> get fields a b c, matchf True
        return dict.fromkeys( s.split(), "" )
    pairs = _kw_re.split( s.rstrip() )
        # "a: 1  b:  c:  d: 2" -> ['', 'a', ' 1', 'b', '', 'c', '', 'd', ' 2']
    grepdict = {}
    for kw, val in [pairs[j:j+2] for j in range( 1, len(pairs), 2 )]:
        grepdict[kw] = matcher( val ) if val  else ""
    if Test:
        print >>sys.stderr, "test str_grepdict:", grepdict
    return grepdict

#...............................................................................
def putlines( lines, header="", indent="", trailer="", out=sys.stdout ):
    """ kwlines = getfields( "name version".split(), afile )
        putlines( map( itemgetter(1), kwlines ), header=afile )
    """
    if not lines:
        return
    if header:
        print >>out, header
    for line in lines:
        print "%s%s" % (indent, line)
    if trailer:
        print >>out, trailer.rstrip( " " )

#...............................................................................
def compare_func( relopstr ):
    """ "< 3"  ->  the function  x -> (x < "3")  NB str "3" not num 3
    """
    # (could lambda x: (x < 3) if isnum(x)  else (x < "3")
    # but then version: > 0.1  is num compare,
    #          version: > 0.1.0  str

    relop, s = _relop_re.match( relopstr ) .groups()
    if relop == "=":  relop = "=="
    s = s.rstrip() .strip( "\"" )
    cmptext = "lambda x: x %s \"%s\"" % (relop, s)
                # e.g.   x < "3" 
    if Test:
        print >>sys.stderr, "compare_func: %s" % cmptext
    return eval( cmptext )  # try ?


#...............................................................................
if __name__ == "__main__":

#     for pat in ( "a", re.compile( "^a" ), matcher( "b" ), dict( a=1 ) ):
#         matchf = matcher( pat )
#         for s in "ab":
#             print "matcher( %s )( %r ) = %s" % (  pat, s, matchf( s ))
#         print ""
#     grep( "^def", __file__ )

    import readline

    # for afile in sys.argv[1:]:
    while 1:
        try:
            line = raw_input( "matcher: " )
        except EOFError:
            break
        if line[0] == "!":
            exec( line[1:] .strip() )
        else:
            print kwgrep( "name: ^mp3  version: ", line.split( ";" ))

# end matcher.py

      

Uniformity is a Good Thing. matcher() makes uniform string match functions out of "RE pattern string" / re.compile() / dict ... The technique is surely well-known, a re-invented wheel -- let me know.

Tags: grep, re, text_processing, uniform

◄	Python recipes (4591)	►
◄	denis's recipes (5)	►

uniform matcher( "re pattern" / re / func / dict / list / tuple / set ) (Python recipe) by denis
ActiveState Code (http://code.activestate.com/recipes/576741/)

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

uniform matcher( "re pattern" / re / func / dict / list / tuple / set ) (Python recipe) by denis ActiveState Code (http://code.activestate.com/recipes/576741/)