Welcome, guest | Sign In | My Account | Store | Cart

Python's os.walk() standard library iterator is useful if you want to walk an entire directory tree, but you're on your own when it comes to implementing name filtering and recursive depth limiting on top of it.

This recipe supports these features with an interface that is just as convenient as the underlying os.walk() API, while being significantly more powerful.

Python, 75 lines
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import fnmatch
import os
import os.path
import collections
import sys

WalkedDir = collections.namedtuple("WalkedDir", "path subdirs files depth")

def filter_walk(top, file_pattern=None, dir_pattern=None, depth=None, onerror=None, followlinks=False, onloop=None):
    """filter_walk is similar to os.walk, but offers the following additional features:
        - yields a named tuple of (path, subdirs, files, depth)
        - allows a recursion depth limit to be specified
        - allows independent glob-style filters for filenames and subdirectories
        - emits a message to stderr and skips the directory if a symlink loop is encountered when following links

       Selective walks are always top down, as the directory listings must be altered to provide
       the above features.

       If not None, depth must be at least 0. A depth of zero can be useful to get separate
       filtered subdirectory and file listings for a given directory.

       onerror is passed to os.walk to handle os.listdir errors
       followlinks is passed to os.walk and enables the symbolic loop detection
       onloop (if provided) can be used to override the default symbolic loop handling. It is
       called with the directory path as an argument when a loop is detected. Any false return
       value will skip the directory as normal, any true value means the directory will be processed.
    """
    if depth is not None and depth < 0:
        msg = "Depth limit must be None or greater than 0 ({!r} provided)"
        raise ValueError(msg.format(depth))
    if onloop is None:
        def onloop(path):
            msg = "Symlink {!r} refers to a parent directory, skipping\n"
            sys.stderr.write(msg.format(path))
            sys.stderr.flush()
    if followlinks:
        real_top = os.path.abspath(os.path.realpath(top))
    sep = os.sep
    initial_depth = top.count(sep)
    for path, walk_subdirs, files in os.walk(top, topdown=True,
                                             onerror=onerror,
                                             followlinks=followlinks):
        # Check for symlink loops
        if followlinks and os.path.islink(path):
            # We just descended into a directory via a symbolic link
            # Check if we're referring to a directory that is
            # a parent of our nominal directory
            relative = os.path.relpath(path, top)
            nominal_path = os.path.join(real_top, relative)
            real_path = os.path.abspath(os.path.realpath(path))
            path_fragments = zip(nominal_path.split(sep), real_path.split(sep))
            for nominal, real in path_fragments:
                if nominal != real:
                    break
            else:
                if not onloop(path):
                    walk_subdirs[:] = []
                    continue
        # Filter files, if requested
        if file_pattern is not None:
            files = fnmatch.filter(files, file_pattern)
        # We hide the underlying generator's subdirectory list, since we
        # clear it internally when we reach the depth limit (if any)
        if dir_pattern is None:
            subdirs = walk_subdirs[:]
        else:
            subdirs = fnmatch.filter(walk_subdirs, dir_pattern)
        # Report depth
        current_depth = path.count(sep) - initial_depth
        yield WalkedDir(path, subdirs, files, current_depth)
        # Filter directories and implement depth limiting
        if depth is not None and current_depth >= depth:
            walk_subdirs[:] = []
        else:
            walk_subdirs[:] = subdirs

"file_pattern=''" can be a useful trick if you only want to walk the directories and don't care about the files at all.

The following example output:

>>> for info in filter_walk("path_fodder", followlinks=True):
...     print(info)
... 
WalkedDir(path='path_fodder', subdirs=['subdir', 'subdir2'], files=['data.txt'], depth=0)
WalkedDir(path='path_fodder/subdir', subdirs=['peerdirlink', 'subsubdir'], files=['data.txt'], depth=1)
WalkedDir(path='path_fodder/subdir/peerdirlink', subdirs=[], files=[], depth=2)
WalkedDir(path='path_fodder/subdir/subsubdir', subdirs=['parentdirlink'], files=[], depth=2)
Symlink 'path_fodder/subdir/subsubdir/parentdirlink' refers to a parent directory, skipping
WalkedDir(path='path_fodder/subdir2', subdirs=[], files=[], depth=1)

Is produced for the following directory structure:

path_fodder/
    data.txt
    subdir/
        data.txt
        peerdirlink --> ../subdir2
        subsubdir/
            parentdirlink --> ..
    subdir2/

It's fairly easy to create a simple "find" utility based on the above:

def find_files(top, file_pattern=None):
    for dir_info in filter_walk(top, file_pattern):
        dir_path = os.path.abspath(dir_info.path)
        for fname in dir_info.files:
            yield os.path.join(dir_path, fname)

>>> print('\n'.join(find_files('path_fodder', '*.txt')))
/home/example/path_fodder/data.txt
/home/example/path_fodder/subdir/data.txt

1 comment

Nick Coghlan (author) 10 years, 5 months ago  # | flag

Rather than using this recipe directly, you may prefer to check out the packaged version on PyPI: http://walkdir.readthedocs.org