Python's os.walk() standard library iterator is useful if you want to walk an entire directory tree, but you're on your own when it comes to implementing name filtering and recursive depth limiting on top of it.
This recipe supports these features with an interface that is just as convenient as the underlying os.walk() API, while being significantly more powerful.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 | import fnmatch
import os
import os.path
import collections
import sys
WalkedDir = collections.namedtuple("WalkedDir", "path subdirs files depth")
def filter_walk(top, file_pattern=None, dir_pattern=None, depth=None, onerror=None, followlinks=False, onloop=None):
"""filter_walk is similar to os.walk, but offers the following additional features:
- yields a named tuple of (path, subdirs, files, depth)
- allows a recursion depth limit to be specified
- allows independent glob-style filters for filenames and subdirectories
- emits a message to stderr and skips the directory if a symlink loop is encountered when following links
Selective walks are always top down, as the directory listings must be altered to provide
the above features.
If not None, depth must be at least 0. A depth of zero can be useful to get separate
filtered subdirectory and file listings for a given directory.
onerror is passed to os.walk to handle os.listdir errors
followlinks is passed to os.walk and enables the symbolic loop detection
onloop (if provided) can be used to override the default symbolic loop handling. It is
called with the directory path as an argument when a loop is detected. Any false return
value will skip the directory as normal, any true value means the directory will be processed.
"""
if depth is not None and depth < 0:
msg = "Depth limit must be None or greater than 0 ({!r} provided)"
raise ValueError(msg.format(depth))
if onloop is None:
def onloop(path):
msg = "Symlink {!r} refers to a parent directory, skipping\n"
sys.stderr.write(msg.format(path))
sys.stderr.flush()
if followlinks:
real_top = os.path.abspath(os.path.realpath(top))
sep = os.sep
initial_depth = top.count(sep)
for path, walk_subdirs, files in os.walk(top, topdown=True,
onerror=onerror,
followlinks=followlinks):
# Check for symlink loops
if followlinks and os.path.islink(path):
# We just descended into a directory via a symbolic link
# Check if we're referring to a directory that is
# a parent of our nominal directory
relative = os.path.relpath(path, top)
nominal_path = os.path.join(real_top, relative)
real_path = os.path.abspath(os.path.realpath(path))
path_fragments = zip(nominal_path.split(sep), real_path.split(sep))
for nominal, real in path_fragments:
if nominal != real:
break
else:
if not onloop(path):
walk_subdirs[:] = []
continue
# Filter files, if requested
if file_pattern is not None:
files = fnmatch.filter(files, file_pattern)
# We hide the underlying generator's subdirectory list, since we
# clear it internally when we reach the depth limit (if any)
if dir_pattern is None:
subdirs = walk_subdirs[:]
else:
subdirs = fnmatch.filter(walk_subdirs, dir_pattern)
# Report depth
current_depth = path.count(sep) - initial_depth
yield WalkedDir(path, subdirs, files, current_depth)
# Filter directories and implement depth limiting
if depth is not None and current_depth >= depth:
walk_subdirs[:] = []
else:
walk_subdirs[:] = subdirs
|
"file_pattern=''" can be a useful trick if you only want to walk the directories and don't care about the files at all.
The following example output:
>>> for info in filter_walk("path_fodder", followlinks=True):
... print(info)
...
WalkedDir(path='path_fodder', subdirs=['subdir', 'subdir2'], files=['data.txt'], depth=0)
WalkedDir(path='path_fodder/subdir', subdirs=['peerdirlink', 'subsubdir'], files=['data.txt'], depth=1)
WalkedDir(path='path_fodder/subdir/peerdirlink', subdirs=[], files=[], depth=2)
WalkedDir(path='path_fodder/subdir/subsubdir', subdirs=['parentdirlink'], files=[], depth=2)
Symlink 'path_fodder/subdir/subsubdir/parentdirlink' refers to a parent directory, skipping
WalkedDir(path='path_fodder/subdir2', subdirs=[], files=[], depth=1)
Is produced for the following directory structure:
path_fodder/
data.txt
subdir/
data.txt
peerdirlink --> ../subdir2
subsubdir/
parentdirlink --> ..
subdir2/
It's fairly easy to create a simple "find" utility based on the above:
def find_files(top, file_pattern=None):
for dir_info in filter_walk(top, file_pattern):
dir_path = os.path.abspath(dir_info.path)
for fname in dir_info.files:
yield os.path.join(dir_path, fname)
>>> print('\n'.join(find_files('path_fodder', '*.txt')))
/home/example/path_fodder/data.txt
/home/example/path_fodder/subdir/data.txt
Rather than using this recipe directly, you may prefer to check out the packaged version on PyPI: http://walkdir.readthedocs.org