def _should_include_path(path, includes, excludes): """Return True iff the given path should be included.""" from os.path import basename from fnmatch import fnmatch base = basename(path) if includes: for include in includes: if fnmatch(base, include): try: log.debug("include `%s' (matches `%s')", path, include) except (NameError, AttributeError): pass break else: try: log.debug("exclude `%s' (matches no includes)", path) except (NameError, AttributeError): pass return False for exclude in excludes: if fnmatch(base, exclude): try: log.debug("exclude `%s' (matches `%s')", path, exclude) except (NameError, AttributeError): pass return False return True def _walk(top, topdown=True, onerror=None, follow_symlinks=False): """A version of `os.walk()` with a couple differences regarding symlinks. 1. follow_symlinks=False (the default): A symlink to a dir is returned as a *non*-dir. In `os.walk()`, a symlink to a dir is returned in the *dirs* list, but it is not recursed into. 2. follow_symlinks=True: A symlink to a dir is returned in the *dirs* list (as with `os.walk()`) but it *is conditionally* recursed into (unlike `os.walk()`). A symlinked dir is only recursed into if it is to a deeper dir within the same tree. This is my understanding of how `find -L DIR` works. """ import os from os.path import join, isdir, islink, abspath # We may not have read permission for top, in which case we can't # get a list of the files the directory contains. os.path.walk # always suppressed the exception then, rather than blow up for a # minor reason when (say) a thousand readable directories are still # left to visit. That logic is copied here. try: names = os.listdir(top) except OSError, err: if onerror is not None: onerror(err) return dirs, nondirs = [], [] if follow_symlinks: for name in names: if isdir(join(top, name)): dirs.append(name) else: nondirs.append(name) else: for name in names: path = join(top, name) if islink(path): nondirs.append(name) elif isdir(path): dirs.append(name) else: nondirs.append(name) if topdown: yield top, dirs, nondirs for name in dirs: path = join(top, name) if follow_symlinks and islink(path): # Only walk this path if it links deeper in the same tree. top_abs = abspath(top) link_abs = abspath(join(top, os.readlink(path))) if not link_abs.startswith(top_abs + os.sep): continue for x in _walk(path, topdown, onerror, follow_symlinks=follow_symlinks): yield x if not topdown: yield top, dirs, nondirs _NOT_SPECIFIED = ("NOT", "SPECIFIED") def _paths_from_path_patterns(path_patterns, files=True, dirs="never", recursive=True, includes=None, excludes=None, skip_dupe_dirs=False, follow_symlinks=False, on_error=_NOT_SPECIFIED): """_paths_from_path_patterns([, ...]) -> file paths Generate a list of paths (files and/or dirs) represented by the given path patterns. "path_patterns" is a list of paths optionally using the '*', '?' and '[seq]' glob patterns. "files" is boolean (default True) indicating if file paths should be yielded "dirs" is string indicating under what conditions dirs are yielded. It must be one of: never (default) never yield dirs always yield all dirs matching given patterns if-not-recursive only yield dirs for invocations when recursive=False See use cases below for more details. "recursive" is boolean (default True) indicating if paths should be recursively yielded under given dirs. "includes" is a list of file patterns to include in recursive searches. "excludes" is a list of file and dir patterns to exclude. (Note: This is slightly different than GNU grep's --exclude option which only excludes *files*. I.e. you cannot exclude a ".svn" dir.) "skip_dupe_dirs" can be set True to watch for and skip descending into a dir that has already been yielded. Note that this currently does not dereference symlinks. "follow_symlinks" is a boolean indicating whether to follow symlinks (default False). To guard against infinite loops with circular dir symlinks, only dir symlinks to *deeper* dirs are followed. "on_error" is an error callback called when a given path pattern matches nothing: on_error(PATH_PATTERN) If not specified, the default is look for a "log" global and call: log.error("`%s': No such file or directory") Specify None to do nothing. Typically this is useful for a command-line tool that takes a list of paths as arguments. (For Unix-heads: the shell on Windows does NOT expand glob chars, that is left to the app.) Use case #1: like `grep -r` {files=True, dirs='never', recursive=(if '-r' in opts)} script FILE # yield FILE, else call on_error(FILE) script DIR # yield nothing script PATH* # yield all files matching PATH*; if none, # call on_error(PATH*) callback script -r DIR # yield files (not dirs) recursively under DIR script -r PATH* # yield files matching PATH* and files recursively # under dirs matching PATH*; if none, call # on_error(PATH*) callback Use case #2: like `file -r` (if it had a recursive option) {files=True, dirs='if-not-recursive', recursive=(if '-r' in opts)} script FILE # yield FILE, else call on_error(FILE) script DIR # yield DIR, else call on_error(DIR) script PATH* # yield all files and dirs matching PATH*; if none, # call on_error(PATH*) callback script -r DIR # yield files (not dirs) recursively under DIR script -r PATH* # yield files matching PATH* and files recursively # under dirs matching PATH*; if none, call # on_error(PATH*) callback Use case #3: kind of like `find .` {files=True, dirs='always', recursive=(if '-r' in opts)} script FILE # yield FILE, else call on_error(FILE) script DIR # yield DIR, else call on_error(DIR) script PATH* # yield all files and dirs matching PATH*; if none, # call on_error(PATH*) callback script -r DIR # yield files and dirs recursively under DIR # (including DIR) script -r PATH* # yield files and dirs matching PATH* and recursively # under dirs; if none, call on_error(PATH*) # callback """ from os.path import basename, exists, isdir, join, normpath, abspath, \ lexists, islink, realpath from glob import glob assert not isinstance(path_patterns, basestring), \ "'path_patterns' must be a sequence, not a string: %r" % path_patterns if includes is None: includes = [] if excludes is None: excludes = [] GLOB_CHARS = '*?[' if skip_dupe_dirs: searched_dirs = set() for path_pattern in path_patterns: # Determine the set of paths matching this path_pattern. for glob_char in GLOB_CHARS: if glob_char in path_pattern: paths = glob(path_pattern) break else: if follow_symlinks: paths = exists(path_pattern) and [path_pattern] or [] else: paths = lexists(path_pattern) and [path_pattern] or [] if not paths: if on_error is None: pass elif on_error is _NOT_SPECIFIED: try: log.error("`%s': No such file or directory", path_pattern) except (NameError, AttributeError): pass else: on_error(path_pattern) for path in paths: if (follow_symlinks or not islink(path)) and isdir(path): if skip_dupe_dirs: canon_path = normpath(abspath(path)) if follow_symlinks: canon_path = realpath(canon_path) if canon_path in searched_dirs: continue else: searched_dirs.add(canon_path) # 'includes' SHOULD affect whether a dir is yielded. if (dirs == "always" or (dirs == "if-not-recursive" and not recursive) ) and _should_include_path(path, includes, excludes): yield path # However, if recursive, 'includes' should NOT affect # whether a dir is recursed into. Otherwise you could # not: # script -r --include="*.py" DIR if recursive and _should_include_path(path, [], excludes): for dirpath, dirnames, filenames in _walk(path, follow_symlinks=follow_symlinks): dir_indeces_to_remove = [] for i, dirname in enumerate(dirnames): d = join(dirpath, dirname) if skip_dupe_dirs: canon_d = normpath(abspath(d)) if follow_symlinks: canon_d = realpath(canon_d) if canon_d in searched_dirs: dir_indeces_to_remove.append(i) continue else: searched_dirs.add(canon_d) if dirs == "always" \ and _should_include_path(d, includes, excludes): yield d if not _should_include_path(d, [], excludes): dir_indeces_to_remove.append(i) for i in reversed(dir_indeces_to_remove): del dirnames[i] if files: for filename in sorted(filenames): f = join(dirpath, filename) if _should_include_path(f, includes, excludes): yield f elif files and _should_include_path(path, includes, excludes): yield path