Find duplicate code in Python 2/3 source files. Write a nice report about it.
Works at the Abstract Syntax Tree level, which is a robust way to detect clones. See this code duplicated in the Python 3.2 standard library.
Update: I cleaned the code a little bit, made it Python 2.7 compatible and faster.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 | """
Python code clone detector,
using Abstract Syntax Trees.
"""
import ast
import collections
__author__ = 'FBV'
__license__ = 'MIT'
__version__ = '0.0.2'
__status__ = 'alpha'
class Position(ast.NodeVisitor):
'''
Find a clone position in the code (its line-span).
Count child nodes.
'''
def init(self, clone):
'''
Lazy initialization.
'''
self.begin_line = self.end_line = clone.node.lineno
self.node_count = 0
self.generic_visit(clone.node)
def visit(self, node):
'''
Find node's line and column span.
'''
if hasattr(node, 'lineno'):
self.begin_line = min(self.begin_line, node.lineno)
self.end_line = max(self.end_line, node.lineno)
self.node_count += 1
self.generic_visit(node)
class Clone(collections.namedtuple('Clone', 'node file position')):
'''
A set of code.
'''
def source(self, indent=''):
'''
Retrieve original source code.
'''
if not hasattr(self.position, 'begin_line'):
self.position.init(self)
lines = self.file.source[
self.position.begin_line-1:self.position.end_line]
return (self.position.begin_line, self.position.end_line,
'\n'.join(indent + line.rstrip() for line in lines))
class Clones(list):
'''
A list of identical code snippets.
'''
def score(self):
'''
Provide a score for ordering clones while reporting.
This sorts by number of nodes in the subtree, number
of clones of the node, and code size.
'''
candidate = self[0] # Pick the first clone.
size = len(candidate.source()[-1])
return (candidate.position.node_count, len(self), size)
class File:
'''
A source file.
'''
def __init__(self, name, source):
'''
Create a file with name and source-code.
'''
self.name = name
self.source = source
def digest(node):
'''
Return an unambiguous string representation of a sub-tree in node.
Emulates ast.dump(node, False).
'''
if isinstance(node, ast.AST):
if not hasattr(node, '_cached'):
node._cached = '%s(%s)' % (node.__class__.__name__, ', '.join(
digest(b) for a, b in ast.iter_fields(node)))
return node._cached
elif isinstance(node, list):
return '[%s]' % ', '.join(digest(x) for x in node)
return repr(node)
class Index(ast.NodeVisitor):
'''
A source code repository.
'''
def __init__(self, exclude):
'''
Create a new file indexer.
'''
self.nodes = collections.defaultdict(Clones)
self.blacklist = frozenset(exclude)
def add(self, file):
'''
Add a file to the index and parse it.
'''
source = open(file).readlines()
tree = ast.parse(''.join(source))
self._file = File(file, source)
self.generic_visit(tree)
def visit(self, node):
'''
Walk the Abstract Syntax Tree of a file.
Convert each sub-tree to a string, which is used
as a key in the clones dictionnary.
'''
if hasattr(node, 'lineno'):
if node.__class__.__name__ not in self.blacklist:
expr = digest(node)
self.nodes[expr].append(
Clone(node, self._file, Position()))
self.generic_visit(node)
def clones(self):
'''
Returns a list of duplicate constructs.
'''
return sorted(((expr, nodes)
for expr, nodes in self.nodes.items() if len(nodes)>1),
key=lambda n: n[1].score(), reverse=True)
if __name__ == '__main__':
import argparse
import itertools
# Parse command-line arguments.
args = argparse.ArgumentParser(description=__doc__)
args.add_argument('files',
metavar='FILE', nargs='+',
help='set of Python files to check for duplicate code')
args.add_argument('--ignore', '-i',
metavar='NODE', nargs='+', default=['Name'],
help='skip some syntactic constructs (default: Name)')
args.add_argument('--min', '-m',
metavar='N', type=int, default=0,
help='report items duplicated at least N times')
args.add_argument('--version', '-v',
action='version', version='%(prog)s ' + __version__)
input = args.parse_args()
# Process files.
sources = Index(input.ignore)
for file in input.files:
sources.add(file)
# Report clones.
for expr, clones in sources.clones():
if len(clones) >= input.min:
print("+%d repetitions of: %s ->" %
(len(clones), expr))
for file, group in itertools.groupby(clones,
lambda clone: clone.file.name):
print(" - in %s" % file)
for clone in group:
begin, end, source = clone.source(' '*8)
if begin == end:
line = 'line %d' % begin
else:
line = 'lines %d to %d' % (begin, end)
print(' %s:\n%s' % (line, source))
|
Run it with some Python file(s) as argument(s):
./dry somefile.py […] | less
This will produce a list of all statements/expressions that appear more than once in the code, along with their position and source-code; ordered by criticality.
A command-line interface is provided which allows for basic customization of the results.
./dry --help
Although not beeing optimized much, the script finds all duplicates of my dist-packages
in a few seconds.
This was hacked in a couple of hours, as a proof of concept. It could be extended if deemed interesting.
there are problems with the script execution. please fix the errors
thanks