Welcome, guest | Sign In | My Account | Store | Cart

Find duplicate code in Python 2/3 source files. Write a nice report about it.

Works at the Abstract Syntax Tree level, which is a robust way to detect clones. See this code duplicated in the Python 3.2 standard library.

Update: I cleaned the code a little bit, made it Python 2.7 compatible and faster.

Python, 166 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""
Python code clone detector,
using Abstract Syntax Trees.
"""

import ast
import collections

__author__  = 'FBV'
__license__ = 'MIT'
__version__ = '0.0.2'
__status__  = 'alpha'

class Position(ast.NodeVisitor):
    '''
    Find a clone position in the code (its line-span).
    Count child nodes.
    '''
    def init(self, clone):
        '''
        Lazy initialization.
        '''
        self.begin_line = self.end_line = clone.node.lineno
        self.node_count = 0
        self.generic_visit(clone.node)
    def visit(self, node):
        '''
        Find node's line and column span.
        '''
        if hasattr(node, 'lineno'):
            self.begin_line = min(self.begin_line, node.lineno)
            self.end_line = max(self.end_line, node.lineno)
            self.node_count += 1
            self.generic_visit(node)

class Clone(collections.namedtuple('Clone', 'node file position')):
    '''
    A set of code.
    '''
    def source(self, indent=''):
        '''
        Retrieve original source code.
        '''
        if not hasattr(self.position, 'begin_line'):
            self.position.init(self)
        lines = self.file.source[
            self.position.begin_line-1:self.position.end_line]
        return (self.position.begin_line, self.position.end_line,
                '\n'.join(indent + line.rstrip() for line in lines))

class Clones(list):
    '''
    A list of identical code snippets.
    '''
    def score(self):
        '''
        Provide a score for ordering clones while reporting.
        This sorts by number of nodes in the subtree, number
        of clones of the node, and code size.
        '''
        candidate = self[0] # Pick the first clone.
        size = len(candidate.source()[-1])
        return (candidate.position.node_count, len(self), size)

class File:
    '''
    A source file.
    '''
    def __init__(self, name, source):
        '''
        Create a file with name and source-code.
        '''
        self.name = name
        self.source = source
            
def digest(node):
    '''
    Return an unambiguous string representation of a sub-tree in node.
    Emulates ast.dump(node, False).
    '''
    if isinstance(node, ast.AST):
        if not hasattr(node, '_cached'):
            node._cached = '%s(%s)' % (node.__class__.__name__, ', '.join(
                digest(b) for a, b in ast.iter_fields(node)))
        return node._cached
    elif isinstance(node, list):
        return '[%s]' % ', '.join(digest(x) for x in node)
    return repr(node)

class Index(ast.NodeVisitor):
    '''
    A source code repository.
    '''
    def __init__(self, exclude):
        '''
        Create a new file indexer.
        '''
        self.nodes = collections.defaultdict(Clones)
        self.blacklist = frozenset(exclude)
    def add(self, file):
        '''
        Add a file to the index and parse it.
        '''
        source = open(file).readlines()
        tree = ast.parse(''.join(source))
        self._file = File(file, source)
        self.generic_visit(tree)
    def visit(self, node):
        '''
        Walk the Abstract Syntax Tree of a file.
        Convert each sub-tree to a string, which is used 
        as a key in the clones dictionnary.
        '''
        if hasattr(node, 'lineno'):
            if node.__class__.__name__ not in self.blacklist:
                expr = digest(node)
                self.nodes[expr].append(
                    Clone(node, self._file, Position()))
        self.generic_visit(node)
    def clones(self):
        '''
        Returns a list of duplicate constructs.
        '''
        return sorted(((expr, nodes) 
            for expr, nodes in self.nodes.items() if len(nodes)>1),
                key=lambda n: n[1].score(), reverse=True)        

if __name__ == '__main__':
    import argparse
    import itertools

    # Parse command-line arguments.
    args = argparse.ArgumentParser(description=__doc__)
    args.add_argument('files', 
        metavar='FILE', nargs='+', 
        help='set of Python files to check for duplicate code')
    args.add_argument('--ignore', '-i',
        metavar='NODE', nargs='+', default=['Name'], 
        help='skip some syntactic constructs (default: Name)')
    args.add_argument('--min', '-m',
        metavar='N', type=int, default=0, 
        help='report items duplicated at least N times')
    args.add_argument('--version', '-v', 
        action='version', version='%(prog)s ' + __version__)
    input = args.parse_args()

    # Process files.
    sources = Index(input.ignore)    
    for file in input.files:
        sources.add(file)

    # Report clones.
    for expr, clones in sources.clones():
        if len(clones) >= input.min:
            print("+%d repetitions of: %s ->" % 
                    (len(clones), expr))
            for file, group in itertools.groupby(clones, 
                    lambda clone: clone.file.name):
                print("  - in %s" % file)
                for clone in group:
                    begin, end, source = clone.source(' '*8)
                    if begin == end:
                        line = 'line %d' % begin
                    else:
                        line = 'lines %d to %d' % (begin, end)
                    print('      %s:\n%s' % (line, source))

Run it with some Python file(s) as argument(s):

./dry somefile.py […] | less

This will produce a list of all statements/expressions that appear more than once in the code, along with their position and source-code; ordered by criticality.

A command-line interface is provided which allows for basic customization of the results.

./dry --help

Although not beeing optimized much, the script finds all duplicates of my dist-packages in a few seconds. This was hacked in a couple of hours, as a proof of concept. It could be extended if deemed interesting.

1 comment

Achal Rastogi 11 years, 5 months ago  # | flag

there are problems with the script execution. please fix the errors

thanks