Welcome, guest | Sign In | My Account | Store | Cart

This module provides 3 ways of detecting which objects have been allocated (methods 1 and 3) or became un-reclaimable (method 2) between 2 points in the code. It can be very useful to detect memory leaks (eg. cycles involving objects with a __del__ method).

Python, 346 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
# Author: David Decotigny 2008 Oct 3
#  @brief Routines to determine which new objects are reachable
#  between 2 points in the code

import gc, cPickle as pickle, weakref, sys, traceback


#
# Method 1: use weak ref to track new live objects
# Advantages: we have live pointers to the new live objects. And fast
# Drawbacks: doesn't track many types (such as list, dict, etc.) but
#            generally this is not a problem because: if they contain
#            sub-objects, these objects might most probably be track-able
#
class RefTracker(object):
    """
    The scan() method will apply the given callback to the list of
    new objects created since last call to scan() (or since the
    construction, for the 1st time).
    """
    def __init__(self):
        self._not_tracked_types = set()
        self._current_refs      = dict()
        self.scan()

    def _get_objects(self):
        return gc.get_objects()
        
    def _scan(self, callback_new_object):
        """
        This is NOT MT-safe and will not work for most builtin types
        """
        objs = self._get_objects()
        
        # First: remove the objects that are not available anymore
        to_remove = []
        for oid, ref in self._current_refs.iteritems():
            if ref() is None:
                to_remove.append(oid)
        for oid in to_remove:
            del self._current_refs[oid]
        del to_remove

        # Create the list of objects that are brand new:
        for obj in objs:
            try:
                my_ref = self._current_refs[id(obj)]
                # The object was already recorded last time.
                # If the recorded object were not the current one,
                # it would mean that the recorded object had been
                # deallocated... this is caught by the previous loop
                #
                # Do some sanity checks, just to make sure:
                assert my_ref() is not None
                assert my_ref() == obj
            except KeyError:
                # This is a new object. Try to make a weak-ref out of it:
                try:
                    wref = weakref.ref(obj)
                except TypeError:
                    # Track only weak-ref-friendly objects, remember
                    # the types of the objects we couldn't weak-reference:
                    self._not_tracked_types.add(str(type(obj)))
                    continue
                # Ok, good, we have a weak ref. Record it:
                self._current_refs[id(obj)] = wref
                # We also want to know that it's a new thing
                try:
                    if callback_new_object:
                        callback_new_object(obj)
                        del obj
                except:
                    traceback.print_exc()

    def scan(self, callback_new_object = None):
        """Call the callback on each new object"""
        # We need this in order to free the refs still held
        # by _scan due to the callback (approx explanation...)
        gc.collect()
        self._scan(callback_new_object)
        gc.collect()

    @property
    def not_tracked_types(self):
        """Return the list of type names of the objects that could not
        be tracked"""
        return self._not_tracked_types

    @staticmethod
    def _print_new_obj(obj):
        """Callback used by scan_and_print_new_objs"""
        print "New obj:", repr(obj)

    def scan_and_print_new_objs(self, msg = None):
        # Print list of new objs, making sure that the list is
        # correctly garbage-collected by the GC
        print "\n# -- %s:" % (msg or "New objects")
        self.scan(self._print_new_obj)
        print "# ---------------\n"



#
# Method 2: Keep track of the garbage list
# Advantages: we have live pointers to the new live objects. And fast
# Drawbacks: will only show the object /after/ the GC had tried to
#            reclaim them, not as soon as they have been
#            creaded. Still useful to debug leaks... But: are we sure
#            that lost objects are only found in cycles ??? Same
#            type restrictions as for method 1 ???
#
class GarbageTracker(RefTracker):
    def _get_objects(self):
        return gc.garbage


#
# Method 3: approximate method storing signatures of objects to a file
#           and comparing the signatures. The signature consist of a pair
#           object id / str(type(obj))
# Advantages: all object types can potentially be tracked. Can allow
#             basic offline analysis
# Drawbacks: might not see some new objects if they are at the same address
#            as previous ones having the same signature. Slow
#

first_time = True
def make_gc_snapShot(filename, name):
        """Append the signatures to a file, giving them the given
        'name'. A signature is a pair object_id / type_name"""
	global first_time
	if first_time:
		gc.collect()
		first_time = False
	contents = []
	for o in gc.get_objects():
		try:
			tname = o.__class__.__name__
		except AttributeError:
			tname = str(type(o))
		contents.append((id(o), tname))
		del tname
	f = open(filename, 'a')
	pickle.dump((name, contents), f)
	f.close()
	del contents
	del f

class GCSnapshot(object):
        """Used to read a set of signatures from the file"""
	def __init__(self, stream):
		self.name, contents = pickle.load(stream)
		self._contents = set(contents)

	def __sub__(self, other):
                """Give the differences between 2 sets of
                signatures. Return a set of pairs object_id /
                type_name"""
		return self._contents - other._contents

	def reach(self, ids):
            """
            \param ids Iterable of object id, as returned by x[0],
            with x in the result of (snapshot2 - snapshot1)
            
            Return a dict id -> object with that id currently known.

            The objects recorded with these id might have been
            replaced by new ones... so we might end-up seeing objects
            that don't correspond to the original ones. This is
            especially true after a gc.collect()
            """
            result = dict()
            for obj in gc.get_objects():
                if id(obj) in ids:
                    result[id(obj)] = obj
            return result


def read_snapshots(filename):
        """Sequentially reads the sets of signatures from a file. For
        each set of signatures, a GCSnapshot is created with the
        stored name. return the dict set name -> GCSnapshot object"""
	result = dict()
	f = open(filename, 'r')
	while 1:
		try:
			snap = GCSnapshot(f)
			result[snap.name] = snap
		except (EOFError, pickle.UnpicklingError):
			break
	f.close()
	return result


#### BEGIN: ONLY FOR THE TESTS
class Dummy:
    def __init__(self):
        print "INFO: ctor", self
    def __del__(self):
        print "INFO: dtor", self

# A pair of mutually-referencing objects with __del__ methods
# See http://docs.python.org/library/gc.html#gc.garbage
# for an explanation why they are not automatically reclaimable
class ObjectReferencer:
    def __init__( self, obj ):
        print "INFO: ctor", self
        self.reference = obj

    def __del__(self):
        print "INFO: dtor", self


class ReferencerCreator:
    def __init__( self ):
        print "INFO: ctor", self
        self.attribute = ObjectReferencer( self )

    def __del__(self):
        print "INFO: dtor", self

    def break_cycle(self):
        # Necessary to break the cycle that prevents the GC from
        # doing its job
        print "INFO: break_cycle", self
        self.attribute = None


def _test1():
    """Tests for method 1 (RefTracker)"""
    print "*** Method 1 (RefTracker) ***"

    r = RefTracker()
    d = Dummy()
    print "del dummy now..."
    del d
    r.scan_and_print_new_objs("After creation/del of Dummy()")

    # Contains a cycle: will not be freed by GC...
    o = ReferencerCreator()
    print "del obj now..."
    del o
    r.scan_and_print_new_objs("After creation/del of ReferencerCreator")

    # The same, but we break the cycle
    o = ReferencerCreator()
    print "break_cycle now..."
    o.break_cycle()
    print "del obj now..."
    del o        
    r.scan_and_print_new_objs("After creation/break_cycle/del of ReferencerCreator")

    print "Types not tracked:"
    for typ in r.not_tracked_types:
        print "  %s" % typ

    print "End of test method 1."


def _test2():
    """Tests for method 2 (GarbageTracker)"""
    print "*** Method 2 (GarbageTracker) ***"

    r = GarbageTracker()
    d = Dummy()
    print "del dummy now..."
    del d
    r.scan_and_print_new_objs("After creation/del of Dummy()")

    # Contains a cycle: will not be freed by GC...
    o = ReferencerCreator()
    print "del obj now..."
    del o
    r.scan_and_print_new_objs("After creation/del of ReferencerCreator")

    # The same, but we break the cycle
    o = ReferencerCreator()
    print "break_cycle now..."
    o.break_cycle()
    print "del obj now..."
    del o        
    r.scan_and_print_new_objs("After creation/break_cycle/del of ReferencerCreator")

    print "Types not tracked:"
    for typ in r.not_tracked_types:
        print "  %s" % typ

    print "End of test method 2."


def _test3():
    """Tests for method 3 (compare signatures)"""
    import os
    
    print "*** Method 3 (compare signatures) ***"
    fname = "/tmp/gc-%s-snapshot" % os.environ["USER"]
    make_gc_snapShot(fname, "0")
    make_gc_snapShot(fname, "1")
    l = list()
    l.append(l)
    make_gc_snapShot(fname, "2")
    l.append(42)
    t = ReferencerCreator()
    make_gc_snapShot(fname, "3")

    # Now analyzing
    snaps = read_snapshots(fname)
    os.remove(fname)

    print "Between 2 and 1, diff is:"
    diff21 = snaps["2"] - snaps["1"]
    for d in diff21:
        print "  ", d

    print "Between 2 and 1, diff as live objects is:"
    for obj in snaps["3"].reach([d[0] for d in diff21]).itervalues():
        print "  ", obj

    print "Between 3 and 2, diff is:"
    diff32 = snaps["3"] - snaps["2"]
    for d in diff32:
        print "  ", d
        
    print "Between 3 and 2, diff as live objects is:"
    for obj in snaps["3"].reach([d[0] for d in diff32]).itervalues():
        print "  ", obj

    print "Between 3 and 1, diff is:"
    diff31 = snaps["3"] - snaps["1"]
    for d in diff31:
        print "  ", d
        
    print "Between 3 and 1, diff as live objects is:"
    for obj in snaps["3"].reach([d[0] for d in diff31]).itervalues():
        print "  ", obj

    print "End of test method 3."
#### END: ONLY FOR THE TESTS


if __name__ == "__main__":
    _test1()
    _test2()
    _test3()
    print "Bye."

The descriptions of the different methods are given in the code, with a summary of their advantages/drawbacks. The main difference between method 1 and method 2 is that the new objects will be reported as soon as they have been created by method 1, whereas in method 2 only non-reclaimable new objects will be reported (after the GC tried to free them). A caveat: methods 1&2 will not report all types of new objects: only those supported by weakref. But the list of unreported object types is available through not_tracked_types().

Also refer to http://docs.python.org/library/gc.html#gc.garbage for the reason that motivated this module.

Tested with python 2.4, 2.5 and 2.6.