Welcome, guest | Sign In | My Account | Store | Cart

Files existing on damaged media (eg old CD-ROMs) are not completely lost. Often, readers will read sectors unreadable in other readers. The "trick" is to merge the successful reads from several devices.

Python, 173 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/usr/local/bin/python -O
# read faulty
# for each of its arguments, it tries to copy the faulty file to the cwd

import sys, os, errno
import collections
import cPickle as Pickle
import gzip

# use the correct errno reported per platform
if sys.platform == 'win32':
    read_failed= lambda exc: exc.errno == errno.EACCES
else:
    read_failed= lambda exc: exc.errno == errno.EIO

class Chunk(object):
    "A description of a data chunk to be read"
    UNIT= 2048
    BIG_UNIT= 32*UNIT

    def __init__(self, offset, size):
        self.offset= offset
        self.size= size

    def next_attempt(self):
        "Return sequence of chunks to retry"
        if self.size == self.UNIT: # a single sector failed
             yield self # try again in the next phase
        else:
            for ix in xrange(self.offset, self.offset+self.size, self.UNIT):
                yield self.__class__(ix, self.UNIT)

    def __getstate__(self):
        return self.offset, self.size
    def __setstate__(self, tpl):
        self.offset, self.size= tpl

    def description(self):
        "Return textual description of chunk"
        unit1= self.offset / self.UNIT
        unit2= (self.offset+self.size) / self.UNIT - 1
        if unit1 == unit2:
            return "%dMiB:%d" % (self.offset//1048576, unit1)
        else:
            return "%dMiB:%d-%d" % (self.offset//1048576, unit1, unit2)

class SuspectFile(object):
    "A file to be copied"
    destination= "."

    def __init__(self, filename, destination=None):
        # phase 1 contains big chunks to be read
        # phase 2 contains sectors to re-read
        # phase 3 contains chunks to store as completely failed
        self.filename= filename
        if destination is not None:
            self.destination= destination
        self.state_filename= os.path.basename(filename) + ".state"
        self.phase3= collections.deque()
        try:
            self.read_last_attempt_state()
        except IOError: # state file does not exist
            self.phase1= self.chunks_to_read()
            self.phase2= collections.deque()

    def chunks_to_read(self):
        result= collections.deque()
        filesize= os.path.getsize(self.filename)
        for offset in xrange(0, filesize, Chunk.BIG_UNIT):
            result.append(Chunk(
                offset,
                filesize-offset>Chunk.BIG_UNIT
                and Chunk.BIG_UNIT
                or filesize-offset))
        return result

    def record_state(self):
        if self.phase1 or self.phase2 or self.phase3:
            fpr= gzip.open(self.state_filename, "wb")
            Pickle.dump(self.phase1, fpr, -1)
            dummy_deque= collections.deque()
            dummy_deque.extend(self.phase2)
            dummy_deque.extend(self.phase3)
            Pickle.dump(dummy_deque, fpr, -1)
            fpr.close()
        else:
            try: os.remove(self.state_filename)
            except OSError: pass # ignore non-existant filename

    @staticmethod
    def copy_chunk(fpi, fpo, chunk):
        fpi.seek(chunk.offset)
        data= fpi.read(chunk.size)
        if data:
            fpo.seek(chunk.offset)
            fpo.write(data)
        return data

    def read_last_attempt_state(self):
        fpr= gzip.open(self.state_filename, "rb")
        self.phase1= Pickle.load(fpr)
        self.phase2= Pickle.load(fpr)

    # the report_* methods are to be overloaded
    def report_attempt(self, chunk):
        "This is to be overloaded with a way to report progress"
        pass

    def report_success(self, chunk):
        pass

    def report_failure(self, chunk):
        pass

    def phase_copy(self, fpi, fpo, phase_in, phase_out):
        "Copy chunks from fpi to fpo storing failures in phase_out"
        while phase_in:
            chunk= phase_in.popleft()
            try: # to make sure this chunk is not skipped, eg by KeyboardInterrupt
                self.report_attempt(chunk)
                try:
                    self.copy_chunk(fpi, fpo, chunk)
                except IOError, exc:
                    if read_failed(exc): # the way windows reports failure
                            for new_chunk in chunk.next_attempt():
                                phase_out.append(new_chunk)
                            chunk= None
                            self.report_failure(chunk)
                    else:
                        raise
                else:
                    # report success, but first make sure chunk is None
                    _, chunk= chunk, None
                    self.report_success(_)
            finally:
                if chunk: phase_in.appendleft(chunk)
                
    def copy(self):
        "Copy the file to the local directory"
        fpi= open(self.filename, "rb")
        fpo_filename= os.path.join(
            self.destination, os.path.basename(self.filename))
        try:
            fpo= open(fpo_filename, "r+b")
        except IOError, exc:
            if exc.errno == errno.ENOENT:
                fpo= open(fpo_filename, "wb")
            else:
                raise
        try:
            self.phase_copy(fpi, fpo, self.phase1, self.phase2)
            self.phase_copy(fpi, fpo, self.phase2, self.phase3)
        finally:
            self.record_state()

if __name__=="__main__":
    class SuspectFileCmd(SuspectFile):
        def report_attempt(self, chunk):
            sys.stderr.write(chunk.description())
        def report_success(self, chunk):
            sys.stderr.write("\r")
        def report_failure(self, chunk):
            sys.stderr.write(" failed\n")
        def record_state(self):
            super(SuspectFileCmd, self).record_state()
            sys.stderr.write("** remaining %d bytes in fast reads\n" % sum(chunk.size for chunk in self.phase1))
            sys.stderr.write("and %d sectors in re-reads\n" % (len(self.phase2) + len(self.phase3)) )

    for filename in sys.argv[1:]:
        faulty_file= SuspectFileCmd(filename)
        sys.stderr.write("copying %s\n" % filename)
        faulty_file.copy()
        sys.stderr.write("\n")

This recipe can be used for recovering files from damaged media (CD-ROMs and DVDs are specifically addressed, because of the UNIT=2048, which is the byte-size of a sector on these media).

This recipe attempts to copy batches of sectors from the faulty file to the current directory; if a batch (typically 32) read fails, these sectors will be re-read later one-by-one. If these sectors still can't be read, the program will exit storing a file (named as the original filename with ".state" appended) containing the unread sectors. Put the disk in another device, and run again. Only the unread sectors will be retried (as long as the file and its .state exist both in the current directory), and you have a chance of reading more data.

At any point you can stop the program with Control-C. Wait a little until the current read fails, and the program will exit having first stored the .state file.

This code is actually a quick hack which I just prettied up a little bit to share it here.

1 comment

Christos Georgiou (author) 17 years, 10 months ago  # | flag

Best used in places with lots of reader devices. ...like your office :)

Use a shared folder (eg nfs, windows, samba) and try from several workstations.

Recover your files 100% or your money back!