Welcome, guest | Sign In | My Account | Store | Cart

This is my version of a "File Tail" class for Python3 (will not work on Python2 w/o a couple of modifications). My original inspiration came from the perl File::Tail module.

Transparently handles files that get rotated or truncated.

  • Does not take 100% CPU.
  • Does not take up much memory.
  • Is capable of handling any size log file.
  • Not tested on Windows

Example:

from filetail import FileTail
tail = FileTail("/var/log/syslog")
for line in tail:
    print(line, end="")
Python, 266 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
""" $Id: filetail.py 1512 2011-05-20 16:14:29Z morrissj $

Python3 module for tailing a file such as a system log that grows continuously.
Transparently handles files that get rotated or trucated.
Inspired by the Perl File::Tail module.

A simple algorithm is used to dynamically sleep when no new data is available in
the file. The longer the amount of time goes by w/o new data the longer the
sleep interval will be (up to "max_interval") and starts at "interval".

Example:
    from filetail import FileTail
    tail = FileTail("/var/log/syslog")
    for line in tail:
        # do something
        pass

"""

import os
import sys
from stat import *
from math import floor
from time import sleep, time

class FileTail(object):
    """
    Tail a file, even if its rotated/truncated.
    Inspiration came from the perl module File::Tail.
    """

    def __init__(self,
                 file,                  # filename to monitor
                 start_pos="end",       # where to initially start reading from
                 #max_buffer_size=16384, # Max buffer size hint (Not exact; @see file.readlines)
                 interval=0.1,          # sleep time to wait if no data is present (dynamically changes)
                 #min_interval=0.01,     # min sleep time
                 max_interval=5,        # max sleep time 
                 max_wait=60,           # max time to wait with no data before reopening file
                 reopen_check="inode",  # how to check if file is different (inode or time) - inode does not work on win32
                 encoding="utf-8"       # file encoding
                ):

        self.start_pos = start_pos
        self.reopen_check = reopen_check
        self.max_wait = max_wait
        #self.max_buffer_size = max_buffer_size
        #self.min_interval = min_interval
        self.max_interval = max_interval
        self.interval = interval
        if self.interval > self.max_interval:
            self.interval = self.max_interval
        self.encoding = encoding

        # will throw exception if it fails... caller should intercept
        self.open(file, start_pos=start_pos)

        # initialize some internal vars
        self._buffer = []
        self.last_time = time()
        self.last_count = 0

    def open(self, file, start_pos="head"):
        """Open the file to tail and initialize our state."""
        fh = open(file, "r", encoding=self.encoding)

        # seek to the initial position in the file we want to start reading
        if start_pos == "end" or start_pos == "tail":
            fh.seek(0, os.SEEK_END)                       # End of file
        elif start_pos == "start" or start_pos == "head":
            #fh.seek(0, os.SEEK_SET)                      # Beginning of file
            pass
        elif start_pos is not None:
            if start_pos >= 0:                            # Absolute position
                fh.seek(start_pos, os.SEEK_SET)
            else:                                         # Absolute position (from end)
                fh.seek(abs(start_pos), os.SEEK_END)
        
        # if we passed the end of the file rewind to the actual end.
        # This avoids a potential race condition if the file was being rotated
        # in the process of opening the file. Not sure if this can actually
        # happen, but better safe than sorry.
        pos = fh.tell()
        if pos > os.stat(file)[ST_SIZE]:
            pos = fh.tell()

        self.fh = fh
        self.pos = pos
        self.stat = os.fstat(fh.fileno())
        self.file = file
    
    def reopen(self):
        """
        Attempt to reopen the current file. If it doesn't appear to have
        changed (been rotated) then the current file handle is not changed.
        """

        #print("Reopening", self.file, "...", end="")

        # if we don't have an opened file already then try to open it now
        if not self.fh or self.fh.closed:
            try:
                self.open(self.file, start_pos="head");
            except IOError:
                return False
            return True

        # save current values
        fh = self.fh
        pos = self.pos
        cur = self.stat
        
        # reopen same file
        try:
            self.open(self.file, "head")
        except IOError as e:
            #print("FILE DOES NOT EXIST")
            return False
        
        new = self.stat
        #print(new.st_ino, ' == ', cur.st_ino)
        if (
            (self.reopen_check == 'inode' and new.st_ino == cur.st_ino)
            or
            (self.reopen_check == 'time' and new.st_mtime <= floor(self.last_time) and new.st_size == pos)
           ):
            #print("FILE NOT CHANGED")
            # file appears to be the same or older than our last read
            #self.last_time = new.st_mtime
            self.fh = fh
            self.pos = pos
            self.stat = cur
            return False

        #print("NEW FILE")
        return True

       
    def __iter__(self):
        """
            Return iterator to support:
                for line in filetail:
                    print line
        """
        self.wait_count = 0
        return self


    def __next__(self):
        """Interator "next" call."""
        return self.next()


    def next(self):
        line = None
        self.wait_count = 0

        # low CPU (probably same as the block below this, but ALLOWS tell()!
        while not line:
            line = self.fh.readline()
            if line != "":
                # track the time we received new data and how much
                self.last_time = time()
                self.last_count = 1
            else:
                self.wait()

        ## uses the least amount of CPU, but does not allow me to tell()
        ## is that a bug in readlines()?
        #while len(self._buffer) == 0:
        #    self._buffer = self.fh.readlines(self.max_buffer_size)
        #    if len(self._buffer) > 0:
        #        # track the time we received new data and how much
        #        self.last_time = time()
        #        self.last_count = len(self._buffer)
        #        self.wait_count = 0
        #    else:
        #        self.wait()
        #line = self._buffer.pop(0)

        # dealing with the file as binary isn't working as well as i hoped
        #while len(self.lines) == 0:
        #    buffer = self.fh.read(self.max_buffer_size).decode(self.encoding)
        #    if buffer is not None:
        #        self._buffer += buffer
        #        size = self.enqueue(self._buffer)
        #        if size:
        #            self._buffer = self._buffer[size:]
        #    else:
        #        self.wait()
        #line = self.lines.pop(0)
            
        # uses too much CPU!! (but not 100%)
        #line = self.fh.readline()
        #while line == "":
        #    self.wait()
        #    line = self.fh.readline()
        #    if line != "":
        #        # track the time we received new data and how much
        #        self.pos = self.fh.tell()
        #        self.last_time = time()
        #        self.last_count = 1 #len(self._buffer)
        #        self.wait_count = 0

        return line

    #def enqueue(self, buffer):
    #    """
    #    Extract any lines from buffer and add to our self.lines list. Ignores
    #    the last line if it does not have a line termination ("\n")
    #    @return total characters extracted from buffer.
    #    """
    #    lines = buffer.splitlines(True)
    #    total = 0;
    #    for l in lines:
    #        if l.endswith("\n"):
    #            self.lines.append(l)
    #            total += len(l)
    #    return total

    # wait for X seconds. The sleep interval is dynamically predicted based on
    # how much was previously read. The predicted interval will never be more
    # than max_interval. If enough time passes w/o any new data the file will
    # be reopened and checked.
    def wait(self):
        if self.wait_count == 0:
            self.pos = self.fh.tell()
            self.stat = os.fstat(self.fh.fileno())

        self.wait_count += 1
        elapsed = time() - self.last_time

        # if we've waited long enough try to reopen the file, if that returns
        # true then we're done here and we do not sleep.
        if elapsed >= self.max_wait:
            self.last_time = time()
            if self.reopen():
                return
            

        # determine delay value. Delay is longer based on total time passed
        # note: currently last_count is always 1.
        if self.last_count:
            #delay = (time() - self.last_time) / self.last_count
            delay = elapsed
        else:
            delay = self.interval

        # don't delay too long
        if delay > self.max_interval:
            delay = self.max_interval
        #elif delay < self.min_interval:
        #    delay = self.min_interval

        #print("delay={:0.06f} elapsed={:0.06f}".format(delay, elapsed))
        sleep(delay)
    
# end of FileTail class


def main():
    print("No tests implemented.")
    
    
if __name__ == "__main__":
    sys.exit(main())

I created this class for a project of mine that has to tail a file that has upwards of 2000-4000 lines per second (firewall logs). My original daemon was written in Perl but I have decided to move over to Python3 for various reasons.

I could not find an adequate File Tail python module that worked, so here is mine.

2 comments

James Mills 12 years, 11 months ago  # | flag

Hi,

I'd be interested to see if circuits (1) and the fail (2) example would suit your needs and perform well enough in your application ?

If you care to try, poke me on FreeNode.

cheers

James Mills / prologic

  1. http://circuitsframework.com/
  2. https://bitbucket.org/prologic/circuits/src/49131fd3c2cc/examples/tail.py
Kai Xia 12 years, 11 months ago  # | flag

IMHO circuit is too heavy for this...