Here's a quick test to see if a file or string contains text or is binary. The difference between text and binary is ill-defined, so this duplicates the definition used by Perl's -T flag, which is: <br/> The first block or so of the file is examined for odd characters such as strange control codes or characters with the high bit set. If too many strange characters (>30%) are found, it's a -B file, otherwise it's a -T file. Also, any file containing null in the first block is considered a binary file.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | #!/usr/bin/env python
import string, sys
text_characters = "".join(map(chr, range(32, 127)) + list("\n\r\t\b"))
_null_trans = string.maketrans("", "")
def istextfile(filename, blocksize = 512):
return istext(open(filename).read(blocksize))
def istext(s):
if "\0" in s:
return 0
if not s: # Empty files are considered text
return 1
# Get the non-text characters (maps a character to itself then
# use the 'remove' option to get rid of the text characters.)
t = s.translate(_null_trans, text_characters)
# If more than 30% non-text characters, then
# this is considered a binary file
if len(t)/len(s) > 0.30:
return 0
return 1
def main(argv):
import os, getopt
try:
args, dirnames = getopt.getopt(argv[1:], "h", ["help"])
except getopt.error:
args = "dummy"
if args:
print "Usage: %s <directory> [<directory> ...]" % (argv[0],)
print " Shows which files in a directory are text and which are binary"
sys.exit(0)
table = {0: "binary", 1: "text"}
if not dirnames:
dirnames = ["."]
for dirname in dirnames:
try:
filenames = os.listdir(dirname)
except OSError, err:
print >>sys.stderr, err
continue
for filename in filenames:
fullname = os.path.join(dirname, filename)
try:
print table[istextfile(fullname)], repr(fullname)[1:-1]
except IOError: # eg, this is a directory
pass
if __name__ == "__main__":
main(sys.argv)
|
A bug will make this script unreliable... The last line of the function is:
This will always indicate that the string is text, unless len(t) = len(s), because this is using the default integer division, and threshold is a float 0 <= threshold <= 1. If you change the last line to
or (if you prefer -- tidier, but less obvious):