#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# pyminifier.py
#
# Copyright 2009 Dan McDougall <YouKnowWho@YouKnowWhat.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; Version 3 of the License
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, the license can be downloaded here:
#
# http://www.gnu.org/licenses/gpl.html
# Meta
__version__ = '1.4.1'
__license__ = "GNU General Public License (GPL) Version 3"
__version_info__ = (1, 4, 1)
__author__ = 'Dan McDougall <YouKnowWho@YouKnowWhat.com>'
"""
**Python Minifier:** Reduces the size of (minifies) Python code for use on
embedded platforms.
Performs the following:
- Removes docstrings.
- Removes comments.
- Minimizes code indentation.
- Joins multiline pairs of parentheses, braces, and brackets (and removes extraneous whitespace within).
- Preserves shebangs and encoding info (e.g. "# -- coding: utf-8 --").
Various examples and edge cases are sprinkled throughout the pyminifier code so
that it can be tested by minifying itself. The way to test is thus:
.. code-block:: bash
$ python pyminifier.py pyminifier.py > minified_pyminifier.py
$ python minified_pyminifier.py pyminifier.py > this_should_be_identical.py
$ diff minified_pyminifier.py this_should_be_identical.py
$
If you get an error executing minified_pyminifier.py or
'this_should_be_identical.py' isn't identical to minified_pyminifier.py then
something is broken.
"""
import sys, re, cStringIO, tokenize
from optparse import OptionParser
# Compile our regular expressions for speed
multiline_quoted_string = re.compile(r'(\'\'\'|\"\"\")')
not_quoted_string = re.compile(r'(\".*\'\'\'.*\"|\'.*\"\"\".*\')')
trailing_newlines = re.compile(r'\n\n')
shebang = re.compile('^#\!.*$')
encoding = re.compile(".*coding[:=]\s*([-\w.]+)")
multiline_indicator = re.compile('\\\\(\s*#.*)?\n')
# The above also removes trailing comments: "test = 'blah \ # comment here"
# These aren't used but they're a pretty good reference:
double_quoted_string = re.compile(r'((?<!\\)".*?(?<!\\)")')
single_quoted_string = re.compile(r"((?<!\\)'.*?(?<!\\)')")
single_line_single_quoted_string = re.compile(r"((?<!\\)'''.*?(?<!\\)''')")
single_line_double_quoted_string = re.compile(r"((?<!\\)'''.*?(?<!\\)''')")
def remove_comments_and_docstrings(source):
"""
Returns 'source' minus comments and docstrings.
**Note**: Uses Python's built-in tokenize module to great effect.
Example:
.. code-block:: python
def noop(): # This is a comment
'''
Does nothing.
'''
pass # Don't do anything
Will become:
.. code-block:: python
def noop():
pass
"""
io_obj = cStringIO.StringIO(source)
out = ""
prev_toktype = tokenize.INDENT
last_lineno = -1
last_col = 0
for tok in tokenize.generate_tokens(io_obj.readline):
token_type = tok[0]
token_string = tok[1]
start_line, start_col = tok[2]
end_line, end_col = tok[3]
ltext = tok[4]
# The following two conditionals preserve indentation.
# This is necessary because we're not using tokenize.untokenize()
# (because it spits out code with copious amounts of oddly-placed
# whitespace).
if start_line > last_lineno:
last_col = 0
if start_col > last_col:
out += (" " * (start_col - last_col))
# Remove comments:
if token_type == tokenize.COMMENT:
pass
# This series of conditionals removes docstrings:
elif token_type == tokenize.STRING:
if prev_toktype != tokenize.INDENT:
# This is likely a docstring; double-check we're not inside an operator:
if prev_toktype != tokenize.NEWLINE:
# Note regarding NEWLINE vs NL: The tokenize module
# differentiates between newlines that start a new statement
# and newlines inside of operators such as parens, brackes,
# and curly braces. Newlines inside of operators are
# NEWLINE and newlines that start new code are NL.
# Catch whole-module docstrings:
if start_col > 0:
# Unlabelled indentation means we're inside an operator
out += token_string
# Note regarding the INDENT token: The tokenize module does
# not label indentation inside of an operator (parens,
# brackets, and curly braces) as actual indentation.
# For example:
# def foo():
# "The spaces before this docstring are tokenize.INDENT"
# test = [
# "The spaces before this string do not get a token"
# ]
else:
out += token_string
prev_toktype = token_type
last_col = end_col
last_lineno = end_line
return out
def reduce_operators(source):
"""
Remove spaces between operators in 'source' and returns the result.
Example:
.. code-block:: python
def foo(foo, bar, blah):
test = "This is a %s" % foo
Will become:
.. code-block:: python
def foo(foo,bar,blah):
test="This is a %s"%foo
"""
io_obj = cStringIO.StringIO(source)
remove_columns = []
out = ""
out_line = ""
prev_toktype = tokenize.INDENT
prev_tok = None
last_lineno = -1
last_col = 0
lshift = 1
for tok in tokenize.generate_tokens(io_obj.readline):
token_type = tok[0]
token_string = tok[1]
start_line, start_col = tok[2]
end_line, end_col = tok[3]
ltext = tok[4]
if start_line > last_lineno:
last_col = 0
if start_col > last_col:
out_line += (" " * (start_col - last_col))
if token_type == tokenize.OP:
# Operators that begin a line such as @ or open parens should be
# left alone
start_of_line_types = [ # These indicate we're starting a new line
tokenize.NEWLINE, tokenize.DEDENT, tokenize.INDENT]
if prev_toktype not in start_of_line_types:
# This is just a regular operator; remove spaces
remove_columns.append(start_col) # Before OP
remove_columns.append(end_col+1) # After OP
if token_string.endswith('\n'):
out_line += token_string
if remove_columns:
for col in remove_columns:
col = col - lshift
try:
# This was really handy for debugging (looks nice, worth saving):
#print out_line + (" " * col) + "^"
# The above points to the character we're looking at
if out_line[col] == " ": # Only if it is a space
out_line = out_line[:col] + out_line[col+1:]
lshift += 1 # To re-align future changes on this line
except IndexError: # Reached and end of line, no biggie
pass
out += out_line
remove_columns = []
out_line = ""
lshift = 1
else:
out_line += token_string
prev_toktype = token_type
prev_token = tok
last_col = end_col
last_lineno = end_line
# This makes sure to capture the last line if it doesn't end in a newline:
out += out_line
# The tokenize module doesn't recognize @ sign before a decorator
return out
# NOTE: This isn't used anymore... Just here for reference in case someone
# searches the internet looking for a way to remove similarly-styled end-of-line
# comments from non-python code. It also acts as an edge case of sorts with
# that raw triple quoted string inside the "quoted_string" assignment.
def remove_comment(single_line):
"""
Removes the comment at the end of the line (if any) and returns the result.
"""
quoted_string = re.compile(
r'''((?<!\\)".*?(?<!\\)")|((?<!\\)'.*?(?<!\\)')'''
)
# This divides the line up into sections:
# Those inside single quotes and those that are not
split_line = quoted_string.split(single_line)
# Remove empty items:
split_line = [a for a in split_line if a]
out_line = ""
for section in split_line:
if section.startswith("'") or section.startswith('"'):
# This is a quoted string; leave it alone
out_line += section
elif '#' in section: # A '#' not in quotes? There's a comment here!
# Get rid of everything after the # including the # itself:
out_line += section.split('#')[0]
break # No reason to bother the rest--it's all comments
else:
# This isn't a quoted string OR a comment; leave it as-is
out_line += section
return out_line.rstrip() # Strip trailing whitespace before returning
def join_multiline_pairs(text, pair="()"):
"""
Finds and removes newlines in multiline matching pairs of characters in
'text'. For example, "(.*\n.*), {.*\n.*}, or [.*\n.*]".
By default it joins parens () but it will join any two characters given via
the 'pair' variable.
**Note:** Doesn't remove extraneous whitespace that ends up between the pair.
Use reduce_operators() for that.
Example:
.. code-block:: python
test = (
"This is inside a multi-line pair of parentheses"
)
Will become:
.. code-block:: python
test = ( "This is inside a multi-line pair of parentheses" )
"""
# Readability variables
opener = pair[0]
closer = pair[1]
# Tracking variables
inside_pair = False
inside_quotes = False
inside_double_quotes = False
inside_single_quotes = False
quoted_string = False
openers = 0
closers = 0
linecount = 0
# Regular expressions
opener_regex = re.compile('\%s' % opener)
closer_regex = re.compile('\%s' % closer)
output = ""
for line in text.split('\n'):
escaped = False
# First we rule out multi-line strings
multline_match = multiline_quoted_string.search(line)
not_quoted_string_match = not_quoted_string.search(line)
if multline_match and not not_quoted_string_match and not quoted_string:
if len(line.split('"""')) > 1 or len(line.split("'''")):
# This is a single line that uses the triple quotes twice
# Treat it as if it were just a regular line:
output += line + '\n'
quoted_string = False
else:
output += line + '\n'
quoted_string = True
elif quoted_string and multiline_quoted_string.search(line):
output += line + '\n'
quoted_string = False
# Now let's focus on the lines containing our opener and/or closer:
elif not quoted_string:
if opener_regex.search(line) or closer_regex.search(line) or inside_pair:
for character in line:
if character == opener:
if not escaped and not inside_quotes:
openers += 1
inside_pair = True
output += character
else:
escaped = False
output += character
elif character == closer:
if not escaped and not inside_quotes:
if openers and openers == (closers + 1):
closers = 0
openers = 0
inside_pair = False
output += character
else:
closers += 1
output += character
else:
escaped = False
output += character
elif character == '\\':
if escaped:
escaped = False
output += character
else:
escaped = True
output += character
elif character == '"' and escaped:
output += character
escaped = False
elif character == "'" and escaped:
output += character
escaped = False
elif character == '"' and inside_quotes:
if inside_single_quotes:
output += character
else:
inside_quotes = False
inside_double_quotes = False
output += character
elif character == "'" and inside_quotes:
if inside_double_quotes:
output += character
else:
inside_quotes = False
inside_single_quotes = False
output += character
elif character == '"' and not inside_quotes:
inside_quotes = True
inside_double_quotes = True
output += character
elif character == "'" and not inside_quotes:
inside_quotes = True
inside_single_quotes = True
output += character
elif character == ' ' and inside_pair and not inside_quotes:
if not output[-1] in [' ', opener]:
output += ' '
else:
if escaped:
escaped = False
output += character
if inside_pair == False:
output += '\n'
else:
output += line + '\n'
else:
output += line + '\n'
# Clean up
output = trailing_newlines.sub('\n', output)
return output
def dedent(source):
"""
Minimizes indentation to save precious bytes
Example:
.. code-block:: python
def foo(bar):
test = "This is a test"
Will become:
.. code-block:: python
def foo(bar):
test = "This is a test"
"""
io_obj = cStringIO.StringIO(source)
out = ""
last_lineno = -1
last_col = 0
prev_start_line = 0
indentation = ""
indentation_level = 0
for i,tok in enumerate(tokenize.generate_tokens(io_obj.readline)):
token_type = tok[0]
token_string = tok[1]
start_line, start_col = tok[2]
end_line, end_col = tok[3]
if start_line > last_lineno:
last_col = 0
if token_type == tokenize.INDENT:
indentation_level += 1
continue
if token_type == tokenize.DEDENT:
indentation_level -= 1
continue
indentation = " " * indentation_level
if start_line > prev_start_line:
out += indentation + token_string
elif start_col > last_col:
out += " " + token_string
else:
out += token_string
prev_start_line = start_line
last_col = end_col
last_lineno = end_line
return out
def fix_empty_methods(source):
"""
Appends 'pass' to empty methods/functions (i.e. where there was nothing but
a docstring before we removed it =).
Example:
.. code-block:: python
# Note: This triple-single-quote inside a triple-double-quote is also a
# pyminifier self-test
def myfunc():
'''This is just a placeholder function.'''
Will become:
.. code-block:: python
def myfunc(): pass
"""
def_indentation_level = 0
output = ""
just_matched = False
previous_line = None
method = re.compile(r'^\s*def\s*.*\(.*\):.*$')
for line in source.split('\n'):
if len(line.strip()) > 0: # Don't look at blank lines
if just_matched == True:
this_indentation_level = len(line.rstrip()) - len(line.strip())
if def_indentation_level == this_indentation_level:
# This method is empty, insert a 'pass' statement
output += "%s pass\n%s\n" % (previous_line, line)
else:
output += "%s\n%s\n" % (previous_line, line)
just_matched = False
elif method.match(line):
def_indentation_level = len(line) - len(line.strip()) # A commment
just_matched = True
previous_line = line
else:
output += "%s\n" % line # Another self-test
else:
output += "\n"
return output
def remove_blank_lines(source):
"""
Removes blank lines from 'source' and returns the result.
Example:
.. code-block:: python
test = "foo"
test2 = "bar"
Will become:
.. code-block:: python
test = "foo"
test2 = "bar"
"""
io_obj = cStringIO.StringIO(source)
source = [a for a in io_obj.readlines() if a.strip()]
return "".join(source)
def minify(source):
"""
Remove all docstrings, comments, blank lines, and minimize code
indentation from 'source' then prints the result.
"""
preserved_shebang = None
preserved_encoding = None
# This is for things like shebangs that must be precisely preserved
for line in source.split('\n')[0:2]:
# Save the first comment line if it starts with a shebang
# (e.g. '#!/usr/bin/env python') <--also a self test!
if shebang.match(line): # Must be first line
preserved_shebang = line
continue
# Save the encoding string (must be first or second line in file)
if encoding.match(line):
preserved_encoding = line
# Remove multilines (e.g. lines that end with '\' followed by a newline)
source = multiline_indicator.sub('', source)
# Remove docstrings (Note: Must run before fix_empty_methods())
source = remove_comments_and_docstrings(source)
# Remove empty (i.e. single line) methods/functions
source = fix_empty_methods(source)
# Join multiline pairs of parens, brackets, and braces
source = join_multiline_pairs(source)
source = join_multiline_pairs(source, '[]')
source = join_multiline_pairs(source, '{}')
# Remove whitespace between operators:
source = reduce_operators(source)
# Minimize indentation
source = dedent(source)
# Re-add preseved items
if preserved_encoding:
source = preserved_encoding + "\n" + source
if preserved_shebang:
source = preserved_shebang + "\n" + source
# Remove blank lines
source = remove_blank_lines(source).rstrip('\n') # Stubborn last newline
return source
def bz2_pack(source):
"Returns 'source' as a bzip2-compressed, self-extracting python script."
import bz2, base64
out = ""
compressed_source = bz2.compress(source)
out += 'import bz2, base64\n'
out += "exec bz2.decompress(base64.b64decode('"
out += base64.b64encode((compressed_source))
out += "'))\n"
return out
def gz_pack(source):
"Returns 'source' as a gzip-compressed, self-extracting python script."
import zlib, base64
out = ""
compressed_source = zlib.compress(source)
out += 'import zlib, base64\n'
out += "exec zlib.decompress(base64.b64decode('"
out += base64.b64encode((compressed_source))
out += "'))\n"
return out
# The test.+() functions below are for testing pyminifer...
def test_decorator(f):
"""Decorator that does nothing"""
return f
def test_reduce_operators():
"""Test the case where an operator such as an open paren starts a line"""
(a, b) = 1, 2 # The indentation level should be preserved
pass
def test_empty_functions():
"""
This is a test method.
This should be replaced with 'def empty_method: pass'
"""
class test_class(object):
"Testing indented decorators"
@test_decorator
def foo(self):
pass
def test_function():
"""
This function encapsulates the edge cases to prevent them from invading the
global namespace.
"""
foo = ("The # character in this string should " # This comment
"not result in a syntax error") # ...and this one should go away
test_multi_line_list = [
'item1',
'item2',
'item3'
]
test_multi_line_dict = {
'item1': 1,
'item2': 2,
'item3': 3
}
# It may seem strange but the code below tests our docstring removal code.
test_string_inside_operators = imaginary_function(
"This string was indented but the tokenizer won't see it that way."
) # To understand how this could mess up docstring removal code see the
# remove_comments_and_docstrings() function starting at this line:
# "elif token_type == tokenize.STRING:"
# This tests remove_extraneous_spaces():
this_line_has_leading_indentation = '''<--That extraneous space should be
removed''' # But not these spaces
def main():
usage = '%prog [options] "<input file>"'
parser = OptionParser(usage=usage, version=__version__)
parser.disable_interspersed_args()
parser.add_option(
"-o", "--outfile",
dest="outfile",
default=None,
help="Save output to the given file.",
metavar="<file path>"
)
parser.add_option(
"--bzip2",
action="store_true",
dest="bzip2",
default=False,
help="bzip2-compress the result into a self-executing python script."
)
parser.add_option(
"--gzip",
action="store_true",
dest="gzip",
default=False,
help="gzip-compress the result into a self-executing python script."
)
options, args = parser.parse_args()
try:
source = open(args[0]).read()
except Exception, e:
print e
parser.print_help()
sys.exit(2)
# Minify our input script
result = minify(source)
# Compress it if we were asked to do so
if options.bzip2:
result = bz2_pack(result)
elif options.gzip:
result = gz_pack(result)
# Either save the result to the output file or print it to stdout
if options.outfile:
f = open(options.outfile, 'w')
f.write(result)
f.close()
else:
print result
if __name__ == "__main__":
main()
Diff to Previous Revision
--- revision 15 2010-09-18 19:04:30
+++ revision 16 2014-05-25 16:23:55
@@ -20,9 +20,9 @@
# http://www.gnu.org/licenses/gpl.html
# Meta
-__version__ = '1.4'
+__version__ = '1.4.1'
__license__ = "GNU General Public License (GPL) Version 3"
-__version_info__ = (1, 4)
+__version_info__ = (1, 4, 1)
__author__ = 'Dan McDougall <YouKnowWho@YouKnowWhat.com>'
"""
@@ -182,8 +182,14 @@
if start_col > last_col:
out_line += (" " * (start_col - last_col))
if token_type == tokenize.OP:
- remove_columns.append(start_col) # Before OP
- remove_columns.append(end_col+1) # After OP
+ # Operators that begin a line such as @ or open parens should be
+ # left alone
+ start_of_line_types = [ # These indicate we're starting a new line
+ tokenize.NEWLINE, tokenize.DEDENT, tokenize.INDENT]
+ if prev_toktype not in start_of_line_types:
+ # This is just a regular operator; remove spaces
+ remove_columns.append(start_col) # Before OP
+ remove_columns.append(end_col+1) # After OP
if token_string.endswith('\n'):
out_line += token_string
if remove_columns:
@@ -210,6 +216,7 @@
last_lineno = end_line
# This makes sure to capture the last line if it doesn't end in a newline:
out += out_line
+ # The tokenize module doesn't recognize @ sign before a decorator
return out
# NOTE: This isn't used anymore... Just here for reference in case someone
@@ -574,11 +581,27 @@
return out
# The test.+() functions below are for testing pyminifer...
+def test_decorator(f):
+ """Decorator that does nothing"""
+ return f
+
+def test_reduce_operators():
+ """Test the case where an operator such as an open paren starts a line"""
+ (a, b) = 1, 2 # The indentation level should be preserved
+ pass
+
def test_empty_functions():
"""
This is a test method.
This should be replaced with 'def empty_method: pass'
"""
+
+class test_class(object):
+ "Testing indented decorators"
+
+ @test_decorator
+ def foo(self):
+ pass
def test_function():
"""