#!/usr/bin/env python # -*- coding: utf-8 -*- # # pyminifier.py # # Copyright 2009 Dan McDougall # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; Version 3 of the License # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, the license can be downloaded here: # # http://www.gnu.org/licenses/gpl.html # Meta __version__ = '1.4.1' __license__ = "GNU General Public License (GPL) Version 3" __version_info__ = (1, 4, 1) __author__ = 'Dan McDougall ' """ **Python Minifier:** Reduces the size of (minifies) Python code for use on embedded platforms. Performs the following: - Removes docstrings. - Removes comments. - Minimizes code indentation. - Joins multiline pairs of parentheses, braces, and brackets (and removes extraneous whitespace within). - Preserves shebangs and encoding info (e.g. "# -- coding: utf-8 --"). Various examples and edge cases are sprinkled throughout the pyminifier code so that it can be tested by minifying itself. The way to test is thus: .. code-block:: bash $ python pyminifier.py pyminifier.py > minified_pyminifier.py $ python minified_pyminifier.py pyminifier.py > this_should_be_identical.py $ diff minified_pyminifier.py this_should_be_identical.py $ If you get an error executing minified_pyminifier.py or 'this_should_be_identical.py' isn't identical to minified_pyminifier.py then something is broken. """ import sys, re, cStringIO, tokenize from optparse import OptionParser # Compile our regular expressions for speed multiline_quoted_string = re.compile(r'(\'\'\'|\"\"\")') not_quoted_string = re.compile(r'(\".*\'\'\'.*\"|\'.*\"\"\".*\')') trailing_newlines = re.compile(r'\n\n') shebang = re.compile('^#\!.*$') encoding = re.compile(".*coding[:=]\s*([-\w.]+)") multiline_indicator = re.compile('\\\\(\s*#.*)?\n') # The above also removes trailing comments: "test = 'blah \ # comment here" # These aren't used but they're a pretty good reference: double_quoted_string = re.compile(r'((? last_lineno: last_col = 0 if start_col > last_col: out += (" " * (start_col - last_col)) # Remove comments: if token_type == tokenize.COMMENT: pass # This series of conditionals removes docstrings: elif token_type == tokenize.STRING: if prev_toktype != tokenize.INDENT: # This is likely a docstring; double-check we're not inside an operator: if prev_toktype != tokenize.NEWLINE: # Note regarding NEWLINE vs NL: The tokenize module # differentiates between newlines that start a new statement # and newlines inside of operators such as parens, brackes, # and curly braces. Newlines inside of operators are # NEWLINE and newlines that start new code are NL. # Catch whole-module docstrings: if start_col > 0: # Unlabelled indentation means we're inside an operator out += token_string # Note regarding the INDENT token: The tokenize module does # not label indentation inside of an operator (parens, # brackets, and curly braces) as actual indentation. # For example: # def foo(): # "The spaces before this docstring are tokenize.INDENT" # test = [ # "The spaces before this string do not get a token" # ] else: out += token_string prev_toktype = token_type last_col = end_col last_lineno = end_line return out def reduce_operators(source): """ Remove spaces between operators in 'source' and returns the result. Example: .. code-block:: python def foo(foo, bar, blah): test = "This is a %s" % foo Will become: .. code-block:: python def foo(foo,bar,blah): test="This is a %s"%foo """ io_obj = cStringIO.StringIO(source) remove_columns = [] out = "" out_line = "" prev_toktype = tokenize.INDENT prev_tok = None last_lineno = -1 last_col = 0 lshift = 1 for tok in tokenize.generate_tokens(io_obj.readline): token_type = tok[0] token_string = tok[1] start_line, start_col = tok[2] end_line, end_col = tok[3] ltext = tok[4] if start_line > last_lineno: last_col = 0 if start_col > last_col: out_line += (" " * (start_col - last_col)) if token_type == tokenize.OP: # Operators that begin a line such as @ or open parens should be # left alone start_of_line_types = [ # These indicate we're starting a new line tokenize.NEWLINE, tokenize.DEDENT, tokenize.INDENT] if prev_toktype not in start_of_line_types: # This is just a regular operator; remove spaces remove_columns.append(start_col) # Before OP remove_columns.append(end_col+1) # After OP if token_string.endswith('\n'): out_line += token_string if remove_columns: for col in remove_columns: col = col - lshift try: # This was really handy for debugging (looks nice, worth saving): #print out_line + (" " * col) + "^" # The above points to the character we're looking at if out_line[col] == " ": # Only if it is a space out_line = out_line[:col] + out_line[col+1:] lshift += 1 # To re-align future changes on this line except IndexError: # Reached and end of line, no biggie pass out += out_line remove_columns = [] out_line = "" lshift = 1 else: out_line += token_string prev_toktype = token_type prev_token = tok last_col = end_col last_lineno = end_line # This makes sure to capture the last line if it doesn't end in a newline: out += out_line # The tokenize module doesn't recognize @ sign before a decorator return out # NOTE: This isn't used anymore... Just here for reference in case someone # searches the internet looking for a way to remove similarly-styled end-of-line # comments from non-python code. It also acts as an edge case of sorts with # that raw triple quoted string inside the "quoted_string" assignment. def remove_comment(single_line): """ Removes the comment at the end of the line (if any) and returns the result. """ quoted_string = re.compile( r'''((? 1 or len(line.split("'''")): # This is a single line that uses the triple quotes twice # Treat it as if it were just a regular line: output += line + '\n' quoted_string = False else: output += line + '\n' quoted_string = True elif quoted_string and multiline_quoted_string.search(line): output += line + '\n' quoted_string = False # Now let's focus on the lines containing our opener and/or closer: elif not quoted_string: if opener_regex.search(line) or closer_regex.search(line) or inside_pair: for character in line: if character == opener: if not escaped and not inside_quotes: openers += 1 inside_pair = True output += character else: escaped = False output += character elif character == closer: if not escaped and not inside_quotes: if openers and openers == (closers + 1): closers = 0 openers = 0 inside_pair = False output += character else: closers += 1 output += character else: escaped = False output += character elif character == '\\': if escaped: escaped = False output += character else: escaped = True output += character elif character == '"' and escaped: output += character escaped = False elif character == "'" and escaped: output += character escaped = False elif character == '"' and inside_quotes: if inside_single_quotes: output += character else: inside_quotes = False inside_double_quotes = False output += character elif character == "'" and inside_quotes: if inside_double_quotes: output += character else: inside_quotes = False inside_single_quotes = False output += character elif character == '"' and not inside_quotes: inside_quotes = True inside_double_quotes = True output += character elif character == "'" and not inside_quotes: inside_quotes = True inside_single_quotes = True output += character elif character == ' ' and inside_pair and not inside_quotes: if not output[-1] in [' ', opener]: output += ' ' else: if escaped: escaped = False output += character if inside_pair == False: output += '\n' else: output += line + '\n' else: output += line + '\n' # Clean up output = trailing_newlines.sub('\n', output) return output def dedent(source): """ Minimizes indentation to save precious bytes Example: .. code-block:: python def foo(bar): test = "This is a test" Will become: .. code-block:: python def foo(bar): test = "This is a test" """ io_obj = cStringIO.StringIO(source) out = "" last_lineno = -1 last_col = 0 prev_start_line = 0 indentation = "" indentation_level = 0 for i,tok in enumerate(tokenize.generate_tokens(io_obj.readline)): token_type = tok[0] token_string = tok[1] start_line, start_col = tok[2] end_line, end_col = tok[3] if start_line > last_lineno: last_col = 0 if token_type == tokenize.INDENT: indentation_level += 1 continue if token_type == tokenize.DEDENT: indentation_level -= 1 continue indentation = " " * indentation_level if start_line > prev_start_line: out += indentation + token_string elif start_col > last_col: out += " " + token_string else: out += token_string prev_start_line = start_line last_col = end_col last_lineno = end_line return out def fix_empty_methods(source): """ Appends 'pass' to empty methods/functions (i.e. where there was nothing but a docstring before we removed it =). Example: .. code-block:: python # Note: This triple-single-quote inside a triple-double-quote is also a # pyminifier self-test def myfunc(): '''This is just a placeholder function.''' Will become: .. code-block:: python def myfunc(): pass """ def_indentation_level = 0 output = "" just_matched = False previous_line = None method = re.compile(r'^\s*def\s*.*$.*$:.*$') for line in source.split('\n'): if len(line.strip()) > 0: # Don't look at blank lines if just_matched == True: this_indentation_level = len(line.rstrip()) - len(line.strip()) if def_indentation_level == this_indentation_level: # This method is empty, insert a 'pass' statement output += "%s pass\n%s\n" % (previous_line, line) else: output += "%s\n%s\n" % (previous_line, line) just_matched = False elif method.match(line): def_indentation_level = len(line) - len(line.strip()) # A commment just_matched = True previous_line = line else: output += "%s\n" % line # Another self-test else: output += "\n" return output def remove_blank_lines(source): """ Removes blank lines from 'source' and returns the result. Example: .. code-block:: python test = "foo" test2 = "bar" Will become: .. code-block:: python test = "foo" test2 = "bar" """ io_obj = cStringIO.StringIO(source) source = [a for a in io_obj.readlines() if a.strip()] return "".join(source) def minify(source): """ Remove all docstrings, comments, blank lines, and minimize code indentation from 'source' then prints the result. """ preserved_shebang = None preserved_encoding = None # This is for things like shebangs that must be precisely preserved for line in source.split('\n')[0:2]: # Save the first comment line if it starts with a shebang # (e.g. '#!/usr/bin/env python') <--also a self test! if shebang.match(line): # Must be first line preserved_shebang = line continue # Save the encoding string (must be first or second line in file) if encoding.match(line): preserved_encoding = line # Remove multilines (e.g. lines that end with '\' followed by a newline) source = multiline_indicator.sub('', source) # Remove docstrings (Note: Must run before fix_empty_methods()) source = remove_comments_and_docstrings(source) # Remove empty (i.e. single line) methods/functions source = fix_empty_methods(source) # Join multiline pairs of parens, brackets, and braces source = join_multiline_pairs(source) source = join_multiline_pairs(source, '[]') source = join_multiline_pairs(source, '{}') # Remove whitespace between operators: source = reduce_operators(source) # Minimize indentation source = dedent(source) # Re-add preseved items if preserved_encoding: source = preserved_encoding + "\n" + source if preserved_shebang: source = preserved_shebang + "\n" + source # Remove blank lines source = remove_blank_lines(source).rstrip('\n') # Stubborn last newline return source def bz2_pack(source): "Returns 'source' as a bzip2-compressed, self-extracting python script." import bz2, base64 out = "" compressed_source = bz2.compress(source) out += 'import bz2, base64\n' out += "exec bz2.decompress(base64.b64decode('" out += base64.b64encode((compressed_source)) out += "'))\n" return out def gz_pack(source): "Returns 'source' as a gzip-compressed, self-extracting python script." import zlib, base64 out = "" compressed_source = zlib.compress(source) out += 'import zlib, base64\n' out += "exec zlib.decompress(base64.b64decode('" out += base64.b64encode((compressed_source)) out += "'))\n" return out # The test.+() functions below are for testing pyminifer... def test_decorator(f): """Decorator that does nothing""" return f def test_reduce_operators(): """Test the case where an operator such as an open paren starts a line""" (a, b) = 1, 2 # The indentation level should be preserved pass def test_empty_functions(): """ This is a test method. This should be replaced with 'def empty_method: pass' """ class test_class(object): "Testing indented decorators" @test_decorator def foo(self): pass def test_function(): """ This function encapsulates the edge cases to prevent them from invading the global namespace. """ foo = ("The # character in this string should " # This comment "not result in a syntax error") # ...and this one should go away test_multi_line_list = [ 'item1', 'item2', 'item3' ] test_multi_line_dict = { 'item1': 1, 'item2': 2, 'item3': 3 } # It may seem strange but the code below tests our docstring removal code. test_string_inside_operators = imaginary_function( "This string was indented but the tokenizer won't see it that way." ) # To understand how this could mess up docstring removal code see the # remove_comments_and_docstrings() function starting at this line: # "elif token_type == tokenize.STRING:" # This tests remove_extraneous_spaces(): this_line_has_leading_indentation = '''<--That extraneous space should be removed''' # But not these spaces def main(): usage = '%prog [options] ""' parser = OptionParser(usage=usage, version=__version__) parser.disable_interspersed_args() parser.add_option( "-o", "--outfile", dest="outfile", default=None, help="Save output to the given file.", metavar="" ) parser.add_option( "--bzip2", action="store_true", dest="bzip2", default=False, help="bzip2-compress the result into a self-executing python script." ) parser.add_option( "--gzip", action="store_true", dest="gzip", default=False, help="gzip-compress the result into a self-executing python script." ) options, args = parser.parse_args() try: source = open(args[0]).read() except Exception, e: print e parser.print_help() sys.exit(2) # Minify our input script result = minify(source) # Compress it if we were asked to do so if options.bzip2: result = bz2_pack(result) elif options.gzip: result = gz_pack(result) # Either save the result to the output file or print it to stdout if options.outfile: f = open(options.outfile, 'w') f.write(result) f.close() else: print result if __name__ == "__main__": main()