Welcome, guest | Sign In | My Account | Store | Cart

Often I get data to import in MS Excel files, and the column headers are not very useful for column names. They contain spaces, punctuation, and special characters that make a simple import difficult.

This script opens the file in Excel, and applies some simple formatting rules to the first row and addresses duplicate names. Then when the file is imported (SQL Server DTS in my case), the column names are somewhat usable.

Python, 76 lines
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# XLfields.py - M.Keranen (mksql@yahoo.com) [10/29/2004]
# -------------------------------------------------------------------------------
# Formats first row cells of an MS Excel (tm) sheet to allow for proper importing
# into databases. Assumes field names in top row, beginning with leftmost column.

import os, string, sys
from win32com.client import Dispatch

namefmt = '0'
namelen = 128

if len(sys.argv)<2:
	print "\nUsage: XLfields.py path/infile.xls [Field name format] [Max field name width]"
	print "\nFormat: 0 = TitleCasedWords"
	print "        1 = Titlecased_Words_Underscored"
	print "        2 = lowercase_words_underscored"
	print "        3 = Words_underscored_only (leave case as in source)"
	sys.exit()
else:
	if len(sys.argv)==2: dummy, xlfile, = sys.argv
	elif len(sys.argv)==3: dummy, xlfile, namefmt = sys.argv
	else: dummy, xlfile, namefmt, namelen = sys.argv

namefmt = int(namefmt)
namelen = int(namelen)

if not os.path.exists(xlfile):
	print "\nFile %s not found." % (xlfile)

# Create string translation tables
allowed = ' _01234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
delchars = ''
for i in range(255):
	if chr(i) not in allowed: delchars = delchars + chr(i)
deltable = string.maketrans(' ','_')

# Open file in Excel
xl = Dispatch("Excel.Application")
xl.Visible = 1
xl.Workbooks.Open(xlfile,0)
xls = xl.Sheets(1)
nc = xls.UsedRange.Columns.Count + 1

# Format column names
cols = []
for c in range(1,nc):
	# Format column name to remove unwanted chars
	col = string.strip(str(xls.Cells(1,c).Value))
	col = col[:namelen]
	col = col.translate(deltable,delchars)
	fmtcol = col
	if namefmt < 3:
		# Title case individual words, leaving original upper chars in place
		fmtcol = ''
		for i in range(len(col)):
			if col.title()[i].isupper(): fmtcol = fmtcol + col[i].upper()
			else: fmtcol = fmtcol + col[i]
	if namefmt == 2: fmtcol = col.lower()
	if namefmt == 0: fmtcol = string.translate(fmtcol,deltable,'_')   # Remove underscores

	d = 0
	dupcol = fmtcol	
	while dupcol in cols:
		d = d + 1
		dupcol = fmtcol + '_' + str(d)
	cols.append(dupcol)
	print "%s -> %s" % (xls.Cells(1,c).Value,dupcol)
	xls.Cells(1,c).Value = dupcol

# Save file under new name. Comment out to leave file open in Excel
#xlname = os.path.basename(xlfile).split('.')[0]
#outfile = os.path.dirname(xlfile) + '\\' + xlname + '.new.xls'
#xl.ActiveWorkbook.SaveAs(Filename=outfile)
#xl.Quit()

sys.exit()