When loading text files into database tables (MSSQL in this example), the source columns often do not match the table definition. This script was written to find the maximum length of each column in a delimited text file, then modify the a table create DDL file to make each character column wide enough, so truncation errors do not occur.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | # SQLColumnWidth.py - M.Keranen (mksql@yahoo.com) - 09/22/2006
# ------------------------------------------------------------
# Find max width of each character column in a delimited text file,
# and modify DDL file to max width of respective source columns.
import sys
if len(sys.argv)<2:
print "\nUsage: %s datafile tableddl.sql" % (sys.argv[0])
sys.exit()
else:
dummy, datafile, sqlfile = sys.argv
print
f = open(datafile,'r')
cw = {}
c = f.readline()
cw = cw.fromkeys(c.split(','),0)
r = 0
l = f.readline()
while l != '':
r += 1
col = l.split('|')
if len(col) != len(cw): print 'Length error in row %s' % (r)
i = 0
for c in cw.iterkeys():
if len(col[i]) > cw[c]: cw[c] = len(col[i])
i += 1
l = f.readline()
#print cw
f.close()
# Match cw dict to CREATE TABLE statement for VARCHAR columns,
# and replace width where data is present larger than column definition
sql = open(sqlfile,'r').read()
lsql = sql.lower() # Copy of SQL to eliminate case in searches
for c in cw.iterkeys():
dt = lsql.find(c)+len(c)+1
if lsql[dt:dt+7] == 'varchar' or lsql[dt:dt+4] == 'char':
lp = dt + lsql[dt:].find('(')+1
rp = dt + lsql[dt:].find(')')
# Original values
ow = int(lsql[lp:rp])
osql = sql[lsql.find(c):rp+1]
if ow < cw[c]:
# Update both lower cased SQL and original to maintain positions
lsql = lsql[:lp] + str(cw[c]) + lsql[rp:]
sql = sql[:lp] + str(cw[c]) + sql[rp:]
print "%s ->%s" % (osql, sql[lsql.find(c)+len(c):rp+1])
open(sqlfile+'.new','w').write(sql)
print "\n%s written.\n" % (sqlfile+'.new')
|
Tags: database