Python script to import ARFF data files (http://www.cs.waikato.ac.nz/~ml/weka/arff.html) into any database.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | '''
ARFF2DB.py - ARFF to Database importer
Author: Pradeep Kishore Gowda <pradeep _at_ btbytes (dot) com>
Revision: 1.2
Changelog:
1.2 : rewritten from scratch using Pyparsing module and SQLAlchemy for db independence
Dependencies: SQLAlchemy, Pyparsing
'''
from pyparsing import *
from sqlalchemy import *
def ARFF2DB(data, connection, tblname):
#parse the data read from the ARFF file
arffFormat = Forward()
E = CaselessLiteral("E")
comment = '%' + restOfLine
relationToken = Keyword('@RELATION', caseless=True)
dataToken = Keyword('@DATA', caseless=True)
attribToken = Keyword('@ATTRIBUTE', caseless=True)
ident = Word( alphas, alphanums + '_-' ).setName('identifier')
relation = Suppress(relationToken) \
+ ident.setResultsName('relation')
classDomain = Suppress('{') \
+ Group(delimitedList(ident.setResultsName('domain'))).setResultsName('domains') + Suppress('}')
attribute = Group(Suppress(attribToken)
+ Word(alphas).setResultsName('attrname')+restOfLine).setResultsName('attribute')
continuous = Group( attribute
+ Word(alphas).setResultsName('type'))
discrete = Group(attribute + classDomain)
arithSign = Word("+-",exact=1)
realNum = Combine( Optional(arithSign)
+ (Word( nums ) + "." + Optional( Word(nums) )|( "." + Word(nums) ))
+ Optional( E + Optional(arithSign) + Word(nums) ))
dataList = Group(delimitedList(realNum|ident)).setResultsName('record')
arffFormat << ( relation
+ OneOrMore(attribute).setResultsName('attributes')
+ dataToken
+ OneOrMore(dataList).setResultsName('records')).setResultsName('arffdata')
simpleARFF = arffFormat
simpleARFF.ignore(comment)
tokens = simpleARFF.parseString(data)
#write the data into the database
db = create_engine(connection)
metadata = BoundMetaData(db)
if tblname:
table_name = tblname
else:
table_name = tokens.relation
tbl = Table(table_name, metadata)
for attr in tokens.attributes:
if (attr.type=='REAL'):
col = Column(attr.attrname, Numeric)
else:
col = Column(attr.attrname, String(50))
tbl.columns.add(col)
tbl.create()
ins = tbl.insert()
for record in tokens.records:
dict = {}
i = 0
for attr in tokens.attributes:
dict.update({attr.attrname:record[i]})
i +=1
ins.execute(dict)
if __name__ == "__main__":
from optparse import OptionParser
parser = OptionParser()
parser.add_option('-f', '--file',
action='store', type='string', dest='filename',
help='the arff file name', metavar="FILE")
parser.add_option('-c', '--connection',
action='store', type='string', dest='connection',
help='Connection String Eg:mysql://user:pass@localhost:port/dbname>')
parser.add_option('-t', '--table',
action='store', type='string', dest='table',
help='Table name')
(options, args) = parser.parse_args()
if options.filename and options.connection:
text = ''.join(open(options.filename, 'r').readlines())
ARFF2DB(text, options.connection, options.table)
else:
parser.print_help()
|
I wanted a script to import ARFF format files(mainly used in the WEKA machine learning toolkit) into database. Hence this.
Uses pyparsing module to parse the ARFF format Uses SQLAlchemy module so that the data can be inserted into any Database without worrying about db quirks.
Further updates can be had from http://btbytes.com/techbytes/arff-to-sql-database-importer
-Pradeep
Link http://btbytes.com/techbytes/arff-to-sql-database-importer is broken. I tried http://btbytes.com/techbytes/arff-to-sql-database-importer and got 404 error.