Welcome, guest | Sign In | My Account | Store | Cart

Python script to import ARFF data files (http://www.cs.waikato.ac.nz/~ml/weka/arff.html) into any database.

Python, 96 lines
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
'''
ARFF2DB.py - ARFF to Database importer
Author: Pradeep Kishore Gowda <pradeep _at_ btbytes (dot) com>
Revision: 1.2
Changelog:
1.2 : rewritten from scratch using Pyparsing module and SQLAlchemy for db independence

Dependencies: SQLAlchemy, Pyparsing
'''
from pyparsing import *
from sqlalchemy import *


def ARFF2DB(data, connection, tblname): 
    #parse the data read from the ARFF file    
    arffFormat = Forward()
    E = CaselessLiteral("E")
    comment = '%' + restOfLine    
    relationToken = Keyword('@RELATION', caseless=True)
    dataToken = Keyword('@DATA', caseless=True)
    attribToken = Keyword('@ATTRIBUTE', caseless=True)
    ident = Word( alphas, alphanums + '_-' ).setName('identifier')    
    relation = Suppress(relationToken) \
               + ident.setResultsName('relation')    
    classDomain = Suppress('{') \
                  + Group(delimitedList(ident.setResultsName('domain'))).setResultsName('domains') + Suppress('}')
    attribute = Group(Suppress(attribToken) 
                + Word(alphas).setResultsName('attrname')+restOfLine).setResultsName('attribute')
    continuous = Group( attribute 
                 + Word(alphas).setResultsName('type'))
    discrete = Group(attribute + classDomain)                
    arithSign = Word("+-",exact=1)            
    realNum = Combine( Optional(arithSign) 
              + (Word( nums ) + "." + Optional( Word(nums) )|( "." + Word(nums) )) 
              + Optional( E + Optional(arithSign) + Word(nums) ))            
    dataList = Group(delimitedList(realNum|ident)).setResultsName('record') 
    arffFormat << ( relation
                   + OneOrMore(attribute).setResultsName('attributes')                  
                   + dataToken
                   + OneOrMore(dataList).setResultsName('records')).setResultsName('arffdata')

    simpleARFF = arffFormat
    simpleARFF.ignore(comment)
    tokens =  simpleARFF.parseString(data)    
    
    #write the data into the database
    db = create_engine(connection)
    metadata = BoundMetaData(db)
    
    if tblname:        
        table_name = tblname
    else:
        table_name = tokens.relation
        
    tbl = Table(table_name, metadata)
    for attr in tokens.attributes:
        if (attr.type=='REAL'):
            col = Column(attr.attrname, Numeric)
        else:
            col = Column(attr.attrname, String(50))
        tbl.columns.add(col)
    tbl.create()

    ins = tbl.insert()
    for record in tokens.records:
        dict = {}
        i = 0 
        for attr in tokens.attributes:
            dict.update({attr.attrname:record[i]})
            i +=1
        ins.execute(dict)
    
    
if __name__ == "__main__":
    from optparse import OptionParser
    parser = OptionParser()
    
    parser.add_option('-f', '--file',
                      action='store', type='string', dest='filename',
                      help='the arff file name', metavar="FILE")

    parser.add_option('-c', '--connection',
                      action='store', type='string', dest='connection',
                      help='Connection String Eg:mysql://user:pass@localhost:port/dbname>')
                      
    parser.add_option('-t', '--table',
                      action='store', type='string', dest='table',
                      help='Table name')
    (options, args) = parser.parse_args()          
    
    
    if options.filename and options.connection:
        text = ''.join(open(options.filename, 'r').readlines())
        ARFF2DB(text, options.connection, options.table)
    else:
        parser.print_help()

I wanted a script to import ARFF format files(mainly used in the WEKA machine learning toolkit) into database. Hence this.

Uses pyparsing module to parse the ARFF format Uses SQLAlchemy module so that the data can be inserted into any Database without worrying about db quirks.

Further updates can be had from http://btbytes.com/techbytes/arff-to-sql-database-importer

-Pradeep

1 comment