Welcome, guest | Sign In | My Account | Store | Cart

Extract air quality data of Santigo , Chile on csv file

Python, 120 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python 
# -*- coding: utf-8 -*-
#
#       aire-csv.py
#       
#       Copyright 2010 Javier Rovegno Campos <tatadeluxe<at>gmail.com>
#       
#       This program is free software; you can redistribute it and/or modify
#       it under the terms of the GNU General Public License as published by
#       the Free Software Foundation; either version 2 of the License, or
#       (at your option) any later version.
#       
#       This program is distributed in the hope that it will be useful,
#       but WITHOUT ANY WARRANTY; without even the implied warranty of
#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#       GNU General Public License for more details.
#       
#       You should have received a copy of the GNU General Public License
#       along with this program; if not, write to the Free Software
#       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#       MA 02110-1301, USA.
#
'''
Usage: aire-csv.py [options] arg

Options:
  --version             show program's version number and exit
  -h, --help            show this help message and exit
  -f FILE, --file=FILE  sensors FILE database
'''
import os
import csv
import re
import urllib
from optparse import OptionParser

def main():
    usage = "usage: %prog [options] arg"
    version = "%prog 1.0"
    parser = OptionParser(usage=usage, version=version)
    parser.add_option("-f", "--file", dest="csv_file",
                  help="csv FILE database", metavar="FILE",
                  default='db-aire.csv')
    (options, args) = parser.parse_args()
    # Run the program
    process(options, args)

def process(options, args):
    '''
    Save air quality data in a csv_file
    '''
    url = "http://www.seremisaludrm.cl/sitio/pag/aire/indexjs3aireindices-prueba.asp"
    sock = urllib.urlopen(url)
    htmlSource = sock.read()
    sock.close()
    csv_file = options.csv_file
    csv_exists = False
    # Append sensors data
    air_data = extract_data(htmlSource)
    encabezado = air_data.next()
    encabezado.append('DATE')
    encabezado.append('TIME')
    date = extract_date(htmlSource)
    time = extract_time(htmlSource)
    if os.path.exists(csv_file):
        f_in = open(csv_file, "rb")
        Reader = csv.reader(f_in, dialect=csv.excel)
        encabezado_old = Reader.next()
        csv_exists = True
    f_out = open(csv_file+'~', 'wb')
    Writer = csv.writer(f_out, dialect=csv.excel)
    # TODO check encabezado == encabezado_old
    Writer.writerow(encabezado)
    for row in air_data:
        row.append(date)
        row.append(time)
        Writer.writerow(row)
    if csv_exists:
        for row in Reader:
            Writer.writerow(row)
        f_in.close()
        os.remove(csv_file)
    f_out.close()
    os.rename(csv_file+'~', csv_file)

def extract_date(htmlSource):
    source = htmlSource[5974:5974+100] # Informacion
    return re.findall(r'\d+/\d+/\d+',htmlSource[5974:5974+100])[0]

def extract_time(htmlSource):
    source = htmlSource[5974:5974+100] # Informacion
    return re.findall(r'\d+:\d+',htmlSource[5974:5974+100])[0]

def extract_data(htmlSource):
    source = htmlSource[6644:12988] # Tabla con informacion
    soup_data = re.findall(r'(?:<span\s.*">)(.*)(?:</span>)',source)
    j = 0
    row = []
    for element in soup_data:
        if j == 0:
            element = element.replace('&nbsp;',' ')
        if j == 2:
            element = element.replace('<br>',' ')
            element = element.replace(' ','')
        if j == 3:
            element = element.replace('<br>',' ')
            element = element.replace(' ','')
            row.append(element)
            j = 0
            aux = row
            row = []
            yield aux
        else:
            row.append(element)
            j += 1

if __name__ == '__main__':
    import doctest
    doctest.testmod()
    main()