scrape_trucks.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90

#!/usr/bin/env python
# coding=UTF-8
'''
Copyright © 2014 Nathan Kinkade

This program is free software; it is released into the public domain under a
CC0 waiver: http://creativecommons.org/publicdomain/zero/1.0/
'''

from datetime import datetime
import json
import urllib2
import cgi
from BeautifulSoup import BeautifulSoup

url = "http://foodtruckfiesta.com/dc-food-truck-list/"
timestamp = datetime.now().strftime('%A, %B %d, %Y %H:%M:%S')

# Aliases a list of locations to a short name
limits = {
    'oti':
        [
            u'DC - Metro Center',
            u'DC - Franklin Square',
            u'DC - Farragut Square'
        ],
}

response = {
    'error': '',
    'date': timestamp,
    'locations' : {},
}

def main(limit):
    try:
        res = urllib2.urlopen(url)
        page = res.read()
        soup = BeautifulSoup(page)
    except Exception, e:
        response['error'] = e.message
        return

    locations = {}
    for h2 in soup.findAll('h2')[1:]:
        next_sib = h2
        trucks = []
        while True:
            truck = {}
            next_sib = next_sib.nextSibling
            if not next_sib:
                break
            try:
                if next_sib.name == 'div':
                    if next_sib['style'] == u'position:relative;':
                        truck['name'] = next_sib.find('span').text
                        truck['url'] = next_sib.find('a')['href']
                        trucks.append(truck)
                if next_sib.name == 'h2':
                    locations[h2.text] = trucks
                    break
            except Exception, e:
                pass

    try:
        if limit:
            limited = limits[limit]
            for location in limited:
                try:
                    response['locations'][location] = locations[location]
                except KeyError:
                    continue
        else:
            response['locations'] = locations
    except:
        response['locations'] = locations

if __name__ == '__main__':
    query_params = cgi.FieldStorage()

    limit = ''
    try:
        if 'limit' in query_params:
            limit = query_params['limit'].value
    except:
        pass

    main(limit)
    print 'Content-type: application/json; charset=UTF-8\n'
    print json.dumps(response, sort_keys=True, indent=4, separators=(',', ': '))