summaryrefslogtreecommitdiff
path: root/miami-dade_clerk_courts.py
diff options
context:
space:
mode:
authorNathan Kinkade <nkinkade@creativecommons.org>2014-05-19 18:18:52 -0400
committerNathan Kinkade <nkinkade@creativecommons.org>2014-05-19 18:18:52 -0400
commitfea92fb73fc066701a4e5e578edee7d737045a41 (patch)
tree8475a74fb159756896e90a802eca32e2545ed86d /miami-dade_clerk_courts.py
parent03a01ac31a16cf7f44e827db15b7483b0ec330cd (diff)
parentc1ea47789989b08e919645d8b8133cfea0ad97c9 (diff)
Manually deleted file to fix merge conflict.HEADmaster
Diffstat (limited to 'miami-dade_clerk_courts.py')
-rwxr-xr-xmiami-dade_clerk_courts.py393
1 files changed, 0 insertions, 393 deletions
diff --git a/miami-dade_clerk_courts.py b/miami-dade_clerk_courts.py
deleted file mode 100755
index 744c4c4..0000000
--- a/miami-dade_clerk_courts.py
+++ /dev/null
@@ -1,393 +0,0 @@
-#!/usr/bin/env python
-
-import sys
-from datetime import datetime
-import urllib
-import urllib2
-import re
-import string
-from BeautifulSoup import BeautifulSoup
-from sqlalchemy import create_engine
-from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy import Column, Integer, String, Text, Date, Numeric
-from sqlalchemy.orm import sessionmaker
-
-Base = declarative_base()
-db_engine = create_engine('mysql://clerk:Clerk.@localhost/clerk_courts', echo=False)
-Session = sessionmaker(bind=db_engine)
-db_session = Session()
-
-url = 'https://www2.miami-dadeclerk.com/CJIS/CaseSearch.aspx'
-years = ['13']
-max_case_not_found_count = 3
-
-# Form field names
-field_names = {
- 'case_type' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$txtCaseNo1_',
- 'case_year' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$txtCaseNo2_',
- 'case_seq' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$txtCaseNo3_',
- 'case_defendant' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$txtCaseNo4_',
- 'viewstate' : '__VIEWSTATE',
- 'eventval' : '__EVENTVALIDATION',
- 'button' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$btnCaseSearch',
- 'active_tab' : 'ctl00_cphPage_tcSearchMenu_ClientState',
-}
-
-# Fixed form field values
-field_values = {
- 'case_type' : 'F',
- 'button' : 'Submit',
- 'active_tab' : '{"ActiveTabIndex":0,"TabState":[true,true,true,true,true]}',
- 'viewstate' : '',
- 'eventval' : '',
- 'field_suffix' : '',
-}
-
-# Case ORM class
-class Case(Base):
- __tablename__ = 'cases'
- id = Column(Integer, primary_key=True)
- court_case_no = Column(String(30))
- state_case_no = Column(String(30))
- name = Column(String(100))
- date_birth = Column(Date)
- date_filed = Column(Date)
- date_closed = Column(Date)
- warrant_type = Column(String(25))
- hearing_date = Column(Date)
- hearing_time = Column(Integer)
- hearing_type = Column(String(25))
- defense_attorney = Column(String(100))
- prob_start_date = Column(Date)
- prob_end_date = Column(Date)
- prob_length = Column(String(25))
- in_jail = Column(String(10))
- released_to = Column(String(100))
- bond_amount = Column(Numeric(7,2))
- bond_status = Column(String(50))
- bond_type = Column(String(50))
- bond_date = Column(Date)
-
-# Charge ORM class
-class Charge(Base):
- __tablename__ = 'charges'
- id = Column(Integer, primary_key=True)
- case_id = Column(Integer)
- seq = Column(Integer)
- charge = Column(String(200))
- type = Column(String(50))
- disposition = Column(String(256))
-
-# Docket ORM class
-class Docket(Base):
- __tablename__ = 'dockets'
- id = Column(Integer, primary_key=True)
- case_id = Column(Integer)
- seq = Column(Integer)
- date = Column(Date)
- docket = Column(Text)
-
-class Akas(Base):
- __tablename__ = 'akas'
- id = Column(Integer, primary_key=True)
- case_id = Column(Integer)
- last_name = Column(String(50))
- first_name = Column(String(50))
- middle_name = Column(String(50))
- race = Column(String(25))
- sex = Column(String(10))
-
-# This is an ASP site, so we have some variables that must be passed with a
-# form submission that change from page to page
-def set_asp_vars():
- global field_values
- response = urllib2.urlopen(url)
- page = response.read()
- soup = BeautifulSoup(page)
- field_values['eventval'] = soup.find(id="__EVENTVALIDATION")['value']
- field_values['viewstate'] = soup.find(id="__VIEWSTATE")['value']
- field_suffix_string = soup.find(title="YY")['id']
- field_values['field_suffix'] = field_suffix_string.split('_')[-1]
-
-def fetch_page(post_fields):
- data = urllib.urlencode(post_fields)
- req = urllib2.Request(url, data)
- response = urllib2.urlopen(req)
- return response.read()
-
-# Checks for a "NOT FOUND" error in the returned HTML for a case
-def check_case_not_found(case_soup):
- try:
- error_text = case_soup.find(id='ctl00_cphPage_lblSearchError').text
- except:
- return False
- else:
- if 'NOT FOUND' in error_text:
- return True
- else:
- return False
-
-# Once we determine that a given case exists, process all the parts
-def process_case(case_soup):
- case_id = parse_case(case_soup)
-
- # Get any AKAs, if they exist
- has_akas = case_soup.find(id='ctl00_cphPage_lnkAKA')
- if has_akas:
- post_fields = {
- '__EVENTTARGET' : 'ctl00$cphPage$lnkAKA',
- '__VIEWSTATE' : case_soup.find(id='__VIEWSTATE')['value'],
- '__EVENTVALIDATION' : case_soup.find(id='__EVENTVALIDATION')['value'],
- }
- akas = fetch_page(post_fields)
- akas_soup = BeautifulSoup(akas)
- parse_akas(akas_soup, case_id)
-
- # Get charges
- parse_charges(case_soup, case_id)
-
- # Get case dockets
- post_fields = {
- '__EVENTTARGET' : 'ctl00$cphPage$lnkDockets',
- '__VIEWSTATE' : case_soup.find(id='__VIEWSTATE')['value'],
- '__EVENTVALIDATION' : case_soup.find(id='__EVENTVALIDATION')['value'],
- }
- dockets = fetch_page(post_fields)
- dockets_soup = BeautifulSoup(dockets)
- parse_dockets(dockets_soup, case_id)
-
- # Get additional case info
- post_fields = {
- '__EVENTTARGET' : 'ctl00$cphPage$lnkAdditionalInfo',
- '__VIEWSTATE' : case_soup.find(id='__VIEWSTATE')['value'],
- '__EVENTVALIDATION' : case_soup.find(id='__EVENTVALIDATION')['value'],
- }
- addinfo = fetch_page(post_fields)
- addinfo_soup = BeautifulSoup(addinfo)
- parse_addinfo(addinfo_soup)
-
-def parse_case(case_soup):
- # Assign fields
- court_case_no = case_soup.find(id='ctl00_cphPage_lblCaseNumber').text
- state_case_no = case_soup.find(id='ctl00_cphPage_lblStateCaseNo').text
- name = case_soup.find(id='ctl00_cphPage_lblName').text
- date_birth = format_date(case_soup.find(id='ctl00_cphPage_lblDoB').text)
- date_filed = format_date(case_soup.find(id='ctl00_cphPage_lblDateFiled').text)
- date_closed = format_date(case_soup.find(id='ctl00_cphPage_lblDateClosed').text)
- warrant_type = case_soup.find(id='ctl00_cphPage_lblWarrantType').text
- hearing_date = format_date(case_soup.find(id='ctl00_cphPage_lblHearingDate').text)
- hearing_time = case_soup.find(id='ctl00_cphPage_lblHearingTime').text.strip()
- hearing_type = case_soup.find(id='ctl00_cphPage_lblHearingType').text
- defense_attorney = case_soup.find(id='ctl00_cphPage_lblDefenseAttorney').text
-
- # Try to clean up the data bit to avoid needless MySQL warnings
- if not hearing_time:
- hearing_time = None
-
- new_case = Case(
- court_case_no = court_case_no,
- state_case_no = state_case_no,
- name = name,
- date_birth = date_birth,
- date_filed = date_filed,
- date_closed = date_closed,
- warrant_type = warrant_type,
- hearing_date = hearing_date,
- hearing_time = hearing_time,
- hearing_type = hearing_type,
- defense_attorney = defense_attorney,
- )
-
- db_session.add(new_case)
- db_session.commit()
-
- return new_case.id
-
-def parse_charges(case_soup, case_id):
- charges_section = case_soup.find(id="ctl00_cphPage_pnlCharges")
- charge_rows = charges_section.findAll('tr')
- charge_rows.pop(0) # We don't want the table headers
- charge_fields = ['seq','charge','type','disposition']
- charges = []
-
- for ridx,fields in enumerate(charge_rows):
- charge_data = {}
- for fidx,field in enumerate(fields.findAll('td', recursive=False)):
- # Tsk-tsk. Why the non-breaking spaces!
- charge_data[charge_fields[fidx]] = re.sub('&nbsp;', ' ', field.text)
- charges.append(charge_data)
-
- for charge in charges:
- new_charge = Charge(
- case_id = case_id,
- seq = charge['seq'],
- charge = charge['charge'],
- type = charge['type'],
- disposition = charge['disposition'],
- )
- db_session.add(new_charge)
-
- # Commit charges to database
- db_session.commit()
-
-def parse_dockets(dockets_soup, case_id):
- # This is pretty lame, but they don't have any useful ids, names or
- # attributes to accurately identify the right table. These classes are
- # probably good enough, though.
- docket_rows = dockets_soup.findAll('tr', attrs={'class':['RowBody','RowBodyAlt']})
- docket_fields = ['seq', 'date', 'book-page', 'docket']
- dockets = []
- for ridx,fields in enumerate(docket_rows):
- docket_data = {}
- for fidx,field in enumerate(fields.findAll('td', recursive=False)):
- # Tsk-tsk. Why the non-breaking spaces!
- docket_data[docket_fields[fidx]] = re.sub('&nbsp;', ' ', field.text)
- dockets.append(docket_data)
- for docket in dockets:
- new_docket = Docket(
- case_id = case_id,
- seq = docket['seq'],
- date = format_date(docket['date']),
- docket = docket['docket'],
- )
- db_session.add(new_docket)
-
- # Commit dockets to database
- db_session.commit()
-
-
-def parse_addinfo(addinfo_soup):
- court_case_no = addinfo_soup.find(id='ctl00_cphPage_lblCaseNumber').text
- case = db_session.query(Case).filter_by(court_case_no=court_case_no).one()
-
- # Assign values to existing case
- case.prob_start_date = format_date(addinfo_soup.find(id='ctl00_cphPage_lblProbationStartDate').text)
- case.prob_end_date = format_date(addinfo_soup.find(id='ctl00_cphPage_lblProbationEndDate').text)
- case.prob_length = addinfo_soup.find(id='ctl00_cphPage_lblProbationLength').text
- case.in_jail = addinfo_soup.find(id='ctl00_cphPage_lblDefendantinJail').text
- case.released_to = addinfo_soup.find(id='ctl00_cphPage_lblDefendantReleaseTo').text
- case.bond_amount = addinfo_soup.find(id='ctl00_cphPage_lblBondAmount').text
- case.bond_status = addinfo_soup.find(id='ctl00_cphPage_lblBondStatus').text
- case.bond_type = addinfo_soup.find(id='ctl00_cphPage_lblBondType').text
- case.bond_date = format_date(addinfo_soup.find(id='ctl00_cphPage_lblBondIssueDate').text)
-
- # Fix format of Bond Amount
- case.bond_amount = case.bond_amount.lstrip('$')
- case.bond_amount = case.bond_amount.replace(',', '')
-
- db_session.commit()
-
-def parse_akas(akas_soup, case_id):
- akas_count_text = akas_soup.find(id='ctl00_cphPage_lblDefendants').text
- akas_count = re.match('^\d+', akas_count_text).group(0)
- for num in range(0, int(akas_count)):
- # For readability, get field ids into variables
- last_name_id = 'ctl00_cphPage_rptDefendants_ctl0%s_lnkDefendantLastName' % num
- first_name_id = 'ctl00_cphPage_rptDefendants_ctl0%s_lnkDefendantFirstName' % num
- middle_name_id = 'ctl00_cphPage_rptDefendants_ctl0%s_lnkDefendantMiddleName' % num
- race_id = 'ctl00_cphPage_rptDefendants_ctl0%s_lnkDefendantRace' % num
- sex_id = 'ctl00_cphPage_rptDefendants_ctl0%s_lnkDefendantSex' % num
-
- # Now get the actual variables from the soup
- last_name = akas_soup.find(id=last_name_id).text
- first_name = akas_soup.find(id=first_name_id).text
- middle_name = akas_soup.find(id=middle_name_id).text
- race = akas_soup.find(id=race_id).text
- sex = akas_soup.find(id=sex_id).text
-
- new_aka = Akas(
- case_id = case_id,
- last_name = last_name,
- first_name = first_name,
- middle_name = first_name,
- race = race,
- sex = sex,
- )
-
- db_session.add(new_aka)
-
- db_session.commit()
-
-def format_date(date):
- try:
- date_obj = datetime.strptime(date, '%m/%d/%Y')
- if date_obj:
- return date_obj.strftime('%Y-%m-%d')
- else:
- return None
- except:
- return None
-
-def main():
-
- # Set ASP-specific form field variables
- set_asp_vars()
-
- for year in years:
- # Keep track of how many NOT FOUND errors we get.
- case_not_found_count = 0
-
- for seq in range(37,39):
- if case_not_found_count > max_case_not_found_count:
- # If our not_found_count exceeds the maximum set,
- # then assume that we have reached the end of all
- # cases for the given year and move on to the next
- # year
- break
-
- # Iterate through A-Z possible defendants
- for defendant in string.lowercase:
-
- # In the cases where we have multiple defendants, when
- # we reach the end of the defendants we do not want the
- # code to try to make the request without the defendant
- # field since this will return a duplicate of the A
- # defendant. This variable is a flag to signal this.
- try:
- has_multiple_defendants
- except:
- has_multiple_defendants = None
-
- # Get main case information
- post_fields = {
- field_names['case_type'] + field_values['field_suffix'] : field_values['case_type'],
- field_names['case_year'] + field_values['field_suffix'] : year,
- field_names['case_seq'] + field_values['field_suffix'] : seq,
- field_names['case_defendant'] + field_values['field_suffix'] : defendant,
- field_names['viewstate'] : field_values['viewstate'],
- field_names['eventval'] : field_values['eventval'],
- field_names['button'] : field_values['button'],
- field_names['active_tab'] : field_values['active_tab'],
- }
-
- case = fetch_page(post_fields)
- case_soup = BeautifulSoup(case)
- not_found = check_case_not_found(case_soup)
- if not_found:
- if has_multiple_defendants:
- break
- # Try again with no defendant sequence specification
- post_fields[ field_names['case_defendant'] + field_values['field_suffix'] ] = ''
- case = fetch_page(post_fields)
- case_soup = BeautifulSoup(case)
- not_found = check_case_not_found(case_soup)
- if not_found:
- # If the case wasn't found even without the defendant
- # number then it really must not exist
- case_not_found_count = case_not_found_count + 1
- else:
- # Reset the not_found_count and process case
- case_not_found_count = 0
- has_multiple_defendants = None
- process_case(case_soup)
-
- break
- else:
- has_multiple_defendants = True
- # Reset the not_found_count and process case
- case_not_found_count = 0
- process_case(case_soup)
-
-
-if __name__ == '__main__':
- main()