diff options
| author | Nathan Kinkade <nkinkade@creativecommons.org> | 2014-05-19 18:18:52 -0400 |
|---|---|---|
| committer | Nathan Kinkade <nkinkade@creativecommons.org> | 2014-05-19 18:18:52 -0400 |
| commit | fea92fb73fc066701a4e5e578edee7d737045a41 (patch) | |
| tree | 8475a74fb159756896e90a802eca32e2545ed86d /miami_dade_clerk_courts.py | |
| parent | 03a01ac31a16cf7f44e827db15b7483b0ec330cd (diff) | |
| parent | c1ea47789989b08e919645d8b8133cfea0ad97c9 (diff) | |
Diffstat (limited to 'miami_dade_clerk_courts.py')
| -rwxr-xr-x | miami_dade_clerk_courts.py | 612 |
1 files changed, 612 insertions, 0 deletions
diff --git a/miami_dade_clerk_courts.py b/miami_dade_clerk_courts.py new file mode 100755 index 0000000..5ec4353 --- /dev/null +++ b/miami_dade_clerk_courts.py @@ -0,0 +1,612 @@ +#!/usr/bin/env python + +import sys +from datetime import datetime +import urllib +import urllib2 +import re +import string +import time +import logging +from BeautifulSoup import BeautifulSoup +from sqlalchemy import create_engine +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy import Column, Integer, String, Text, Date, Numeric +from sqlalchemy.orm import sessionmaker + +logging.basicConfig( filename='mdcc.log', level=logging.ERROR ) + +Base = declarative_base() +db_engine = create_engine('mysql://mdcc:Mdcc.@localhost/mdcc', echo=False) +Session = sessionmaker(bind=db_engine) +db_session = Session() + +# Global variables +url = 'https://www2.miami-dadeclerk.com/CJIS/CaseSearch.aspx' +max_case_not_found_count = 50 + + +# Form field names +field_names = { + 'case_type' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$txtCaseNo1_', + 'case_year' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$txtCaseNo2_', + 'case_seq' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$txtCaseNo3_', + 'case_defendant' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$txtCaseNo4_', + 'viewstate' : '__VIEWSTATE', + 'eventval' : '__EVENTVALIDATION', + 'button' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$btnCaseSearch', + 'active_tab' : 'ctl00_cphPage_tcSearchMenu_ClientState', +} + +# Fixed form field values +field_values = { + 'case_type' : 'F', + 'button' : 'Submit', + 'active_tab' : '{"ActiveTabIndex":0,"TabState":[true,true,true,true,true]}', + 'viewstate' : '', + 'eventval' : '', + 'field_suffix' : '', +} + +# Case ORM class +class Case(Base): + __tablename__ = 'cases' + id = Column(Integer, primary_key=True) + court_case_no = Column(String(30)) + state_case_no = Column(String(30)) + name = Column(String(100)) + first_name = Column(String(50)) + last_name = Column(String(50)) + date_birth = Column(Date) + date_filed = Column(Date) + date_closed = Column(Date) + warrant_type = Column(String(25)) + hearing_date = Column(Date) + hearing_time = Column(Integer) + hearing_type = Column(String(25)) + defense_attorney = Column(String(100)) + prob_start_date = Column(Date) + prob_end_date = Column(Date) + prob_length = Column(String(25)) + in_jail = Column(String(10)) + released_to = Column(String(100)) + bond_amount = Column(Numeric(7,2)) + bond_status = Column(String(50)) + bond_type = Column(String(50)) + bond_date = Column(Date) + +# Charge ORM class +class Charge(Base): + __tablename__ = 'charges' + id = Column(Integer, primary_key=True) + case_id = Column(Integer) + seq = Column(Integer) + charge = Column(String(200)) + type = Column(String(50)) + disposition = Column(String(256)) + +# Docket ORM class +class Docket(Base): + __tablename__ = 'dockets' + id = Column(Integer, primary_key=True) + case_id = Column(Integer) + seq = Column(Integer) + date = Column(Date) + docket = Column(Text) + +class Akas(Base): + __tablename__ = 'akas' + id = Column(Integer, primary_key=True) + case_id = Column(Integer) + last_name = Column(String(50)) + first_name = Column(String(50)) + middle_name = Column(String(50)) + race = Column(String(25)) + sex = Column(String(10)) + + +# This is an ASP site, so we have some variables that must be passed with a +# form submission that change form page to page +def set_asp_vars(): + + global field_values + + try: + response = urllib2.urlopen(url) + page = response.read() + soup = BeautifulSoup(page) + field_values['eventval'] = soup.find(id="__EVENTVALIDATION")['value'] + field_values['viewstate'] = soup.find(id="__VIEWSTATE")['value'] + field_suffix_string = soup.find(title="YY")['id'] + field_values['field_suffix'] = field_suffix_string.split('_')[-1] + except KeyboardInterrupt: + raise + except: + logging.error('Failed to get initial ASP variables. %s' % sys.exc_info()[0]) + return False + + +# Fetches the actual page from the site +def fetch_page(post_fields): + + data = urllib.urlencode(post_fields) + + attempt = 0 + # Try to fetch the page 3 times before giving up + while attempt < 3: + try: + req = urllib2.Request(url, data) + response = urllib2.urlopen(req, timeout=30) + page = response.read() + return page + except (urllib2.HTTPError, urllib2.URLError) as e: + attempt = attempt + 1 + if hasattr(e, 'reason'): + logging.error('%s: urllib2 error reason: %s' % (current_case, e.reason)) + if hasattr(e, 'code'): + logging.error('%s: urllib2 error code: %s' % (current_case, e.code)) + if e.code == 500: + sleeptime = 2 ** attempt * 10 + time.sleep(sleeptime) + logging.error('%s: Trying again after sleeping for %d seconds' % (current_case, sleeptime)) + # Reset base ASP vars just in case + set_asp_vars() + logging.error('%s: Also resetting ASP vars just in case.' % (current_case)) + except KeyboardInterrupt: + raise + except: + logging.error('%s: fetch_page() exception %s - %s' + % (current_case, sys.exc_info()[0], sys.exc_info()[1])) + + # If we reach this point then fetching the page totally failed + logging.error('%s: Failed to fetch page contents. %s' + % (current_case, sys.exc_info()[0])) + return False + + +# Checks whether we have some sort of error condition in the page +def check_case_not_found(case_soup): + + try: + error_text = case_soup.find(id='ctl00_cphPage_lblSearchError').text + except KeyboardInterrupt: + raise + except: + return False + else: + if 'NOT FOUND' in error_text: + return True + elif 'Unable to retrieve case information' in error_text: + return True + elif 'CASE INFORMATION NOT ON FILE' in error_text: + return True + else: + return False + +# Checks for a "No dockets found" error condition +def check_dockets_not_found(dockets_soup): + + try: + error_text = dockets_soup.find(id='ctl00_cphPage_lblCaseError').text + except KeyboardInterrupt: + raise + except: + return False + else: + if 'No dockets were found' in error_text: + return True + else: + return False + + +# Once we determine that a given case exists, process all the parts +def process_case(case_soup): + + case_id = parse_case(case_soup) + + if not case_id: + return False + + # Get any AKAs, if they exist + has_akas = case_soup.find(id='ctl00_cphPage_lnkAKA') + if has_akas: + post_fields = { + '__EVENTTARGET' : 'ctl00$cphPage$lnkAKA', + '__VIEWSTATE' : case_soup.find(id='__VIEWSTATE')['value'], + '__EVENTVALIDATION' : case_soup.find(id='__EVENTVALIDATION')['value'], + } + akas = fetch_page(post_fields) + if akas: + akas_soup = BeautifulSoup(akas) + parse_akas(akas_soup, case_id) + + # Get charges + parse_charges(case_soup, case_id) + + # Get case dockets + post_fields = { + '__EVENTTARGET' : 'ctl00$cphPage$lnkDockets', + '__VIEWSTATE' : case_soup.find(id='__VIEWSTATE')['value'], + '__EVENTVALIDATION' : case_soup.find(id='__EVENTVALIDATION')['value'], + } + dockets = fetch_page(post_fields) + if dockets: + dockets_soup = BeautifulSoup(dockets) + no_dockets = check_dockets_not_found(dockets_soup) + if not no_dockets: + parse_dockets(dockets_soup, case_id) + + # Get additional case info + post_fields = { + '__EVENTTARGET' : 'ctl00$cphPage$lnkAdditionalInfo', + '__VIEWSTATE' : case_soup.find(id='__VIEWSTATE')['value'], + '__EVENTVALIDATION' : case_soup.find(id='__EVENTVALIDATION')['value'], + } + addinfo = fetch_page(post_fields) + if addinfo: + addinfo_soup = BeautifulSoup(addinfo) + parse_addinfo(addinfo_soup) + + +# Parses the main case information +def parse_case(case_soup): + + # A dictionary of local variable names mapped to page ids + case_fields = { + 'court_case_no' : 'ctl00_cphPage_lblCaseNumber', + 'state_case_no' : 'ctl00_cphPage_lblStateCaseNo', + 'name' : 'ctl00_cphPage_lblName', + 'date_birth' : 'ctl00_cphPage_lblDoB', + 'date_filed' : 'ctl00_cphPage_lblDateFiled', + 'date_closed' : 'ctl00_cphPage_lblDateClosed', + 'warrant_type' : 'ctl00_cphPage_lblWarrantType', + 'hearing_date' : 'ctl00_cphPage_lblHearingDate', + 'hearing_time' : 'ctl00_cphPage_lblHearingTime', + 'hearing_type' : 'ctl00_cphPage_lblHearingType', + 'defense_attorney' : 'ctl00_cphPage_lblDefenseAttorney', + } + + case_values = {} + for var, field_id in case_fields.iteritems(): + try: + case_values[var] = case_soup.find(id=field_id).text + except KeyboardInterrupt: + raise + except: + # warrant_type won't always exist, so don't bail just because we + # don't find it, but otherwise do log and bail + if var == 'warrant_type': + pass + else: + logging.error('%s: Failed to find field %s in soup. %s' % (current_case, var, sys.exc_info()[0])) + logging.error('%s: Something is wrong with the soup. Here it is: %s' % (current_case, case_soup)) + continue + + # Try to clean up the data bit to avoid needless MySQL warnings + if not case_values['hearing_time'].strip(): + case_values['hearing_time'] = None + + # Before we try to add this case, be sure that for some reason + # it doesn't already exist + case_exists = db_session.query(Case).filter_by(court_case_no=case_values['court_case_no']).first() + + if not case_exists: + new_case = Case( + court_case_no = case_values['court_case_no'], + state_case_no = case_values['state_case_no'], + name = case_values['name'], + first_name = case_values['name'].split(',')[1].strip(), + last_name = case_values['name'].split(',')[0].strip(), + date_birth = format_date(case_values['date_birth']), + date_filed = format_date(case_values['date_filed']), + date_closed = format_date(case_values['date_closed']), + hearing_date = format_date(case_values['hearing_date']), + hearing_time = case_values['hearing_time'], + hearing_type = case_values['hearing_type'], + defense_attorney = case_values['defense_attorney'], + ) + + # warrant_type won't always exist, so check before trying to assign it + try: + new_case.warrant_type = case_values['warrant_type'] + except KeyboardInterrupt: + raise + except: + pass + + db_session.add(new_case) + db_session.commit() + + return new_case.id + else: + return False + + +# Parses the case soup for the list of charges agains the defendant +def parse_charges(case_soup, case_id): + + try: + charges_section = case_soup.find(id="ctl00_cphPage_pnlCharges") + except KeyboardInterrupt: + raise + except: + logging.error('%s: Failed to find charges for case ID %s. %s' % (current_case, sys.exc_info()[0])) + return False + + charge_rows = charges_section.findAll('tr') + + if charge_rows: + charge_rows.pop(0) # We don't want the table headers + charge_fields = ['seq','charge','type','disposition'] + charges = [] + + for ridx,fields in enumerate(charge_rows): + charge_data = {} + for fidx,field in enumerate(fields.findAll('td', recursive=False)): + # Why the non-breaking spaces? Remove them. + charge_data[charge_fields[fidx]] = re.sub(' ', ' ', field.text) + charges.append(charge_data) + + for charge in charges: + new_charge = Charge( + case_id = case_id, + seq = charge['seq'], + charge = charge['charge'], + type = charge['type'], + disposition = charge['disposition'], + ) + db_session.add(new_charge) + + # Commit charges to database + db_session.commit() + else: + logging.error('%s: No charges found.' % current_case) + + + +# Parses any dockets associated with the case +def parse_dockets(dockets_soup, case_id): + + # This is pretty lame, but they don't have any useful ids, names or + # attributes to accurately identify the right table. These classes are + # probably good enough, though. + docket_rows = dockets_soup.findAll('tr', attrs={'class':['RowBody','RowBodyAlt']}) + docket_fields = ['seq', 'date', 'book-page', 'docket'] + dockets = [] + if docket_rows: + for ridx,fields in enumerate(docket_rows): + docket_data = {} + for fidx,field in enumerate(fields.findAll('td', recursive=False)): + # Why the non-breaking spaces? Remove them. + docket_data[docket_fields[fidx]] = re.sub(' ', ' ', field.text) + dockets.append(docket_data) + for docket in dockets: + new_docket = Docket( + case_id = case_id, + seq = docket['seq'], + date = format_date(docket['date']), + docket = docket['docket'], + ) + db_session.add(new_docket) + + # Commit dockets to database + db_session.commit() + else: + logging.error('%s: No dockets found.' % current_case) + + +# Parses the soup for the Additional Information section for the case +def parse_addinfo(addinfo_soup): + + try: + court_case_no = addinfo_soup.find(id='ctl00_cphPage_lblCaseNumber').text + case = db_session.query(Case).filter_by(court_case_no=court_case_no).one() + except KeyboardInterrupt: + raise + except: + logging.error('%s: Error while looking for existing case in addinfo. %s' % (current_case, sys.exc_info()[0])) + return False + + # Map db field names to corresponding page ids + case_fields = { + 'prob_start_date' : 'ctl00_cphPage_lblProbationStartDate', + 'prob_end_date' : 'ctl00_cphPage_lblProbationEndDate', + 'prob_length' : 'ctl00_cphPage_lblProbationLength', + 'in_jail' : 'ctl00_cphPage_lblDefendantinJail', + 'released_to' : 'ctl00_cphPage_lblDefendantReleaseTo', + 'bond_amount' : 'ctl00_cphPage_lblBondAmount', + 'bond_status' : 'ctl00_cphPage_lblBondStatus', + 'bond_type' : 'ctl00_cphPage_lblBondType', + 'bond_date' : 'ctl00_cphPage_lblBondIssueDate', + } + + case_values = {} + for var, field_id in case_fields.iteritems(): + try: + case_values[var] = addinfo_soup.find(id=field_id).text + except KeyboardInterrupt: + raise + except: + logging.error('%s: Failed to find a value for the addinfo field %s. %s' % (current_case, sys.exc_info()[0])) + continue + + # Assign values to existing case + case.prob_start_date = format_date(case_values['prob_start_date']) + case.prob_end_date = format_date(case_values['prob_end_date']) + case.prob_length = case_values['prob_length'] + case.in_jail = case_values['in_jail'] + case.released_to = case_values['released_to'] + case.bond_amount = case_values['bond_amount'] + case.bond_status = case_values['bond_status'] + case.bond_type = case_values['bond_type'] + case.bond_date = format_date(case_values['bond_date']) + + # Fix format of Bond Amount + case.bond_amount = case.bond_amount.lstrip('$') + case.bond_amount = case.bond_amount.replace(',', '') + + db_session.commit() + + +# Parses page listing any AKAs for the defendant +def parse_akas(akas_soup, case_id): + try: + akas_count_text = akas_soup.find(id='ctl00_cphPage_lblDefendants').text + akas_count = re.match('^\d+', akas_count_text).group(0) + except KeyboardInterrupt: + raise + except: + logging.error('%s: Failed to determine akas_count %s. %s' % (current_case, sys.exc_info()[0])) + return False + + for num in range(0, int(akas_count)): + + # We have to pad numbers less than 10 + if num < 10: + num = '0%s' % num + + # For readability, get field ids into variables + last_name_id = 'ctl00_cphPage_rptDefendants_ctl%s_lnkDefendantLastName' % num + first_name_id = 'ctl00_cphPage_rptDefendants_ctl%s_lnkDefendantFirstName' % num + middle_name_id = 'ctl00_cphPage_rptDefendants_ctl%s_lnkDefendantMiddleName' % num + race_id = 'ctl00_cphPage_rptDefendants_ctl%s_lnkDefendantRace' % num + sex_id = 'ctl00_cphPage_rptDefendants_ctl%s_lnkDefendantSex' % num + + # Map db fields to coresponding page id + akas_fields = { + 'last_name' : last_name_id, + 'first_name' : first_name_id, + 'middle_name' : middle_name_id, + 'race' : race_id, + 'sex' : sex_id, + } + + akas_values = {} + for var, field_id in akas_fields.iteritems(): + try: + akas_values[var] = akas_soup.find(id=field_id).text + except KeyboardInterrupt: + raise + except: + logging.error('%s: Failed to find a value for akas field %s. %s' % (current_case, var, sys.exc_info()[0])) + continue + + new_aka = Akas( + case_id = case_id, + last_name = akas_values['last_name'], + first_name = akas_values['first_name'], + middle_name = akas_values['first_name'], + race = akas_values['race'], + sex = akas_values['sex'], + ) + + db_session.add(new_aka) + + db_session.commit() + + +# Format the date to something acceptable for the database +def format_date(date): + + try: + if date.strip(): + date_obj = datetime.strptime(date, '%m/%d/%Y') + if date_obj: + return date_obj.strftime('%Y-%m-%d') + else: + return None + except KeyboardInterrupt: + raise + except: + logging.error('%s: Failed to create a date object for date %s. %s' % (current_case, date, sys.exc_info()[0])) + return None + + +def main(): + + # We set this shortly so that our logging facility can associate + # a case number with errors being logged. + global current_case + + # Set ASP-specific form field variables + set_asp_vars() + + for year in years: + # Keep track of how many NOT FOUND errors we get. + case_not_found_count = 0 + + for seq in range(seq_start, seq_end): + + if case_not_found_count >= max_case_not_found_count: + # If our not_found_count exceeds the maximum set, + # then assume that we have reached the end of all + # cases for the given year and move on to the next + # year + logging.error('%s: Case not found count exceeded %s, quitting' % (current_case, case_not_found_count)) + break + + # Iterate through A-Z possible defendants + for defendant in string.lowercase: + + current_case = 'F-%s-%s-%s' % (year, seq, defendant) + + # Get main case information + post_fields = { + field_names['case_type'] + field_values['field_suffix'] : field_values['case_type'], + field_names['case_year'] + field_values['field_suffix'] : year, + field_names['case_seq'] + field_values['field_suffix'] : seq, + field_names['case_defendant'] + field_values['field_suffix'] : defendant, + field_names['viewstate'] : field_values['viewstate'], + field_names['eventval'] : field_values['eventval'], + field_names['button'] : field_values['button'], + field_names['active_tab'] : field_values['active_tab'], + } + + # Try to fetch a given case 3 times before + # giving up and moving to the next sequence + case = fetch_page(post_fields) + if case: + case_soup = BeautifulSoup(case) + not_found = check_case_not_found(case_soup) + if not_found: + current_case = 'F-%s-%s' % (year, seq) + # We only want to try stripping the defendant letter on 'a', + # since for anything after we will have already tried + # stripping it, and don't want to do it again. + if defendant != 'a': + break + # Try again with no defendant sequence specification + post_fields[ field_names['case_defendant'] + field_values['field_suffix'] ] = '' + case = fetch_page(post_fields) + if case: + case_soup = BeautifulSoup(case) + not_found = check_case_not_found(case_soup) + if not_found: + # If the case wasn't found even without the defendant + # number then it really must not exist + case_not_found_count = case_not_found_count + 1 + else: + # Reset the not_found_count and process case + case_not_found_count = 0 + process_case(case_soup) + + break + else: + current_case = 'F-%s-%s-%s' % (year, seq, defendant) + # Reset the not_found_count and process case + case_not_found_count = 0 + process_case(case_soup) + else: + break + + +if __name__ == '__main__': + # Get starting sequence number and years from command line arguments + seq_start = int(sys.argv[1]) + if int(sys.argv[2]) == 0: + seq_end = 100000 + else: + seq_end = int(sys.argv[2]) + 1 + years = sys.argv[3:] + + main() |
