diff options
| author | Nathan Kinkade <nkinkade@creativecommons.org> | 2014-05-19 18:18:52 -0400 |
|---|---|---|
| committer | Nathan Kinkade <nkinkade@creativecommons.org> | 2014-05-19 18:18:52 -0400 |
| commit | fea92fb73fc066701a4e5e578edee7d737045a41 (patch) | |
| tree | 8475a74fb159756896e90a802eca32e2545ed86d /miami-dade_clerk_courts.py | |
| parent | 03a01ac31a16cf7f44e827db15b7483b0ec330cd (diff) | |
| parent | c1ea47789989b08e919645d8b8133cfea0ad97c9 (diff) | |
Diffstat (limited to 'miami-dade_clerk_courts.py')
| -rwxr-xr-x | miami-dade_clerk_courts.py | 393 |
1 files changed, 0 insertions, 393 deletions
diff --git a/miami-dade_clerk_courts.py b/miami-dade_clerk_courts.py deleted file mode 100755 index 744c4c4..0000000 --- a/miami-dade_clerk_courts.py +++ /dev/null @@ -1,393 +0,0 @@ -#!/usr/bin/env python - -import sys -from datetime import datetime -import urllib -import urllib2 -import re -import string -from BeautifulSoup import BeautifulSoup -from sqlalchemy import create_engine -from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import Column, Integer, String, Text, Date, Numeric -from sqlalchemy.orm import sessionmaker - -Base = declarative_base() -db_engine = create_engine('mysql://clerk:Clerk.@localhost/clerk_courts', echo=False) -Session = sessionmaker(bind=db_engine) -db_session = Session() - -url = 'https://www2.miami-dadeclerk.com/CJIS/CaseSearch.aspx' -years = ['13'] -max_case_not_found_count = 3 - -# Form field names -field_names = { - 'case_type' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$txtCaseNo1_', - 'case_year' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$txtCaseNo2_', - 'case_seq' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$txtCaseNo3_', - 'case_defendant' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$txtCaseNo4_', - 'viewstate' : '__VIEWSTATE', - 'eventval' : '__EVENTVALIDATION', - 'button' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$btnCaseSearch', - 'active_tab' : 'ctl00_cphPage_tcSearchMenu_ClientState', -} - -# Fixed form field values -field_values = { - 'case_type' : 'F', - 'button' : 'Submit', - 'active_tab' : '{"ActiveTabIndex":0,"TabState":[true,true,true,true,true]}', - 'viewstate' : '', - 'eventval' : '', - 'field_suffix' : '', -} - -# Case ORM class -class Case(Base): - __tablename__ = 'cases' - id = Column(Integer, primary_key=True) - court_case_no = Column(String(30)) - state_case_no = Column(String(30)) - name = Column(String(100)) - date_birth = Column(Date) - date_filed = Column(Date) - date_closed = Column(Date) - warrant_type = Column(String(25)) - hearing_date = Column(Date) - hearing_time = Column(Integer) - hearing_type = Column(String(25)) - defense_attorney = Column(String(100)) - prob_start_date = Column(Date) - prob_end_date = Column(Date) - prob_length = Column(String(25)) - in_jail = Column(String(10)) - released_to = Column(String(100)) - bond_amount = Column(Numeric(7,2)) - bond_status = Column(String(50)) - bond_type = Column(String(50)) - bond_date = Column(Date) - -# Charge ORM class -class Charge(Base): - __tablename__ = 'charges' - id = Column(Integer, primary_key=True) - case_id = Column(Integer) - seq = Column(Integer) - charge = Column(String(200)) - type = Column(String(50)) - disposition = Column(String(256)) - -# Docket ORM class -class Docket(Base): - __tablename__ = 'dockets' - id = Column(Integer, primary_key=True) - case_id = Column(Integer) - seq = Column(Integer) - date = Column(Date) - docket = Column(Text) - -class Akas(Base): - __tablename__ = 'akas' - id = Column(Integer, primary_key=True) - case_id = Column(Integer) - last_name = Column(String(50)) - first_name = Column(String(50)) - middle_name = Column(String(50)) - race = Column(String(25)) - sex = Column(String(10)) - -# This is an ASP site, so we have some variables that must be passed with a -# form submission that change from page to page -def set_asp_vars(): - global field_values - response = urllib2.urlopen(url) - page = response.read() - soup = BeautifulSoup(page) - field_values['eventval'] = soup.find(id="__EVENTVALIDATION")['value'] - field_values['viewstate'] = soup.find(id="__VIEWSTATE")['value'] - field_suffix_string = soup.find(title="YY")['id'] - field_values['field_suffix'] = field_suffix_string.split('_')[-1] - -def fetch_page(post_fields): - data = urllib.urlencode(post_fields) - req = urllib2.Request(url, data) - response = urllib2.urlopen(req) - return response.read() - -# Checks for a "NOT FOUND" error in the returned HTML for a case -def check_case_not_found(case_soup): - try: - error_text = case_soup.find(id='ctl00_cphPage_lblSearchError').text - except: - return False - else: - if 'NOT FOUND' in error_text: - return True - else: - return False - -# Once we determine that a given case exists, process all the parts -def process_case(case_soup): - case_id = parse_case(case_soup) - - # Get any AKAs, if they exist - has_akas = case_soup.find(id='ctl00_cphPage_lnkAKA') - if has_akas: - post_fields = { - '__EVENTTARGET' : 'ctl00$cphPage$lnkAKA', - '__VIEWSTATE' : case_soup.find(id='__VIEWSTATE')['value'], - '__EVENTVALIDATION' : case_soup.find(id='__EVENTVALIDATION')['value'], - } - akas = fetch_page(post_fields) - akas_soup = BeautifulSoup(akas) - parse_akas(akas_soup, case_id) - - # Get charges - parse_charges(case_soup, case_id) - - # Get case dockets - post_fields = { - '__EVENTTARGET' : 'ctl00$cphPage$lnkDockets', - '__VIEWSTATE' : case_soup.find(id='__VIEWSTATE')['value'], - '__EVENTVALIDATION' : case_soup.find(id='__EVENTVALIDATION')['value'], - } - dockets = fetch_page(post_fields) - dockets_soup = BeautifulSoup(dockets) - parse_dockets(dockets_soup, case_id) - - # Get additional case info - post_fields = { - '__EVENTTARGET' : 'ctl00$cphPage$lnkAdditionalInfo', - '__VIEWSTATE' : case_soup.find(id='__VIEWSTATE')['value'], - '__EVENTVALIDATION' : case_soup.find(id='__EVENTVALIDATION')['value'], - } - addinfo = fetch_page(post_fields) - addinfo_soup = BeautifulSoup(addinfo) - parse_addinfo(addinfo_soup) - -def parse_case(case_soup): - # Assign fields - court_case_no = case_soup.find(id='ctl00_cphPage_lblCaseNumber').text - state_case_no = case_soup.find(id='ctl00_cphPage_lblStateCaseNo').text - name = case_soup.find(id='ctl00_cphPage_lblName').text - date_birth = format_date(case_soup.find(id='ctl00_cphPage_lblDoB').text) - date_filed = format_date(case_soup.find(id='ctl00_cphPage_lblDateFiled').text) - date_closed = format_date(case_soup.find(id='ctl00_cphPage_lblDateClosed').text) - warrant_type = case_soup.find(id='ctl00_cphPage_lblWarrantType').text - hearing_date = format_date(case_soup.find(id='ctl00_cphPage_lblHearingDate').text) - hearing_time = case_soup.find(id='ctl00_cphPage_lblHearingTime').text.strip() - hearing_type = case_soup.find(id='ctl00_cphPage_lblHearingType').text - defense_attorney = case_soup.find(id='ctl00_cphPage_lblDefenseAttorney').text - - # Try to clean up the data bit to avoid needless MySQL warnings - if not hearing_time: - hearing_time = None - - new_case = Case( - court_case_no = court_case_no, - state_case_no = state_case_no, - name = name, - date_birth = date_birth, - date_filed = date_filed, - date_closed = date_closed, - warrant_type = warrant_type, - hearing_date = hearing_date, - hearing_time = hearing_time, - hearing_type = hearing_type, - defense_attorney = defense_attorney, - ) - - db_session.add(new_case) - db_session.commit() - - return new_case.id - -def parse_charges(case_soup, case_id): - charges_section = case_soup.find(id="ctl00_cphPage_pnlCharges") - charge_rows = charges_section.findAll('tr') - charge_rows.pop(0) # We don't want the table headers - charge_fields = ['seq','charge','type','disposition'] - charges = [] - - for ridx,fields in enumerate(charge_rows): - charge_data = {} - for fidx,field in enumerate(fields.findAll('td', recursive=False)): - # Tsk-tsk. Why the non-breaking spaces! - charge_data[charge_fields[fidx]] = re.sub(' ', ' ', field.text) - charges.append(charge_data) - - for charge in charges: - new_charge = Charge( - case_id = case_id, - seq = charge['seq'], - charge = charge['charge'], - type = charge['type'], - disposition = charge['disposition'], - ) - db_session.add(new_charge) - - # Commit charges to database - db_session.commit() - -def parse_dockets(dockets_soup, case_id): - # This is pretty lame, but they don't have any useful ids, names or - # attributes to accurately identify the right table. These classes are - # probably good enough, though. - docket_rows = dockets_soup.findAll('tr', attrs={'class':['RowBody','RowBodyAlt']}) - docket_fields = ['seq', 'date', 'book-page', 'docket'] - dockets = [] - for ridx,fields in enumerate(docket_rows): - docket_data = {} - for fidx,field in enumerate(fields.findAll('td', recursive=False)): - # Tsk-tsk. Why the non-breaking spaces! - docket_data[docket_fields[fidx]] = re.sub(' ', ' ', field.text) - dockets.append(docket_data) - for docket in dockets: - new_docket = Docket( - case_id = case_id, - seq = docket['seq'], - date = format_date(docket['date']), - docket = docket['docket'], - ) - db_session.add(new_docket) - - # Commit dockets to database - db_session.commit() - - -def parse_addinfo(addinfo_soup): - court_case_no = addinfo_soup.find(id='ctl00_cphPage_lblCaseNumber').text - case = db_session.query(Case).filter_by(court_case_no=court_case_no).one() - - # Assign values to existing case - case.prob_start_date = format_date(addinfo_soup.find(id='ctl00_cphPage_lblProbationStartDate').text) - case.prob_end_date = format_date(addinfo_soup.find(id='ctl00_cphPage_lblProbationEndDate').text) - case.prob_length = addinfo_soup.find(id='ctl00_cphPage_lblProbationLength').text - case.in_jail = addinfo_soup.find(id='ctl00_cphPage_lblDefendantinJail').text - case.released_to = addinfo_soup.find(id='ctl00_cphPage_lblDefendantReleaseTo').text - case.bond_amount = addinfo_soup.find(id='ctl00_cphPage_lblBondAmount').text - case.bond_status = addinfo_soup.find(id='ctl00_cphPage_lblBondStatus').text - case.bond_type = addinfo_soup.find(id='ctl00_cphPage_lblBondType').text - case.bond_date = format_date(addinfo_soup.find(id='ctl00_cphPage_lblBondIssueDate').text) - - # Fix format of Bond Amount - case.bond_amount = case.bond_amount.lstrip('$') - case.bond_amount = case.bond_amount.replace(',', '') - - db_session.commit() - -def parse_akas(akas_soup, case_id): - akas_count_text = akas_soup.find(id='ctl00_cphPage_lblDefendants').text - akas_count = re.match('^\d+', akas_count_text).group(0) - for num in range(0, int(akas_count)): - # For readability, get field ids into variables - last_name_id = 'ctl00_cphPage_rptDefendants_ctl0%s_lnkDefendantLastName' % num - first_name_id = 'ctl00_cphPage_rptDefendants_ctl0%s_lnkDefendantFirstName' % num - middle_name_id = 'ctl00_cphPage_rptDefendants_ctl0%s_lnkDefendantMiddleName' % num - race_id = 'ctl00_cphPage_rptDefendants_ctl0%s_lnkDefendantRace' % num - sex_id = 'ctl00_cphPage_rptDefendants_ctl0%s_lnkDefendantSex' % num - - # Now get the actual variables from the soup - last_name = akas_soup.find(id=last_name_id).text - first_name = akas_soup.find(id=first_name_id).text - middle_name = akas_soup.find(id=middle_name_id).text - race = akas_soup.find(id=race_id).text - sex = akas_soup.find(id=sex_id).text - - new_aka = Akas( - case_id = case_id, - last_name = last_name, - first_name = first_name, - middle_name = first_name, - race = race, - sex = sex, - ) - - db_session.add(new_aka) - - db_session.commit() - -def format_date(date): - try: - date_obj = datetime.strptime(date, '%m/%d/%Y') - if date_obj: - return date_obj.strftime('%Y-%m-%d') - else: - return None - except: - return None - -def main(): - - # Set ASP-specific form field variables - set_asp_vars() - - for year in years: - # Keep track of how many NOT FOUND errors we get. - case_not_found_count = 0 - - for seq in range(37,39): - if case_not_found_count > max_case_not_found_count: - # If our not_found_count exceeds the maximum set, - # then assume that we have reached the end of all - # cases for the given year and move on to the next - # year - break - - # Iterate through A-Z possible defendants - for defendant in string.lowercase: - - # In the cases where we have multiple defendants, when - # we reach the end of the defendants we do not want the - # code to try to make the request without the defendant - # field since this will return a duplicate of the A - # defendant. This variable is a flag to signal this. - try: - has_multiple_defendants - except: - has_multiple_defendants = None - - # Get main case information - post_fields = { - field_names['case_type'] + field_values['field_suffix'] : field_values['case_type'], - field_names['case_year'] + field_values['field_suffix'] : year, - field_names['case_seq'] + field_values['field_suffix'] : seq, - field_names['case_defendant'] + field_values['field_suffix'] : defendant, - field_names['viewstate'] : field_values['viewstate'], - field_names['eventval'] : field_values['eventval'], - field_names['button'] : field_values['button'], - field_names['active_tab'] : field_values['active_tab'], - } - - case = fetch_page(post_fields) - case_soup = BeautifulSoup(case) - not_found = check_case_not_found(case_soup) - if not_found: - if has_multiple_defendants: - break - # Try again with no defendant sequence specification - post_fields[ field_names['case_defendant'] + field_values['field_suffix'] ] = '' - case = fetch_page(post_fields) - case_soup = BeautifulSoup(case) - not_found = check_case_not_found(case_soup) - if not_found: - # If the case wasn't found even without the defendant - # number then it really must not exist - case_not_found_count = case_not_found_count + 1 - else: - # Reset the not_found_count and process case - case_not_found_count = 0 - has_multiple_defendants = None - process_case(case_soup) - - break - else: - has_multiple_defendants = True - # Reset the not_found_count and process case - case_not_found_count = 0 - process_case(case_soup) - - -if __name__ == '__main__': - main() |
