summaryrefslogtreecommitdiff
path: root/miami_dade_clerk_courts.py
diff options
context:
space:
mode:
Diffstat (limited to 'miami_dade_clerk_courts.py')
-rwxr-xr-xmiami_dade_clerk_courts.py612
1 files changed, 612 insertions, 0 deletions
diff --git a/miami_dade_clerk_courts.py b/miami_dade_clerk_courts.py
new file mode 100755
index 0000000..5ec4353
--- /dev/null
+++ b/miami_dade_clerk_courts.py
@@ -0,0 +1,612 @@
+#!/usr/bin/env python
+
+import sys
+from datetime import datetime
+import urllib
+import urllib2
+import re
+import string
+import time
+import logging
+from BeautifulSoup import BeautifulSoup
+from sqlalchemy import create_engine
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy import Column, Integer, String, Text, Date, Numeric
+from sqlalchemy.orm import sessionmaker
+
+logging.basicConfig( filename='mdcc.log', level=logging.ERROR )
+
+Base = declarative_base()
+db_engine = create_engine('mysql://mdcc:Mdcc.@localhost/mdcc', echo=False)
+Session = sessionmaker(bind=db_engine)
+db_session = Session()
+
+# Global variables
+url = 'https://www2.miami-dadeclerk.com/CJIS/CaseSearch.aspx'
+max_case_not_found_count = 50
+
+
+# Form field names
+field_names = {
+ 'case_type' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$txtCaseNo1_',
+ 'case_year' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$txtCaseNo2_',
+ 'case_seq' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$txtCaseNo3_',
+ 'case_defendant' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$txtCaseNo4_',
+ 'viewstate' : '__VIEWSTATE',
+ 'eventval' : '__EVENTVALIDATION',
+ 'button' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$btnCaseSearch',
+ 'active_tab' : 'ctl00_cphPage_tcSearchMenu_ClientState',
+}
+
+# Fixed form field values
+field_values = {
+ 'case_type' : 'F',
+ 'button' : 'Submit',
+ 'active_tab' : '{"ActiveTabIndex":0,"TabState":[true,true,true,true,true]}',
+ 'viewstate' : '',
+ 'eventval' : '',
+ 'field_suffix' : '',
+}
+
+# Case ORM class
+class Case(Base):
+ __tablename__ = 'cases'
+ id = Column(Integer, primary_key=True)
+ court_case_no = Column(String(30))
+ state_case_no = Column(String(30))
+ name = Column(String(100))
+ first_name = Column(String(50))
+ last_name = Column(String(50))
+ date_birth = Column(Date)
+ date_filed = Column(Date)
+ date_closed = Column(Date)
+ warrant_type = Column(String(25))
+ hearing_date = Column(Date)
+ hearing_time = Column(Integer)
+ hearing_type = Column(String(25))
+ defense_attorney = Column(String(100))
+ prob_start_date = Column(Date)
+ prob_end_date = Column(Date)
+ prob_length = Column(String(25))
+ in_jail = Column(String(10))
+ released_to = Column(String(100))
+ bond_amount = Column(Numeric(7,2))
+ bond_status = Column(String(50))
+ bond_type = Column(String(50))
+ bond_date = Column(Date)
+
+# Charge ORM class
+class Charge(Base):
+ __tablename__ = 'charges'
+ id = Column(Integer, primary_key=True)
+ case_id = Column(Integer)
+ seq = Column(Integer)
+ charge = Column(String(200))
+ type = Column(String(50))
+ disposition = Column(String(256))
+
+# Docket ORM class
+class Docket(Base):
+ __tablename__ = 'dockets'
+ id = Column(Integer, primary_key=True)
+ case_id = Column(Integer)
+ seq = Column(Integer)
+ date = Column(Date)
+ docket = Column(Text)
+
+class Akas(Base):
+ __tablename__ = 'akas'
+ id = Column(Integer, primary_key=True)
+ case_id = Column(Integer)
+ last_name = Column(String(50))
+ first_name = Column(String(50))
+ middle_name = Column(String(50))
+ race = Column(String(25))
+ sex = Column(String(10))
+
+
+# This is an ASP site, so we have some variables that must be passed with a
+# form submission that change form page to page
+def set_asp_vars():
+
+ global field_values
+
+ try:
+ response = urllib2.urlopen(url)
+ page = response.read()
+ soup = BeautifulSoup(page)
+ field_values['eventval'] = soup.find(id="__EVENTVALIDATION")['value']
+ field_values['viewstate'] = soup.find(id="__VIEWSTATE")['value']
+ field_suffix_string = soup.find(title="YY")['id']
+ field_values['field_suffix'] = field_suffix_string.split('_')[-1]
+ except KeyboardInterrupt:
+ raise
+ except:
+ logging.error('Failed to get initial ASP variables. %s' % sys.exc_info()[0])
+ return False
+
+
+# Fetches the actual page from the site
+def fetch_page(post_fields):
+
+ data = urllib.urlencode(post_fields)
+
+ attempt = 0
+ # Try to fetch the page 3 times before giving up
+ while attempt < 3:
+ try:
+ req = urllib2.Request(url, data)
+ response = urllib2.urlopen(req, timeout=30)
+ page = response.read()
+ return page
+ except (urllib2.HTTPError, urllib2.URLError) as e:
+ attempt = attempt + 1
+ if hasattr(e, 'reason'):
+ logging.error('%s: urllib2 error reason: %s' % (current_case, e.reason))
+ if hasattr(e, 'code'):
+ logging.error('%s: urllib2 error code: %s' % (current_case, e.code))
+ if e.code == 500:
+ sleeptime = 2 ** attempt * 10
+ time.sleep(sleeptime)
+ logging.error('%s: Trying again after sleeping for %d seconds' % (current_case, sleeptime))
+ # Reset base ASP vars just in case
+ set_asp_vars()
+ logging.error('%s: Also resetting ASP vars just in case.' % (current_case))
+ except KeyboardInterrupt:
+ raise
+ except:
+ logging.error('%s: fetch_page() exception %s - %s'
+ % (current_case, sys.exc_info()[0], sys.exc_info()[1]))
+
+ # If we reach this point then fetching the page totally failed
+ logging.error('%s: Failed to fetch page contents. %s'
+ % (current_case, sys.exc_info()[0]))
+ return False
+
+
+# Checks whether we have some sort of error condition in the page
+def check_case_not_found(case_soup):
+
+ try:
+ error_text = case_soup.find(id='ctl00_cphPage_lblSearchError').text
+ except KeyboardInterrupt:
+ raise
+ except:
+ return False
+ else:
+ if 'NOT FOUND' in error_text:
+ return True
+ elif 'Unable to retrieve case information' in error_text:
+ return True
+ elif 'CASE INFORMATION NOT ON FILE' in error_text:
+ return True
+ else:
+ return False
+
+# Checks for a "No dockets found" error condition
+def check_dockets_not_found(dockets_soup):
+
+ try:
+ error_text = dockets_soup.find(id='ctl00_cphPage_lblCaseError').text
+ except KeyboardInterrupt:
+ raise
+ except:
+ return False
+ else:
+ if 'No dockets were found' in error_text:
+ return True
+ else:
+ return False
+
+
+# Once we determine that a given case exists, process all the parts
+def process_case(case_soup):
+
+ case_id = parse_case(case_soup)
+
+ if not case_id:
+ return False
+
+ # Get any AKAs, if they exist
+ has_akas = case_soup.find(id='ctl00_cphPage_lnkAKA')
+ if has_akas:
+ post_fields = {
+ '__EVENTTARGET' : 'ctl00$cphPage$lnkAKA',
+ '__VIEWSTATE' : case_soup.find(id='__VIEWSTATE')['value'],
+ '__EVENTVALIDATION' : case_soup.find(id='__EVENTVALIDATION')['value'],
+ }
+ akas = fetch_page(post_fields)
+ if akas:
+ akas_soup = BeautifulSoup(akas)
+ parse_akas(akas_soup, case_id)
+
+ # Get charges
+ parse_charges(case_soup, case_id)
+
+ # Get case dockets
+ post_fields = {
+ '__EVENTTARGET' : 'ctl00$cphPage$lnkDockets',
+ '__VIEWSTATE' : case_soup.find(id='__VIEWSTATE')['value'],
+ '__EVENTVALIDATION' : case_soup.find(id='__EVENTVALIDATION')['value'],
+ }
+ dockets = fetch_page(post_fields)
+ if dockets:
+ dockets_soup = BeautifulSoup(dockets)
+ no_dockets = check_dockets_not_found(dockets_soup)
+ if not no_dockets:
+ parse_dockets(dockets_soup, case_id)
+
+ # Get additional case info
+ post_fields = {
+ '__EVENTTARGET' : 'ctl00$cphPage$lnkAdditionalInfo',
+ '__VIEWSTATE' : case_soup.find(id='__VIEWSTATE')['value'],
+ '__EVENTVALIDATION' : case_soup.find(id='__EVENTVALIDATION')['value'],
+ }
+ addinfo = fetch_page(post_fields)
+ if addinfo:
+ addinfo_soup = BeautifulSoup(addinfo)
+ parse_addinfo(addinfo_soup)
+
+
+# Parses the main case information
+def parse_case(case_soup):
+
+ # A dictionary of local variable names mapped to page ids
+ case_fields = {
+ 'court_case_no' : 'ctl00_cphPage_lblCaseNumber',
+ 'state_case_no' : 'ctl00_cphPage_lblStateCaseNo',
+ 'name' : 'ctl00_cphPage_lblName',
+ 'date_birth' : 'ctl00_cphPage_lblDoB',
+ 'date_filed' : 'ctl00_cphPage_lblDateFiled',
+ 'date_closed' : 'ctl00_cphPage_lblDateClosed',
+ 'warrant_type' : 'ctl00_cphPage_lblWarrantType',
+ 'hearing_date' : 'ctl00_cphPage_lblHearingDate',
+ 'hearing_time' : 'ctl00_cphPage_lblHearingTime',
+ 'hearing_type' : 'ctl00_cphPage_lblHearingType',
+ 'defense_attorney' : 'ctl00_cphPage_lblDefenseAttorney',
+ }
+
+ case_values = {}
+ for var, field_id in case_fields.iteritems():
+ try:
+ case_values[var] = case_soup.find(id=field_id).text
+ except KeyboardInterrupt:
+ raise
+ except:
+ # warrant_type won't always exist, so don't bail just because we
+ # don't find it, but otherwise do log and bail
+ if var == 'warrant_type':
+ pass
+ else:
+ logging.error('%s: Failed to find field %s in soup. %s' % (current_case, var, sys.exc_info()[0]))
+ logging.error('%s: Something is wrong with the soup. Here it is: %s' % (current_case, case_soup))
+ continue
+
+ # Try to clean up the data bit to avoid needless MySQL warnings
+ if not case_values['hearing_time'].strip():
+ case_values['hearing_time'] = None
+
+ # Before we try to add this case, be sure that for some reason
+ # it doesn't already exist
+ case_exists = db_session.query(Case).filter_by(court_case_no=case_values['court_case_no']).first()
+
+ if not case_exists:
+ new_case = Case(
+ court_case_no = case_values['court_case_no'],
+ state_case_no = case_values['state_case_no'],
+ name = case_values['name'],
+ first_name = case_values['name'].split(',')[1].strip(),
+ last_name = case_values['name'].split(',')[0].strip(),
+ date_birth = format_date(case_values['date_birth']),
+ date_filed = format_date(case_values['date_filed']),
+ date_closed = format_date(case_values['date_closed']),
+ hearing_date = format_date(case_values['hearing_date']),
+ hearing_time = case_values['hearing_time'],
+ hearing_type = case_values['hearing_type'],
+ defense_attorney = case_values['defense_attorney'],
+ )
+
+ # warrant_type won't always exist, so check before trying to assign it
+ try:
+ new_case.warrant_type = case_values['warrant_type']
+ except KeyboardInterrupt:
+ raise
+ except:
+ pass
+
+ db_session.add(new_case)
+ db_session.commit()
+
+ return new_case.id
+ else:
+ return False
+
+
+# Parses the case soup for the list of charges agains the defendant
+def parse_charges(case_soup, case_id):
+
+ try:
+ charges_section = case_soup.find(id="ctl00_cphPage_pnlCharges")
+ except KeyboardInterrupt:
+ raise
+ except:
+ logging.error('%s: Failed to find charges for case ID %s. %s' % (current_case, sys.exc_info()[0]))
+ return False
+
+ charge_rows = charges_section.findAll('tr')
+
+ if charge_rows:
+ charge_rows.pop(0) # We don't want the table headers
+ charge_fields = ['seq','charge','type','disposition']
+ charges = []
+
+ for ridx,fields in enumerate(charge_rows):
+ charge_data = {}
+ for fidx,field in enumerate(fields.findAll('td', recursive=False)):
+ # Why the non-breaking spaces? Remove them.
+ charge_data[charge_fields[fidx]] = re.sub('&nbsp;', ' ', field.text)
+ charges.append(charge_data)
+
+ for charge in charges:
+ new_charge = Charge(
+ case_id = case_id,
+ seq = charge['seq'],
+ charge = charge['charge'],
+ type = charge['type'],
+ disposition = charge['disposition'],
+ )
+ db_session.add(new_charge)
+
+ # Commit charges to database
+ db_session.commit()
+ else:
+ logging.error('%s: No charges found.' % current_case)
+
+
+
+# Parses any dockets associated with the case
+def parse_dockets(dockets_soup, case_id):
+
+ # This is pretty lame, but they don't have any useful ids, names or
+ # attributes to accurately identify the right table. These classes are
+ # probably good enough, though.
+ docket_rows = dockets_soup.findAll('tr', attrs={'class':['RowBody','RowBodyAlt']})
+ docket_fields = ['seq', 'date', 'book-page', 'docket']
+ dockets = []
+ if docket_rows:
+ for ridx,fields in enumerate(docket_rows):
+ docket_data = {}
+ for fidx,field in enumerate(fields.findAll('td', recursive=False)):
+ # Why the non-breaking spaces? Remove them.
+ docket_data[docket_fields[fidx]] = re.sub('&nbsp;', ' ', field.text)
+ dockets.append(docket_data)
+ for docket in dockets:
+ new_docket = Docket(
+ case_id = case_id,
+ seq = docket['seq'],
+ date = format_date(docket['date']),
+ docket = docket['docket'],
+ )
+ db_session.add(new_docket)
+
+ # Commit dockets to database
+ db_session.commit()
+ else:
+ logging.error('%s: No dockets found.' % current_case)
+
+
+# Parses the soup for the Additional Information section for the case
+def parse_addinfo(addinfo_soup):
+
+ try:
+ court_case_no = addinfo_soup.find(id='ctl00_cphPage_lblCaseNumber').text
+ case = db_session.query(Case).filter_by(court_case_no=court_case_no).one()
+ except KeyboardInterrupt:
+ raise
+ except:
+ logging.error('%s: Error while looking for existing case in addinfo. %s' % (current_case, sys.exc_info()[0]))
+ return False
+
+ # Map db field names to corresponding page ids
+ case_fields = {
+ 'prob_start_date' : 'ctl00_cphPage_lblProbationStartDate',
+ 'prob_end_date' : 'ctl00_cphPage_lblProbationEndDate',
+ 'prob_length' : 'ctl00_cphPage_lblProbationLength',
+ 'in_jail' : 'ctl00_cphPage_lblDefendantinJail',
+ 'released_to' : 'ctl00_cphPage_lblDefendantReleaseTo',
+ 'bond_amount' : 'ctl00_cphPage_lblBondAmount',
+ 'bond_status' : 'ctl00_cphPage_lblBondStatus',
+ 'bond_type' : 'ctl00_cphPage_lblBondType',
+ 'bond_date' : 'ctl00_cphPage_lblBondIssueDate',
+ }
+
+ case_values = {}
+ for var, field_id in case_fields.iteritems():
+ try:
+ case_values[var] = addinfo_soup.find(id=field_id).text
+ except KeyboardInterrupt:
+ raise
+ except:
+ logging.error('%s: Failed to find a value for the addinfo field %s. %s' % (current_case, sys.exc_info()[0]))
+ continue
+
+ # Assign values to existing case
+ case.prob_start_date = format_date(case_values['prob_start_date'])
+ case.prob_end_date = format_date(case_values['prob_end_date'])
+ case.prob_length = case_values['prob_length']
+ case.in_jail = case_values['in_jail']
+ case.released_to = case_values['released_to']
+ case.bond_amount = case_values['bond_amount']
+ case.bond_status = case_values['bond_status']
+ case.bond_type = case_values['bond_type']
+ case.bond_date = format_date(case_values['bond_date'])
+
+ # Fix format of Bond Amount
+ case.bond_amount = case.bond_amount.lstrip('$')
+ case.bond_amount = case.bond_amount.replace(',', '')
+
+ db_session.commit()
+
+
+# Parses page listing any AKAs for the defendant
+def parse_akas(akas_soup, case_id):
+ try:
+ akas_count_text = akas_soup.find(id='ctl00_cphPage_lblDefendants').text
+ akas_count = re.match('^\d+', akas_count_text).group(0)
+ except KeyboardInterrupt:
+ raise
+ except:
+ logging.error('%s: Failed to determine akas_count %s. %s' % (current_case, sys.exc_info()[0]))
+ return False
+
+ for num in range(0, int(akas_count)):
+
+ # We have to pad numbers less than 10
+ if num < 10:
+ num = '0%s' % num
+
+ # For readability, get field ids into variables
+ last_name_id = 'ctl00_cphPage_rptDefendants_ctl%s_lnkDefendantLastName' % num
+ first_name_id = 'ctl00_cphPage_rptDefendants_ctl%s_lnkDefendantFirstName' % num
+ middle_name_id = 'ctl00_cphPage_rptDefendants_ctl%s_lnkDefendantMiddleName' % num
+ race_id = 'ctl00_cphPage_rptDefendants_ctl%s_lnkDefendantRace' % num
+ sex_id = 'ctl00_cphPage_rptDefendants_ctl%s_lnkDefendantSex' % num
+
+ # Map db fields to coresponding page id
+ akas_fields = {
+ 'last_name' : last_name_id,
+ 'first_name' : first_name_id,
+ 'middle_name' : middle_name_id,
+ 'race' : race_id,
+ 'sex' : sex_id,
+ }
+
+ akas_values = {}
+ for var, field_id in akas_fields.iteritems():
+ try:
+ akas_values[var] = akas_soup.find(id=field_id).text
+ except KeyboardInterrupt:
+ raise
+ except:
+ logging.error('%s: Failed to find a value for akas field %s. %s' % (current_case, var, sys.exc_info()[0]))
+ continue
+
+ new_aka = Akas(
+ case_id = case_id,
+ last_name = akas_values['last_name'],
+ first_name = akas_values['first_name'],
+ middle_name = akas_values['first_name'],
+ race = akas_values['race'],
+ sex = akas_values['sex'],
+ )
+
+ db_session.add(new_aka)
+
+ db_session.commit()
+
+
+# Format the date to something acceptable for the database
+def format_date(date):
+
+ try:
+ if date.strip():
+ date_obj = datetime.strptime(date, '%m/%d/%Y')
+ if date_obj:
+ return date_obj.strftime('%Y-%m-%d')
+ else:
+ return None
+ except KeyboardInterrupt:
+ raise
+ except:
+ logging.error('%s: Failed to create a date object for date %s. %s' % (current_case, date, sys.exc_info()[0]))
+ return None
+
+
+def main():
+
+ # We set this shortly so that our logging facility can associate
+ # a case number with errors being logged.
+ global current_case
+
+ # Set ASP-specific form field variables
+ set_asp_vars()
+
+ for year in years:
+ # Keep track of how many NOT FOUND errors we get.
+ case_not_found_count = 0
+
+ for seq in range(seq_start, seq_end):
+
+ if case_not_found_count >= max_case_not_found_count:
+ # If our not_found_count exceeds the maximum set,
+ # then assume that we have reached the end of all
+ # cases for the given year and move on to the next
+ # year
+ logging.error('%s: Case not found count exceeded %s, quitting' % (current_case, case_not_found_count))
+ break
+
+ # Iterate through A-Z possible defendants
+ for defendant in string.lowercase:
+
+ current_case = 'F-%s-%s-%s' % (year, seq, defendant)
+
+ # Get main case information
+ post_fields = {
+ field_names['case_type'] + field_values['field_suffix'] : field_values['case_type'],
+ field_names['case_year'] + field_values['field_suffix'] : year,
+ field_names['case_seq'] + field_values['field_suffix'] : seq,
+ field_names['case_defendant'] + field_values['field_suffix'] : defendant,
+ field_names['viewstate'] : field_values['viewstate'],
+ field_names['eventval'] : field_values['eventval'],
+ field_names['button'] : field_values['button'],
+ field_names['active_tab'] : field_values['active_tab'],
+ }
+
+ # Try to fetch a given case 3 times before
+ # giving up and moving to the next sequence
+ case = fetch_page(post_fields)
+ if case:
+ case_soup = BeautifulSoup(case)
+ not_found = check_case_not_found(case_soup)
+ if not_found:
+ current_case = 'F-%s-%s' % (year, seq)
+ # We only want to try stripping the defendant letter on 'a',
+ # since for anything after we will have already tried
+ # stripping it, and don't want to do it again.
+ if defendant != 'a':
+ break
+ # Try again with no defendant sequence specification
+ post_fields[ field_names['case_defendant'] + field_values['field_suffix'] ] = ''
+ case = fetch_page(post_fields)
+ if case:
+ case_soup = BeautifulSoup(case)
+ not_found = check_case_not_found(case_soup)
+ if not_found:
+ # If the case wasn't found even without the defendant
+ # number then it really must not exist
+ case_not_found_count = case_not_found_count + 1
+ else:
+ # Reset the not_found_count and process case
+ case_not_found_count = 0
+ process_case(case_soup)
+
+ break
+ else:
+ current_case = 'F-%s-%s-%s' % (year, seq, defendant)
+ # Reset the not_found_count and process case
+ case_not_found_count = 0
+ process_case(case_soup)
+ else:
+ break
+
+
+if __name__ == '__main__':
+ # Get starting sequence number and years from command line arguments
+ seq_start = int(sys.argv[1])
+ if int(sys.argv[2]) == 0:
+ seq_end = 100000
+ else:
+ seq_end = int(sys.argv[2]) + 1
+ years = sys.argv[3:]
+
+ main()