#!/usr/bin/env python import sys from datetime import datetime import urllib import urllib2 import re import string import time import logging from BeautifulSoup import BeautifulSoup from sqlalchemy import create_engine from sqlalchemy.ext.declarative import declarative_base from sqlalchemy import Column, Integer, String, Text, Date, Numeric from sqlalchemy.orm import sessionmaker logging.basicConfig( filename='mdcc.log', level=logging.ERROR ) Base = declarative_base() db_engine = create_engine('mysql://mdcc:Mdcc.@localhost/mdcc', echo=False) Session = sessionmaker(bind=db_engine) db_session = Session() # Global variables url = 'https://www2.miami-dadeclerk.com/CJIS/CaseSearch.aspx' max_case_not_found_count = 50 # Form field names field_names = { 'case_type' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$txtCaseNo1_', 'case_year' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$txtCaseNo2_', 'case_seq' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$txtCaseNo3_', 'case_defendant' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$txtCaseNo4_', 'viewstate' : '__VIEWSTATE', 'eventval' : '__EVENTVALIDATION', 'button' : 'ctl00$cphPage$tcSearchMenu$tpCaseSearch$btnCaseSearch', 'active_tab' : 'ctl00_cphPage_tcSearchMenu_ClientState', } # Fixed form field values field_values = { 'case_type' : 'F', 'button' : 'Submit', 'active_tab' : '{"ActiveTabIndex":0,"TabState":[true,true,true,true,true]}', 'viewstate' : '', 'eventval' : '', 'field_suffix' : '', } # Case ORM class class Case(Base): __tablename__ = 'cases' id = Column(Integer, primary_key=True) court_case_no = Column(String(30)) state_case_no = Column(String(30)) name = Column(String(100)) first_name = Column(String(50)) last_name = Column(String(50)) date_birth = Column(Date) date_filed = Column(Date) date_closed = Column(Date) warrant_type = Column(String(25)) hearing_date = Column(Date) hearing_time = Column(Integer) hearing_type = Column(String(25)) defense_attorney = Column(String(100)) prob_start_date = Column(Date) prob_end_date = Column(Date) prob_length = Column(String(25)) in_jail = Column(String(10)) released_to = Column(String(100)) bond_amount = Column(Numeric(7,2)) bond_status = Column(String(50)) bond_type = Column(String(50)) bond_date = Column(Date) # Charge ORM class class Charge(Base): __tablename__ = 'charges' id = Column(Integer, primary_key=True) case_id = Column(Integer) seq = Column(Integer) charge = Column(String(200)) type = Column(String(50)) disposition = Column(String(256)) # Docket ORM class class Docket(Base): __tablename__ = 'dockets' id = Column(Integer, primary_key=True) case_id = Column(Integer) seq = Column(Integer) date = Column(Date) docket = Column(Text) class Akas(Base): __tablename__ = 'akas' id = Column(Integer, primary_key=True) case_id = Column(Integer) last_name = Column(String(50)) first_name = Column(String(50)) middle_name = Column(String(50)) race = Column(String(25)) sex = Column(String(10)) # This is an ASP site, so we have some variables that must be passed with a # form submission that change form page to page def set_asp_vars(): global field_values try: response = urllib2.urlopen(url) page = response.read() soup = BeautifulSoup(page) field_values['eventval'] = soup.find(id="__EVENTVALIDATION")['value'] field_values['viewstate'] = soup.find(id="__VIEWSTATE")['value'] field_suffix_string = soup.find(title="YY")['id'] field_values['field_suffix'] = field_suffix_string.split('_')[-1] except KeyboardInterrupt: raise except: logging.error('Failed to get initial ASP variables. %s' % sys.exc_info()[0]) return False # Fetches the actual page from the site def fetch_page(post_fields): data = urllib.urlencode(post_fields) attempt = 0 # Try to fetch the page 3 times before giving up while attempt < 3: try: req = urllib2.Request(url, data) response = urllib2.urlopen(req, timeout=30) page = response.read() return page except (urllib2.HTTPError, urllib2.URLError) as e: attempt = attempt + 1 if hasattr(e, 'reason'): logging.error('%s: urllib2 error reason: %s' % (current_case, e.reason)) if hasattr(e, 'code'): logging.error('%s: urllib2 error code: %s' % (current_case, e.code)) if e.code == 500: sleeptime = 2 ** attempt * 10 time.sleep(sleeptime) logging.error('%s: Trying again after sleeping for %d seconds' % (current_case, sleeptime)) # Reset base ASP vars just in case set_asp_vars() logging.error('%s: Also resetting ASP vars just in case.' % (current_case)) except KeyboardInterrupt: raise except: logging.error('%s: fetch_page() exception %s - %s' % (current_case, sys.exc_info()[0], sys.exc_info()[1])) # If we reach this point then fetching the page totally failed logging.error('%s: Failed to fetch page contents. %s' % (current_case, sys.exc_info()[0])) return False # Checks whether we have some sort of error condition in the page def check_case_not_found(case_soup): try: error_text = case_soup.find(id='ctl00_cphPage_lblSearchError').text except KeyboardInterrupt: raise except: return False else: if 'NOT FOUND' in error_text: return True elif 'Unable to retrieve case information' in error_text: return True elif 'CASE INFORMATION NOT ON FILE' in error_text: return True else: return False # Checks for a "No dockets found" error condition def check_dockets_not_found(dockets_soup): try: error_text = dockets_soup.find(id='ctl00_cphPage_lblCaseError').text except KeyboardInterrupt: raise except: return False else: if 'No dockets were found' in error_text: return True else: return False # Once we determine that a given case exists, process all the parts def process_case(case_soup): case_id = parse_case(case_soup) if not case_id: return False # Get any AKAs, if they exist has_akas = case_soup.find(id='ctl00_cphPage_lnkAKA') if has_akas: post_fields = { '__EVENTTARGET' : 'ctl00$cphPage$lnkAKA', '__VIEWSTATE' : case_soup.find(id='__VIEWSTATE')['value'], '__EVENTVALIDATION' : case_soup.find(id='__EVENTVALIDATION')['value'], } akas = fetch_page(post_fields) if akas: akas_soup = BeautifulSoup(akas) parse_akas(akas_soup, case_id) # Get charges parse_charges(case_soup, case_id) # Get case dockets post_fields = { '__EVENTTARGET' : 'ctl00$cphPage$lnkDockets', '__VIEWSTATE' : case_soup.find(id='__VIEWSTATE')['value'], '__EVENTVALIDATION' : case_soup.find(id='__EVENTVALIDATION')['value'], } dockets = fetch_page(post_fields) if dockets: dockets_soup = BeautifulSoup(dockets) no_dockets = check_dockets_not_found(dockets_soup) if not no_dockets: parse_dockets(dockets_soup, case_id) # Get additional case info post_fields = { '__EVENTTARGET' : 'ctl00$cphPage$lnkAdditionalInfo', '__VIEWSTATE' : case_soup.find(id='__VIEWSTATE')['value'], '__EVENTVALIDATION' : case_soup.find(id='__EVENTVALIDATION')['value'], } addinfo = fetch_page(post_fields) if addinfo: addinfo_soup = BeautifulSoup(addinfo) parse_addinfo(addinfo_soup) # Parses the main case information def parse_case(case_soup): # A dictionary of local variable names mapped to page ids case_fields = { 'court_case_no' : 'ctl00_cphPage_lblCaseNumber', 'state_case_no' : 'ctl00_cphPage_lblStateCaseNo', 'name' : 'ctl00_cphPage_lblName', 'date_birth' : 'ctl00_cphPage_lblDoB', 'date_filed' : 'ctl00_cphPage_lblDateFiled', 'date_closed' : 'ctl00_cphPage_lblDateClosed', 'warrant_type' : 'ctl00_cphPage_lblWarrantType', 'hearing_date' : 'ctl00_cphPage_lblHearingDate', 'hearing_time' : 'ctl00_cphPage_lblHearingTime', 'hearing_type' : 'ctl00_cphPage_lblHearingType', 'defense_attorney' : 'ctl00_cphPage_lblDefenseAttorney', } case_values = {} for var, field_id in case_fields.iteritems(): try: case_values[var] = case_soup.find(id=field_id).text except KeyboardInterrupt: raise except: # warrant_type won't always exist, so don't bail just because we # don't find it, but otherwise do log and bail if var == 'warrant_type': pass else: logging.error('%s: Failed to find field %s in soup. %s' % (current_case, var, sys.exc_info()[0])) logging.error('%s: Something is wrong with the soup. Here it is: %s' % (current_case, case_soup)) continue # Try to clean up the data bit to avoid needless MySQL warnings if not case_values['hearing_time'].strip(): case_values['hearing_time'] = None # Before we try to add this case, be sure that for some reason # it doesn't already exist case_exists = db_session.query(Case).filter_by(court_case_no=case_values['court_case_no']).first() if not case_exists: new_case = Case( court_case_no = case_values['court_case_no'], state_case_no = case_values['state_case_no'], name = case_values['name'], first_name = case_values['name'].split(',')[1].strip(), last_name = case_values['name'].split(',')[0].strip(), date_birth = format_date(case_values['date_birth']), date_filed = format_date(case_values['date_filed']), date_closed = format_date(case_values['date_closed']), hearing_date = format_date(case_values['hearing_date']), hearing_time = case_values['hearing_time'], hearing_type = case_values['hearing_type'], defense_attorney = case_values['defense_attorney'], ) # warrant_type won't always exist, so check before trying to assign it try: new_case.warrant_type = case_values['warrant_type'] except KeyboardInterrupt: raise except: pass db_session.add(new_case) db_session.commit() return new_case.id else: return False # Parses the case soup for the list of charges agains the defendant def parse_charges(case_soup, case_id): try: charges_section = case_soup.find(id="ctl00_cphPage_pnlCharges") except KeyboardInterrupt: raise except: logging.error('%s: Failed to find charges for case ID %s. %s' % (current_case, sys.exc_info()[0])) return False charge_rows = charges_section.findAll('tr') if charge_rows: charge_rows.pop(0) # We don't want the table headers charge_fields = ['seq','charge','type','disposition'] charges = [] for ridx,fields in enumerate(charge_rows): charge_data = {} for fidx,field in enumerate(fields.findAll('td', recursive=False)): # Why the non-breaking spaces? Remove them. charge_data[charge_fields[fidx]] = re.sub(' ', ' ', field.text) charges.append(charge_data) for charge in charges: new_charge = Charge( case_id = case_id, seq = charge['seq'], charge = charge['charge'], type = charge['type'], disposition = charge['disposition'], ) db_session.add(new_charge) # Commit charges to database db_session.commit() else: logging.error('%s: No charges found.' % current_case) # Parses any dockets associated with the case def parse_dockets(dockets_soup, case_id): # This is pretty lame, but they don't have any useful ids, names or # attributes to accurately identify the right table. These classes are # probably good enough, though. docket_rows = dockets_soup.findAll('tr', attrs={'class':['RowBody','RowBodyAlt']}) docket_fields = ['seq', 'date', 'book-page', 'docket'] dockets = [] if docket_rows: for ridx,fields in enumerate(docket_rows): docket_data = {} for fidx,field in enumerate(fields.findAll('td', recursive=False)): # Why the non-breaking spaces? Remove them. docket_data[docket_fields[fidx]] = re.sub(' ', ' ', field.text) dockets.append(docket_data) for docket in dockets: new_docket = Docket( case_id = case_id, seq = docket['seq'], date = format_date(docket['date']), docket = docket['docket'], ) db_session.add(new_docket) # Commit dockets to database db_session.commit() else: logging.error('%s: No dockets found.' % current_case) # Parses the soup for the Additional Information section for the case def parse_addinfo(addinfo_soup): try: court_case_no = addinfo_soup.find(id='ctl00_cphPage_lblCaseNumber').text case = db_session.query(Case).filter_by(court_case_no=court_case_no).one() except KeyboardInterrupt: raise except: logging.error('%s: Error while looking for existing case in addinfo. %s' % (current_case, sys.exc_info()[0])) return False # Map db field names to corresponding page ids case_fields = { 'prob_start_date' : 'ctl00_cphPage_lblProbationStartDate', 'prob_end_date' : 'ctl00_cphPage_lblProbationEndDate', 'prob_length' : 'ctl00_cphPage_lblProbationLength', 'in_jail' : 'ctl00_cphPage_lblDefendantinJail', 'released_to' : 'ctl00_cphPage_lblDefendantReleaseTo', 'bond_amount' : 'ctl00_cphPage_lblBondAmount', 'bond_status' : 'ctl00_cphPage_lblBondStatus', 'bond_type' : 'ctl00_cphPage_lblBondType', 'bond_date' : 'ctl00_cphPage_lblBondIssueDate', } case_values = {} for var, field_id in case_fields.iteritems(): try: case_values[var] = addinfo_soup.find(id=field_id).text except KeyboardInterrupt: raise except: logging.error('%s: Failed to find a value for the addinfo field %s. %s' % (current_case, sys.exc_info()[0])) continue # Assign values to existing case case.prob_start_date = format_date(case_values['prob_start_date']) case.prob_end_date = format_date(case_values['prob_end_date']) case.prob_length = case_values['prob_length'] case.in_jail = case_values['in_jail'] case.released_to = case_values['released_to'] case.bond_amount = case_values['bond_amount'] case.bond_status = case_values['bond_status'] case.bond_type = case_values['bond_type'] case.bond_date = format_date(case_values['bond_date']) # Fix format of Bond Amount case.bond_amount = case.bond_amount.lstrip('$') case.bond_amount = case.bond_amount.replace(',', '') db_session.commit() # Parses page listing any AKAs for the defendant def parse_akas(akas_soup, case_id): try: akas_count_text = akas_soup.find(id='ctl00_cphPage_lblDefendants').text akas_count = re.match('^\d+', akas_count_text).group(0) except KeyboardInterrupt: raise except: logging.error('%s: Failed to determine akas_count %s. %s' % (current_case, sys.exc_info()[0])) return False for num in range(0, int(akas_count)): # We have to pad numbers less than 10 if num < 10: num = '0%s' % num # For readability, get field ids into variables last_name_id = 'ctl00_cphPage_rptDefendants_ctl%s_lnkDefendantLastName' % num first_name_id = 'ctl00_cphPage_rptDefendants_ctl%s_lnkDefendantFirstName' % num middle_name_id = 'ctl00_cphPage_rptDefendants_ctl%s_lnkDefendantMiddleName' % num race_id = 'ctl00_cphPage_rptDefendants_ctl%s_lnkDefendantRace' % num sex_id = 'ctl00_cphPage_rptDefendants_ctl%s_lnkDefendantSex' % num # Map db fields to coresponding page id akas_fields = { 'last_name' : last_name_id, 'first_name' : first_name_id, 'middle_name' : middle_name_id, 'race' : race_id, 'sex' : sex_id, } akas_values = {} for var, field_id in akas_fields.iteritems(): try: akas_values[var] = akas_soup.find(id=field_id).text except KeyboardInterrupt: raise except: logging.error('%s: Failed to find a value for akas field %s. %s' % (current_case, var, sys.exc_info()[0])) continue new_aka = Akas( case_id = case_id, last_name = akas_values['last_name'], first_name = akas_values['first_name'], middle_name = akas_values['first_name'], race = akas_values['race'], sex = akas_values['sex'], ) db_session.add(new_aka) db_session.commit() # Format the date to something acceptable for the database def format_date(date): try: if date.strip(): date_obj = datetime.strptime(date, '%m/%d/%Y') if date_obj: return date_obj.strftime('%Y-%m-%d') else: return None except KeyboardInterrupt: raise except: logging.error('%s: Failed to create a date object for date %s. %s' % (current_case, date, sys.exc_info()[0])) return None def main(): # We set this shortly so that our logging facility can associate # a case number with errors being logged. global current_case # Set ASP-specific form field variables set_asp_vars() for year in years: # Keep track of how many NOT FOUND errors we get. case_not_found_count = 0 for seq in range(seq_start, seq_end): if case_not_found_count >= max_case_not_found_count: # If our not_found_count exceeds the maximum set, # then assume that we have reached the end of all # cases for the given year and move on to the next # year logging.error('%s: Case not found count exceeded %s, quitting' % (current_case, case_not_found_count)) break # Iterate through A-Z possible defendants for defendant in string.lowercase: current_case = 'F-%s-%s-%s' % (year, seq, defendant) # Get main case information post_fields = { field_names['case_type'] + field_values['field_suffix'] : field_values['case_type'], field_names['case_year'] + field_values['field_suffix'] : year, field_names['case_seq'] + field_values['field_suffix'] : seq, field_names['case_defendant'] + field_values['field_suffix'] : defendant, field_names['viewstate'] : field_values['viewstate'], field_names['eventval'] : field_values['eventval'], field_names['button'] : field_values['button'], field_names['active_tab'] : field_values['active_tab'], } # Try to fetch a given case 3 times before # giving up and moving to the next sequence case = fetch_page(post_fields) if case: case_soup = BeautifulSoup(case) not_found = check_case_not_found(case_soup) if not_found: current_case = 'F-%s-%s' % (year, seq) # We only want to try stripping the defendant letter on 'a', # since for anything after we will have already tried # stripping it, and don't want to do it again. if defendant != 'a': break # Try again with no defendant sequence specification post_fields[ field_names['case_defendant'] + field_values['field_suffix'] ] = '' case = fetch_page(post_fields) if case: case_soup = BeautifulSoup(case) not_found = check_case_not_found(case_soup) if not_found: # If the case wasn't found even without the defendant # number then it really must not exist case_not_found_count = case_not_found_count + 1 else: # Reset the not_found_count and process case case_not_found_count = 0 process_case(case_soup) break else: current_case = 'F-%s-%s-%s' % (year, seq, defendant) # Reset the not_found_count and process case case_not_found_count = 0 process_case(case_soup) else: break if __name__ == '__main__': # Get starting sequence number and years from command line arguments seq_start = int(sys.argv[1]) if int(sys.argv[2]) == 0: seq_end = 100000 else: seq_end = int(sys.argv[2]) + 1 years = sys.argv[3:] main()