Selenium Web Scraping

Reminder

Please use all code samples responsibly - these are samples and likely require adjustments to work correctly for your specific needs. Read through the documentation and comments to understand any caveats or limitations of the code and/or data and follow-up with the code author or Code Library admins (code_library@urban.org) if you have questions on how to adapt the sample to your specific use case.

Purpose: This code uses Selenium to scrape an interactive website that has Javascript. It can select dropdown menus and click buttons in order to download hundreds of CSV files.

Data: The URL we scraped data from is: https://data.cms.gov/tools/mapping-medicare-disparities-by-population.

Author: Judah Axelrod and Clayton Seraphin (December 2022)

Responsible Web Scraping Guidelines

Check the robots.txt file (for example): https://www.urban.org/robots.txt
Consult Urban’s Automated Data Collection Guidelines.
Use Headers

# Description: This script scrapes CSV files from https://data.cms.gov/tools/mapping-medicare-disparities-by-population.
# Original Authors: Judah Axelrod and Clayton Seraphin
# Date of Creation: 12/29/2022


# NOTE 1: Anyone using this code should first be using Urban's SiteMonitor library to ensure 
# they are not overwhelming the website with requests. See this resource:
# https://urban-institute.medium.com/sitemonitor-a-tool-for-responsible-web-scraping-e759042e296a

# NOTE 2: The user should create a folder called 'output-data' in the same location as this script.

import glob
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select, WebDriverWait
import time
from webdriver_manager.firefox import GeckoDriverManager

def select_dropdown(driver, id, value, index=None):
    '''
    This function clicks on the correct dropdown option on the site https://data.cms.gov/tools/mapping-medicare-disparities-by-population.
    It first waits until the element becomes selectable before locating the proper drop down menu. Then it selects the proper option.
    If the page doesn't load within 20 seconds, it will return a timeout message.

    Inputs:
        id (string): This is the HTML 'value' of the dropdown menu to be selected, 
            found through inspecting the web page.
        value (string): The value to select from the dropdown menu.
        index (int): If index is not None, function assumes we want to select an option by its index instead of by specific value. 
            In this case, should specify that value = None.
    
    Returns:
        A boolean value, depending on whether the dropdown was successfully selected.
    '''
    
    try:
        element_clickable = EC.element_to_be_clickable((By.ID, id))
        element = WebDriverWait(driver, timeout=20).until(element_clickable)

    except Exception:
        
        if index is None:
            print(f'Page took too long to load for id {id}, value {value}')
        else:
            print(f'Page took too long to load for id {id}, index {index}')
        return False

    if index is None:
        Select(element).select_by_value(value)
    else:
        try:
            Select(element).select_by_index(index)
        except: # The index is disabled
            print('Dropdown disabled')
            return False

    print('Successfully selected dropdown')
    return True


def data_download(measure, index, dual, sex, age, race, counter=0, threshold=None, url='https://data.cms.gov/tools/mapping-medicare-disparities-by-population'):
    '''
    This function downloads a CSV file from data.cms.gov (https://data.cms.gov/tools/mapping-medicare-disparities-by-population)
    and names it according to the values taken by its parameters (i.e. characteristics).

    Inputs:
        measure, index, dual, sex, age, and race (String) are all characteristics 
            which can be broken out based on the dropdown menus on the website.
            Each time this function is called below, it is for a different cut of the data.
        counter (int): Counts how many times this function has been called in the script below
        threshold (int): Optionally set, it will skip all download iterations until the counter reaches the threshold.
            This is so that if webdriver stops behaving in the middle of a run or you hit a rate limit,
            you can pick up right where you leave off, rather than starting over.
            To use this, set threshold equal to # files that were successfully downloaded + 1
        url (string): The website to scrape

    Returns:
        The function returns both the current value of the counter and a boolean value based on 
            whether or not the data was successfully downloaded.
    '''
    is_page_load_successful = False
    is_download_successful = False
    print(f'Measure={measure}, index={index}, dual={dual}, sex={sex}, age={age}, race={race}')
    counter += 1
    print(f'Counter is at {counter}')
    if threshold is not None:
        if counter < threshold: # Skips the download if counter has not yet reached threshold
            is_download_successful = True
            return (counter, is_download_successful)
    url = "https://data.cms.gov/tools/mapping-medicare-disparities-by-population"

    # Set up selenium driver 
    service = Service(executable_path=GeckoDriverManager().install())
    driver = webdriver.Firefox(service=service)

    # Navigate to page 
    driver.get(url)
    
    # All of the dropdown options we want are within an iframe object, so switch the view
    try:
        WebDriverWait(driver, 20).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, '//iframe[@title="population"]')))
    except:
        driver.quit()
        return (counter, is_download_successful)

    # These options do not change
    is_page_load_successful = select_dropdown(driver=driver, id='year', value='9') # '9' --> Year = 2019
    if not is_page_load_successful:
        driver.quit()
        return (counter, is_download_successful)
    select_dropdown(driver=driver, id='geography', value='s') # 's' --> Geography = State/Territory
    select_dropdown(driver=driver, id='measure', value=measure)
    select_dropdown(driver=driver, id='adjust', value='3') # '3' --> Adjustment = smoothed actual
    is_condition_successful = select_dropdown(driver=driver, id='condition', value=None, index=index)
    if not is_condition_successful: # Disabled dropdown option, go on to next index in the loop
        driver.quit()
        return (counter, is_download_successful)
    if measure == 'e':
        select_dropdown(driver=driver, id='dual', value=dual)
    select_dropdown(driver=driver, id='sex_code', value=sex)
    select_dropdown(driver=driver, id='age_group', value=age)
    select_dropdown(driver=driver, id='race_code', value=race)

    # Wait until the button is clickable before downloading the file
    data_button_clickable = EC.element_to_be_clickable((By.ID, 'data_download'))
    data_button = WebDriverWait(driver, timeout=20).until(data_button_clickable)
    
    print('Found data download button now clicking it')
    
    driver.execute_script("arguments[0].click();", data_button)

    time.sleep(1)
    # If the file hasn't yet been downloaded or is still 0 bytes, wait a second and try again
    # There can sometimes be a lag, which necessitates the 'while' loop below
    timer = 0
    while not os.path.exists(os.path.expanduser('~/Downloads/mmd_data.csv')) or \
        os.stat(os.path.expanduser('~/Downloads/mmd_data.csv')).st_size==0:
        time.sleep(1)
        driver.execute_script("arguments[0].click();", data_button)

        if timer > 15:
            print('Took too long, closing driver')
            driver.quit()
            return (counter,  is_download_successful)
        print('Waiting for download')
        time.sleep(1)
        timer += 1
        
    print('Now moving file')
    os.rename(os.path.expanduser('~/Downloads/mmd_data.csv'), os.path.expanduser(f'output-data/{measure}_{index}_{dual}_{sex}_{age}_{race}.csv'))
    print('Moved and renamed file')
    time.sleep(1)
    driver.quit()
    is_download_successful = True
    return (counter, is_download_successful)


def run_entire_loop():
    # Set THRESHOLD at number of files you already have downloaded + 1
    THRESHOLD = len(glob.glob(os.path.expanduser('output-data/*.csv'))) + 1 
    measure_values = ['e', 'q', 'r']
    emergency_indices = [i for i in range(30) if i not in (1, 25)] # Indices 1 and 25 disabled
    pqi_indices = [j for j in range(12)]
    readmissions_indices = [k for k in range(4) if k != 1] # Index 1 disabled
    condition_indices = (emergency_indices, pqi_indices, readmissions_indices)

    sex_values = ['null', '1', '2']
    age_values = ['null', '0', '1', '2', '3']
    dual_values = ['null', '0', '1']
    race_values = ['null', '1', '2', '4', '5', '6']

    counter = 0
    for num, measure in enumerate(measure_values):
        for index in condition_indices[num]:
            if measure == 'e':
                for dual in dual_values[1:]: # Dual eligibility can only be selected for the first measure
                    for sex in sex_values[1:]:
                        counter, is_download_successful = data_download(measure=measure, index=index, dual=dual, sex=sex, age='null', race='null', counter=counter, threshold=THRESHOLD)
                        if not is_download_successful:
                            return
                    for age in age_values[1:]:
                        counter, is_download_successful = data_download(measure=measure, index=index, dual=dual, sex='null', age=age, race='null', counter=counter, threshold=THRESHOLD)
                        if not is_download_successful:
                            return
                    for race in race_values[1:]:
                        counter, is_download_successful = data_download(measure=measure, index=index, dual=dual, sex='null', age='null', race=race, counter=counter, threshold=THRESHOLD)
                        if not is_download_successful:
                            return
            else:
                for sex in sex_values[1:]:
                    counter, is_download_successful = data_download(measure=measure, index=index, dual=dual, sex=sex, age='null', race='null', counter=counter, threshold=THRESHOLD)
                    if not is_download_successful:
                        return
                for age in age_values[1:]:
                    counter, is_download_successful = data_download(measure=measure, index=index, dual=dual, sex='null', age=age, race='null', counter=counter, threshold=THRESHOLD)
                    if not is_download_successful:
                        return
                for race in race_values[1:]:
                    counter, is_download_successful = data_download(measure=measure, index=index, dual=dual, sex='null', age='null', race=race, counter=counter, threshold=THRESHOLD)
                    if not is_download_successful:
                        return

if __name__ == '__main__':
    run_entire_loop()

    # The 3 commented lines below can be used to force the scraper to keep trying to run the whole script
    # if it fails without an error message; in practice they were not needed but could be depending on the
    # application.

    # TOTAL_FILES = 781
    # while len(glob.glob(os.path.expanduser('output-data/*.csv'))) < TOTAL_FILES:
    #    run_entire_loop()