# Description: This script scrapes CSV files from https://data.cms.gov/tools/mapping-medicare-disparities-by-population.
# Original Authors: Judah Axelrod and Clayton Seraphin
# Date of Creation: 12/29/2022
# NOTE 1: Anyone using this code should first be using Urban's SiteMonitor library to ensure
# they are not overwhelming the website with requests. See this resource:
# https://urban-institute.medium.com/sitemonitor-a-tool-for-responsible-web-scraping-e759042e296a
# NOTE 2: The user should create a folder called 'output-data' in the same location as this script.
import glob
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select, WebDriverWait
import time
from webdriver_manager.firefox import GeckoDriverManager
def select_dropdown(driver, id, value, index=None):
'''
This function clicks on the correct dropdown option on the site https://data.cms.gov/tools/mapping-medicare-disparities-by-population.
It first waits until the element becomes selectable before locating the proper drop down menu. Then it selects the proper option.
If the page doesn't load within 20 seconds, it will return a timeout message.
Inputs:
id (string): This is the HTML 'value' of the dropdown menu to be selected,
found through inspecting the web page.
value (string): The value to select from the dropdown menu.
index (int): If index is not None, function assumes we want to select an option by its index instead of by specific value.
In this case, should specify that value = None.
Returns:
A boolean value, depending on whether the dropdown was successfully selected.
'''
try:
= EC.element_to_be_clickable((By.ID, id))
element_clickable = WebDriverWait(driver, timeout=20).until(element_clickable)
element
except Exception:
if index is None:
print(f'Page took too long to load for id {id}, value {value}')
else:
print(f'Page took too long to load for id {id}, index {index}')
return False
if index is None:
Select(element).select_by_value(value)else:
try:
Select(element).select_by_index(index)except: # The index is disabled
print('Dropdown disabled')
return False
print('Successfully selected dropdown')
return True
def data_download(measure, index, dual, sex, age, race, counter=0, threshold=None, url='https://data.cms.gov/tools/mapping-medicare-disparities-by-population'):
'''
This function downloads a CSV file from data.cms.gov (https://data.cms.gov/tools/mapping-medicare-disparities-by-population)
and names it according to the values taken by its parameters (i.e. characteristics).
Inputs:
measure, index, dual, sex, age, and race (String) are all characteristics
which can be broken out based on the dropdown menus on the website.
Each time this function is called below, it is for a different cut of the data.
counter (int): Counts how many times this function has been called in the script below
threshold (int): Optionally set, it will skip all download iterations until the counter reaches the threshold.
This is so that if webdriver stops behaving in the middle of a run or you hit a rate limit,
you can pick up right where you leave off, rather than starting over.
To use this, set threshold equal to # files that were successfully downloaded + 1
url (string): The website to scrape
Returns:
The function returns both the current value of the counter and a boolean value based on
whether or not the data was successfully downloaded.
'''
= False
is_page_load_successful = False
is_download_successful print(f'Measure={measure}, index={index}, dual={dual}, sex={sex}, age={age}, race={race}')
+= 1
counter print(f'Counter is at {counter}')
if threshold is not None:
if counter < threshold: # Skips the download if counter has not yet reached threshold
= True
is_download_successful return (counter, is_download_successful)
= "https://data.cms.gov/tools/mapping-medicare-disparities-by-population"
url
# Set up selenium driver
= Service(executable_path=GeckoDriverManager().install())
service = webdriver.Firefox(service=service)
driver
# Navigate to page
driver.get(url)
# All of the dropdown options we want are within an iframe object, so switch the view
try:
20).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, '//iframe[@title="population"]')))
WebDriverWait(driver, except:
driver.quit()return (counter, is_download_successful)
# These options do not change
= select_dropdown(driver=driver, id='year', value='9') # '9' --> Year = 2019
is_page_load_successful if not is_page_load_successful:
driver.quit()return (counter, is_download_successful)
=driver, id='geography', value='s') # 's' --> Geography = State/Territory
select_dropdown(driver=driver, id='measure', value=measure)
select_dropdown(driver=driver, id='adjust', value='3') # '3' --> Adjustment = smoothed actual
select_dropdown(driver= select_dropdown(driver=driver, id='condition', value=None, index=index)
is_condition_successful if not is_condition_successful: # Disabled dropdown option, go on to next index in the loop
driver.quit()return (counter, is_download_successful)
if measure == 'e':
=driver, id='dual', value=dual)
select_dropdown(driver=driver, id='sex_code', value=sex)
select_dropdown(driver=driver, id='age_group', value=age)
select_dropdown(driver=driver, id='race_code', value=race)
select_dropdown(driver
# Wait until the button is clickable before downloading the file
= EC.element_to_be_clickable((By.ID, 'data_download'))
data_button_clickable = WebDriverWait(driver, timeout=20).until(data_button_clickable)
data_button
print('Found data download button now clicking it')
"arguments[0].click();", data_button)
driver.execute_script(
1)
time.sleep(# If the file hasn't yet been downloaded or is still 0 bytes, wait a second and try again
# There can sometimes be a lag, which necessitates the 'while' loop below
= 0
timer while not os.path.exists(os.path.expanduser('~/Downloads/mmd_data.csv')) or \
'~/Downloads/mmd_data.csv')).st_size==0:
os.stat(os.path.expanduser(1)
time.sleep("arguments[0].click();", data_button)
driver.execute_script(
if timer > 15:
print('Took too long, closing driver')
driver.quit()return (counter, is_download_successful)
print('Waiting for download')
1)
time.sleep(+= 1
timer
print('Now moving file')
'~/Downloads/mmd_data.csv'), os.path.expanduser(f'output-data/{measure}_{index}_{dual}_{sex}_{age}_{race}.csv'))
os.rename(os.path.expanduser(print('Moved and renamed file')
1)
time.sleep(
driver.quit()= True
is_download_successful return (counter, is_download_successful)
def run_entire_loop():
# Set THRESHOLD at number of files you already have downloaded + 1
= len(glob.glob(os.path.expanduser('output-data/*.csv'))) + 1
THRESHOLD = ['e', 'q', 'r']
measure_values = [i for i in range(30) if i not in (1, 25)] # Indices 1 and 25 disabled
emergency_indices = [j for j in range(12)]
pqi_indices = [k for k in range(4) if k != 1] # Index 1 disabled
readmissions_indices = (emergency_indices, pqi_indices, readmissions_indices)
condition_indices
= ['null', '1', '2']
sex_values = ['null', '0', '1', '2', '3']
age_values = ['null', '0', '1']
dual_values = ['null', '1', '2', '4', '5', '6']
race_values
= 0
counter for num, measure in enumerate(measure_values):
for index in condition_indices[num]:
if measure == 'e':
for dual in dual_values[1:]: # Dual eligibility can only be selected for the first measure
for sex in sex_values[1:]:
= data_download(measure=measure, index=index, dual=dual, sex=sex, age='null', race='null', counter=counter, threshold=THRESHOLD)
counter, is_download_successful if not is_download_successful:
return
for age in age_values[1:]:
= data_download(measure=measure, index=index, dual=dual, sex='null', age=age, race='null', counter=counter, threshold=THRESHOLD)
counter, is_download_successful if not is_download_successful:
return
for race in race_values[1:]:
= data_download(measure=measure, index=index, dual=dual, sex='null', age='null', race=race, counter=counter, threshold=THRESHOLD)
counter, is_download_successful if not is_download_successful:
return
else:
for sex in sex_values[1:]:
= data_download(measure=measure, index=index, dual=dual, sex=sex, age='null', race='null', counter=counter, threshold=THRESHOLD)
counter, is_download_successful if not is_download_successful:
return
for age in age_values[1:]:
= data_download(measure=measure, index=index, dual=dual, sex='null', age=age, race='null', counter=counter, threshold=THRESHOLD)
counter, is_download_successful if not is_download_successful:
return
for race in race_values[1:]:
= data_download(measure=measure, index=index, dual=dual, sex='null', age='null', race=race, counter=counter, threshold=THRESHOLD)
counter, is_download_successful if not is_download_successful:
return
if __name__ == '__main__':
run_entire_loop()
# The 3 commented lines below can be used to force the scraper to keep trying to run the whole script
# if it fails without an error message; in practice they were not needed but could be depending on the
# application.
# TOTAL_FILES = 781
# while len(glob.glob(os.path.expanduser('output-data/*.csv'))) < TOTAL_FILES:
# run_entire_loop()
Selenium Web Scraping
Please use all code samples responsibly - these are samples and likely require adjustments to work correctly for your specific needs. Read through the documentation and comments to understand any caveats or limitations of the code and/or data and follow-up with the code author or Code Library admins (code_library@urban.org) if you have questions on how to adapt the sample to your specific use case.
Purpose: This code uses Selenium to scrape an interactive website that has Javascript. It can select dropdown menus and click buttons in order to download hundreds of CSV files.
Data: The URL we scraped data from is: https://data.cms.gov/tools/mapping-medicare-disparities-by-population.
Author: Judah Axelrod and Clayton Seraphin (December 2022)
Check the robots.txt file (for example): https://www.urban.org/robots.txt
Consult Urban’s Automated Data Collection Guidelines.
Use Headers