import requests
from bs4 import BeautifulSoup
import pandas as pd
from dateutil.relativedelta import relativedelta
def fetch_gas_prices(state_abbreviations):
"""
Fetches and processes gas price data for multiple states and their respective cities.
This function scrapes the AAA Gas Prices website to collect gas prices for different fuel grades
(Regular, Mid-Grade, Premium, Diesel) in various cities within the specified states. The collected
data is then structured into a pandas DataFrame.
Parameters:
-----------
state_abbreviations : dict
A dictionary mapping state names to their respective abbreviations, e.g.,
{"California": "CA", "Texas": "TX"}.
Returns:
--------
pd.DataFrame
A DataFrame containing gas prices with columns:
['Date', 'State', 'City', 'Regular', 'Mid-Grade', 'Premium', 'Diesel']
"""
= {
headers "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
}
= pd.Timestamp.today()
today = {
time_mapping "Current Avg.": lambda: today,
"Yesterday Avg.": lambda: today - pd.Timedelta(days=1),
"Week Ago Avg.": lambda: today - pd.Timedelta(weeks=1),
"Month Ago Avg.": lambda: today - relativedelta(months=1),
"Year Ago Avg.": lambda: today - relativedelta(years=1),
}
def extract_gas_prices(row, time_mapping, today, state, city_name):
"""
Extracts and processes gas price data from a single row in the gas price table.
Parameters:
-----------
row : bs4.element.Tag
A BeautifulSoup object representing a table row (<tr>) containing gas prices.
time_mapping : dict
A dictionary mapping AAA's time labels (e.g., "Current Avg.") to corresponding dates.
today : pd.Timestamp
The current date, used as a fallback if no valid time label is found.
state : str
The full name of the state (e.g., "California").
city_name : str
The name of the city corresponding to the extracted gas prices.
Returns:
--------
list
A list containing extracted data in the format:
[date (str), state (str), city (str), regular (str), mid-grade (str), premium (str), diesel (str)]
"""
= row.find_all("td")
cells = cells[0].get_text(strip=True)
date_text
# Get the corresponding date using time_mapping, defaulting to today
= time_mapping.get(date_text, lambda: today)().strftime("%Y-%d-%m")
date
# Extract prices, removing the dollar sign
= [cell.get_text(strip=True).replace("$", "") for cell in cells[1:]]
prices
return [date, state, city_name] + prices
def process_city_data(city, time_mapping, today, state):
"""
Extracts gas price data for a specific city by locating its corresponding table.
Parameters:
-----------
city : bs4.element.Tag
A BeautifulSoup object representing a city's heading element.
time_mapping : dict
A dictionary mapping time labels to actual date values.
today : pd.Timestamp
The current date used for fallback mapping.
state : str
The full name of the state.
Returns:
--------
list
A list of lists, where each inner list contains gas price data for a specific date in the format:
[date, state, city, regular, mid-grade, premium, diesel]
"""
= city.get_text(strip=True)
city_name = city.find_next("table").select("tbody tr")
rows
return [
extract_gas_prices(row, time_mapping, today, state, city_name)for row in rows
]
def process_states(state_abbreviations, headers, time_mapping, today):
"""
Iterates through all states, sending requests to fetch and process gas price data.
Parameters:
-----------
state_abbreviations : dict
A dictionary mapping state names to their respective abbreviations.
headers : dict
HTTP request headers to mimic a real browser and avoid request blocking.
time_mapping : dict
A dictionary mapping AAA's date labels to actual date values.
today : pd.Timestamp
The current date used for fallback mapping.
Returns:
--------
list
A list of lists containing processed gas price data for all states in the format:
[date, state, city, regular, mid-grade, premium, diesel]
"""
= []
all_data for state, abbreviation in state_abbreviations.items():
= {"state": abbreviation}
params = requests.get(
response "https://gasprices.aaa.com/", params=params, headers=headers
)
if response.status_code != 200:
print(
f"Error fetching data for {state}. Status code: {response.status_code}"
)continue
= BeautifulSoup(response.content, "html.parser")
soup
# Extract city sections
= soup.select(".accordion-prices.metros-js > h3[data-title]")
cities
all_data.extend(
[
row_datafor city in cities
for row_data in process_city_data(city, time_mapping, today, state)
]
)return all_data
# Fetch and process data for all states
= process_states(state_abbreviations, headers, time_mapping, today)
all_data
# Convert list of extracted data into a pandas DataFrame
= pd.DataFrame(
all_data_df
all_data,=["Date", "State", "City", "Regular", "Mid-Grade", "Premium", "Diesel"],
columns
)
# Convert 'Date' to datetime format
"Date"] = pd.to_datetime(all_data_df["Date"], format="%Y-%d-%m")
all_data_df[
# Sort by 'State', 'City', and 'Date' for better organization
= all_data_df.sort_values(by=["State", "City", "Date"]).reset_index(
all_data_df =True
drop
)
return all_data_df
Data Science for Policy Analysts: A Simple Introduction to Web Scraping
policy analysis, causal inference, data science, econometrics
In academia, lots of the datasets we tend to work with come in pretty csv files. And while that’s great… oftentimes, in modern policy data science, the data we seek are unstructured in the sense that they do not come from a specific file such as a .csv file or Stata dataset. Such data must be web-scraped, or the process of gathering unstructured data, via reproducible script.
This post simply seeks to present a simple use case of web-scraping in the setting the applied policy scientist might need for certain tasks. Say we wish to gather the price of gas from AAA, across each metro area in the nation. In the old days, we’d need to ask AAA and pay thousands of dollars for an extended time series… but now we don’t need to, at least for this case. The reason is because we have Python, and we are able to leverage its basic features to collect these data via exploiting the publicly available information they provide us.
The Scrape
Here is the code for scrape. These are the helpers that we get one function to call. This is the nitty-gritty that calls and collects the data of interest.
We first define AAA’s website as the URL of interest, Then, we inspect the URL for each state where the actual data for each city/metro is located at. For Massachusetts, the URL is “https://gasprices.aaa.com/?state=MA”. For Florida, the URL is “https://gasprices.aaa.com/?state=FL”. See the pattern? There’s a common prefix, with the only thing changing being the suffix of which is the abbreviation of the state.
The master function is fetch_gas_prices
. This simply accepts a dictionary of state abbreviations (the value) paired with the key (the state name) we shall query over. We, within this function, define a common time mapping which AAA uses to stnadardize dates, and specify the current date.
Next I use Python’s requests
library to query each state (plenty of ways to do this step, but requests is generally the quickest for simple and even fairly big jobs). I then process each state with the process_states
function. This accepts a list of states, headers, the common time mapping, and the current date. We then query each state, and clean the accordion-style tables that we see towards the bottom of the page. We clean them for each city/each metro area with the process_city_data
function. Then append all of them into a single dataframe for each state, moving on to the next state after we’ve axhausted all metros for that state. Finally we appened the state dataframes together, where we have the date, the prices of gas, the state, and the city name all in one place.
The Call
We may then call this whole script with a few lines of python code.
import pandas as pd
from datetime import datetime
from cityutils import fetch_gas_prices
import os
# We just need the state abbreviations since
# AAA indexes their states by the abbreviation.
= "https://raw.githubusercontent.com/jasonong/List-of-US-States/refs/heads/master/states.csv"
url
# We read the csv into a df
= pd.read_csv(url)
states_df
# Here is the main function that does the scrape.
= fetch_gas_prices(dict(zip(states_df['State'], states_df['Abbreviation'])))
df
# Format the date for the filename
= datetime.now().strftime("%Y-%m-%d")
date_str
# Ensure the output directory exists
= "./City Scrape/Data"
output_dir =True)
os.makedirs(output_dir, exist_ok
# Save the DataFrame as "/City Scrape/Data/City_{date}.csv"
= f"{output_dir}/City_{date_str}.csv"
output_path =False) df.to_csv(output_path, index
This pulls everything together in a single dataframe ans saves it in a csv file. Notice that the data are collected from today, yesterday, last week, last month, and last year. Meaning, in theory, we have at least a year’s worth of prior data to collect, assuming we collected this data for a year.
This a simple case of web scraping. I’ve done more complicated scrapes, such as scraping the prices of goods across every Whole Foods in the country across 5 different departments per store. The scale of the job and the size of the data is much bigger, but the principles are overall the same. In the future, I will write more posts that demonstrate different applications of web-scraping and how policy scientists may use them. In particular, I will cover how scraping can go hand in hand with GitHub Actions. In my experience so far as a PHD student, these two alone (scraping and Github Actions) has made my life as a researcher far easier than otherwise.