import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from scipy import stats
import matplotlib.pyplot as plt
import altair as alt
import warnings
warnings.filterwarnings('ignore')Does free delivery increase order value?
Food Delivery App | Regression Discontinuity Design (RDD)
The Problem
Right now, our company gives free delivery to anyone living 5.0 km or closer to our warehouse. The Finance team wants to change this rule so that only people within 3.0 km get the deal, which would save us money on gas and drivers. However, the Marketing team thinks this is a mistake. They believe “Free Delivery” is a huge incentive that pushes people to put more items in their cart. They worry that if we start charging for delivery for customers living within 5.0km, those customers will end up spending much less money overall.
The Question
It is hard to know who is right. We already know that people living close to the warehouse spend more money than people living far away. But we need to figure out the “why.” Do they spend more because of the free delivery deal, or is it just a coincidence?
For example, maybe the neighborhoods near our warehouse are just wealthier, or maybe those families are just larger and buy more food.
# Load the correct dataset for RDD analysis
# rdd.csv contains 'distance_from_hub_km' and 'free_delivery_eligible'
df = pd.read_csv('rdd.csv')
# Define global constants
CUTOFF = 5.0 # Threshold for free deliverydf.head()| distance_from_hub_km | free_delivery_eligible | order_value_usd | items_in_wishlist | is_returning_customer | |
|---|---|---|---|---|---|
| 0 | 6.2 | 0 | 53.01 | 0 | 1 |
| 1 | 4.7 | 1 | 52.84 | 3 | 0 |
| 2 | 6.6 | 0 | 43.44 | 9 | 0 |
| 3 | 8.8 | 0 | 40.85 | 5 | 1 |
| 4 | 4.4 | 1 | 54.21 | 6 | 1 |
Data Understanding
df.shape(3500, 5)
# Short Info of data
df.info()<class 'pandas.DataFrame'>
RangeIndex: 3500 entries, 0 to 3499
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 distance_from_hub_km 3500 non-null float64
1 free_delivery_eligible 3500 non-null int64
2 order_value_usd 3500 non-null float64
3 items_in_wishlist 3500 non-null int64
4 is_returning_customer 3500 non-null int64
dtypes: float64(2), int64(3)
memory usage: 136.8 KB
# Summary statistics
df.describe()| distance_from_hub_km | free_delivery_eligible | order_value_usd | items_in_wishlist | is_returning_customer | |
|---|---|---|---|---|---|
| count | 3500.000000 | 3500.000000 | 3500.000000 | 3500.000000 | 3500.000000 |
| mean | 5.083771 | 0.499143 | 50.833360 | 4.047143 | 0.391143 |
| std | 2.434786 | 0.500071 | 11.967617 | 2.042085 | 0.488076 |
| min | 0.100000 | 0.000000 | 16.880000 | 0.000000 | 0.000000 |
| 25% | 3.400000 | 0.000000 | 41.895000 | 3.000000 | 0.000000 |
| 50% | 5.100000 | 0.000000 | 50.750000 | 4.000000 | 0.000000 |
| 75% | 6.700000 | 1.000000 | 59.675000 | 5.000000 | 1.000000 |
| max | 14.800000 | 1.000000 | 88.130000 | 13.000000 | 1.000000 |
Code
# --- Data Preparation & Sampling ---
# We cap the sampling to maintain the density "cloud" effect established in the design system.
sample_size = 1000
random_state = 42
treated_df = df[df['free_delivery_eligible'] == 1]
control_df = df[df['free_delivery_eligible'] == 0]
# Sample down to group size if the dataset is larger
treated_sample = treated_df.sample(n=sample_size, random_state=random_state) if len(treated_df) > sample_size else treated_df
control_sample = control_df.sample(n=sample_size, random_state=random_state) if len(control_df) > sample_size else control_df
# Combine the sampled data
plot_df = pd.concat([treated_sample, control_sample])
plot_df['Delivery Type'] = np.where(plot_df['free_delivery_eligible'] == 1, 'Free Delivery', 'Paid Delivery')
# --- Brand & Styling Configuration ---
canvas_bg_color = '#ffffff'
free_color = "#1F2E7A" # Blue for the treated group
paid_color = "#E3120B" # Red for the control group
cutoff_line_color = "#c0c0c0"
cutoff_marker_color = "#6b6b6b"
cutoff_text_color = "#7d7d7d"
# --- Dynamic Layout & Padding Calculations ---
# Dynamically calculate Y boundary to ensure consistent ceiling
y_max = plot_df['order_value_usd'].max()
clean_y_max = int(np.ceil(y_max / 10.0) * 10)
# Implementation of the "Padding Logic"
# Calculate X-axis range and add 5% padding to prevent points from touching the axis walls
x_min = plot_df['distance_from_hub_km'].min()
x_max = plot_df['distance_from_hub_km'].max()
x_range = x_max - x_min
x_padding = x_range * 0.05
# Define clean ticks with an increment of 2
# We calculate this dynamically based on the data range to avoid hardcoding limits
tick_step = 2
x_ticks = list(range(int(np.floor(x_min)), int(np.ceil(x_max)) + 1, tick_step))
x_scale = alt.Scale(domain=[x_min - x_padding, x_max + x_padding], nice=False)
# The core scatter plot
# Small size (30), zero stroke width, and 0.2 opacity creates the "misty" density clouds
scatter = alt.Chart(plot_df).mark_circle(size=20, strokeWidth=0, opacity=0.3).encode(
x=alt.X('distance_from_hub_km:Q',
title="Distance from Hub (km)",
scale=x_scale,
axis=alt.Axis(grid=False, domainColor='black', tickColor='black', values=x_ticks, titlePadding=15)
),
y=alt.Y('order_value_usd:Q',
title=None,
scale=alt.Scale(domain=[0, clean_y_max]),
axis=alt.Axis(grid=True, domain=False, ticks=False)
),
color=alt.condition(
alt.datum['Delivery Type'] == 'Free Delivery',
alt.value(free_color),
alt.value(paid_color)
)
)
# --- Direct Labeling (Anchored to X-Axis and Cutoff) ---
# Highlight label (Free Delivery) sits on the X-axis, just LEFT of the cutoff line
labels_highlight = alt.Chart(pd.DataFrame({'x': [CUTOFF], 'y': [0], 'label': ['Free Delivery']})).mark_text(
align='right',
baseline='bottom',
dx=-10,
dy=-4, # Nudged closer to the axis line
fontSize=14,
fontWeight='bold',
color=free_color
).encode(x='x:Q', y='y:Q', text='label:N')
# Background label (Paid Delivery) sits on the X-axis, just RIGHT of the cutoff line
labels_bg = alt.Chart(pd.DataFrame({'x': [CUTOFF], 'y': [0], 'label': ['Paid Delivery']})).mark_text(
align='left',
baseline='bottom',
dx=10,
dy=-4, # Nudged closer to the axis line
fontSize=14,
fontWeight='bold',
color=paid_color
).encode(x='x:Q', y='y:Q', text='label:N')
# Construct the Lollipop marker for the cutoff threshold
marker_df = pd.DataFrame({
'x': [CUTOFF],
'label': [f'Cutoff = {CUTOFF}km']
})
lollipop_stick = alt.Chart(marker_df).mark_rule(color=cutoff_line_color, strokeWidth=2).encode(
x='x:Q',
y=alt.value(400), # Anchors to the bottom boundary of the chart
y2=alt.value(-25) # Extends above the chart ceiling
)
lollipop_cap = alt.Chart(marker_df).mark_point(color=cutoff_marker_color, size=60, filled=True).encode(
x='x:Q', y=alt.value(-25)
)
lollipop_label = alt.Chart(marker_df).mark_text(
align='center', baseline='bottom', dy=-10, color=cutoff_text_color, fontSize=12, fontWeight='bold'
).encode(x='x:Q', y=alt.value(-25), text='label:N')
# Composite all layers and apply the rigid brand formatting
chart = (scatter + labels_highlight + labels_bg + lollipop_stick + lollipop_cap + lollipop_label).properties(
title=alt.TitleParams(
text="Impact of free delivery on purchase behavior",
subtitle="Average order value by delivery zone, $",
offset=40 # Pushed chart down for more breathing room from subtitle
),
width=700,
height=400
).configure(
background=canvas_bg_color # Apply the off-white background
).configure_title(
fontSize=20,
subtitleFontSize=14,
subtitleColor='#666666',
anchor='start', color='black'
).configure_axis(
labelFontSize=12, titleFontSize=14
).configure_view(
strokeWidth=0
)
chart.display()RDD Analysis
We use a statistical model called Regression Discontinuity Design of the form:
\[
\Large Y = \underbrace{\beta_0}_{\text{Intercept}} + \underbrace{\beta_1 D}_{\text{Treatment}} + \underbrace{\beta_2(X - c)}_{\text{Trend}} + \underbrace{\beta_3 D(X - c)}_{\text{Interaction}} + \varepsilon
\]
Model Components
Y (Outcome Variable)
The primary metric being tracked. In this analysis, it represents the total amount of money each customer spent on their order.
X (Forcing Variable)
The numerical value that determines the rule for treatment. Here, it is the distance in km between the customer and the warehouse.
c (The Cutoff)
The specific threshold where the policy changes. This is the 5 km line where the free delivery offer either starts or stops.
D (Treatment Status)
A binary switch (0 or 1). It is assigned 1 if the customer received free delivery and 0 if they had to pay for it.
ε (Stochastic Error)
This accounts for all the random variation and noise in the data, such as a customer’s personal mood, income levels, or specific needs that aren’t captured by the distance or the delivery deal.
(X - c) (Centered Running Variable)
This mathematical shift moves the data so that the 5 km mark becomes the zero point. It allows for a direct comparison of people just above and just below the threshold.
Interpretation
β₀ (The Intercept)
This represents the spending amount for someone living exactly 5 km away who still has to pay for delivery. It serves as our baseline measurement.
β₁ (The Treatment Effect)
This is the most important coefficient. It captures the immediate jump in spending that occurs the moment delivery becomes free. This discontinuity is the core finding of the entire analysis.
β₂ (The Control Slope)
This shows how spending naturally changes with distance for customers paying for delivery. It reveals whether people spend less when they live further away due to higher shipping costs.
β₃ (The Interaction Effect)
This tells us whether free delivery changes the relationship between distance and spending. It reveals if customers with free delivery become less sensitive to living far away from the warehouse.
Code
# Step 1: Define Our Variables
# CUTOFF is defined at the top of the document
# Create the key variables with clear names
df['X'] = df['distance_from_hub_km'] # X = running variable (distance)
df['X_minus_c'] = df['X'] - CUTOFF # (X - c) = centered distance
df['D'] = df['free_delivery_eligible'] # D = treatment (1=free delivery, 0=no)
df['Y'] = df['order_value_usd'] # Y = outcome (order value)
# Step 2: Select Bandwidth (local window around cutoff)
bandwidth = 2.0 # Looking at customers within 2 km of the cutoff
df_local = df[(df['X_minus_c'] >= -bandwidth) &
(df['X_minus_c'] <= bandwidth)].copy()
# Throwing away any data points that fall outside that +/- 2 km window.
print(f"Total observations: {len(df)}")
print(f"Observations within ±{bandwidth}km of cutoff: {len(df_local)}")
# Step 3: Run the Regression (matching textbook formula)
# Y = β₀ + β₁(D) + β₂(X - c) + β₃(D)(X - c) + ε
model = smf.ols('Y ~ D + X_minus_c + D:X_minus_c',
data=df_local).fit()
print(model.summary())Total observations: 3500
Observations within ±2.0km of cutoff: 2072
OLS Regression Results
==============================================================================
Dep. Variable: Y R-squared: 0.453
Model: OLS Adj. R-squared: 0.452
Method: Least Squares F-statistic: 570.4
Date: Thu, 09 Apr 2026 Prob (F-statistic): 4.29e-270
Time: 19:01:26 Log-Likelihood: -7286.5
No. Observations: 2072 AIC: 1.458e+04
Df Residuals: 2068 BIC: 1.460e+04
Df Model: 3
Covariance Type: nonrobust
===============================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------
Intercept 45.3368 0.528 85.868 0.000 44.301 46.372
D 10.7479 0.707 15.205 0.000 9.362 12.134
X_minus_c -1.8198 0.462 -3.941 0.000 -2.725 -0.914
D:X_minus_c -0.3808 0.626 -0.609 0.543 -1.608 0.846
==============================================================================
Omnibus: 0.824 Durbin-Watson: 2.010
Prob(Omnibus): 0.662 Jarque-Bera (JB): 0.767
Skew: 0.044 Prob(JB): 0.681
Kurtosis: 3.032 Cond. No. 7.96
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Code
# --- Data Preparation & Sampling ---
sample_size = 3000
random_state = 42
treated_df = df_local[df_local['D'] == 1].copy()
control_df = df_local[df_local['D'] == 0].copy()
# Sample down to 3,000 points per group to maintain the balanced density effect
treated_sample = treated_df.sample(n=sample_size, random_state=random_state) if len(treated_df) > sample_size else treated_df
control_sample = control_df.sample(n=sample_size, random_state=random_state) if len(control_df) > sample_size else control_df
# Combine the sampled data for the scatter layer
plot_df = pd.concat([treated_sample, control_sample])
plot_df['Delivery Type'] = np.where(plot_df['D'] == 1, 'Free Delivery', 'Paid Delivery')
# --- Regression Line Data Preparation ---
# We generate the fitted values for the regression lines explicitly for Altair to plot
x_range_treatment = np.linspace(-bandwidth, 0, 100)
y_treatment_line = model.params['Intercept'] + model.params['D'] + (model.params['X_minus_c'] + model.params['D:X_minus_c']) * x_range_treatment
x_range_control = np.linspace(0, bandwidth, 100)
y_control_line = model.params['Intercept'] + model.params['X_minus_c'] * x_range_control
fit_df_treatment = pd.DataFrame({'X_minus_c': x_range_treatment, 'Y': y_treatment_line, 'Delivery Type': 'Free Delivery'})
fit_df_control = pd.DataFrame({'X_minus_c': x_range_control, 'Y': y_control_line, 'Delivery Type': 'Paid Delivery'})
fit_df = pd.concat([fit_df_treatment, fit_df_control])
# --- Metrics for Dynamic Subtitle ---
# Extracting exact numbers to build an editorial headline
y_control_at_cutoff = model.params["Intercept"]
treatment_effect = model.params["D"]
pct_increase = (treatment_effect / y_control_at_cutoff) * 100
dynamic_subtitle = f"Treatment Effect: ${treatment_effect:.2f} ({pct_increase:.1f}% jump) at the cutoff threshold"
# --- Brand & Styling Configuration ---
canvas_bg_color = '#ffffff'
free_color = "#1F2E7A"
paid_color = "#E3120B"
cutoff_line_color = "#c0c0c0"
cutoff_marker_color = "#6b6b6b"
cutoff_text_color = "#7d7d7d"
# Explicit color mapping to ensure regression lines break and colorize correctly
color_scale = alt.Scale(
domain=['Free Delivery', 'Paid Delivery'],
range=[free_color, paid_color]
)
# Dynamically calculate Y boundary to ensure consistent ceiling
y_max = plot_df['Y'].max()
clean_y_max = int(np.ceil(y_max / 10.0) * 10)
# Calculate exact X-axis ticks (0.5 increments)
x_ticks = [float(x) for x in np.arange(-bandwidth, bandwidth + 0.1, 0.5)]
# Add 5% padding to the visual domain to prevent points from clipping the Y-axis
x_padding = bandwidth * 0.05
x_scale = alt.Scale(domain=[-bandwidth - x_padding, bandwidth + x_padding], nice=False)
# The core scatter plot layer (Density Clouds)
scatter = alt.Chart(plot_df).mark_circle(size=30, strokeWidth=0, opacity=0.2).encode(
x=alt.X('X_minus_c:Q',
title="Distance from Cutoff (km)",
scale=x_scale,
axis=alt.Axis(grid=False, domainColor='black', tickColor='black', values=x_ticks, titlePadding=15)
),
y=alt.Y('Y:Q',
title=None,
scale=alt.Scale(domain=[0, clean_y_max]),
axis=alt.Axis(grid=True, domain=False, ticks=False)
),
color=alt.Color('Delivery Type:N', scale=color_scale, legend=None)
)
# The regression line layer
lines = alt.Chart(fit_df).mark_line(strokeWidth=3.5).encode(
x='X_minus_c:Q',
y='Y:Q',
color=alt.Color('Delivery Type:N', scale=color_scale, legend=None)
)
# --- Direct Labeling (Anchored to X-Axis and Cutoff = 0) ---
# Highlight label (Free Delivery) sits on the X-axis, LEFT of the cutoff (negative distance)
labels_highlight = alt.Chart(pd.DataFrame({'x': [0], 'y': [0], 'label': ['Free Delivery']})).mark_text(
align='right',
baseline='bottom',
dx=-10,
dy=-4,
fontSize=14,
fontWeight='bold',
color=free_color
).encode(x='x:Q', y='y:Q', text='label:N')
# Background label (Paid Delivery) sits on the X-axis, RIGHT of the cutoff (positive distance)
labels_bg = alt.Chart(pd.DataFrame({'x': [0], 'y': [0], 'label': ['Paid Delivery']})).mark_text(
align='left',
baseline='bottom',
dx=10,
dy=-4,
fontSize=14,
fontWeight='bold',
color=paid_color
).encode(x='x:Q', y='y:Q', text='label:N')
# Construct the Lollipop marker for the cutoff threshold at 0
marker_df = pd.DataFrame({
'x': [0],
'label': ['Cutoff']
})
lollipop_stick = alt.Chart(marker_df).mark_rule(color=cutoff_line_color, strokeWidth=2).encode(
x='x:Q',
y=alt.value(400), # Anchors to the bottom boundary of the chart
y2=alt.value(-25) # Extends above the chart ceiling
)
lollipop_cap = alt.Chart(marker_df).mark_point(color=cutoff_marker_color, size=60, filled=True).encode(
x='x:Q', y=alt.value(-25)
)
lollipop_label = alt.Chart(marker_df).mark_text(
align='center', baseline='bottom', dy=-10, color=cutoff_text_color, fontSize=12, fontWeight='bold'
).encode(x='x:Q', y=alt.value(-25), text='label:N')
# Composite all layers and apply the rigid brand formatting
chart = (scatter + lines + labels_highlight + labels_bg + lollipop_stick + lollipop_cap + lollipop_label).properties(
title=alt.TitleParams(
text="RDD: Effect of Free Delivery on Order Value",
subtitle=dynamic_subtitle,
offset=40
),
width=700,
height=400
).configure(
background=canvas_bg_color # Apply the background
).configure_title(
fontSize=20,
subtitleFontSize=14,
subtitleColor='#666666',
anchor='start', color='black'
).configure_axis(
labelFontSize=12, titleFontSize=14
).configure_view(
strokeWidth=0
)
chart.display()Model Results Interpretation
Intercept (45.34, p < 0.001): This is the baseline spending at the 5km cutoff point. Customers just beyond the threshold who pay for delivery spend $45.34 on average per order. This is statistically significant.
D: Free Delivery Effect (10.75, p < 0.001): Free delivery increases spending by $10.75 per order. Customers just below the 5km threshold spend $56.09 on average ($45.34 + $10.75). The percentage increase is:
($10.75 / $45.34) × 100 = 24%. This is statistically significant.X_minus_c: Distance Effect (-1.82, p < 0.001): For customers who pay for delivery (those beyond 5km), each extra kilometer from the hub reduces spending by $1.82. A customer at 6km spends $1.82 less than one at 5km. This is statistically significant.
D:X_minus_c: Interaction Effect (-0.38, p = 0.543): This tests whether free delivery changes how distance affects spending. It does not. The result is not statistically significant (p = 0.543). This means the distance effect is the same for both free delivery and paid delivery customers. Both groups spend less as they get farther from the hub at approximately the same rate.
Business Takeaway
Free delivery increases order values by 24%. Distance from the hub reduces spending for all customers at the same rate, regardless of delivery type.