import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('cleaned_data.csv')TessX | Exploring the Data Through Questions
Pricing Analytics
Introduction
After completing Data Preparation, the next step is Exploratory Data Analysis (EDA). In this notebook, I explore the data further by asking key questions to uncover patterns, trends, and insights. This helps in understanding the data’s structure and guiding the next steps in the analysis.
If you’d like to review the previous notebook on Data Preparation, click the link below:
View Data Preparation Notebook
Load libraries
What’s the Most Common Car Price?
Code
# Generate KDE using Plotly
msrp_values = df['MSRP'].dropna().values # Drop NaNs if any
x_range = np.linspace(msrp_values.min(), msrp_values.max(), 100)
# Create KDE curve
density = ff.create_distplot([msrp_values], ['MSRP'], show_hist=False, show_rug=False)
# Extract KDE curve values
kde_x = density.data[0]['x']
kde_y = density.data[0]['y']
# Create figure
fig = go.Figure()
# Add KDE curve
fig.add_trace(go.Scatter(
x=kde_x,
y=kde_y,
fill='tozeroy',
mode='lines',
line=dict(color='#E3120B', width=2), # Line color
fillcolor='#E3120B',
hoverinfo='x+y'
))
# Customize layout
fig.update_layout(
title="<span style='color:black'>Car Price Distribution</span>", # Title in black
xaxis=dict(
title="", # Add x-axis label
range=[0, 400000], # Set x-axis range
dtick=100000, # Tick marks every 100,000
tickformat=",d", # Full number format (e.g., 100,000 instead of 100k)
tickprefix="$", # Add dollar sign ($100,000)
showgrid=False,
zeroline=True,
zerolinewidth=2,
zerolinecolor='#f7f7f7'
),
yaxis=dict(
title="", # Add y-axis label
showticklabels=False,
showgrid=False
),
plot_bgcolor='#F7F7F7',
paper_bgcolor='#F7F7F7',
font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=14, color='#000000'),
margin=dict(l=50, r=50, t=100, b=50)
)
# Show plot
fig.show()Most cars are affordable, and only a few are luxury. 84.5% of cars are under $91K, proving luxury is an exception, not the norm.
What’s the Relationship Between Horsepower and MSRP
Code
# Create scatter plot with red points
fig = px.scatter(
df,
x='Horsepower',
y='MSRP',
opacity=0.5, # Adjust transparency
color_discrete_sequence=['#E3120B'], # Set all points to red
title="<span style='color:black'>MSRP vs. Horsepower</span>", # Title in black
labels={'Horsepower': '', 'MSRP': ''} # Remove axis labels
)
# Customize layout to remove x-axis line and gridlines
fig.update_layout(
plot_bgcolor='#F5F4EF', # Set plot background color
paper_bgcolor='#F5F4EF', # Set page background color
xaxis=dict(
showline=False, # Remove x-axis line
showgrid=False, # Remove x-axis gridlines
tickmode='auto',
tickcolor='black',
tickwidth=2,
ticks="outside",
tickfont=dict(family="Hiragino Kaku Gothic Pro, sans-serif", color='black')
),
yaxis=dict(
side="right", # Move y-axis to the right
showgrid=False, # Remove y-axis gridlines
tickfont=dict(family="Hiragino Kaku Gothic Pro, sans-serif"),
zeroline=False # Remove the thick zero line
),
title=dict(
font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", color='black')
),
font=dict(family="Hiragino Kaku Gothic Pro, sans-serif") ,
margin=dict(l=50, r=50, t=100, b=50)
)
# Show plot
fig.show()More Horsepower Means Higher Prices. There is a clear trend—cars with more horsepower tend to be more expensive. High-performance engines require advanced engineering, premium materials, and specialized components, driving up the cost. This makes sense, as luxury and sports cars prioritize power and speed.
How Do Car Prices Vary by Brand
Code
# Group by 'Make' and calculate median MSRP, then sort in ascending order
median_msrp_by_make = df.groupby('Make')['MSRP'].median().sort_values(ascending=True).reset_index()
# Create bar plot using Plotly
fig = px.bar(
median_msrp_by_make,
x='Make',
y='MSRP',
title="<span style='color:black'>MSRP by Car Make</span>", # Title in black
labels={'Make': '', 'MSRP': ''},
color_discrete_sequence=['#E3120B'] # Set all bars to red
)
# Update layout with custom size, backgrounds, tick styles, font, and gridlines
fig.update_layout(
plot_bgcolor='#F5F4EF', # Set plot background color
paper_bgcolor='#F5F4EF', # Set page background color
xaxis=dict(
showline=True, # Show the x-axis line
linecolor='black', # Set x-axis line color to black
tickmode='auto', # Ensure ticks are displayed
tickcolor='black', # Set tick color to black
tickwidth=2, # Set tick width to 2
ticks="outside", # Make ticks appear outside
tickfont=dict(family="Hiragino Kaku Gothic Pro, sans-serif", color='black'), # Set x-axis text color
range=[-0.7, len(median_msrp_by_make["Make"]) - 0] # Properly shift left without hiding any bars
),
yaxis=dict(
side="right", # Move y-axis to the right
tickfont=dict(family="Hiragino Kaku Gothic Pro, sans-serif"), # Set y-axis tick font
gridcolor='lightgray', # Light gray gridlines
showticklabels=True, # Show y-axis values
gridwidth=0.5, # Subtle grid width
griddash='solid', # Solid gridlines
zeroline=False # Hide the thick zero line for a cleaner look
),
title=dict(
font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", color='black') # Set title font and color
),
font=dict(family="Hiragino Kaku Gothic Pro, sans-serif"), # Set general font
margin=dict(l=50, r=50, t=100, b=50)
)
# Show plot
fig.show()Luxury brands like BMW and Mercedes-Benz command higher prices, but ultra-luxury brands such as Bentley and Aston Martin stand in a league of their own. Their exclusivity, craftsmanship, and brand prestige push their prices significantly higher compared to mainstream luxury cars.
Which Body Types Have the Highest and Lowest Prices?
Code
# Calculate median MSRP for each Body Size category and sort in ascending order
median_values = df.groupby('Body Size')['MSRP'].median().sort_values(ascending=True)
# Convert 'Body Size' into a categorical variable with this custom order
df['Body Size'] = pd.Categorical(df['Body Size'], categories=median_values.index, ordered=True)
# Create the vertical box plot with a single color
fig = px.box(
df,
x='Body Size',
y='MSRP',
title="<span style='color:black'>MSRP by Body Size</span>", # Title in black
category_orders={'Body Size': median_values.index}, # Order from lowest to highest
color_discrete_sequence=['#E3120B'] # Set all box plots to red
)
# Customize layout
fig.update_layout(
showlegend=False, # Remove legend
paper_bgcolor="#F5F4EF", # Set page background color
plot_bgcolor="#F5F4EF", # Set plot background color
font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", color="black"), # Set font style and color
margin=dict(l=50, r=50, t=100, b=50),
yaxis=dict(
showgrid=True, # Keep y-axis grid lines
gridcolor="lightgray", # Light gray grid lines
gridwidth=1, # Gridline thickness
showline=False, # Remove y-axis line
zeroline=False, # Remove only the zero line
showticklabels=True, # Keep y-axis tick labels
tickfont=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=12, color="black"), # Set tick font
title=None # Remove y-axis label
),
xaxis=dict(
showgrid=False, # Remove x-axis grid lines
showline=False, # Remove x-axis line
ticks="outside", # Keep x-axis ticks
tickfont=dict(family="Hiragino Kaku Gothic Pro, sans-serif", size=14, color="black"), # Set tick font
categoryorder='array', # Ensure order from lowest to highest
categoryarray=median_values.index, # Apply custom category order
title=None # Remove x-axis label
)
)
# Show the plot
fig.show()Smaller vehicles, such as compact sedans and hatchbacks, are typically the most affordable. Midsize cars show more variability in pricing, often depending on features and trim levels. Large vehicles, including SUVs and luxury sedans, sit at the higher end due to their size, materials, and additional features.
Which Drivetrain Offers the Best Value for Its Price?
Code
# Calculate the median MSRP for each drivetrain and sort in ascending order
median_msrp = df.groupby("Drivetrain")["MSRP"].median().reset_index()
median_msrp = median_msrp.sort_values("MSRP", ascending=True)
# Create bar plot using Plotly
fig = px.bar(
median_msrp,
x="Drivetrain",
y="MSRP",
title="<span style='color:black'>MSRP by Drivetrain</span>", # Title in black
labels={'Drivetrain': '', 'MSRP': ''},
color_discrete_sequence=['#E3120B'] # Set all bars to red
)
# Update layout with custom size, backgrounds, tick styles, font, and Economist-style gridlines
fig.update_layout(
plot_bgcolor='#F5F4EF', # Set plot background color
paper_bgcolor='#F5F4EF', # Set page background color
xaxis=dict(
showline=True, # Show the x-axis line
linecolor='black', # Set x-axis line color to black
tickmode='auto', # Ensure ticks are displayed
tickcolor='black', # Set tick color to black
tickwidth=2, # Set tick width to 2
ticks="outside", # Make ticks appear outside
tickfont=dict(family="Hiragino Kaku Gothic Pro, sans-serif", color='black'), # Set x-axis text to black
range=[-0.7, len(median_msrp["Drivetrain"]) - 0] # Properly shift left without hiding any bars
),
yaxis=dict(
side="right", # Move y-axis to the right
tickfont=dict(family="Hiragino Kaku Gothic Pro, sans-serif", color='black'), # Set y-axis tick font to black
gridcolor='lightgray', # Closer to The Economist’s gridline color
showticklabels=True, # Show y-axis values
gridwidth=0.5, # Set grid width to 0.5 for a subtle look
griddash='solid', # Make gridlines solid
zeroline=False # Hide the thick zero line for a cleaner look
),
title=dict(
font=dict(family="Hiragino Kaku Gothic Pro, sans-serif") # Set title font
),
font=dict(family="Hiragino Kaku Gothic Pro, sans-serif"), # Set general font
margin=dict(l=50, r=50, t=100, b=50)
)
# Show plot
fig.show()All-wheel drive (AWD) and four-wheel drive (4WD) systems increase vehicle costs due to added components and engineering. In contrast, front-wheel drive (FWD) is more affordable and common in budget-friendly vehicles. The trade-off often involves performance and capability in different road conditions.
Is There a Link Between Transmission and Engine Power?
Code
# Create the box plot
fig = px.box(df, x='Transmission', y='Horsepower',
color='Transmission',
color_discrete_map={'manual': '#E3120B', 'automatic': '#0057B8'},
title="<span style='color:black'>Horsepower by Transmission Type</span>") # Title in black
# Update layout to remove unwanted elements
fig.update_layout(
paper_bgcolor="#F5F4EF", # Background color
plot_bgcolor="#F5F4EF", # Background color
font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", color="black"), # Font style
xaxis=dict(
title=None, # Remove x-axis label
showline=False, # Remove x-axis line
ticks="outside",
tickfont=dict(size=14, color="black")
),
yaxis=dict(
title=None, # Remove y-axis label
showline=False, # Remove y-axis line
zeroline=False, # Remove zero line
showticklabels=True, # Show y-axis numbers
tickfont=dict(size=14, color="black"), # Adjust font size and color
gridcolor="lightgray", # Light gray grid lines
gridwidth=1
),
showlegend=False, # Remove legend
margin=dict(l=50, r=50, t=100, b=50)
)
# Show the plot
fig.show()Automatic transmissions dominate the market, providing a broad spectrum of horsepower levels. Manual transmissions, often favored by enthusiasts, tend to be paired with mid-to-high horsepower engines, offering a more engaging driving experience.
Which Cylinder Type Delivers the Most Torque?
Code
# Define the natural order of cylinders
natural_order = ['I3', 'I4', 'I5', 'I6', 'V6', 'V8', 'V10', 'V12', 'W12']
# Convert 'Cylinders' to a categorical type with this order
df['Cylinders'] = pd.Categorical(df['Cylinders'], categories=natural_order, ordered=True)
# Aggregate median torque per cylinder type
median_torque_by_cylinders = df.groupby('Cylinders')['Torque'].median().reset_index()
# Define custom hex color codes for each cylinder type
custom_colors = {
'I3': '#EB6E64',
'I4': '#EB6E64',
'I5': '#EB6E64',
'I6': '#EB6E64',
'V6': '#EB6E64',
'V8': '#EB6E64',
'V10': '#EB6E64',
'V12': '#EB6E64',
'W12': '#E3120B'
}
# Create bar chart with Plotly
fig = px.bar(
median_torque_by_cylinders,
x='Cylinders',
y='Torque', # Corrected y-axis column
title="Torque by Cylinder Type",
labels={'Cylinders': 'Cylinders', 'Torque': 'Median Torque (Nm)'},
color='Cylinders', # Map colors to cylinder types
color_discrete_map=custom_colors # Use custom hex color codes
)
# Update layout to match previous style
fig.update_layout(
plot_bgcolor='#F5F4EF',
paper_bgcolor='#F5F4EF',
xaxis=dict(
title="Cylinders",
showline=True,
linecolor='black',
tickmode='array',
tickvals=list(range(len(natural_order))), # Ensure correct ticks
ticktext=natural_order, # Ensure correct labels
tickcolor='black',
tickwidth=2,
ticks="outside",
tickfont=dict(family="Hiragino Kaku Gothic Pro, sans-serif", color='black'),
range=[-0.5, len(natural_order) - 0.5] # Adjusted range
),
yaxis=dict(
title="",
side="right",
tickfont=dict(family="Hiragino Kaku Gothic Pro, sans-serif", color='black'),
gridcolor='lightgray',
showticklabels=True, # Now shows torque values
gridwidth=0.5,
griddash='solid',
zeroline=False
),
title=dict(
font=dict(family="Hiragino Kaku Gothic Pro, sans-serif", color='black')
),
font=dict(family="Hiragino Kaku Gothic Pro, sans-serif"),
margin=dict(l=50, r=50, t=100, b=50) ,
showlegend=False # Remove the legend
)
# Show plot
fig.show()While larger engines generally produce higher torque, the data reveals that the increase in torque isn’t consistent across all engine types. For instance, the torque starts at 225 for the I3 and gradually increases, reaching 369 for the I6. However, the jump in torque is more noticeable in the larger engines, with the V8 producing 485 torque, the V12 reaching 609.5, and the W12 exceeding both with 664 torque. This suggests that engine design and configuration, such as cylinder arrangement, play a crucial role in determining torque, rather than just the number of cylinders alone.
Is Engine Aspiration the Key to Better Fuel Economy?
Code
# Calculate the median for each Engine Aspiration category
median_values = df.groupby('Engine Aspiration')['Highway Fuel Economy'].median().reset_index()
# Sort the median values in ascending order
median_values = median_values.sort_values('Highway Fuel Economy', ascending=True)
# Create the bar chart with Plotly
fig = px.bar(
median_values,
x='Engine Aspiration',
y='Highway Fuel Economy',
title="<span style='color:black'>Highway Fuel Economy by Engine Aspiration</span>", # Title in black
labels={'Engine Aspiration': '', 'Highway Fuel Economy': ''},
color_discrete_sequence=['#E3120B'] # Set all bars to red
)
# Update layout to match your previous style
fig.update_layout(
plot_bgcolor='#F5F4EF',
paper_bgcolor='#F5F4EF',
xaxis=dict(
showline=True,
linecolor='black',
tickmode='auto',
tickcolor='black',
tickwidth=2,
ticks="outside",
tickfont=dict(family="Hiragino Kaku Gothic Pro, sans-serif", color='black'),
range=[-0.7, len(median_values["Engine Aspiration"]) - 0]
),
yaxis=dict(
side="right",
tickfont=dict(family="Hiragino Kaku Gothic Pro, sans-serif", color='black'), # Ensure tick labels are readable
gridcolor='lightgray',
showticklabels=True, # Show y-axis numbers
gridwidth=0.5,
griddash='solid',
zeroline=False
),
title=dict(
font=dict(family="Hiragino Kaku Gothic Pro, sans-serif")
),
font=dict(family="Hiragino Kaku Gothic Pro, sans-serif"),
margin=dict(l=50, r=50, t=100, b=50)
)
# Show plot
fig.show()Turbocharged and naturally aspirated engines show differences in fuel economy, but one insight stands out—electric vehicles outperform all traditional engines in efficiency. Their ability to maximize energy use makes them the best choice for reducing fuel costs and emissions.
Next Steps
Now, you can move to the next notebook, where I discuss Feature Engineering by exploring the data further, examining the methods I will use in my data pipeline, and building a baseline model.