🎲 Probability Distributions: The Mathematics of Uncertainty
Imagine you're a weather forecaster 🌤️. You can't say exactly how much it will rain tomorrow, but you can describe the probability of different amounts. That's a probability distribution! It's like having a crystal ball that shows all possible outcomes and their likelihoods. From predicting customer behavior to modeling stock prices, probability distributions are the foundation of data science decision-making.
Understanding Probability Distributions 🎯
A probability distribution is like a recipe that tells you how likely different outcomes are. Just as a recipe tells you the proportions of ingredients, a distribution tells you the proportions of probabilities across all possible values.
Yes/No outcomes"] E --> N["Events per time
Customer arrivals"] H --> O["Natural phenomena
Heights, IQ scores"] I --> P["Time between events
Waiting times"] style A fill:#667eea style B fill:#3b82f6 style C fill:#10b981
The Vending Machine Analogy 🎰
Think of probability distributions like different vending machines. A uniform distribution vending machine gives each snack equal probability - completely fair! A normal distribution machine mostly gives medium-sized snacks, rarely tiny or huge ones. A Poisson distribution machine counts how many snacks drop per button press. Each distribution is a different "randomness machine" with its own personality!
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# Set style for beautiful visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
class ProbabilityDistributions:
"""
Your comprehensive guide to probability distributions!
Master the art of modeling uncertainty 🎯
"""
def __init__(self):
np.random.seed(42)
def discrete_distributions(self):
"""
Explore discrete probability distributions:
When your outcomes are countable!
"""
print("🎯 DISCRETE PROBABILITY DISTRIBUTIONS")
print("=" * 60)
# 1. Binomial Distribution
print("\n1️⃣ BINOMIAL DISTRIBUTION")
print("-" * 40)
print("📊 Scenario: Flipping a coin, pass/fail tests")
# Example: Quality control - testing 20 products
n_trials = 20 # number of products tested
p_success = 0.95 # probability each product passes
binomial_data = np.random.binomial(n_trials, p_success, 10000)
print(f"Parameters: n={n_trials}, p={p_success}")
print(f"Expected value: {n_trials * p_success}")
print(f"Variance: {n_trials * p_success * (1-p_success):.3f}")
# Calculate probabilities
print("\nProbability of outcomes:")
for k in [18, 19, 20]:
prob = stats.binom.pmf(k, n_trials, p_success)
print(f" P(X = {k}): {prob:.4f} ({prob*100:.2f}%)")
# Cumulative probability
prob_at_least_19 = 1 - stats.binom.cdf(18, n_trials, p_success)
print(f" P(X ≥ 19): {prob_at_least_19:.4f} ({prob_at_least_19*100:.2f}%)")
# 2. Poisson Distribution
print("\n2️⃣ POISSON DISTRIBUTION")
print("-" * 40)
print("📊 Scenario: Events per time interval")
# Example: Customer arrivals per hour
lambda_rate = 5 # average customers per hour
poisson_data = np.random.poisson(lambda_rate, 10000)
print(f"Parameter: λ={lambda_rate}")
print(f"Expected value: {lambda_rate}")
print(f"Variance: {lambda_rate}")
print("\nProbability of customer arrivals:")
for k in range(8):
prob = stats.poisson.pmf(k, lambda_rate)
print(f" P(X = {k}): {prob:.4f} ({prob*100:.2f}%)")
# Probability of unusual events
prob_more_than_10 = 1 - stats.poisson.cdf(10, lambda_rate)
print(f"\nP(X > 10): {prob_more_than_10:.4f} (unusual event!)")
# 3. Geometric Distribution
print("\n3️⃣ GEOMETRIC DISTRIBUTION")
print("-" * 40)
print("📊 Scenario: Trials until first success")
# Example: Sales calls until first sale
p_sale = 0.1 # probability of making a sale
geometric_data = np.random.geometric(p_sale, 10000)
print(f"Parameter: p={p_sale}")
print(f"Expected trials until success: {1/p_sale:.1f}")
print(f"Variance: {(1-p_sale)/(p_sale**2):.1f}")
print("\nProbability of first sale on call k:")
for k in [1, 5, 10, 20]:
prob = stats.geom.pmf(k, p_sale)
print(f" P(X = {k}): {prob:.4f}")
# Probability of needing more than 10 calls
prob_more_than_10 = 1 - stats.geom.cdf(10, p_sale)
print(f"\nP(X > 10 calls): {prob_more_than_10:.4f}")
def continuous_distributions(self):
"""
Explore continuous probability distributions:
When your outcomes are measurements!
"""
print("\n🌊 CONTINUOUS PROBABILITY DISTRIBUTIONS")
print("=" * 60)
# 1. Normal Distribution
print("\n1️⃣ NORMAL (GAUSSIAN) DISTRIBUTION")
print("-" * 40)
print("📊 The bell curve - nature's favorite distribution!")
# Example: Human heights
mu_height = 170 # mean height in cm
sigma_height = 10 # standard deviation
normal_data = np.random.normal(mu_height, sigma_height, 10000)
print(f"Parameters: μ={mu_height}, σ={sigma_height}")
print(f"Expected value: {mu_height}")
print(f"Variance: {sigma_height**2}")
# 68-95-99.7 Rule
print("\n68-95-99.7 Rule:")
within_1_std = np.sum(np.abs(normal_data - mu_height) <= sigma_height) / len(normal_data)
within_2_std = np.sum(np.abs(normal_data - mu_height) <= 2*sigma_height) / len(normal_data)
within_3_std = np.sum(np.abs(normal_data - mu_height) <= 3*sigma_height) / len(normal_data)
print(f" Within 1σ: {within_1_std*100:.1f}% (theory: 68.3%)")
print(f" Within 2σ: {within_2_std*100:.1f}% (theory: 95.4%)")
print(f" Within 3σ: {within_3_std*100:.1f}% (theory: 99.7%)")
# Z-scores and probabilities
print("\nProbability calculations:")
heights = [150, 180, 190]
for h in heights:
z_score = (h - mu_height) / sigma_height
prob_less = stats.norm.cdf(h, mu_height, sigma_height)
print(f" P(Height < {h}cm): {prob_less:.4f} (z={z_score:.2f})")
# 2. Exponential Distribution
print("\n2️⃣ EXPONENTIAL DISTRIBUTION")
print("-" * 40)
print("📊 Time between events, waiting times")
# Example: Time between customer service calls (minutes)
lambda_exp = 0.2 # rate parameter (1/mean)
mean_time = 1/lambda_exp # average time between calls
exponential_data = np.random.exponential(mean_time, 10000)
print(f"Parameter: λ={lambda_exp} (rate)")
print(f"Expected value (mean time): {mean_time} minutes")
print(f"Variance: {mean_time**2}")
# Memoryless property
print("\nMemoryless property demonstration:")
wait_times = [2, 5, 10]
for t in wait_times:
prob = stats.expon.cdf(t, scale=mean_time)
print(f" P(Wait < {t} min): {prob:.4f}")
# 3. Uniform Distribution
print("\n3️⃣ UNIFORM DISTRIBUTION")
print("-" * 40)
print("📊 All outcomes equally likely in a range")
# Example: Random number generator
a, b = 0, 100 # range [a, b]
uniform_data = np.random.uniform(a, b, 10000)
print(f"Parameters: a={a}, b={b}")
print(f"Expected value: {(a+b)/2}")
print(f"Variance: {((b-a)**2)/12:.2f}")
print("\nProbability calculations:")
ranges = [(20, 40), (0, 50), (75, 100)]
for r in ranges:
prob = (r[1] - r[0]) / (b - a)
print(f" P({r[0]} < X < {r[1]}): {prob:.4f}")
# 4. Beta Distribution
print("\n4️⃣ BETA DISTRIBUTION")
print("-" * 40)
print("📊 Probabilities, proportions, rates")
# Example: Conversion rate modeling
alpha, beta_param = 30, 70 # shape parameters
beta_data = np.random.beta(alpha, beta_param, 10000)
mean_beta = alpha / (alpha + beta_param)
var_beta = (alpha * beta_param) / ((alpha + beta_param)**2 * (alpha + beta_param + 1))
print(f"Parameters: α={alpha}, β={beta_param}")
print(f"Expected value: {mean_beta:.3f}")
print(f"Variance: {var_beta:.6f}")
print("\nConversion rate probabilities:")
rates = [0.2, 0.3, 0.4]
for rate in rates:
prob = stats.beta.cdf(rate, alpha, beta_param)
print(f" P(Rate < {rate}): {prob:.4f}")
# Run demonstrations
dist = ProbabilityDistributions()
dist.discrete_distributions()
dist.continuous_distributions()
Choosing the Right Distribution 🎯
Selecting the appropriate probability distribution is like choosing the right tool for a job. Use this decision tree to guide your choice!
Real-World Applications 🌍
📦 Manufacturing: Binomial
- Quality control testing
- Defect rate analysis
- Pass/fail inspections
🏪 Retail: Poisson
- Customer arrivals per hour
- Website clicks per minute
- Call center volume
📊 Finance: Normal
- Stock returns modeling
- Risk assessment
- Portfolio optimization
⏱️ Operations: Exponential
- Equipment failure times
- Service wait times
- Time between events
🎮 Gaming: Uniform
- Random number generation
- Fair dice rolls
- Random sampling
🎯 Marketing: Beta
- Conversion rate modeling
- Click-through rates
- A/B test analysis
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
class PracticalApplications:
"""
Real-world applications of probability distributions
See statistics in action! 🚀
"""
def customer_service_scenario(self):
"""
Call center planning using Poisson and Exponential distributions
"""
print("📞 CALL CENTER CAPACITY PLANNING")
print("=" * 60)
# Average calls per hour during peak time
calls_per_hour = 30
lambda_rate = calls_per_hour / 60 # calls per minute
# Simulate one hour of operation
np.random.seed(42)
n_simulations = 1000
print(f"Average calls per hour: {calls_per_hour}")
print(f"Average time between calls: {60/calls_per_hour:.1f} minutes")
# Question 1: How many agents needed?
print("\n🤔 How many agents do we need?")
# Simulate call arrivals in different hours
hourly_calls = np.random.poisson(calls_per_hour, n_simulations)
print(f"Minimum calls in an hour: {hourly_calls.min()}")
print(f"Maximum calls in an hour: {hourly_calls.max()}")
print(f"95th percentile: {np.percentile(hourly_calls, 95):.0f} calls")
# Service level: Want to handle 95% of peak hours
agents_needed = np.percentile(hourly_calls, 95) / 60 * 5 # 5 min per call
print(f"\n✅ Recommendation: {np.ceil(agents_needed):.0f} agents")
print(" (To handle 95% of peak hours with 5-minute average call)")
# Question 2: Customer wait time distribution
print("\n⏱️ Customer Wait Time Analysis")
# If arrival rate > service rate, queues form
service_rate = 0.4 # calls per minute per agent (2.5 min per call)
n_agents = 3
total_service_rate = service_rate * n_agents
if lambda_rate < total_service_rate:
utilization = lambda_rate / total_service_rate
print(f"System utilization: {utilization:.1%}")
# Average wait time (M/M/c queue formula - simplified)
avg_wait = utilization / (total_service_rate - lambda_rate)
print(f"Average wait time: {avg_wait:.1f} minutes")
# Probability of waiting
prob_wait = utilization
print(f"Probability customer waits: {prob_wait:.1%}")
else:
print("⚠️ System overloaded! Need more agents.")
def quality_control_scenario(self):
"""
Manufacturing quality control using Binomial distribution
"""
print("\n🏭 MANUFACTURING QUALITY CONTROL")
print("=" * 60)
# Production parameters
defect_rate = 0.02 # 2% defect rate
batch_size = 100
sample_size = 10
print(f"Production defect rate: {defect_rate:.1%}")
print(f"Batch size: {batch_size}")
print(f"Sample size for testing: {sample_size}")
# Question: Accept or reject batch?
print("\n📊 Acceptance Sampling Plan")
# Accept if 0 or 1 defects in sample
accept_threshold = 1
# Probability of accepting good batch (2% defects)
prob_accept_good = stats.binom.cdf(accept_threshold, sample_size, defect_rate)
print(f"P(Accept | 2% defects): {prob_accept_good:.3f}")
# Probability of accepting bad batch (5% defects)
bad_defect_rate = 0.05
prob_accept_bad = stats.binom.cdf(accept_threshold, sample_size, bad_defect_rate)
print(f"P(Accept | 5% defects): {prob_accept_bad:.3f}")
# Operating Characteristic Curve
print("\n📈 Operating Characteristic Curve")
defect_rates = np.linspace(0, 0.1, 11)
for rate in defect_rates:
prob_accept = stats.binom.cdf(accept_threshold, sample_size, rate)
risk_level = "✅" if prob_accept > 0.95 else "⚠️" if prob_accept > 0.5 else "❌"
print(f" Defect rate {rate:.1%}: P(Accept)={prob_accept:.3f} {risk_level}")
def stock_returns_scenario(self):
"""
Stock market returns using Normal and Log-normal distributions
"""
print("\n📈 STOCK MARKET RETURNS ANALYSIS")
print("=" * 60)
# Daily returns parameters (historical S&P 500)
daily_mean = 0.0003 # 0.03% daily return
daily_std = 0.01 # 1% daily volatility
print(f"Daily mean return: {daily_mean:.2%}")
print(f"Daily volatility: {daily_std:.1%}")
# Annual projections (252 trading days)
annual_mean = daily_mean * 252
annual_std = daily_std * np.sqrt(252)
print(f"\nAnnualized return: {annual_mean:.1%}")
print(f"Annualized volatility: {annual_std:.1%}")
# Probability calculations
print("\n🎲 Probability Analysis")
# Probability of loss in a day
prob_daily_loss = stats.norm.cdf(0, daily_mean, daily_std)
print(f"P(Daily loss): {prob_daily_loss:.1%}")
# Probability of >2% gain in a day
prob_big_gain = 1 - stats.norm.cdf(0.02, daily_mean, daily_std)
print(f"P(Daily gain > 2%): {prob_big_gain:.2%}")
# Value at Risk (VaR)
confidence_levels = [0.95, 0.99]
print("\n💰 Value at Risk (VaR) - $1M portfolio")
for confidence in confidence_levels:
var_daily = stats.norm.ppf(1-confidence, daily_mean, daily_std)
var_annual = stats.norm.ppf(1-confidence, annual_mean, annual_std)
print(f"\n{confidence:.0%} confidence level:")
print(f" Daily VaR: ${-var_daily * 1000000:,.0f}")
print(f" Annual VaR: ${-var_annual * 1000000:,.0f}")
# Black Swan events
print("\n🦢 Black Swan Events (>3σ moves)")
black_swan_threshold = 3 * daily_std
prob_black_swan = 2 * (1 - stats.norm.cdf(black_swan_threshold, 0, daily_std))
print(f"Probability of >3σ move: {prob_black_swan:.4%}")
print(f"Expected frequency: Once every {1/prob_black_swan:.0f} days")
print(f"Or about {252*prob_black_swan:.1f} times per year")
def website_traffic_scenario(self):
"""
Website traffic modeling using multiple distributions
"""
print("\n🌐 WEBSITE TRAFFIC ANALYSIS")
print("=" * 60)
# Traffic parameters
avg_visitors_per_hour = 500
avg_page_views_per_visitor = 4
conversion_rate = 0.02
print(f"Average visitors/hour: {avg_visitors_per_hour}")
print(f"Average pages/visitor: {avg_page_views_per_visitor}")
print(f"Conversion rate: {conversion_rate:.1%}")
# Simulate one day (24 hours)
print("\n📊 Daily Traffic Simulation")
hourly_visitors = []
for hour in range(24):
# Traffic varies by hour (peak at noon and evening)
if 9 <= hour <= 11 or 19 <= hour <= 21:
hour_rate = avg_visitors_per_hour * 1.5 # Peak hours
elif 0 <= hour <= 6:
hour_rate = avg_visitors_per_hour * 0.2 # Night hours
else:
hour_rate = avg_visitors_per_hour
visitors = np.random.poisson(hour_rate)
hourly_visitors.append(visitors)
total_visitors = sum(hourly_visitors)
print(f"Total daily visitors: {total_visitors:,}")
# Page views (Negative Binomial - overdispersed count)
total_page_views = sum(
np.random.negative_binomial(avg_page_views_per_visitor, 0.5)
for _ in range(total_visitors)
)
print(f"Total page views: {total_page_views:,}")
print(f"Pages per visitor: {total_page_views/total_visitors:.1f}")
# Conversions (Binomial)
conversions = np.random.binomial(total_visitors, conversion_rate)
print(f"Conversions: {conversions}")
print(f"Actual rate: {conversions/total_visitors:.2%}")
# Server capacity planning
print("\n🖥️ Server Capacity Planning")
peak_hour_visitors = max(hourly_visitors)
print(f"Peak hour visitors: {peak_hour_visitors}")
# Concurrent users (roughly 10% of hourly at any moment)
concurrent_factor = 0.1
peak_concurrent = int(peak_hour_visitors * concurrent_factor)
# Add safety margin (99th percentile)
safety_concurrent = stats.poisson.ppf(0.99, peak_concurrent)
print(f"Expected peak concurrent users: {peak_concurrent}")
print(f"99th percentile concurrent: {safety_concurrent:.0f}")
print(f"\n✅ Recommendation: Size servers for {safety_concurrent:.0f} concurrent users")
# Run all scenarios
app = PracticalApplications()
app.customer_service_scenario()
app.quality_control_scenario()
app.stock_returns_scenario()
app.website_traffic_scenario()
Interactive Probability Calculator 🧮
Normal Distribution Calculator
Common Pitfalls and Best Practices 🚧
⚠️ Common Mistakes to Avoid
- Assuming Normal: Not everything follows a normal distribution! Test your assumptions.
- Ignoring Sample Size: Small samples may not represent the true distribution.
- Confusing Parameters: Mean ≠ Median ≠ Mode (except in normal distribution)
- Misusing Poisson: Events must be independent with constant rate
- Forgetting Bounds: Can't have negative heights or probabilities > 1
Summary: Your Distribution Toolkit ✅
🎯 Discrete Distributions
When to use: Counting things
- Binomial: Fixed trials, yes/no
- Poisson: Rate over time/space
- Geometric: Trials until success
- Negative Binomial: Overdispersed counts
🌊 Continuous Distributions
When to use: Measuring things
- Normal: Natural phenomena, CLT
- Exponential: Time between events
- Uniform: Equal probability
- Beta: Probabilities, rates
# Quick Reference Card - Probability Distributions
import numpy as np
from scipy import stats
# DISCRETE DISTRIBUTIONS
# ---------------------
# Binomial: n trials, probability p
n, p = 10, 0.3
binomial_sample = np.random.binomial(n, p, size=1000)
binomial_pmf = stats.binom.pmf(k=3, n=n, p=p) # P(X=3)
binomial_cdf = stats.binom.cdf(k=3, n=n, p=p) # P(X≤3)
# Poisson: average rate λ
lambda_rate = 5
poisson_sample = np.random.poisson(lambda_rate, size=1000)
poisson_pmf = stats.poisson.pmf(k=3, mu=lambda_rate)
# Geometric: probability p
p_success = 0.2
geometric_sample = np.random.geometric(p_success, size=1000)
geometric_pmf = stats.geom.pmf(k=5, p=p_success)
# CONTINUOUS DISTRIBUTIONS
# -----------------------
# Normal: mean μ, std σ
mu, sigma = 0, 1
normal_sample = np.random.normal(mu, sigma, size=1000)
normal_pdf = stats.norm.pdf(x=1.5, loc=mu, scale=sigma)
normal_cdf = stats.norm.cdf(x=1.5, loc=mu, scale=sigma)
z_score = (1.5 - mu) / sigma
# Exponential: rate λ
lambda_exp = 2
exponential_sample = np.random.exponential(1/lambda_exp, size=1000)
exponential_pdf = stats.expon.pdf(x=0.5, scale=1/lambda_exp)
# Uniform: range [a, b]
a, b = 0, 10
uniform_sample = np.random.uniform(a, b, size=1000)
uniform_pdf = stats.uniform.pdf(x=5, loc=a, scale=b-a)
# Beta: shape parameters α, β
alpha, beta_param = 2, 5
beta_sample = np.random.beta(alpha, beta_param, size=1000)
beta_pdf = stats.beta.pdf(x=0.3, a=alpha, b=beta_param)
# KEY FUNCTIONS
# ------------
# .rvs() - Random variates (sampling)
# .pmf() - Probability mass function (discrete)
# .pdf() - Probability density function (continuous)
# .cdf() - Cumulative distribution function
# .ppf() - Percent point function (inverse CDF)
# .mean() - Expected value
# .var() - Variance
# .std() - Standard deviation
print("🎲 Master probability distributions for better predictions!")