Master the fundamentals of A/B testing, from hypothesis formulation to result interpretation. Learn how to design, run, and analyze experiments to make data-driven decisions with confidence.
Conversion Rate
n = 1000 users
Conversion Rate
n = 1000 users
A/B testing (also known as split testing) is a method of comparing two versions of something to determine which performs better.
Total: 7,684 participants
Estimated Duration: 8 days (at 1000 visitors/day)
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.power import tt_solve_power
from statsmodels.stats.proportion import proportions_ztest
class ABTest:
"""
Complete A/B Testing Framework
"""
def __init__(self, control_data, treatment_data, alpha=0.05):
"""
Initialize A/B test with data
Parameters:
-----------
control_data : array-like
Binary outcomes for control group (0 or 1)
treatment_data : array-like
Binary outcomes for treatment group (0 or 1)
alpha : float
Significance level (default: 0.05)
"""
self.control = np.array(control_data)
self.treatment = np.array(treatment_data)
self.alpha = alpha
# Calculate basic statistics
self.n_control = len(self.control)
self.n_treatment = len(self.treatment)
self.p_control = np.mean(self.control)
self.p_treatment = np.mean(self.treatment)
def calculate_sample_size(self, baseline_rate, mde, alpha=0.05, power=0.8):
"""
Calculate required sample size for desired power
Parameters:
-----------
baseline_rate : float
Expected conversion rate in control (0-1)
mde : float
Minimum detectable effect (relative change)
alpha : float
Significance level
power : float
Statistical power (1 - Type II error rate)
Returns:
--------
int : Required sample size per group
"""
# Calculate effect size (Cohen's h for proportions)
p1 = baseline_rate
p2 = baseline_rate * (1 + mde)
# Cohen's h = 2 * arcsin(√p1) - 2 * arcsin(√p2)
h = 2 * (np.arcsin(np.sqrt(p2)) - np.arcsin(np.sqrt(p1)))
# Use power analysis
n = tt_solve_power(effect_size=h, alpha=alpha, power=power,
ratio=1, alternative='two-sided')
return int(np.ceil(n))
def run_test(self):
"""
Run the A/B test and return results
Returns:
--------
dict : Test results including p-value, confidence intervals, etc.
"""
# Perform z-test for proportions
successes = [self.control.sum(), self.treatment.sum()]
samples = [self.n_control, self.n_treatment]
z_stat, p_value = proportions_ztest(successes, samples)
# Calculate confidence intervals
se_control = np.sqrt(self.p_control * (1 - self.p_control) / self.n_control)
se_treatment = np.sqrt(self.p_treatment * (1 - self.p_treatment) / self.n_treatment)
z_critical = stats.norm.ppf(1 - self.alpha/2)
ci_control = [
self.p_control - z_critical * se_control,
self.p_control + z_critical * se_control
]
ci_treatment = [
self.p_treatment - z_critical * se_treatment,
self.p_treatment + z_critical * se_treatment
]
# Calculate relative uplift
uplift = (self.p_treatment - self.p_control) / self.p_control * 100
# Determine statistical significance
is_significant = p_value < self.alpha
results = {
'control_rate': self.p_control,
'treatment_rate': self.p_treatment,
'relative_uplift': uplift,
'absolute_difference': self.p_treatment - self.p_control,
'p_value': p_value,
'z_statistic': z_stat,
'is_significant': is_significant,
'ci_control': ci_control,
'ci_treatment': ci_treatment,
'sample_sizes': {
'control': self.n_control,
'treatment': self.n_treatment
}
}
return results
def calculate_power(self):
"""
Calculate the statistical power of the test
Returns:
--------
float : Statistical power (probability of detecting true effect)
"""
# Calculate effect size from observed data
h = 2 * (np.arcsin(np.sqrt(self.p_treatment)) -
np.arcsin(np.sqrt(self.p_control)))
# Calculate power
from statsmodels.stats.power import zt_ind_solve_power
power = zt_ind_solve_power(
effect_size=h,
nobs1=self.n_control,
alpha=self.alpha,
ratio=self.n_treatment/self.n_control,
alternative='two-sided'
)
return power
def plot_results(self, figsize=(12, 6)):
"""
Visualize A/B test results
"""
fig, axes = plt.subplots(1, 3, figsize=figsize)
# 1. Conversion rates with confidence intervals
ax1 = axes[0]
groups = ['Control', 'Treatment']
rates = [self.p_control, self.p_treatment]
results = self.run_test()
ci_lower = [results['ci_control'][0], results['ci_treatment'][0]]
ci_upper = [results['ci_control'][1], results['ci_treatment'][1]]
errors = [[rates[i] - ci_lower[i] for i in range(2)],
[ci_upper[i] - rates[i] for i in range(2)]]
bars = ax1.bar(groups, rates, yerr=errors, capsize=10,
color=['#4CAF50', '#2196F3'], alpha=0.7)
ax1.set_ylabel('Conversion Rate')
ax1.set_title('Conversion Rates with 95% CI')
ax1.set_ylim(0, max(ci_upper) * 1.2)
# Add value labels
for i, (bar, rate) in enumerate(zip(bars, rates)):
ax1.text(bar.get_x() + bar.get_width()/2, rate + 0.01,
f'{rate:.3f}', ha='center', va='bottom')
# 2. Distribution of differences (bootstrap)
ax2 = axes[1]
n_bootstrap = 10000
bootstrap_diffs = []
for _ in range(n_bootstrap):
control_sample = np.random.choice(self.control, size=self.n_control, replace=True)
treatment_sample = np.random.choice(self.treatment, size=self.n_treatment, replace=True)
diff = treatment_sample.mean() - control_sample.mean()
bootstrap_diffs.append(diff)
ax2.hist(bootstrap_diffs, bins=50, alpha=0.7, color='purple', edgecolor='black')
ax2.axvline(0, color='red', linestyle='--', label='No difference')
ax2.axvline(self.p_treatment - self.p_control, color='green',
linestyle='-', linewidth=2, label='Observed difference')
ax2.set_xlabel('Difference in Conversion Rate')
ax2.set_ylabel('Frequency')
ax2.set_title('Bootstrap Distribution of Differences')
ax2.legend()
# 3. Sequential analysis / p-value over time
ax3 = axes[2]
sample_sizes = np.linspace(100, max(self.n_control, self.n_treatment), 20)
p_values = []
for n in sample_sizes:
n = int(n)
if n <= min(self.n_control, self.n_treatment):
temp_test = ABTest(self.control[:n], self.treatment[:n], self.alpha)
temp_results = temp_test.run_test()
p_values.append(temp_results['p_value'])
ax3.plot(sample_sizes[:len(p_values)], p_values, 'b-', linewidth=2)
ax3.axhline(self.alpha, color='red', linestyle='--',
label=f'α = {self.alpha}')
ax3.set_xlabel('Sample Size')
ax3.set_ylabel('P-value')
ax3.set_title('P-value Evolution')
ax3.legend()
ax3.set_ylim(0, 1)
plt.tight_layout()
plt.show()
return fig
# Example Usage
np.random.seed(42)
# Simulate A/B test data
n_control = 1000
n_treatment = 1000
p_control_true = 0.10 # 10% conversion rate
p_treatment_true = 0.12 # 12% conversion rate (20% lift)
# Generate data
control_data = np.random.binomial(1, p_control_true, n_control)
treatment_data = np.random.binomial(1, p_treatment_true, n_treatment)
# Run test
ab_test = ABTest(control_data, treatment_data)
results = ab_test.run_test()
# Print results
print("A/B Test Results")
print("=" * 50)
print(f"Control Conversion Rate: {results['control_rate']:.3%}")
print(f"Treatment Conversion Rate: {results['treatment_rate']:.3%}")
print(f"Relative Uplift: {results['relative_uplift']:.1f}%")
print(f"P-value: {results['p_value']:.4f}")
print(f"Statistically Significant: {results['is_significant']}")
print(f"\nConfidence Intervals (95%):")
print(f"Control: [{results['ci_control'][0]:.3%}, {results['ci_control'][1]:.3%}]")
print(f"Treatment: [{results['ci_treatment'][0]:.3%}, {results['ci_treatment'][1]:.3%}]")
# Calculate required sample size
required_n = ab_test.calculate_sample_size(
baseline_rate=0.10,
mde=0.20, # 20% minimum detectable effect
alpha=0.05,
power=0.80
)
print(f"\nRequired sample size: {required_n} per group")
# Plot results
ab_test.plot_results()
Monitoring tests over time requires careful consideration to avoid false positives.
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
class SequentialABTest:
"""
Sequential testing with alpha spending function
"""
def __init__(self, alpha=0.05, max_samples=10000):
self.alpha = alpha
self.max_samples = max_samples
self.results_history = []
def obrien_fleming_boundary(self, information_fraction):
"""
O'Brien-Fleming alpha spending function
More conservative early, preserves Type I error
"""
z_alpha = stats.norm.ppf(1 - self.alpha/2)
return z_alpha / np.sqrt(information_fraction)
def pocock_boundary(self, k, K):
"""
Pocock boundary - constant boundary
k: current look, K: total planned looks
"""
# Pocock constant for given K and alpha
# These are pre-calculated values
pocock_constants = {
2: 2.178, 3: 2.289, 4: 2.361,
5: 2.413, 10: 2.555, 20: 2.672
}
return pocock_constants.get(K, 2.5)
def run_sequential_test(self, control_stream, treatment_stream,
check_points=[500, 1000, 2000, 5000, 10000]):
"""
Run sequential test with specified check points
"""
results = []
for i, n in enumerate(check_points):
# Get data up to checkpoint
control_data = control_stream[:n]
treatment_data = treatment_stream[:n]
# Calculate test statistics
p_c = np.mean(control_data)
p_t = np.mean(treatment_data)
# Pooled standard error
p_pooled = (control_data.sum() + treatment_data.sum()) / (2 * n)
se = np.sqrt(p_pooled * (1 - p_pooled) * (2/n))
# Z-statistic
z_stat = (p_t - p_c) / se
# Information fraction
info_frac = n / self.max_samples
# Critical values
z_crit_of = self.obrien_fleming_boundary(info_frac)
z_crit_pocock = self.pocock_boundary(i+1, len(check_points))
# P-value
p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
results.append({
'n': n,
'z_stat': z_stat,
'p_value': p_value,
'z_crit_of': z_crit_of,
'z_crit_pocock': z_crit_pocock,
'reject_of': abs(z_stat) > z_crit_of,
'reject_pocock': abs(z_stat) > z_crit_pocock,
'p_control': p_c,
'p_treatment': p_t
})
# Early stopping
if results[-1]['reject_of']:
print(f"Early stopping at n={n} (O'Brien-Fleming)")
break
return results
def plot_sequential_results(self, results):
"""
Visualize sequential testing boundaries and results
"""
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8))
# Extract data
n_values = [r['n'] for r in results]
z_stats = [r['z_stat'] for r in results]
z_crit_of = [r['z_crit_of'] for r in results]
z_crit_pocock = [r['z_crit_pocock'] for r in results]
# Plot 1: Z-statistics with boundaries
ax1.plot(n_values, z_stats, 'b-', marker='o', label='Z-statistic', linewidth=2)
ax1.plot(n_values, z_crit_of, 'r--', label="O'Brien-Fleming", linewidth=2)
ax1.plot(n_values, [-z for z in z_crit_of], 'r--', linewidth=2)
ax1.plot(n_values, z_crit_pocock, 'g--', label='Pocock', linewidth=2)
ax1.plot(n_values, [-z for z in z_crit_pocock], 'g--', linewidth=2)
ax1.axhline(y=0, color='gray', linestyle='-', alpha=0.3)
ax1.fill_between(n_values, z_crit_of, [-z for z in z_crit_of],
alpha=0.1, color='red')
ax1.set_xlabel('Sample Size')
ax1.set_ylabel('Z-statistic')
ax1.set_title('Sequential Testing Boundaries')
ax1.legend()
ax1.grid(True, alpha=0.3)
# Plot 2: P-values over time
p_values = [r['p_value'] for r in results]
ax2.plot(n_values, p_values, 'b-', marker='o', linewidth=2)
ax2.axhline(y=0.05, color='red', linestyle='--',
label='α = 0.05', linewidth=2)
ax2.set_xlabel('Sample Size')
ax2.set_ylabel('P-value')
ax2.set_title('P-value Evolution')
ax2.legend()
ax2.grid(True, alpha=0.3)
ax2.set_ylim(0, min(1, max(p_values) * 1.2))
plt.tight_layout()
plt.show()
# Example: Sequential testing simulation
np.random.seed(42)
# Generate streaming data
n_total = 10000
p_control = 0.10
p_treatment = 0.11 # Small effect
control_stream = np.random.binomial(1, p_control, n_total)
treatment_stream = np.random.binomial(1, p_treatment, n_total)
# Run sequential test
seq_test = SequentialABTest(alpha=0.05, max_samples=n_total)
seq_results = seq_test.run_sequential_test(
control_stream,
treatment_stream,
check_points=[500, 1000, 2000, 3000, 5000, 7000, 10000]
)
# Display results
print("\nSequential Testing Results:")
print("=" * 70)
for r in seq_results:
print(f"n={r['n']:5d}: z={r['z_stat']:6.3f}, p={r['p_value']:.4f}, "
f"OF boundary={r['z_crit_of']:.3f}, "
f"Reject={'Yes' if r['reject_of'] else 'No'}")
# Plot
seq_test.plot_sequential_results(seq_results)
Continuously checking results until significance is found increases false positive rate.
Solution: Pre-specify analysis plan and use sequential testing methods.Running tests without adequate sample size leads to false negatives.
Solution: Calculate required sample size before starting the test.Non-random assignment compromises validity of results.
Solution: Use proper randomization and check for balanced groups.Testing multiple metrics increases chance of false discoveries.
Solution: Apply Bonferroni correction or control FDR.Initial positive results may fade as users adapt to changes.
Solution: Run tests longer and monitor for stability.Treatment affects control group through network effects.
Solution: Use cluster randomization or geo-experiments.You're testing whether changing the "Add to Cart" button from green (control) to orange (treatment) increases conversion rates.
scipy.stats.ttest_ind() - Two-sample t-teststatsmodels.stats.proportion.proportions_ztest() - Proportions teststatsmodels.stats.power.tt_solve_power() - Power analysisscipy.stats.chi2_contingency() - Chi-square testscipy.stats.fisher_exact() - Fisher's exact test