Skip to main content

Common Pitfalls and Best Practices 🚧

print("⚠️ COMMON CORRELATION PITFALLS")
print("=" * 60)

# Pitfall 1: Assuming linearity
print("\n❌ Pitfall 1: Assuming Linear Relationships")
print("Pearson correlation only captures LINEAR relationships")
print("✅ Solution: Use Spearman for monotonic, visualize first")

# Pitfall 2: Outliers
print("\n❌ Pitfall 2: Outlier Influence")
print("Single outliers can drastically change correlation")
print("✅ Solution: Check scatter plots, use robust methods")

# Pitfall 3: Restriction of range
print("\n❌ Pitfall 3: Restriction of Range")
print("Selecting subset of data can hide correlations")
print("✅ Solution: Analyze full range of data")

# Pitfall 4: Sample size
print("\n❌ Pitfall 4: Small Sample Sizes")
print("Correlations unstable with n < 30")
print("✅ Solution: Report confidence intervals, use bootstrap")

# Pitfall 5: Multiple testing
print("\n❌ Pitfall 5: Multiple Comparisons")
print("Testing many correlations increases false positives")
print("✅ Solution: Apply Bonferroni correction")

# Best Practices
print("\n✨ CORRELATION BEST PRACTICES")
print("=" * 60)

best_practices = [
    "1. ALWAYS visualize first (scatter plots)",
    "2. Check assumptions (linearity, normality for Pearson)",
    "3. Report confidence intervals, not just point estimates",
    "4. Consider both Pearson and Spearman",
    "5. Look for non-linear patterns",
    "6. Check for outliers and influential points",
    "7. Be cautious about causation claims",
    "8. Consider confounding variables",
    "9. Report effect sizes (R²) not just significance",
    "10. Validate findings with different datasets"
]

for practice in best_practices:
    print(f"   {practice}")

Method Comparison Table 📊

Method Best For Assumptions Pros Cons
Pearson's r Linear relationships Normal distribution, linear Interpretable, widely used Misses non-linear patterns
Spearman's ρ Monotonic relationships Ordinal scale sufficient Robust to outliers Less power than Pearson
Kendall's τ Small samples, ties Ordinal data Handles ties well Computationally intensive
Point-Biserial Binary vs continuous One dichotomous variable Tests group differences Limited to binary
Partial Correlation Controlling confounders Linear relationships Removes confounding Complex interpretation

Summary: Your Correlation Toolkit ✅

🎯 Key Takeaways:

Quick Formulas:
Pearson: r = Σ[(xi - x̄)(yi - ȳ)] / √[Σ(xi - x̄)² × Σ(yi - ȳ)²]
R² = r² (proportion of variance explained)
t = r√(n-2) / √(1-r²) (significance test)
# Quick Reference Card - Correlation Analysis
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt

# PEARSON CORRELATION
from scipy.stats import pearsonr
r, p_value = pearsonr(x, y)
# CI using Fisher transformation
z = np.arctanh(r)
se = 1/np.sqrt(len(x)-3)
ci = np.tanh([z - 1.96*se, z + 1.96*se])

# SPEARMAN CORRELATION
from scipy.stats import spearmanr
rho, p_value = spearmanr(x, y)

# KENDALL'S TAU
from scipy.stats import kendalltau
tau, p_value = kendalltau(x, y)

# CORRELATION MATRIX
corr_matrix = df.corr(method='pearson')  # or 'spearman', 'kendall'

# PARTIAL CORRELATION
# Using pingouin library
import pingouin as pg
partial_corr = pg.partial_corr(data=df, x='x', y='y', covar='z')

# VISUALIZATION
# Scatter plot matrix
pd.plotting.scatter_matrix(df, figsize=(10, 10))

# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', 
            vmin=-1, vmax=1, center=0, square=True)

# Pair plot
sns.pairplot(df, kind='scatter', diag_kind='kde')

# POINT-BISERIAL (binary vs continuous)
from scipy.stats import pointbiserialr
r, p_value = pointbiserialr(binary_var, continuous_var)

# MULTIPLE TESTING CORRECTION
from statsmodels.stats.multitest import multipletests
p_values = [...]  # List of p-values from multiple tests
reject, p_adjusted, _, _ = multipletests(p_values, method='bonferroni')

# BOOTSTRAP CONFIDENCE INTERVAL
def bootstrap_correlation(x, y, n_bootstrap=1000, ci=95):
    correlations = []
    n = len(x)
    for _ in range(n_bootstrap):
        idx = np.random.choice(n, n, replace=True)
        correlations.append(pearsonr(x[idx], y[idx])[0])
    return np.percentile(correlations, [(100-ci)/2, 100-(100-ci)/2])

# Remember:
# • Visualize before correlating
# • Check assumptions for each method
# • Report CI and effect size (R²)
# • Consider confounding variables
# • Correlation does not imply causation!

print("🔗 Master correlation analysis for better insights!")