print("⚠️ COMMON CORRELATION PITFALLS")
print("=" * 60)
# Pitfall 1: Assuming linearity
print("\n❌ Pitfall 1: Assuming Linear Relationships")
print("Pearson correlation only captures LINEAR relationships")
print("✅ Solution: Use Spearman for monotonic, visualize first")
# Pitfall 2: Outliers
print("\n❌ Pitfall 2: Outlier Influence")
print("Single outliers can drastically change correlation")
print("✅ Solution: Check scatter plots, use robust methods")
# Pitfall 3: Restriction of range
print("\n❌ Pitfall 3: Restriction of Range")
print("Selecting subset of data can hide correlations")
print("✅ Solution: Analyze full range of data")
# Pitfall 4: Sample size
print("\n❌ Pitfall 4: Small Sample Sizes")
print("Correlations unstable with n < 30")
print("✅ Solution: Report confidence intervals, use bootstrap")
# Pitfall 5: Multiple testing
print("\n❌ Pitfall 5: Multiple Comparisons")
print("Testing many correlations increases false positives")
print("✅ Solution: Apply Bonferroni correction")
# Best Practices
print("\n✨ CORRELATION BEST PRACTICES")
print("=" * 60)
best_practices = [
"1. ALWAYS visualize first (scatter plots)",
"2. Check assumptions (linearity, normality for Pearson)",
"3. Report confidence intervals, not just point estimates",
"4. Consider both Pearson and Spearman",
"5. Look for non-linear patterns",
"6. Check for outliers and influential points",
"7. Be cautious about causation claims",
"8. Consider confounding variables",
"9. Report effect sizes (R²) not just significance",
"10. Validate findings with different datasets"
]
for practice in best_practices:
print(f" {practice}")
# Quick Reference Card - Correlation Analysis
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
# PEARSON CORRELATION
from scipy.stats import pearsonr
r, p_value = pearsonr(x, y)
# CI using Fisher transformation
z = np.arctanh(r)
se = 1/np.sqrt(len(x)-3)
ci = np.tanh([z - 1.96*se, z + 1.96*se])
# SPEARMAN CORRELATION
from scipy.stats import spearmanr
rho, p_value = spearmanr(x, y)
# KENDALL'S TAU
from scipy.stats import kendalltau
tau, p_value = kendalltau(x, y)
# CORRELATION MATRIX
corr_matrix = df.corr(method='pearson') # or 'spearman', 'kendall'
# PARTIAL CORRELATION
# Using pingouin library
import pingouin as pg
partial_corr = pg.partial_corr(data=df, x='x', y='y', covar='z')
# VISUALIZATION
# Scatter plot matrix
pd.plotting.scatter_matrix(df, figsize=(10, 10))
# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm',
vmin=-1, vmax=1, center=0, square=True)
# Pair plot
sns.pairplot(df, kind='scatter', diag_kind='kde')
# POINT-BISERIAL (binary vs continuous)
from scipy.stats import pointbiserialr
r, p_value = pointbiserialr(binary_var, continuous_var)
# MULTIPLE TESTING CORRECTION
from statsmodels.stats.multitest import multipletests
p_values = [...] # List of p-values from multiple tests
reject, p_adjusted, _, _ = multipletests(p_values, method='bonferroni')
# BOOTSTRAP CONFIDENCE INTERVAL
def bootstrap_correlation(x, y, n_bootstrap=1000, ci=95):
correlations = []
n = len(x)
for _ in range(n_bootstrap):
idx = np.random.choice(n, n, replace=True)
correlations.append(pearsonr(x[idx], y[idx])[0])
return np.percentile(correlations, [(100-ci)/2, 100-(100-ci)/2])
# Remember:
# • Visualize before correlating
# • Check assumptions for each method
# • Report CI and effect size (R²)
# • Consider confounding variables
# • Correlation does not imply causation!
print("🔗 Master correlation analysis for better insights!")