š DataFrames and Series: Your Data's Best Friends
Imagine you're managing a bustling restaurant. You need to track orders (Series: what each table ordered) and manage your entire operation (DataFrame: all tables, orders, times, and payments). That's Pandas! š½ļø It's like Excel on steroids, giving you the power to manipulate, analyze, and understand data with the elegance of Python. If NumPy is the engine, Pandas is the luxury car built on top of it.
The Pandas Universe: Where Data Comes to Life š
Pandas transforms raw data into insights. It's the bridge between messy real-world data and clean analysis. Series handle one-dimensional data (like a single column), while DataFrames manage two-dimensional data (like a spreadsheet). Together, they're the dynamic duo of data analysis!
The Spreadsheet Evolution š
Think of Excel as a bicycle and Pandas as a sports car. Both get you there, but Pandas does it with style, speed, and the ability to handle millions of rows without breaking a sweat. Series is like a single column with superpowers, while DataFrame is your entire spreadsheet on steroids!
import pandas as pd
import numpy as np
# Welcome to Pandas: Your Data Analysis Powerhouse! š
def series_fundamentals():
"""
Series: The building block of Pandas.
Think of it as a supercharged list with an index!
"""
print("š· Pandas Series Fundamentals")
print("=" * 60)
# Creating Series - Multiple Ways
print("Creating Series:")
# From list
fruits = pd.Series(['apple', 'banana', 'cherry', 'date'])
print(f"\n1. From list:\n{fruits}")
# From dictionary
sales = pd.Series({
'Monday': 120,
'Tuesday': 135,
'Wednesday': 155,
'Thursday': 142,
'Friday': 198
})
print(f"\n2. From dictionary:\n{sales}")
# From NumPy array with custom index
temperatures = pd.Series(
np.random.normal(20, 5, 7),
index=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
)
print(f"\n3. From NumPy array:\n{temperatures.round(1)}")
# Series attributes
print("\nš Series Attributes:")
print(f" Values: {sales.values}")
print(f" Index: {sales.index.tolist()}")
print(f" Shape: {sales.shape}")
print(f" Size: {sales.size}")
print(f" Data type: {sales.dtype}")
# Basic operations
print("\nš§ Basic Operations:")
print(f" Sum: {sales.sum()}")
print(f" Mean: {sales.mean():.2f}")
print(f" Max: {sales.max()}")
print(f" Best day: {sales.idxmax()}")
# Indexing and slicing
print("\nšÆ Indexing and Slicing:")
print(f" sales['Monday']: {sales['Monday']}")
print(f" sales[0]: {sales.iloc[0]}") # Position-based
print(f" sales['Tuesday':'Thursday']:\n{sales['Tuesday':'Thursday']}")
# Vectorized operations (like NumPy!)
print("\nā” Vectorized Operations:")
sales_doubled = sales * 2
print(f" Sales doubled:\n{sales_doubled}")
above_150 = sales[sales > 150]
print(f" Days with sales > 150:\n{above_150}")
def dataframe_fundamentals():
"""
DataFrame: The star of the show!
Your spreadsheet, database table, and more - all in one.
"""
print("\nš¶ DataFrame Fundamentals")
print("=" * 60)
# Creating DataFrames - Multiple Ways
print("Creating DataFrames:")
# From dictionary
data = {
'Product': ['Laptop', 'Mouse', 'Keyboard', 'Monitor', 'Headphones'],
'Price': [999.99, 29.99, 79.99, 299.99, 149.99],
'Stock': [25, 150, 85, 40, 60],
'Category': ['Electronics', 'Accessories', 'Accessories', 'Electronics', 'Audio']
}
df = pd.DataFrame(data)
print(f"\n1. From dictionary:\n{df}")
# From list of dictionaries
records = [
{'name': 'Alice', 'age': 28, 'city': 'NYC'},
{'name': 'Bob', 'age': 35, 'city': 'LA'},
{'name': 'Charlie', 'age': 42, 'city': 'Chicago'}
]
df_records = pd.DataFrame(records)
print(f"\n2. From list of dicts:\n{df_records}")
# From NumPy array
random_data = np.random.randn(4, 3)
df_numpy = pd.DataFrame(
random_data,
columns=['A', 'B', 'C'],
index=['row1', 'row2', 'row3', 'row4']
)
print(f"\n3. From NumPy array:\n{df_numpy.round(2)}")
# DataFrame info
print("\nš DataFrame Information:")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Index: {df.index.tolist()}")
print(f"\nData types:")
print(df.dtypes)
print(f"\nMemory usage:")
print(df.memory_usage())
# Run fundamentals
series_fundamentals()
dataframe_fundamentals()
Series vs DataFrame: Understanding the Difference š
š Series
- 1-dimensional labeled array
- Single column of data
- Has index and values
- Like a dictionary with order
- Homogeneous data type
š DataFrame
- 2-dimensional labeled table
- Multiple columns (Series)
- Has index and columns
- Like a spreadsheet or SQL table
- Can have mixed data types
import pandas as pd
import numpy as np
class PandasDataStructures:
"""
Master class for understanding Pandas data structures.
Your guide to Series and DataFrames!
"""
def __init__(self):
np.random.seed(42)
def series_deep_dive(self):
"""Deep dive into Series capabilities"""
print("š Series Deep Dive")
print("=" * 60)
# Create a Series with datetime index
dates = pd.date_range('2024-01-01', periods=30)
daily_sales = pd.Series(
np.random.randint(100, 500, 30),
index=dates,
name='Daily Sales'
)
print("Time Series Example:")
print(daily_sales.head())
# Series methods
print("\nš Powerful Series Methods:")
# Statistical summary
print(f"\nDescribe:\n{daily_sales.describe().round(2)}")
# Rolling window
rolling_avg = daily_sales.rolling(window=7).mean()
print(f"\n7-day Rolling Average (last 5 days):\n{rolling_avg.tail().round(2)}")
# Cumulative operations
cumulative = daily_sales.cumsum()
print(f"\nCumulative Sales (last 5 days):\n{cumulative.tail()}")
# Value counts
categories = pd.Series(['A', 'B', 'A', 'C', 'B', 'A', 'C', 'C', 'B', 'A'])
print(f"\nValue Counts:\n{categories.value_counts()}")
# String operations
names = pd.Series(['alice', 'bob', 'charlie', 'diana'])
print(f"\nString Operations:")
print(f" Uppercase: {names.str.upper().tolist()}")
print(f" Length: {names.str.len().tolist()}")
print(f" Contains 'a': {names.str.contains('a').tolist()}")
def dataframe_operations(self):
"""Essential DataFrame operations"""
print("\nšÆ DataFrame Operations")
print("=" * 60)
# Create a sample DataFrame
df = pd.DataFrame({
'Date': pd.date_range('2024-01-01', periods=10),
'Product': ['A', 'B', 'A', 'C', 'B', 'A', 'C', 'B', 'A', 'C'],
'Sales': np.random.randint(100, 1000, 10),
'Quantity': np.random.randint(1, 50, 10),
'Region': ['North', 'South', 'North', 'East', 'West',
'North', 'East', 'South', 'West', 'East']
})
print("Sample DataFrame:")
print(df)
# Column operations
print("\nš Column Operations:")
# Add new column
df['Revenue'] = df['Sales'] * df['Quantity']
print(f"Added 'Revenue' column")
# Select columns
print(f"\nSelecting columns:")
print(f" Single column (Series): df['Product']")
print(f" Multiple columns (DataFrame): df[['Product', 'Sales']]")
# Drop column
df_dropped = df.drop('Region', axis=1)
print(f" Dropped 'Region' column")
# Row operations
print("\nš Row Operations:")
# Filter rows
high_sales = df[df['Sales'] > 500]
print(f"Rows with Sales > 500:\n{high_sales}")
# Sort values
sorted_df = df.sort_values('Revenue', ascending=False)
print(f"\nSorted by Revenue (descending):\n{sorted_df.head(3)}")
# Unique