Skip to main content

šŸ“Š DataFrames and Series: Your Data's Best Friends

Imagine you're managing a bustling restaurant. You need to track orders (Series: what each table ordered) and manage your entire operation (DataFrame: all tables, orders, times, and payments). That's Pandas! šŸ½ļø It's like Excel on steroids, giving you the power to manipulate, analyze, and understand data with the elegance of Python. If NumPy is the engine, Pandas is the luxury car built on top of it.

The Pandas Universe: Where Data Comes to Life 🌟

Pandas transforms raw data into insights. It's the bridge between messy real-world data and clean analysis. Series handle one-dimensional data (like a single column), while DataFrames manage two-dimensional data (like a spreadsheet). Together, they're the dynamic duo of data analysis!

graph TB A[Pandas Data Structures] --> B[Series] A --> C[DataFrame] B --> D[1D Labeled Array] B --> E[Like a Column] B --> F[Index + Values] C --> G[2D Labeled Table] C --> H[Like a Spreadsheet] C --> I[Rows + Columns] D --> J[Time Series] E --> K[Single Variable] F --> L[Fast Lookups] G --> M[Datasets] H --> N[SQL Tables] I --> O[Multi-Variable Analysis] style A fill:#667eea style B fill:#4ecdc4 style C fill:#ffd93d

The Spreadsheet Evolution šŸ“ˆ

Think of Excel as a bicycle and Pandas as a sports car. Both get you there, but Pandas does it with style, speed, and the ability to handle millions of rows without breaking a sweat. Series is like a single column with superpowers, while DataFrame is your entire spreadsheet on steroids!

import pandas as pd
import numpy as np

# Welcome to Pandas: Your Data Analysis Powerhouse! šŸš€

def series_fundamentals():
    """
    Series: The building block of Pandas.
    Think of it as a supercharged list with an index!
    """
    print("šŸ”· Pandas Series Fundamentals")
    print("=" * 60)
    
    # Creating Series - Multiple Ways
    print("Creating Series:")
    
    # From list
    fruits = pd.Series(['apple', 'banana', 'cherry', 'date'])
    print(f"\n1. From list:\n{fruits}")
    
    # From dictionary
    sales = pd.Series({
        'Monday': 120,
        'Tuesday': 135,
        'Wednesday': 155,
        'Thursday': 142,
        'Friday': 198
    })
    print(f"\n2. From dictionary:\n{sales}")
    
    # From NumPy array with custom index
    temperatures = pd.Series(
        np.random.normal(20, 5, 7),
        index=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
    )
    print(f"\n3. From NumPy array:\n{temperatures.round(1)}")
    
    # Series attributes
    print("\nšŸ“Š Series Attributes:")
    print(f"  Values: {sales.values}")
    print(f"  Index: {sales.index.tolist()}")
    print(f"  Shape: {sales.shape}")
    print(f"  Size: {sales.size}")
    print(f"  Data type: {sales.dtype}")
    
    # Basic operations
    print("\nšŸ”§ Basic Operations:")
    print(f"  Sum: {sales.sum()}")
    print(f"  Mean: {sales.mean():.2f}")
    print(f"  Max: {sales.max()}")
    print(f"  Best day: {sales.idxmax()}")
    
    # Indexing and slicing
    print("\nšŸŽÆ Indexing and Slicing:")
    print(f"  sales['Monday']: {sales['Monday']}")
    print(f"  sales[0]: {sales.iloc[0]}")  # Position-based
    print(f"  sales['Tuesday':'Thursday']:\n{sales['Tuesday':'Thursday']}")
    
    # Vectorized operations (like NumPy!)
    print("\n⚔ Vectorized Operations:")
    sales_doubled = sales * 2
    print(f"  Sales doubled:\n{sales_doubled}")
    
    above_150 = sales[sales > 150]
    print(f"  Days with sales > 150:\n{above_150}")

def dataframe_fundamentals():
    """
    DataFrame: The star of the show!
    Your spreadsheet, database table, and more - all in one.
    """
    print("\nšŸ”¶ DataFrame Fundamentals")
    print("=" * 60)
    
    # Creating DataFrames - Multiple Ways
    print("Creating DataFrames:")
    
    # From dictionary
    data = {
        'Product': ['Laptop', 'Mouse', 'Keyboard', 'Monitor', 'Headphones'],
        'Price': [999.99, 29.99, 79.99, 299.99, 149.99],
        'Stock': [25, 150, 85, 40, 60],
        'Category': ['Electronics', 'Accessories', 'Accessories', 'Electronics', 'Audio']
    }
    
    df = pd.DataFrame(data)
    print(f"\n1. From dictionary:\n{df}")
    
    # From list of dictionaries
    records = [
        {'name': 'Alice', 'age': 28, 'city': 'NYC'},
        {'name': 'Bob', 'age': 35, 'city': 'LA'},
        {'name': 'Charlie', 'age': 42, 'city': 'Chicago'}
    ]
    
    df_records = pd.DataFrame(records)
    print(f"\n2. From list of dicts:\n{df_records}")
    
    # From NumPy array
    random_data = np.random.randn(4, 3)
    df_numpy = pd.DataFrame(
        random_data,
        columns=['A', 'B', 'C'],
        index=['row1', 'row2', 'row3', 'row4']
    )
    print(f"\n3. From NumPy array:\n{df_numpy.round(2)}")
    
    # DataFrame info
    print("\nšŸ“ˆ DataFrame Information:")
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print(f"Index: {df.index.tolist()}")
    print(f"\nData types:")
    print(df.dtypes)
    print(f"\nMemory usage:")
    print(df.memory_usage())

# Run fundamentals
series_fundamentals()
dataframe_fundamentals()

Series vs DataFrame: Understanding the Difference šŸ”

šŸ“ Series

  • 1-dimensional labeled array
  • Single column of data
  • Has index and values
  • Like a dictionary with order
  • Homogeneous data type

šŸ“‹ DataFrame

  • 2-dimensional labeled table
  • Multiple columns (Series)
  • Has index and columns
  • Like a spreadsheet or SQL table
  • Can have mixed data types
import pandas as pd
import numpy as np

class PandasDataStructures:
    """
    Master class for understanding Pandas data structures.
    Your guide to Series and DataFrames!
    """
    
    def __init__(self):
        np.random.seed(42)
    
    def series_deep_dive(self):
        """Deep dive into Series capabilities"""
        print("šŸ” Series Deep Dive")
        print("=" * 60)
        
        # Create a Series with datetime index
        dates = pd.date_range('2024-01-01', periods=30)
        daily_sales = pd.Series(
            np.random.randint(100, 500, 30),
            index=dates,
            name='Daily Sales'
        )
        
        print("Time Series Example:")
        print(daily_sales.head())
        
        # Series methods
        print("\nšŸ“Š Powerful Series Methods:")
        
        # Statistical summary
        print(f"\nDescribe:\n{daily_sales.describe().round(2)}")
        
        # Rolling window
        rolling_avg = daily_sales.rolling(window=7).mean()
        print(f"\n7-day Rolling Average (last 5 days):\n{rolling_avg.tail().round(2)}")
        
        # Cumulative operations
        cumulative = daily_sales.cumsum()
        print(f"\nCumulative Sales (last 5 days):\n{cumulative.tail()}")
        
        # Value counts
        categories = pd.Series(['A', 'B', 'A', 'C', 'B', 'A', 'C', 'C', 'B', 'A'])
        print(f"\nValue Counts:\n{categories.value_counts()}")
        
        # String operations
        names = pd.Series(['alice', 'bob', 'charlie', 'diana'])
        print(f"\nString Operations:")
        print(f"  Uppercase: {names.str.upper().tolist()}")
        print(f"  Length: {names.str.len().tolist()}")
        print(f"  Contains 'a': {names.str.contains('a').tolist()}")
    
    def dataframe_operations(self):
        """Essential DataFrame operations"""
        print("\nšŸŽÆ DataFrame Operations")
        print("=" * 60)
        
        # Create a sample DataFrame
        df = pd.DataFrame({
            'Date': pd.date_range('2024-01-01', periods=10),
            'Product': ['A', 'B', 'A', 'C', 'B', 'A', 'C', 'B', 'A', 'C'],
            'Sales': np.random.randint(100, 1000, 10),
            'Quantity': np.random.randint(1, 50, 10),
            'Region': ['North', 'South', 'North', 'East', 'West', 
                      'North', 'East', 'South', 'West', 'East']
        })
        
        print("Sample DataFrame:")
        print(df)
        
        # Column operations
        print("\nšŸ“Š Column Operations:")
        
        # Add new column
        df['Revenue'] = df['Sales'] * df['Quantity']
        print(f"Added 'Revenue' column")
        
        # Select columns
        print(f"\nSelecting columns:")
        print(f"  Single column (Series): df['Product']")
        print(f"  Multiple columns (DataFrame): df[['Product', 'Sales']]")
        
        # Drop column
        df_dropped = df.drop('Region', axis=1)
        print(f"  Dropped 'Region' column")
        
        # Row operations
        print("\nšŸ“Š Row Operations:")
        
        # Filter rows
        high_sales = df[df['Sales'] > 500]
        print(f"Rows with Sales > 500:\n{high_sales}")
        
        # Sort values
        sorted_df = df.sort_values('Revenue', ascending=False)
        print(f"\nSorted by Revenue (descending):\n{sorted_df.head(3)}")
        
        # Unique