Data Science

NumPy - Numerical Computing

Array Creation

import numpy as np

# Create arrays
arr1d = np.array([1, 2, 3, 4, 5])
arr2d = np.array([[1, 2, 3], [4, 5, 6]])
arr3d = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])

# Common array creation functions
np.zeros(5)                     # [0. 0. 0. 0. 0.]
np.ones((3, 4))                 # 3x4 array of ones
np.full((2, 3), 7)              # 2x3 array filled with 7
np.eye(3)                       # 3x3 identity matrix
np.arange(0, 10, 2)             # [0, 2, 4, 6, 8]
np.linspace(0, 1, 5)            # [0. 0.25 0.5 0.75 1.]
np.random.random((3, 3))        # 3x3 random array
np.random.randint(0, 10, (2, 3)) # 2x3 random integers

Array Properties

arr = np.array([[1, 2, 3], [4, 5, 6]])

# Basic properties
arr.shape                       # (2, 3)
arr.size                        # 6
arr.ndim                        # 2
arr.dtype                       # int64
arr.itemsize                    # 8 bytes per element

# Reshape and resize
arr.reshape(3, 2)               # Reshape to 3x2
arr.flatten()                   # Flatten to 1D
arr.T                           # Transpose
arr.ravel()                     # Flatten (returns view if possible)

Array Indexing and Slicing

arr = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])

# Basic indexing
arr[0]                          # First row: [1, 2, 3, 4]
arr[0, 1]                       # Element at row 0, col 1: 2
arr[-1]                         # Last row: [9, 10, 11, 12]

# Slicing
arr[0:2]                        # First 2 rows
arr[:, 1:3]                     # All rows, columns 1-2
arr[1, :]                       # Row 1, all columns
arr[::2, ::2]                   # Every 2nd row and column

# Boolean indexing
arr > 5                         # Boolean array
arr[arr > 5]                    # Elements greater than 5
arr[arr % 2 == 0]               # Even elements

# Fancy indexing
arr[[0, 2], [1, 3]]             # Elements at (0,1) and (2,3)
arr[np.array([0, 1, 2]), np.array([1, 2, 3])]  # Multiple elements

Array Operations

arr1 = np.array([1, 2, 3])
arr2 = np.array([4, 5, 6])

# Arithmetic operations (element-wise)
arr1 + arr2                     # [5, 7, 9]
arr1 - arr2                     # [-3, -3, -3]
arr1 * arr2                     # [4, 10, 18]
arr1 / arr2                     # [0.25, 0.4, 0.5]
arr1 ** 2                       # [1, 4, 9]

# Mathematical functions
np.sqrt(arr1)                   # Square root
np.exp(arr1)                    # Exponential
np.log(arr1)                    # Natural logarithm
np.sin(arr1)                    # Sine
np.cos(arr1)                    # Cosine
np.abs(arr1)                    # Absolute value

# Statistical functions
np.mean(arr1)                   # Mean
np.median(arr1)                 # Median
np.std(arr1)                    # Standard deviation
np.var(arr1)                    # Variance
np.min(arr1)                    # Minimum
np.max(arr1)                    # Maximum
np.sum(arr1)                    # Sum
np.prod(arr1)                   # Product

Linear Algebra

# Matrix operations
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])

# Matrix multiplication
np.dot(A, B)                    # Matrix multiplication
A @ B                           # Alternative syntax
np.matmul(A, B)                 # Matrix multiplication

# Linear algebra functions
np.linalg.inv(A)                # Matrix inverse
np.linalg.det(A)                # Determinant
np.linalg.eig(A)                # Eigenvalues and eigenvectors
np.linalg.solve(A, B)           # Solve linear system Ax = B
np.linalg.norm(A)               # Matrix norm

Pandas - Data Manipulation

Series Creation and Operations

import pandas as pd

# Create Series
s1 = pd.Series([1, 2, 3, 4, 5])
s2 = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
s3 = pd.Series({'a': 1, 'b': 2, 'c': 3})

# Series properties
s2.values                       # Underlying array
s2.index                        # Index
s2.dtype                        # Data type
s2.shape                        # Shape
s2.size                         # Size

# Series operations
s2['a']                         # Access by label
s2[0]                           # Access by position
s2[['a', 'c']]                  # Multiple labels
s2[s2 > 2]                      # Boolean indexing
s2.head(3)                      # First 3 elements
s2.tail(2)                      # Last 2 elements

DataFrame Creation

# Create DataFrame
df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})

# From nested dictionary
df2 = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 30, 35],
    'city': ['New York', 'London', 'Tokyo']
})

# From list of dictionaries
df3 = pd.DataFrame([
    {'name': 'Alice', 'age': 25, 'city': 'New York'},
    {'name': 'Bob', 'age': 30, 'city': 'London'}
])

# From CSV file
df = pd.read_csv('data.csv')
df = pd.read_csv('data.csv', sep=';', header=0, index_col=0)

# From other formats
df = pd.read_excel('data.xlsx')
df = pd.read_json('data.json')
df = pd.read_sql('SELECT * FROM table', connection)

DataFrame Properties and Info

df = pd.DataFrame({
    'A': [1, 2, 3, 4],
    'B': [5, 6, 7, 8],
    'C': ['x', 'y', 'z', 'w']
})

# Basic info
df.shape                        # (4, 3)
df.size                         # 12
df.columns                      # Column names
df.index                        # Row index
df.dtypes                       # Data types
df.info()                       # Detailed info
df.describe()                   # Statistical summary
df.head()                       # First 5 rows
df.tail()                       # Last 5 rows
df.sample(3)                    # Random 3 rows

DataFrame Indexing and Selection

# Column selection
df['A']                         # Single column (Series)
df[['A', 'B']]                  # Multiple columns (DataFrame)
df.A                            # Dot notation (if valid identifier)

# Row selection
df.loc[0]                       # By label
df.iloc[0]                      # By position
df.loc[0:2]                     # Label-based slicing
df.iloc[0:2]                    # Position-based slicing

# Boolean indexing
df[df['A'] > 2]                 # Rows where A > 2
df[df['C'].str.contains('x')]   # Rows where C contains 'x'
df[(df['A'] > 1) & (df['B'] < 8)]  # Multiple conditions

# Setting values
df.loc[0, 'A'] = 10             # Set single value
df.loc[df['A'] > 2, 'B'] = 99   # Set multiple values
df['D'] = df['A'] + df['B']     # Add new column

Data Cleaning and Transformation

# Handle missing data
df.isnull()                     # Check for missing values
df.notnull()                    # Check for non-missing values
df.isnull().sum()               # Count missing values per column
df.dropna()                     # Drop rows with missing values
df.dropna(axis=1)               # Drop columns with missing values
df.fillna(0)                    # Fill missing values with 0
df.fillna(df.mean())            # Fill with mean
df.fillna(method='forward')     # Forward fill
df.fillna(method='backward')    # Backward fill

# Remove duplicates
df.duplicated()                 # Check for duplicates
df.drop_duplicates()            # Remove duplicates
df.drop_duplicates(subset=['A'])  # Based on specific columns

# Data transformation
df['A'].astype(str)             # Change data type
df['A'].astype('category')      # Convert to category
pd.to_numeric(df['A'])          # Convert to numeric
pd.to_datetime(df['date'])      # Convert to datetime

DataFrame Operations

# Sorting
df.sort_values('A')             # Sort by column A
df.sort_values(['A', 'B'])      # Sort by multiple columns
df.sort_values('A', ascending=False)  # Descending order
df.sort_index()                 # Sort by index

# Grouping and aggregation
df.groupby('C').sum()           # Group by column C and sum
df.groupby('C').agg(['mean', 'std'])  # Multiple aggregations
df.groupby(['C', 'D']).mean()   # Group by multiple columns

# Apply functions
df['A'].apply(lambda x: x**2)   # Apply function to column
df.apply(lambda row: row['A'] + row['B'], axis=1)  # Apply to rows
df.applymap(lambda x: x**2)     # Apply to all elements

# String operations
df['C'].str.upper()             # Convert to uppercase
df['C'].str.lower()             # Convert to lowercase
df['C'].str.len()               # String length
df['C'].str.contains('pattern') # Check if contains pattern
df['C'].str.replace('old', 'new')  # Replace strings

Merging and Joining

df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value1': [1, 2, 3]})
df2 = pd.DataFrame({'key': ['A', 'B', 'D'], 'value2': [4, 5, 6]})

# Merge DataFrames
pd.merge(df1, df2, on='key')                    # Inner join
pd.merge(df1, df2, on='key', how='left')       # Left join
pd.merge(df1, df2, on='key', how='right')      # Right join
pd.merge(df1, df2, on='key', how='outer')      # Outer join

# Concatenate DataFrames
pd.concat([df1, df2])                          # Vertical concatenation
pd.concat([df1, df2], axis=1)                  # Horizontal concatenation
pd.concat([df1, df2], ignore_index=True)       # Reset index

# Join on index
df1.join(df2, lsuffix='_left', rsuffix='_right')

Pivot Tables and Reshaping

# Pivot table
df.pivot_table(values='value', index='row_col', columns='col_col', aggfunc='mean')

# Melt (wide to long)
pd.melt(df, id_vars=['id'], value_vars=['A', 'B'], var_name='variable', value_name='value')

# Stack and unstack
df.stack()                      # Wide to long
df.unstack()                    # Long to wide

# Transpose
df.T                            # Transpose DataFrame

Matplotlib - Data Visualization

Basic Plotting

import matplotlib.pyplot as plt
import numpy as np

# Line plot
x = np.linspace(0, 10, 100)
y = np.sin(x)
plt.plot(x, y)
plt.xlabel('X axis')
plt.ylabel('Y axis')
plt.title('Sine Wave')
plt.grid(True)
plt.show()

# Multiple lines
y1 = np.sin(x)
y2 = np.cos(x)
plt.plot(x, y1, label='sin(x)')
plt.plot(x, y2, label='cos(x)')
plt.legend()
plt.show()

# Scatter plot
x = np.random.randn(100)
y = np.random.randn(100)
plt.scatter(x, y, alpha=0.5)
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Scatter Plot')
plt.show()

Chart Types

# Bar chart
categories = ['A', 'B', 'C', 'D']
values = [23, 45, 56, 78]
plt.bar(categories, values)
plt.title('Bar Chart')
plt.show()

# Horizontal bar chart
plt.barh(categories, values)
plt.title('Horizontal Bar Chart')
plt.show()

# Histogram
data = np.random.normal(0, 1, 1000)
plt.hist(data, bins=30, alpha=0.7)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram')
plt.show()

# Pie chart
sizes = [15, 30, 45, 10]
labels = ['A', 'B', 'C', 'D']
plt.pie(sizes, labels=labels, autopct='%1.1f%%')
plt.title('Pie Chart')
plt.show()

# Box plot
data = [np.random.normal(0, std, 100) for std in range(1, 4)]
plt.boxplot(data)
plt.title('Box Plot')
plt.show()

Subplots and Layouts

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(10, 8))

# Plot in each subplot
axes[0, 0].plot(x, y1)
axes[0, 0].set_title('Plot 1')

axes[0, 1].scatter(x, y2)
axes[0, 1].set_title('Plot 2')

axes[1, 0].hist(data, bins=20)
axes[1, 0].set_title('Plot 3')

axes[1, 1].bar(categories, values)
axes[1, 1].set_title('Plot 4')

plt.tight_layout()
plt.show()

# Alternative syntax
plt.subplot(2, 2, 1)
plt.plot(x, y1)
plt.title('Plot 1')

plt.subplot(2, 2, 2)
plt.scatter(x, y2)
plt.title('Plot 2')

plt.show()

Customization

# Colors and styles
plt.plot(x, y, color='red', linestyle='--', linewidth=2, marker='o', markersize=5)

# Color options
plt.plot(x, y, 'r-')            # Red line
plt.plot(x, y, 'bo')            # Blue circles
plt.plot(x, y, 'g--')           # Green dashed line

# Style options
plt.style.use('seaborn')        # Use seaborn style
plt.style.use('ggplot')         # Use ggplot style

# Figure size and DPI
plt.figure(figsize=(10, 6), dpi=100)

# Axis limits
plt.xlim(0, 10)
plt.ylim(-2, 2)

# Axis labels and title
plt.xlabel('X Label', fontsize=12)
plt.ylabel('Y Label', fontsize=12)
plt.title('Title', fontsize=14, fontweight='bold')

# Grid
plt.grid(True, alpha=0.3)

# Save figure
plt.savefig('plot.png', dpi=300, bbox_inches='tight')

Seaborn - Statistical Visualization

Basic Seaborn Plots

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Load sample dataset
tips = sns.load_dataset('tips')
iris = sns.load_dataset('iris')

# Set style
sns.set_style('whitegrid')
sns.set_palette('husl')

# Distribution plots
sns.histplot(tips['total_bill'])
sns.histplot(tips['total_bill'], kde=True)
sns.kdeplot(tips['total_bill'])
sns.boxplot(x='day', y='total_bill', data=tips)
sns.violinplot(x='day', y='total_bill', data=tips)

# Relationship plots
sns.scatterplot(x='total_bill', y='tip', data=tips)
sns.lineplot(x='total_bill', y='tip', data=tips)
sns.regplot(x='total_bill', y='tip', data=tips)

# Categorical plots
sns.barplot(x='day', y='total_bill', data=tips)
sns.countplot(x='day', data=tips)
sns.pointplot(x='day', y='total_bill', data=tips)

Advanced Seaborn Plots

# Heatmap
correlation_matrix = iris.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')

# Pair plot
sns.pairplot(iris, hue='species')

# Facet grid
g = sns.FacetGrid(tips, col='time', row='smoker')
g.map(sns.scatterplot, 'total_bill', 'tip')

# Joint plot
sns.jointplot(x='total_bill', y='tip', data=tips, kind='scatter')
sns.jointplot(x='total_bill', y='tip', data=tips, kind='reg')
sns.jointplot(x='total_bill', y='tip', data=tips, kind='hex')

# Cluster map
sns.clustermap(iris.corr(), annot=True)

Customization in Seaborn

# Figure size
plt.figure(figsize=(10, 6))
sns.boxplot(x='day', y='total_bill', data=tips)

# Color palette
sns.boxplot(x='day', y='total_bill', data=tips, palette='Set2')

# Hue parameter
sns.boxplot(x='day', y='total_bill', hue='smoker', data=tips)

# Style parameters
sns.set_style('darkgrid')
sns.set_context('talk')  # poster, notebook, talk, paper

# Custom color palette
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
sns.set_palette(colors)

Jupyter Notebooks

Essential Jupyter Commands

# IPython magic commands
%timeit code_to_time            # Time execution
%time code_to_time              # Time single execution
%who                            # List variables
%whos                           # Detailed variable info
%reset                          # Clear namespace
%run script.py                  # Run external script
%load script.py                 # Load script content
%save filename 1-5              # Save cells to file
%history                        # Show command history

# Line vs cell magic
%matplotlib inline              # Line magic
%%writefile script.py           # Cell magic
%%time                          # Time entire cell
%%bash                          # Run bash commands
%%sql                           # Run SQL queries

Jupyter Display and Output

from IPython.display import display, HTML, Image, Audio, Video
import pandas as pd

# Display DataFrames nicely
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
display(df)

# Display HTML
display(HTML('<h1>Hello World</h1>'))

# Display images
display(Image('image.png'))

# Display multiple items
display(df, HTML('<hr>'), Image('plot.png'))

# Progress bars
from tqdm import tqdm
for i in tqdm(range(100)):
    # Do something
    pass

Jupyter Widgets

import ipywidgets as widgets
from IPython.display import display

# Interactive widgets
slider = widgets.IntSlider(value=7, min=0, max=10, step=1, description='Value:')
display(slider)

# Interact decorator
@widgets.interact
def f(x=10):
    return x**2

# Manual interact
widgets.interact(f, x=widgets.IntSlider(min=0, max=20, value=10))

# Text widget
text = widgets.Text(value='Hello World', description='String:')
display(text)

# Dropdown
dropdown = widgets.Dropdown(
    options=['Option 1', 'Option 2', 'Option 3'],
    value='Option 1',
    description='Choose:'
)
display(dropdown)

Scientific Computing Patterns

NumPy Broadcasting

# Broadcasting rules
a = np.array([1, 2, 3])        # (3,)
b = np.array([[1], [2], [3]])  # (3, 1)
c = a + b                      # (3, 3) - broadcasts to match

# Common broadcasting patterns
arr = np.array([[1, 2, 3], [4, 5, 6]])
arr + 10                       # Add scalar to all elements
arr * np.array([1, 2, 3])      # Multiply each row by different values
arr + np.array([[10], [20]])   # Add different values to each row

Vectorization

# Avoid loops with vectorization
# Bad: Using loops
def slow_function(arr):
    result = []
    for i in range(len(arr)):
        result.append(arr[i] ** 2 + 2 * arr[i] + 1)
    return np.array(result)

# Good: Vectorized
def fast_function(arr):
    return arr ** 2 + 2 * arr + 1

# Conditional operations
arr = np.array([1, 2, 3, 4, 5])
np.where(arr > 3, arr, 0)      # Replace values <= 3 with 0
np.select([arr < 2, arr > 4], [arr * 2, arr * 3], arr)  # Multiple conditions

Memory Efficiency

# Use views instead of copies when possible
arr = np.array([[1, 2, 3], [4, 5, 6]])
view = arr[0]                  # View (shares memory)
copy = arr[0].copy()           # Copy (new memory)

# Check if view or copy
view.base is arr               # True for view
copy.base is arr               # False for copy

# In-place operations
arr *= 2                       # In-place multiplication
np.add(arr, 1, out=arr)        # In-place addition

Data Analysis Workflows

Exploratory Data Analysis (EDA)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load and inspect data
df = pd.read_csv('data.csv')
print(df.head())
print(df.info())
print(df.describe())
print(df.isnull().sum())

# Distribution analysis
df.hist(bins=20, figsize=(12, 8))
plt.show()

# Correlation analysis
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()

# Outlier detection
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
outliers = df[((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

Data Preprocessing Pipeline

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Handle missing values
df = df.dropna()  # or df.fillna(method='forward')

# Handle categorical variables
label_encoder = LabelEncoder()
df['category_encoded'] = label_encoder.fit_transform(df['category'])

# One-hot encoding
df_encoded = pd.get_dummies(df, columns=['category'], prefix='cat')

# Feature scaling
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[['feature1', 'feature2']])

# Train-test split
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Statistical Operations

# Descriptive statistics
df.describe()                   # Summary statistics
df.mean()                       # Mean
df.median()                     # Median
df.mode()                       # Mode
df.std()                        # Standard deviation
df.var()                        # Variance
df.skew()                       # Skewness
df.kurtosis()                   # Kurtosis

# Correlation and covariance
df.corr()                       # Correlation matrix
df.cov()                        # Covariance matrix
df['A'].corr(df['B'])           # Correlation between two columns

# Hypothesis testing
from scipy import stats
t_stat, p_value = stats.ttest_ind(group1, group2)
chi2, p_value = stats.chi2_contingency(contingency_table)

Machine Learning Basics

Scikit-learn Overview

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Basic ML pipeline
# 1. Load and prepare data
X = df.drop('target', axis=1)
y = df['target']

# 2. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# 5. Make predictions
y_pred = model.predict(X_test_scaled)

# 6. Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.3f}')
print(classification_report(y_test, y_pred))

Model Evaluation

from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Regression metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Classification metrics
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

# Cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)
print(f'CV Scores: {cv_scores}')
print(f'Mean CV Score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})')

Data Visualization Best Practices

Choosing the Right Chart Type

# For different data types and purposes:

# Continuous data distribution
sns.histplot(data['column'])            # Histogram
sns.kdeplot(data['column'])             # Density plot
sns.boxplot(y='column', data=data)      # Box plot

# Categorical data
sns.countplot(x='category', data=data)  # Count plot
sns.barplot(x='category', y='value', data=data)  # Bar plot

# Relationships
sns.scatterplot(x='x', y='y', data=data)  # Scatter plot
sns.lineplot(x='x', y='y', data=data)     # Line plot
sns.regplot(x='x', y='y', data=data)      # Regression plot

# Multiple variables
sns.pairplot(data)                      # Pair plot
sns.heatmap(data.corr())                # Correlation heatmap

Effective Visualization

# Good practices
plt.figure(figsize=(10, 6))
plt.title('Clear, Descriptive Title', fontsize=16)
plt.xlabel('X Axis Label', fontsize=12)
plt.ylabel('Y Axis Label', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()

# Color considerations
# Use colorblind-friendly palettes
sns.set_palette('colorblind')
# Or manually specify colors
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']

# Avoid chart junk
sns.despine()  # Remove top and right spines
plt.grid(True, alpha=0.3)  # Subtle grid
# Keep it simple and focused

Common Patterns and Workflows

Data Loading and Exploration

# Standard data loading pattern
def load_and_explore(filename):
    # Load data
    df = pd.read_csv(filename)

    # Basic info
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print(f"Data types:\n{df.dtypes}")
    print(f"Missing values:\n{df.isnull().sum()}")

    # Quick stats
    print(f"Numerical summary:\n{df.describe()}")

    # Display first few rows
    display(df.head())

    return df

# Usage
df = load_and_explore('data.csv')

Data Cleaning Pipeline

def clean_data(df):
    # Make a copy
    df_clean = df.copy()

    # Handle missing values
    df_clean = df_clean.dropna(thresh=len(df_clean) * 0.5, axis=1)  # Drop columns with >50% missing
    df_clean = df_clean.fillna(df_clean.mean())  # Fill numeric with mean

    # Handle duplicates
    df_clean = df_clean.drop_duplicates()

    # Remove outliers (optional)
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        df_clean = df_clean[~((df_clean[col] < (Q1 - 1.5 * IQR)) |
                             (df_clean[col] > (Q3 + 1.5 * IQR)))]

    return df_clean

Analysis Template

def analyze_dataset(df, target_column):
    # Exploratory Data Analysis
    print("=== EXPLORATORY DATA ANALYSIS ===")
    print(f"Dataset shape: {df.shape}")
    print(f"Target distribution:\n{df[target_column].value_counts()}")

    # Visualizations
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))

    # Target distribution
    df[target_column].value_counts().plot(kind='bar', ax=axes[0, 0])
    axes[0, 0].set_title('Target Distribution')

    # Correlation heatmap
    corr_matrix = df.corr()
    sns.heatmap(corr_matrix, annot=True, ax=axes[0, 1])
    axes[0, 1].set_title('Correlation Matrix')

    # Feature distributions
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols].hist(bins=20, ax=axes[1, 0])
    axes[1, 0].set_title('Feature Distributions')

    # Missing values
    df.isnull().sum().plot(kind='bar', ax=axes[1, 1])
    axes[1, 1].set_title('Missing Values')

    plt.tight_layout()
    plt.show()

    return df

This comprehensive cheatsheet covers the essential Python data science tools and techniques. Keep it handy for quick reference during your data science projects!

NumPy - Numerical Computing​

Array Creation​

Array Properties​

Array Indexing and Slicing​

Array Operations​

Linear Algebra​

Pandas - Data Manipulation​

Series Creation and Operations​

DataFrame Creation​

DataFrame Properties and Info​

DataFrame Indexing and Selection​

Data Cleaning and Transformation​

DataFrame Operations​

Merging and Joining​

Pivot Tables and Reshaping​

Matplotlib - Data Visualization​

Basic Plotting​

Chart Types​

Subplots and Layouts​

Customization​

Seaborn - Statistical Visualization​

Basic Seaborn Plots​

Advanced Seaborn Plots​

Customization in Seaborn​

Jupyter Notebooks​

Essential Jupyter Commands​

Jupyter Display and Output​

Jupyter Widgets​

Scientific Computing Patterns​

NumPy Broadcasting​

Vectorization​

Memory Efficiency​

Data Analysis Workflows​

Exploratory Data Analysis (EDA)​

Data Preprocessing Pipeline​

Statistical Operations​

Machine Learning Basics​

Scikit-learn Overview​

Model Evaluation​

Data Visualization Best Practices​

Choosing the Right Chart Type​

Effective Visualization​

Common Patterns and Workflows​

Data Loading and Exploration​

Data Cleaning Pipeline​

Analysis Template​

NumPy - Numerical Computing

Array Creation

Array Properties

Array Indexing and Slicing

Array Operations

Linear Algebra

Pandas - Data Manipulation

Series Creation and Operations

DataFrame Creation

DataFrame Properties and Info

DataFrame Indexing and Selection

Data Cleaning and Transformation

DataFrame Operations

Merging and Joining

Pivot Tables and Reshaping

Matplotlib - Data Visualization

Basic Plotting

Chart Types

Subplots and Layouts

Customization

Seaborn - Statistical Visualization

Basic Seaborn Plots

Advanced Seaborn Plots

Customization in Seaborn

Jupyter Notebooks

Essential Jupyter Commands

Jupyter Display and Output

Jupyter Widgets

Scientific Computing Patterns

NumPy Broadcasting

Vectorization

Memory Efficiency

Data Analysis Workflows

Exploratory Data Analysis (EDA)

Data Preprocessing Pipeline

Statistical Operations

Machine Learning Basics

Scikit-learn Overview

Model Evaluation

Data Visualization Best Practices

Choosing the Right Chart Type

Effective Visualization

Common Patterns and Workflows

Data Loading and Exploration

Data Cleaning Pipeline

Analysis Template