Skip to main content

Data Science

NumPy - Numerical Computing

Array Creation

import numpy as np

# Create arrays
arr1d = np.array([1, 2, 3, 4, 5])
arr2d = np.array([[1, 2, 3], [4, 5, 6]])
arr3d = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])

# Common array creation functions
np.zeros(5) # [0. 0. 0. 0. 0.]
np.ones((3, 4)) # 3x4 array of ones
np.full((2, 3), 7) # 2x3 array filled with 7
np.eye(3) # 3x3 identity matrix
np.arange(0, 10, 2) # [0, 2, 4, 6, 8]
np.linspace(0, 1, 5) # [0. 0.25 0.5 0.75 1.]
np.random.random((3, 3)) # 3x3 random array
np.random.randint(0, 10, (2, 3)) # 2x3 random integers

Array Properties

arr = np.array([[1, 2, 3], [4, 5, 6]])

# Basic properties
arr.shape # (2, 3)
arr.size # 6
arr.ndim # 2
arr.dtype # int64
arr.itemsize # 8 bytes per element

# Reshape and resize
arr.reshape(3, 2) # Reshape to 3x2
arr.flatten() # Flatten to 1D
arr.T # Transpose
arr.ravel() # Flatten (returns view if possible)

Array Indexing and Slicing

arr = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])

# Basic indexing
arr[0] # First row: [1, 2, 3, 4]
arr[0, 1] # Element at row 0, col 1: 2
arr[-1] # Last row: [9, 10, 11, 12]

# Slicing
arr[0:2] # First 2 rows
arr[:, 1:3] # All rows, columns 1-2
arr[1, :] # Row 1, all columns
arr[::2, ::2] # Every 2nd row and column

# Boolean indexing
arr > 5 # Boolean array
arr[arr > 5] # Elements greater than 5
arr[arr % 2 == 0] # Even elements

# Fancy indexing
arr[[0, 2], [1, 3]] # Elements at (0,1) and (2,3)
arr[np.array([0, 1, 2]), np.array([1, 2, 3])] # Multiple elements

Array Operations

arr1 = np.array([1, 2, 3])
arr2 = np.array([4, 5, 6])

# Arithmetic operations (element-wise)
arr1 + arr2 # [5, 7, 9]
arr1 - arr2 # [-3, -3, -3]
arr1 * arr2 # [4, 10, 18]
arr1 / arr2 # [0.25, 0.4, 0.5]
arr1 ** 2 # [1, 4, 9]

# Mathematical functions
np.sqrt(arr1) # Square root
np.exp(arr1) # Exponential
np.log(arr1) # Natural logarithm
np.sin(arr1) # Sine
np.cos(arr1) # Cosine
np.abs(arr1) # Absolute value

# Statistical functions
np.mean(arr1) # Mean
np.median(arr1) # Median
np.std(arr1) # Standard deviation
np.var(arr1) # Variance
np.min(arr1) # Minimum
np.max(arr1) # Maximum
np.sum(arr1) # Sum
np.prod(arr1) # Product

Linear Algebra

# Matrix operations
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])

# Matrix multiplication
np.dot(A, B) # Matrix multiplication
A @ B # Alternative syntax
np.matmul(A, B) # Matrix multiplication

# Linear algebra functions
np.linalg.inv(A) # Matrix inverse
np.linalg.det(A) # Determinant
np.linalg.eig(A) # Eigenvalues and eigenvectors
np.linalg.solve(A, B) # Solve linear system Ax = B
np.linalg.norm(A) # Matrix norm

Pandas - Data Manipulation

Series Creation and Operations

import pandas as pd

# Create Series
s1 = pd.Series([1, 2, 3, 4, 5])
s2 = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
s3 = pd.Series({'a': 1, 'b': 2, 'c': 3})

# Series properties
s2.values # Underlying array
s2.index # Index
s2.dtype # Data type
s2.shape # Shape
s2.size # Size

# Series operations
s2['a'] # Access by label
s2[0] # Access by position
s2[['a', 'c']] # Multiple labels
s2[s2 > 2] # Boolean indexing
s2.head(3) # First 3 elements
s2.tail(2) # Last 2 elements

DataFrame Creation

# Create DataFrame
df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})

# From nested dictionary
df2 = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie'],
'age': [25, 30, 35],
'city': ['New York', 'London', 'Tokyo']
})

# From list of dictionaries
df3 = pd.DataFrame([
{'name': 'Alice', 'age': 25, 'city': 'New York'},
{'name': 'Bob', 'age': 30, 'city': 'London'}
])

# From CSV file
df = pd.read_csv('data.csv')
df = pd.read_csv('data.csv', sep=';', header=0, index_col=0)

# From other formats
df = pd.read_excel('data.xlsx')
df = pd.read_json('data.json')
df = pd.read_sql('SELECT * FROM table', connection)

DataFrame Properties and Info

df = pd.DataFrame({
'A': [1, 2, 3, 4],
'B': [5, 6, 7, 8],
'C': ['x', 'y', 'z', 'w']
})

# Basic info
df.shape # (4, 3)
df.size # 12
df.columns # Column names
df.index # Row index
df.dtypes # Data types
df.info() # Detailed info
df.describe() # Statistical summary
df.head() # First 5 rows
df.tail() # Last 5 rows
df.sample(3) # Random 3 rows

DataFrame Indexing and Selection

# Column selection
df['A'] # Single column (Series)
df[['A', 'B']] # Multiple columns (DataFrame)
df.A # Dot notation (if valid identifier)

# Row selection
df.loc[0] # By label
df.iloc[0] # By position
df.loc[0:2] # Label-based slicing
df.iloc[0:2] # Position-based slicing

# Boolean indexing
df[df['A'] > 2] # Rows where A > 2
df[df['C'].str.contains('x')] # Rows where C contains 'x'
df[(df['A'] > 1) & (df['B'] < 8)] # Multiple conditions

# Setting values
df.loc[0, 'A'] = 10 # Set single value
df.loc[df['A'] > 2, 'B'] = 99 # Set multiple values
df['D'] = df['A'] + df['B'] # Add new column

Data Cleaning and Transformation

# Handle missing data
df.isnull() # Check for missing values
df.notnull() # Check for non-missing values
df.isnull().sum() # Count missing values per column
df.dropna() # Drop rows with missing values
df.dropna(axis=1) # Drop columns with missing values
df.fillna(0) # Fill missing values with 0
df.fillna(df.mean()) # Fill with mean
df.fillna(method='forward') # Forward fill
df.fillna(method='backward') # Backward fill

# Remove duplicates
df.duplicated() # Check for duplicates
df.drop_duplicates() # Remove duplicates
df.drop_duplicates(subset=['A']) # Based on specific columns

# Data transformation
df['A'].astype(str) # Change data type
df['A'].astype('category') # Convert to category
pd.to_numeric(df['A']) # Convert to numeric
pd.to_datetime(df['date']) # Convert to datetime

DataFrame Operations

# Sorting
df.sort_values('A') # Sort by column A
df.sort_values(['A', 'B']) # Sort by multiple columns
df.sort_values('A', ascending=False) # Descending order
df.sort_index() # Sort by index

# Grouping and aggregation
df.groupby('C').sum() # Group by column C and sum
df.groupby('C').agg(['mean', 'std']) # Multiple aggregations
df.groupby(['C', 'D']).mean() # Group by multiple columns

# Apply functions
df['A'].apply(lambda x: x**2) # Apply function to column
df.apply(lambda row: row['A'] + row['B'], axis=1) # Apply to rows
df.applymap(lambda x: x**2) # Apply to all elements

# String operations
df['C'].str.upper() # Convert to uppercase
df['C'].str.lower() # Convert to lowercase
df['C'].str.len() # String length
df['C'].str.contains('pattern') # Check if contains pattern
df['C'].str.replace('old', 'new') # Replace strings

Merging and Joining

df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value1': [1, 2, 3]})
df2 = pd.DataFrame({'key': ['A', 'B', 'D'], 'value2': [4, 5, 6]})

# Merge DataFrames
pd.merge(df1, df2, on='key') # Inner join
pd.merge(df1, df2, on='key', how='left') # Left join
pd.merge(df1, df2, on='key', how='right') # Right join
pd.merge(df1, df2, on='key', how='outer') # Outer join

# Concatenate DataFrames
pd.concat([df1, df2]) # Vertical concatenation
pd.concat([df1, df2], axis=1) # Horizontal concatenation
pd.concat([df1, df2], ignore_index=True) # Reset index

# Join on index
df1.join(df2, lsuffix='_left', rsuffix='_right')

Pivot Tables and Reshaping

# Pivot table
df.pivot_table(values='value', index='row_col', columns='col_col', aggfunc='mean')

# Melt (wide to long)
pd.melt(df, id_vars=['id'], value_vars=['A', 'B'], var_name='variable', value_name='value')

# Stack and unstack
df.stack() # Wide to long
df.unstack() # Long to wide

# Transpose
df.T # Transpose DataFrame

Matplotlib - Data Visualization

Basic Plotting

import matplotlib.pyplot as plt
import numpy as np

# Line plot
x = np.linspace(0, 10, 100)
y = np.sin(x)
plt.plot(x, y)
plt.xlabel('X axis')
plt.ylabel('Y axis')
plt.title('Sine Wave')
plt.grid(True)
plt.show()

# Multiple lines
y1 = np.sin(x)
y2 = np.cos(x)
plt.plot(x, y1, label='sin(x)')
plt.plot(x, y2, label='cos(x)')
plt.legend()
plt.show()

# Scatter plot
x = np.random.randn(100)
y = np.random.randn(100)
plt.scatter(x, y, alpha=0.5)
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Scatter Plot')
plt.show()

Chart Types

# Bar chart
categories = ['A', 'B', 'C', 'D']
values = [23, 45, 56, 78]
plt.bar(categories, values)
plt.title('Bar Chart')
plt.show()

# Horizontal bar chart
plt.barh(categories, values)
plt.title('Horizontal Bar Chart')
plt.show()

# Histogram
data = np.random.normal(0, 1, 1000)
plt.hist(data, bins=30, alpha=0.7)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram')
plt.show()

# Pie chart
sizes = [15, 30, 45, 10]
labels = ['A', 'B', 'C', 'D']
plt.pie(sizes, labels=labels, autopct='%1.1f%%')
plt.title('Pie Chart')
plt.show()

# Box plot
data = [np.random.normal(0, std, 100) for std in range(1, 4)]
plt.boxplot(data)
plt.title('Box Plot')
plt.show()

Subplots and Layouts

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(10, 8))

# Plot in each subplot
axes[0, 0].plot(x, y1)
axes[0, 0].set_title('Plot 1')

axes[0, 1].scatter(x, y2)
axes[0, 1].set_title('Plot 2')

axes[1, 0].hist(data, bins=20)
axes[1, 0].set_title('Plot 3')

axes[1, 1].bar(categories, values)
axes[1, 1].set_title('Plot 4')

plt.tight_layout()
plt.show()

# Alternative syntax
plt.subplot(2, 2, 1)
plt.plot(x, y1)
plt.title('Plot 1')

plt.subplot(2, 2, 2)
plt.scatter(x, y2)
plt.title('Plot 2')

plt.show()

Customization

# Colors and styles
plt.plot(x, y, color='red', linestyle='--', linewidth=2, marker='o', markersize=5)

# Color options
plt.plot(x, y, 'r-') # Red line
plt.plot(x, y, 'bo') # Blue circles
plt.plot(x, y, 'g--') # Green dashed line

# Style options
plt.style.use('seaborn') # Use seaborn style
plt.style.use('ggplot') # Use ggplot style

# Figure size and DPI
plt.figure(figsize=(10, 6), dpi=100)

# Axis limits
plt.xlim(0, 10)
plt.ylim(-2, 2)

# Axis labels and title
plt.xlabel('X Label', fontsize=12)
plt.ylabel('Y Label', fontsize=12)
plt.title('Title', fontsize=14, fontweight='bold')

# Grid
plt.grid(True, alpha=0.3)

# Save figure
plt.savefig('plot.png', dpi=300, bbox_inches='tight')

Seaborn - Statistical Visualization

Basic Seaborn Plots

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Load sample dataset
tips = sns.load_dataset('tips')
iris = sns.load_dataset('iris')

# Set style
sns.set_style('whitegrid')
sns.set_palette('husl')

# Distribution plots
sns.histplot(tips['total_bill'])
sns.histplot(tips['total_bill'], kde=True)
sns.kdeplot(tips['total_bill'])
sns.boxplot(x='day', y='total_bill', data=tips)
sns.violinplot(x='day', y='total_bill', data=tips)

# Relationship plots
sns.scatterplot(x='total_bill', y='tip', data=tips)
sns.lineplot(x='total_bill', y='tip', data=tips)
sns.regplot(x='total_bill', y='tip', data=tips)

# Categorical plots
sns.barplot(x='day', y='total_bill', data=tips)
sns.countplot(x='day', data=tips)
sns.pointplot(x='day', y='total_bill', data=tips)

Advanced Seaborn Plots

# Heatmap
correlation_matrix = iris.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')

# Pair plot
sns.pairplot(iris, hue='species')

# Facet grid
g = sns.FacetGrid(tips, col='time', row='smoker')
g.map(sns.scatterplot, 'total_bill', 'tip')

# Joint plot
sns.jointplot(x='total_bill', y='tip', data=tips, kind='scatter')
sns.jointplot(x='total_bill', y='tip', data=tips, kind='reg')
sns.jointplot(x='total_bill', y='tip', data=tips, kind='hex')

# Cluster map
sns.clustermap(iris.corr(), annot=True)

Customization in Seaborn

# Figure size
plt.figure(figsize=(10, 6))
sns.boxplot(x='day', y='total_bill', data=tips)

# Color palette
sns.boxplot(x='day', y='total_bill', data=tips, palette='Set2')

# Hue parameter
sns.boxplot(x='day', y='total_bill', hue='smoker', data=tips)

# Style parameters
sns.set_style('darkgrid')
sns.set_context('talk') # poster, notebook, talk, paper

# Custom color palette
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
sns.set_palette(colors)

Jupyter Notebooks

Essential Jupyter Commands

# IPython magic commands
%timeit code_to_time # Time execution
%time code_to_time # Time single execution
%who # List variables
%whos # Detailed variable info
%reset # Clear namespace
%run script.py # Run external script
%load script.py # Load script content
%save filename 1-5 # Save cells to file
%history # Show command history

# Line vs cell magic
%matplotlib inline # Line magic
%%writefile script.py # Cell magic
%%time # Time entire cell
%%bash # Run bash commands
%%sql # Run SQL queries

Jupyter Display and Output

from IPython.display import display, HTML, Image, Audio, Video
import pandas as pd

# Display DataFrames nicely
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
display(df)

# Display HTML
display(HTML('<h1>Hello World</h1>'))

# Display images
display(Image('image.png'))

# Display multiple items
display(df, HTML('<hr>'), Image('plot.png'))

# Progress bars
from tqdm import tqdm
for i in tqdm(range(100)):
# Do something
pass

Jupyter Widgets

import ipywidgets as widgets
from IPython.display import display

# Interactive widgets
slider = widgets.IntSlider(value=7, min=0, max=10, step=1, description='Value:')
display(slider)

# Interact decorator
@widgets.interact
def f(x=10):
return x**2

# Manual interact
widgets.interact(f, x=widgets.IntSlider(min=0, max=20, value=10))

# Text widget
text = widgets.Text(value='Hello World', description='String:')
display(text)

# Dropdown
dropdown = widgets.Dropdown(
options=['Option 1', 'Option 2', 'Option 3'],
value='Option 1',
description='Choose:'
)
display(dropdown)

Scientific Computing Patterns

NumPy Broadcasting

# Broadcasting rules
a = np.array([1, 2, 3]) # (3,)
b = np.array([[1], [2], [3]]) # (3, 1)
c = a + b # (3, 3) - broadcasts to match

# Common broadcasting patterns
arr = np.array([[1, 2, 3], [4, 5, 6]])
arr + 10 # Add scalar to all elements
arr * np.array([1, 2, 3]) # Multiply each row by different values
arr + np.array([[10], [20]]) # Add different values to each row

Vectorization

# Avoid loops with vectorization
# Bad: Using loops
def slow_function(arr):
result = []
for i in range(len(arr)):
result.append(arr[i] ** 2 + 2 * arr[i] + 1)
return np.array(result)

# Good: Vectorized
def fast_function(arr):
return arr ** 2 + 2 * arr + 1

# Conditional operations
arr = np.array([1, 2, 3, 4, 5])
np.where(arr > 3, arr, 0) # Replace values <= 3 with 0
np.select([arr < 2, arr > 4], [arr * 2, arr * 3], arr) # Multiple conditions

Memory Efficiency

# Use views instead of copies when possible
arr = np.array([[1, 2, 3], [4, 5, 6]])
view = arr[0] # View (shares memory)
copy = arr[0].copy() # Copy (new memory)

# Check if view or copy
view.base is arr # True for view
copy.base is arr # False for copy

# In-place operations
arr *= 2 # In-place multiplication
np.add(arr, 1, out=arr) # In-place addition

Data Analysis Workflows

Exploratory Data Analysis (EDA)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load and inspect data
df = pd.read_csv('data.csv')
print(df.head())
print(df.info())
print(df.describe())
print(df.isnull().sum())

# Distribution analysis
df.hist(bins=20, figsize=(12, 8))
plt.show()

# Correlation analysis
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()

# Outlier detection
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
outliers = df[((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

Data Preprocessing Pipeline

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Handle missing values
df = df.dropna() # or df.fillna(method='forward')

# Handle categorical variables
label_encoder = LabelEncoder()
df['category_encoded'] = label_encoder.fit_transform(df['category'])

# One-hot encoding
df_encoded = pd.get_dummies(df, columns=['category'], prefix='cat')

# Feature scaling
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[['feature1', 'feature2']])

# Train-test split
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Statistical Operations

# Descriptive statistics
df.describe() # Summary statistics
df.mean() # Mean
df.median() # Median
df.mode() # Mode
df.std() # Standard deviation
df.var() # Variance
df.skew() # Skewness
df.kurtosis() # Kurtosis

# Correlation and covariance
df.corr() # Correlation matrix
df.cov() # Covariance matrix
df['A'].corr(df['B']) # Correlation between two columns

# Hypothesis testing
from scipy import stats
t_stat, p_value = stats.ttest_ind(group1, group2)
chi2, p_value = stats.chi2_contingency(contingency_table)

Machine Learning Basics

Scikit-learn Overview

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Basic ML pipeline
# 1. Load and prepare data
X = df.drop('target', axis=1)
y = df['target']

# 2. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# 5. Make predictions
y_pred = model.predict(X_test_scaled)

# 6. Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.3f}')
print(classification_report(y_test, y_pred))

Model Evaluation

from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Regression metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Classification metrics
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

# Cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)
print(f'CV Scores: {cv_scores}')
print(f'Mean CV Score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})')

Data Visualization Best Practices

Choosing the Right Chart Type

# For different data types and purposes:

# Continuous data distribution
sns.histplot(data['column']) # Histogram
sns.kdeplot(data['column']) # Density plot
sns.boxplot(y='column', data=data) # Box plot

# Categorical data
sns.countplot(x='category', data=data) # Count plot
sns.barplot(x='category', y='value', data=data) # Bar plot

# Relationships
sns.scatterplot(x='x', y='y', data=data) # Scatter plot
sns.lineplot(x='x', y='y', data=data) # Line plot
sns.regplot(x='x', y='y', data=data) # Regression plot

# Multiple variables
sns.pairplot(data) # Pair plot
sns.heatmap(data.corr()) # Correlation heatmap

Effective Visualization

# Good practices
plt.figure(figsize=(10, 6))
plt.title('Clear, Descriptive Title', fontsize=16)
plt.xlabel('X Axis Label', fontsize=12)
plt.ylabel('Y Axis Label', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()

# Color considerations
# Use colorblind-friendly palettes
sns.set_palette('colorblind')
# Or manually specify colors
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']

# Avoid chart junk
sns.despine() # Remove top and right spines
plt.grid(True, alpha=0.3) # Subtle grid
# Keep it simple and focused

Common Patterns and Workflows

Data Loading and Exploration

# Standard data loading pattern
def load_and_explore(filename):
# Load data
df = pd.read_csv(filename)

# Basic info
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Data types:\n{df.dtypes}")
print(f"Missing values:\n{df.isnull().sum()}")

# Quick stats
print(f"Numerical summary:\n{df.describe()}")

# Display first few rows
display(df.head())

return df

# Usage
df = load_and_explore('data.csv')

Data Cleaning Pipeline

def clean_data(df):
# Make a copy
df_clean = df.copy()

# Handle missing values
df_clean = df_clean.dropna(thresh=len(df_clean) * 0.5, axis=1) # Drop columns with >50% missing
df_clean = df_clean.fillna(df_clean.mean()) # Fill numeric with mean

# Handle duplicates
df_clean = df_clean.drop_duplicates()

# Remove outliers (optional)
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
Q1 = df_clean[col].quantile(0.25)
Q3 = df_clean[col].quantile(0.75)
IQR = Q3 - Q1
df_clean = df_clean[~((df_clean[col] < (Q1 - 1.5 * IQR)) |
(df_clean[col] > (Q3 + 1.5 * IQR)))]

return df_clean

Analysis Template

def analyze_dataset(df, target_column):
# Exploratory Data Analysis
print("=== EXPLORATORY DATA ANALYSIS ===")
print(f"Dataset shape: {df.shape}")
print(f"Target distribution:\n{df[target_column].value_counts()}")

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Target distribution
df[target_column].value_counts().plot(kind='bar', ax=axes[0, 0])
axes[0, 0].set_title('Target Distribution')

# Correlation heatmap
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, ax=axes[0, 1])
axes[0, 1].set_title('Correlation Matrix')

# Feature distributions
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols].hist(bins=20, ax=axes[1, 0])
axes[1, 0].set_title('Feature Distributions')

# Missing values
df.isnull().sum().plot(kind='bar', ax=axes[1, 1])
axes[1, 1].set_title('Missing Values')

plt.tight_layout()
plt.show()

return df

This comprehensive cheatsheet covers the essential Python data science tools and techniques. Keep it handy for quick reference during your data science projects!