Data Science
NumPy - Numerical Computing
Array Creation
import numpy as np
# Create arrays
arr1d = np.array([1, 2, 3, 4, 5])
arr2d = np.array([[1, 2, 3], [4, 5, 6]])
arr3d = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
# Common array creation functions
np.zeros(5) # [0. 0. 0. 0. 0.]
np.ones((3, 4)) # 3x4 array of ones
np.full((2, 3), 7) # 2x3 array filled with 7
np.eye(3) # 3x3 identity matrix
np.arange(0, 10, 2) # [0, 2, 4, 6, 8]
np.linspace(0, 1, 5) # [0. 0.25 0.5 0.75 1.]
np.random.random((3, 3)) # 3x3 random array
np.random.randint(0, 10, (2, 3)) # 2x3 random integers
Array Properties
arr = np.array([[1, 2, 3], [4, 5, 6]])
# Basic properties
arr.shape # (2, 3)
arr.size # 6
arr.ndim # 2
arr.dtype # int64
arr.itemsize # 8 bytes per element
# Reshape and resize
arr.reshape(3, 2) # Reshape to 3x2
arr.flatten() # Flatten to 1D
arr.T # Transpose
arr.ravel() # Flatten (returns view if possible)
Array Indexing and Slicing
arr = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
# Basic indexing
arr[0] # First row: [1, 2, 3, 4]
arr[0, 1] # Element at row 0, col 1: 2
arr[-1] # Last row: [9, 10, 11, 12]
# Slicing
arr[0:2] # First 2 rows
arr[:, 1:3] # All rows, columns 1-2
arr[1, :] # Row 1, all columns
arr[::2, ::2] # Every 2nd row and column
# Boolean indexing
arr > 5 # Boolean array
arr[arr > 5] # Elements greater than 5
arr[arr % 2 == 0] # Even elements
# Fancy indexing
arr[[0, 2], [1, 3]] # Elements at (0,1) and (2,3)
arr[np.array([0, 1, 2]), np.array([1, 2, 3])] # Multiple elements
Array Operations
arr1 = np.array([1, 2, 3])
arr2 = np.array([4, 5, 6])
# Arithmetic operations (element-wise)
arr1 + arr2 # [5, 7, 9]
arr1 - arr2 # [-3, -3, -3]
arr1 * arr2 # [4, 10, 18]
arr1 / arr2 # [0.25, 0.4, 0.5]
arr1 ** 2 # [1, 4, 9]
# Mathematical functions
np.sqrt(arr1) # Square root
np.exp(arr1) # Exponential
np.log(arr1) # Natural logarithm
np.sin(arr1) # Sine
np.cos(arr1) # Cosine
np.abs(arr1) # Absolute value
# Statistical functions
np.mean(arr1) # Mean
np.median(arr1) # Median
np.std(arr1) # Standard deviation
np.var(arr1) # Variance
np.min(arr1) # Minimum
np.max(arr1) # Maximum
np.sum(arr1) # Sum
np.prod(arr1) # Product
Linear Algebra
# Matrix operations
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])
# Matrix multiplication
np.dot(A, B) # Matrix multiplication
A @ B # Alternative syntax
np.matmul(A, B) # Matrix multiplication
# Linear algebra functions
np.linalg.inv(A) # Matrix inverse
np.linalg.det(A) # Determinant
np.linalg.eig(A) # Eigenvalues and eigenvectors
np.linalg.solve(A, B) # Solve linear system Ax = B
np.linalg.norm(A) # Matrix norm
Pandas - Data Manipulation
Series Creation and Operations
import pandas as pd
# Create Series
s1 = pd.Series([1, 2, 3, 4, 5])
s2 = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
s3 = pd.Series({'a': 1, 'b': 2, 'c': 3})
# Series properties
s2.values # Underlying array
s2.index # Index
s2.dtype # Data type
s2.shape # Shape
s2.size # Size
# Series operations
s2['a'] # Access by label
s2[0] # Access by position
s2[['a', 'c']] # Multiple labels
s2[s2 > 2] # Boolean indexing
s2.head(3) # First 3 elements
s2.tail(2) # Last 2 elements
DataFrame Creation
# Create DataFrame
df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
# From nested dictionary
df2 = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie'],
'age': [25, 30, 35],
'city': ['New York', 'London', 'Tokyo']
})
# From list of dictionaries
df3 = pd.DataFrame([
{'name': 'Alice', 'age': 25, 'city': 'New York'},
{'name': 'Bob', 'age': 30, 'city': 'London'}
])
# From CSV file
df = pd.read_csv('data.csv')
df = pd.read_csv('data.csv', sep=';', header=0, index_col=0)
# From other formats
df = pd.read_excel('data.xlsx')
df = pd.read_json('data.json')
df = pd.read_sql('SELECT * FROM table', connection)
DataFrame Properties and Info
df = pd.DataFrame({
'A': [1, 2, 3, 4],
'B': [5, 6, 7, 8],
'C': ['x', 'y', 'z', 'w']
})
# Basic info
df.shape # (4, 3)
df.size # 12
df.columns # Column names
df.index # Row index
df.dtypes # Data types
df.info() # Detailed info
df.describe() # Statistical summary
df.head() # First 5 rows
df.tail() # Last 5 rows
df.sample(3) # Random 3 rows
DataFrame Indexing and Selection
# Column selection
df['A'] # Single column (Series)
df[['A', 'B']] # Multiple columns (DataFrame)
df.A # Dot notation (if valid identifier)
# Row selection
df.loc[0] # By label
df.iloc[0] # By position
df.loc[0:2] # Label-based slicing
df.iloc[0:2] # Position-based slicing
# Boolean indexing
df[df['A'] > 2] # Rows where A > 2
df[df['C'].str.contains('x')] # Rows where C contains 'x'
df[(df['A'] > 1) & (df['B'] < 8)] # Multiple conditions
# Setting values
df.loc[0, 'A'] = 10 # Set single value
df.loc[df['A'] > 2, 'B'] = 99 # Set multiple values
df['D'] = df['A'] + df['B'] # Add new column
Data Cleaning and Transformation
# Handle missing data
df.isnull() # Check for missing values
df.notnull() # Check for non-missing values
df.isnull().sum() # Count missing values per column
df.dropna() # Drop rows with missing values
df.dropna(axis=1) # Drop columns with missing values
df.fillna(0) # Fill missing values with 0
df.fillna(df.mean()) # Fill with mean
df.fillna(method='forward') # Forward fill
df.fillna(method='backward') # Backward fill
# Remove duplicates
df.duplicated() # Check for duplicates
df.drop_duplicates() # Remove duplicates
df.drop_duplicates(subset=['A']) # Based on specific columns
# Data transformation
df['A'].astype(str) # Change data type
df['A'].astype('category') # Convert to category
pd.to_numeric(df['A']) # Convert to numeric
pd.to_datetime(df['date']) # Convert to datetime
DataFrame Operations
# Sorting
df.sort_values('A') # Sort by column A
df.sort_values(['A', 'B']) # Sort by multiple columns
df.sort_values('A', ascending=False) # Descending order
df.sort_index() # Sort by index
# Grouping and aggregation
df.groupby('C').sum() # Group by column C and sum
df.groupby('C').agg(['mean', 'std']) # Multiple aggregations
df.groupby(['C', 'D']).mean() # Group by multiple columns
# Apply functions
df['A'].apply(lambda x: x**2) # Apply function to column
df.apply(lambda row: row['A'] + row['B'], axis=1) # Apply to rows
df.applymap(lambda x: x**2) # Apply to all elements
# String operations
df['C'].str.upper() # Convert to uppercase
df['C'].str.lower() # Convert to lowercase
df['C'].str.len() # String length
df['C'].str.contains('pattern') # Check if contains pattern
df['C'].str.replace('old', 'new') # Replace strings
Merging and Joining
df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value1': [1, 2, 3]})
df2 = pd.DataFrame({'key': ['A', 'B', 'D'], 'value2': [4, 5, 6]})
# Merge DataFrames
pd.merge(df1, df2, on='key') # Inner join
pd.merge(df1, df2, on='key', how='left') # Left join
pd.merge(df1, df2, on='key', how='right') # Right join
pd.merge(df1, df2, on='key', how='outer') # Outer join
# Concatenate DataFrames
pd.concat([df1, df2]) # Vertical concatenation
pd.concat([df1, df2], axis=1) # Horizontal concatenation
pd.concat([df1, df2], ignore_index=True) # Reset index
# Join on index
df1.join(df2, lsuffix='_left', rsuffix='_right')
Pivot Tables and Reshaping
# Pivot table
df.pivot_table(values='value', index='row_col', columns='col_col', aggfunc='mean')
# Melt (wide to long)
pd.melt(df, id_vars=['id'], value_vars=['A', 'B'], var_name='variable', value_name='value')
# Stack and unstack
df.stack() # Wide to long
df.unstack() # Long to wide
# Transpose
df.T # Transpose DataFrame
Matplotlib - Data Visualization
Basic Plotting
import matplotlib.pyplot as plt
import numpy as np
# Line plot
x = np.linspace(0, 10, 100)
y = np.sin(x)
plt.plot(x, y)
plt.xlabel('X axis')
plt.ylabel('Y axis')
plt.title('Sine Wave')
plt.grid(True)
plt.show()
# Multiple lines
y1 = np.sin(x)
y2 = np.cos(x)
plt.plot(x, y1, label='sin(x)')
plt.plot(x, y2, label='cos(x)')
plt.legend()
plt.show()
# Scatter plot
x = np.random.randn(100)
y = np.random.randn(100)
plt.scatter(x, y, alpha=0.5)
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Scatter Plot')
plt.show()
Chart Types
# Bar chart
categories = ['A', 'B', 'C', 'D']
values = [23, 45, 56, 78]
plt.bar(categories, values)
plt.title('Bar Chart')
plt.show()
# Horizontal bar chart
plt.barh(categories, values)
plt.title('Horizontal Bar Chart')
plt.show()
# Histogram
data = np.random.normal(0, 1, 1000)
plt.hist(data, bins=30, alpha=0.7)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram')
plt.show()
# Pie chart
sizes = [15, 30, 45, 10]
labels = ['A', 'B', 'C', 'D']
plt.pie(sizes, labels=labels, autopct='%1.1f%%')
plt.title('Pie Chart')
plt.show()
# Box plot
data = [np.random.normal(0, std, 100) for std in range(1, 4)]
plt.boxplot(data)
plt.title('Box Plot')
plt.show()
Subplots and Layouts
# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
# Plot in each subplot
axes[0, 0].plot(x, y1)
axes[0, 0].set_title('Plot 1')
axes[0, 1].scatter(x, y2)
axes[0, 1].set_title('Plot 2')
axes[1, 0].hist(data, bins=20)
axes[1, 0].set_title('Plot 3')
axes[1, 1].bar(categories, values)
axes[1, 1].set_title('Plot 4')
plt.tight_layout()
plt.show()
# Alternative syntax
plt.subplot(2, 2, 1)
plt.plot(x, y1)
plt.title('Plot 1')
plt.subplot(2, 2, 2)
plt.scatter(x, y2)
plt.title('Plot 2')
plt.show()
Customization
# Colors and styles
plt.plot(x, y, color='red', linestyle='--', linewidth=2, marker='o', markersize=5)
# Color options
plt.plot(x, y, 'r-') # Red line
plt.plot(x, y, 'bo') # Blue circles
plt.plot(x, y, 'g--') # Green dashed line
# Style options
plt.style.use('seaborn') # Use seaborn style
plt.style.use('ggplot') # Use ggplot style
# Figure size and DPI
plt.figure(figsize=(10, 6), dpi=100)
# Axis limits
plt.xlim(0, 10)
plt.ylim(-2, 2)
# Axis labels and title
plt.xlabel('X Label', fontsize=12)
plt.ylabel('Y Label', fontsize=12)
plt.title('Title', fontsize=14, fontweight='bold')
# Grid
plt.grid(True, alpha=0.3)
# Save figure
plt.savefig('plot.png', dpi=300, bbox_inches='tight')
Seaborn - Statistical Visualization
Basic Seaborn Plots
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
# Load sample dataset
tips = sns.load_dataset('tips')
iris = sns.load_dataset('iris')
# Set style
sns.set_style('whitegrid')
sns.set_palette('husl')
# Distribution plots
sns.histplot(tips['total_bill'])
sns.histplot(tips['total_bill'], kde=True)
sns.kdeplot(tips['total_bill'])
sns.boxplot(x='day', y='total_bill', data=tips)
sns.violinplot(x='day', y='total_bill', data=tips)
# Relationship plots
sns.scatterplot(x='total_bill', y='tip', data=tips)
sns.lineplot(x='total_bill', y='tip', data=tips)
sns.regplot(x='total_bill', y='tip', data=tips)
# Categorical plots
sns.barplot(x='day', y='total_bill', data=tips)
sns.countplot(x='day', data=tips)
sns.pointplot(x='day', y='total_bill', data=tips)
Advanced Seaborn Plots
# Heatmap
correlation_matrix = iris.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
# Pair plot
sns.pairplot(iris, hue='species')
# Facet grid
g = sns.FacetGrid(tips, col='time', row='smoker')
g.map(sns.scatterplot, 'total_bill', 'tip')
# Joint plot
sns.jointplot(x='total_bill', y='tip', data=tips, kind='scatter')
sns.jointplot(x='total_bill', y='tip', data=tips, kind='reg')
sns.jointplot(x='total_bill', y='tip', data=tips, kind='hex')
# Cluster map
sns.clustermap(iris.corr(), annot=True)
Customization in Seaborn
# Figure size
plt.figure(figsize=(10, 6))
sns.boxplot(x='day', y='total_bill', data=tips)
# Color palette
sns.boxplot(x='day', y='total_bill', data=tips, palette='Set2')
# Hue parameter
sns.boxplot(x='day', y='total_bill', hue='smoker', data=tips)
# Style parameters
sns.set_style('darkgrid')
sns.set_context('talk') # poster, notebook, talk, paper
# Custom color palette
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
sns.set_palette(colors)
Jupyter Notebooks
Essential Jupyter Commands
# IPython magic commands
%timeit code_to_time # Time execution
%time code_to_time # Time single execution
%who # List variables
%whos # Detailed variable info
%reset # Clear namespace
%run script.py # Run external script
%load script.py # Load script content
%save filename 1-5 # Save cells to file
%history # Show command history
# Line vs cell magic
%matplotlib inline # Line magic
%%writefile script.py # Cell magic
%%time # Time entire cell
%%bash # Run bash commands
%%sql # Run SQL queries
Jupyter Display and Output
from IPython.display import display, HTML, Image, Audio, Video
import pandas as pd
# Display DataFrames nicely
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
display(df)
# Display HTML
display(HTML('<h1>Hello World</h1>'))
# Display images
display(Image('image.png'))
# Display multiple items
display(df, HTML('<hr>'), Image('plot.png'))
# Progress bars
from tqdm import tqdm
for i in tqdm(range(100)):
# Do something
pass
Jupyter Widgets
import ipywidgets as widgets
from IPython.display import display
# Interactive widgets
slider = widgets.IntSlider(value=7, min=0, max=10, step=1, description='Value:')
display(slider)
# Interact decorator
@widgets.interact
def f(x=10):
return x**2
# Manual interact
widgets.interact(f, x=widgets.IntSlider(min=0, max=20, value=10))
# Text widget
text = widgets.Text(value='Hello World', description='String:')
display(text)
# Dropdown
dropdown = widgets.Dropdown(
options=['Option 1', 'Option 2', 'Option 3'],
value='Option 1',
description='Choose:'
)
display(dropdown)
Scientific Computing Patterns
NumPy Broadcasting
# Broadcasting rules
a = np.array([1, 2, 3]) # (3,)
b = np.array([[1], [2], [3]]) # (3, 1)
c = a + b # (3, 3) - broadcasts to match
# Common broadcasting patterns
arr = np.array([[1, 2, 3], [4, 5, 6]])
arr + 10 # Add scalar to all elements
arr * np.array([1, 2, 3]) # Multiply each row by different values
arr + np.array([[10], [20]]) # Add different values to each row
Vectorization
# Avoid loops with vectorization
# Bad: Using loops
def slow_function(arr):
result = []
for i in range(len(arr)):
result.append(arr[i] ** 2 + 2 * arr[i] + 1)
return np.array(result)
# Good: Vectorized
def fast_function(arr):
return arr ** 2 + 2 * arr + 1
# Conditional operations
arr = np.array([1, 2, 3, 4, 5])
np.where(arr > 3, arr, 0) # Replace values <= 3 with 0
np.select([arr < 2, arr > 4], [arr * 2, arr * 3], arr) # Multiple conditions
Memory Efficiency
# Use views instead of copies when possible
arr = np.array([[1, 2, 3], [4, 5, 6]])
view = arr[0] # View (shares memory)
copy = arr[0].copy() # Copy (new memory)
# Check if view or copy
view.base is arr # True for view
copy.base is arr # False for copy
# In-place operations
arr *= 2 # In-place multiplication
np.add(arr, 1, out=arr) # In-place addition
Data Analysis Workflows
Exploratory Data Analysis (EDA)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Load and inspect data
df = pd.read_csv('data.csv')
print(df.head())
print(df.info())
print(df.describe())
print(df.isnull().sum())
# Distribution analysis
df.hist(bins=20, figsize=(12, 8))
plt.show()
# Correlation analysis
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()
# Outlier detection
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
outliers = df[((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
Data Preprocessing Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
# Handle missing values
df = df.dropna() # or df.fillna(method='forward')
# Handle categorical variables
label_encoder = LabelEncoder()
df['category_encoded'] = label_encoder.fit_transform(df['category'])
# One-hot encoding
df_encoded = pd.get_dummies(df, columns=['category'], prefix='cat')
# Feature scaling
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[['feature1', 'feature2']])
# Train-test split
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
Statistical Operations
# Descriptive statistics
df.describe() # Summary statistics
df.mean() # Mean
df.median() # Median
df.mode() # Mode
df.std() # Standard deviation
df.var() # Variance
df.skew() # Skewness
df.kurtosis() # Kurtosis
# Correlation and covariance
df.corr() # Correlation matrix
df.cov() # Covariance matrix
df['A'].corr(df['B']) # Correlation between two columns
# Hypothesis testing
from scipy import stats
t_stat, p_value = stats.ttest_ind(group1, group2)
chi2, p_value = stats.chi2_contingency(contingency_table)
Machine Learning Basics
Scikit-learn Overview
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Basic ML pipeline
# 1. Load and prepare data
X = df.drop('target', axis=1)
y = df['target']
# 2. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 3. Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 4. Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
# 5. Make predictions
y_pred = model.predict(X_test_scaled)
# 6. Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.3f}')
print(classification_report(y_test, y_pred))
Model Evaluation
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
# Regression metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
# Classification metrics
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
# ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()
# Cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)
print(f'CV Scores: {cv_scores}')
print(f'Mean CV Score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})')
Data Visualization Best Practices
Choosing the Right Chart Type
# For different data types and purposes:
# Continuous data distribution
sns.histplot(data['column']) # Histogram
sns.kdeplot(data['column']) # Density plot
sns.boxplot(y='column', data=data) # Box plot
# Categorical data
sns.countplot(x='category', data=data) # Count plot
sns.barplot(x='category', y='value', data=data) # Bar plot
# Relationships
sns.scatterplot(x='x', y='y', data=data) # Scatter plot
sns.lineplot(x='x', y='y', data=data) # Line plot
sns.regplot(x='x', y='y', data=data) # Regression plot
# Multiple variables
sns.pairplot(data) # Pair plot
sns.heatmap(data.corr()) # Correlation heatmap
Effective Visualization
# Good practices
plt.figure(figsize=(10, 6))
plt.title('Clear, Descriptive Title', fontsize=16)
plt.xlabel('X Axis Label', fontsize=12)
plt.ylabel('Y Axis Label', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
# Color considerations
# Use colorblind-friendly palettes
sns.set_palette('colorblind')
# Or manually specify colors
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
# Avoid chart junk
sns.despine() # Remove top and right spines
plt.grid(True, alpha=0.3) # Subtle grid
# Keep it simple and focused
Common Patterns and Workflows
Data Loading and Exploration
# Standard data loading pattern
def load_and_explore(filename):
# Load data
df = pd.read_csv(filename)
# Basic info
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Data types:\n{df.dtypes}")
print(f"Missing values:\n{df.isnull().sum()}")
# Quick stats
print(f"Numerical summary:\n{df.describe()}")
# Display first few rows
display(df.head())
return df
# Usage
df = load_and_explore('data.csv')
Data Cleaning Pipeline
def clean_data(df):
# Make a copy
df_clean = df.copy()
# Handle missing values
df_clean = df_clean.dropna(thresh=len(df_clean) * 0.5, axis=1) # Drop columns with >50% missing
df_clean = df_clean.fillna(df_clean.mean()) # Fill numeric with mean
# Handle duplicates
df_clean = df_clean.drop_duplicates()
# Remove outliers (optional)
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
Q1 = df_clean[col].quantile(0.25)
Q3 = df_clean[col].quantile(0.75)
IQR = Q3 - Q1
df_clean = df_clean[~((df_clean[col] < (Q1 - 1.5 * IQR)) |
(df_clean[col] > (Q3 + 1.5 * IQR)))]
return df_clean
Analysis Template
def analyze_dataset(df, target_column):
# Exploratory Data Analysis
print("=== EXPLORATORY DATA ANALYSIS ===")
print(f"Dataset shape: {df.shape}")
print(f"Target distribution:\n{df[target_column].value_counts()}")
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
# Target distribution
df[target_column].value_counts().plot(kind='bar', ax=axes[0, 0])
axes[0, 0].set_title('Target Distribution')
# Correlation heatmap
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, ax=axes[0, 1])
axes[0, 1].set_title('Correlation Matrix')
# Feature distributions
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols].hist(bins=20, ax=axes[1, 0])
axes[1, 0].set_title('Feature Distributions')
# Missing values
df.isnull().sum().plot(kind='bar', ax=axes[1, 1])
axes[1, 1].set_title('Missing Values')
plt.tight_layout()
plt.show()
return df
This comprehensive cheatsheet covers the essential Python data science tools and techniques. Keep it handy for quick reference during your data science projects!