Core Principles

1. Never Modify Raw Data

# Always work on a copy
raw_df = pd.read_csv("data.csv")        # original — never touch
df = raw_df.copy()                       # work on this

# Or load fresh each time
def get_data():
    return pd.read_csv("data.csv")

2. Document Every Transformation

def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean the customer dataset.

    Transformations:
    - Drop rows where name is null (can't identify customer)
    - Fill missing age with median (distribution is ~normal)
    - Remove salary outliers (IQR method, retain >99.5% of data)
    - Standardise department names (lowercase, strip whitespace)

    Args:
        df: Raw customer DataFrame (n rows × m columns)

    Returns:
        Cleaned DataFrame with same columns, fewer rows
    """
    df = df.copy()
    # ... transformations
    return df

3. Make Analysis Reproducible

# Set random seed
np.random.seed(42)

# Record package versions
import pkg_resources
packages = ["pandas", "numpy", "scikit-learn", "matplotlib"]
for p in packages:
    version = pkg_resources.get_distribution(p).version
    print(f"{p}: {version}")

# OR: use a requirements.txt / pyproject.toml

DataFrame Best Practices

Method Chaining

# ✅ Method chaining with parentheses for readability
result = (
    df
    .dropna(subset=["salary"])
    .query("age >= 18")
    .assign(
        age_group=pd.cut(df["age"], bins=[0, 30, 50, 100],
                         labels=["young", "mid", "senior"]),
        log_salary=np.log1p(df["salary"])
    )
    .groupby("department")
    .agg({"salary": ["mean", "count"]})
    .sort_values("avg_salary", ascending=False)
)

pipe() + reset_index()

# ✅ Use .pipe() for custom transformations in chains
def remove_outliers(df, col):
    Q1, Q3 = df[col].quantile([0.25, 0.75])
    IQR = Q3 - Q1
    return df[df[col].between(Q1 - 1.5*IQR, Q3 + 1.5*IQR)]

result = df.pipe(remove_outliers, "salary")

# ✅ reset_index() after groupby when needed
grouped = df.groupby("dept")["salary"].mean().reset_index()
grouped.columns = ["dept", "avg_salary"]

EDA Checklist

Visualisation Best Practices

# Always label axes
ax.set_xlabel("Age (years)", fontsize=12)
ax.set_ylabel("Salary (£)", fontsize=12)
ax.set_title("Salary vs Age by Department", fontsize=14, fontweight="bold")

# Always add legends when multiple series
ax.legend(title="Department", loc="best")

# Use tight_layout to prevent clipping
plt.tight_layout()

# Save at high resolution
plt.savefig("figure.png", dpi=150, bbox_inches="tight")

# Colour-blind friendly palettes
sns.set_palette("colorblind")  # or "viridis", "cividis"

Reproducibility

Code Level

# 1. Set random seed at the TOP of notebook/script
import numpy as np, random
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# 2. Document package versions
import pkg_resources
packages = ['pandas', 'numpy', 'scikit-learn', 'matplotlib', 'seaborn']
for pkg in packages:
    version = pkg_resources.get_distribution(pkg).version
    print(f"  {pkg}: {version}")

# 3. Use absolute paths, not relative
from pathlib import Path
project_root = Path(__file__).parent.parent
data_path = project_root / "data" / "raw.csv"
df = pd.read_csv(data_path)

# 4. Never modify raw data
raw_df = pd.read_csv(data_path)
df = raw_df.copy()  # Always work on a copy

Data Level

# 1. Document data source and date
data_source = "https://api.example.com/data"
download_date = "2026-03-22"
data_version = "v2.1"

# 2. Record data shape and checksums
print(f"Shape: {df.shape}")
print(f"MD5: {pd.util.hash_pandas_object(df, index=True).sum()}")

# 3. Document every transformation with before/after counts
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    initial_rows = len(df)
    df = df[df['age'] >= 0]
    print(f"After removing negative ages: {len(df)} rows (dropped {initial_rows - len(df)})")
    df['salary'] = df['salary'].fillna(0)
    df = df.drop_duplicates(subset=['name', 'dob'])
    df['department'] = df['department'].str.title()
    return df

Environment Level

# requirements.txt
pandas==2.0.3
numpy==1.24.3
scikit-learn==1.3.0
matplotlib==3.7.1

# OR use conda environment.yml
# name: analysis-env
# dependencies:
#   - python=3.10
#   - pandas=2.0.3
#   - numpy=1.24.3

# Install: conda env create -f environment.yml

Documentation Level

# Analysis Project

## Overview
Predicts customer churn using historical purchase data.

## Data
- Source: company data warehouse
- Date: 2026-03-22
- Size: 100k customers, 50 features

## Steps to Reproduce
1. conda env create -f environment.yml
2. python scripts/download_data.py
3. jupyter notebook notebooks/01_eda.ipynb

## Key Results
- Model accuracy: 0.85 (test set)
- Top 3 features: tenure, monthly_charge, tech_support

Code Review Checklist

Data Handling

  • Is raw data never modified? (Always work on a copy)
  • Are file paths absolute, not relative?
  • Is data source and version documented?
  • Are column names clear and explained?
  • Are data types correct? (date as datetime, not string)
  • Are missing values documented and handled?
  • Are outliers checked and explained?
  • Are duplicates removed intentionally (not by accident)?

Transformations

  • Is every transformation documented (the WHY, not just what)?
  • Are before/after row counts reported?
  • Are edge cases handled? (empty data, single value)
  • Could someone else reproduce the exact transformations?
  • Are functions reusable? (Not hardcoded magic numbers)

Analysis

  • Is there a clear hypothesis or question?
  • Are the correct statistical methods used?
  • Are assumptions checked? (normality, independence)
  • Are confidence intervals or error bars reported?
  • Are multiple interpretations considered?
  • Is correlation distinguished from causation?

Visualisation

  • Are axes labeled with units?
  • Are titles informative (not generic)?
  • Is a legend present when multiple series shown?
  • Are colour schemes accessible (colourblind-friendly)?
  • Is resolution high enough (dpi ≥ 150)?

Code Quality

  • Is code readable? (Good variable names)
  • Are there comments explaining the WHY, not the WHAT?
  • Is the code DRY? (No repeated blocks)
  • Are loops avoided in favour of vectorised operations?
  • Are type hints used?

Reproducibility

  • Is random seed set?
  • Are package versions documented?
  • Can someone rerun the code and get identical results?
  • Is the analysis documented step-by-step?

Example Code Review

❌ Issues
# Magic number without explanation
df = df[df['salary'] > 100000]

# Transformation not documented
df['log_salary'] = np.log(df['salary'])

# Loop instead of vectorised
results = []
for idx, row in df.iterrows():
    results.append(row['a'] * row['b'])
✅ Better
# Named constant
MIN_SALARY = 100000  # focus on senior
df = df[df['salary'] > MIN_SALARY]

# Documented log transform
# Normalise right-skewed distribution
df['log_salary'] = np.log(df['salary'])
print(f"Skew: {df['salary'].skew():.2f}")

# Vectorised — 100x faster!
df['product'] = df['a'] * df['b']