Core Principles
1. Never Modify Raw Data
# Always work on a copy
raw_df = pd.read_csv("data.csv") # original — never touch
df = raw_df.copy() # work on this
# Or load fresh each time
def get_data():
return pd.read_csv("data.csv")
2. Document Every Transformation
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
"""
Clean the customer dataset.
Transformations:
- Drop rows where name is null (can't identify customer)
- Fill missing age with median (distribution is ~normal)
- Remove salary outliers (IQR method, retain >99.5% of data)
- Standardise department names (lowercase, strip whitespace)
Args:
df: Raw customer DataFrame (n rows × m columns)
Returns:
Cleaned DataFrame with same columns, fewer rows
"""
df = df.copy()
# ... transformations
return df
3. Make Analysis Reproducible
# Set random seed
np.random.seed(42)
# Record package versions
import pkg_resources
packages = ["pandas", "numpy", "scikit-learn", "matplotlib"]
for p in packages:
version = pkg_resources.get_distribution(p).version
print(f"{p}: {version}")
# OR: use a requirements.txt / pyproject.toml
DataFrame Best Practices
Method Chaining
# ✅ Method chaining with parentheses for readability
result = (
df
.dropna(subset=["salary"])
.query("age >= 18")
.assign(
age_group=pd.cut(df["age"], bins=[0, 30, 50, 100],
labels=["young", "mid", "senior"]),
log_salary=np.log1p(df["salary"])
)
.groupby("department")
.agg({"salary": ["mean", "count"]})
.sort_values("avg_salary", ascending=False)
)
pipe() + reset_index()
# ✅ Use .pipe() for custom transformations in chains
def remove_outliers(df, col):
Q1, Q3 = df[col].quantile([0.25, 0.75])
IQR = Q3 - Q1
return df[df[col].between(Q1 - 1.5*IQR, Q3 + 1.5*IQR)]
result = df.pipe(remove_outliers, "salary")
# ✅ reset_index() after groupby when needed
grouped = df.groupby("dept")["salary"].mean().reset_index()
grouped.columns = ["dept", "avg_salary"]
Visualisation Best Practices
# Always label axes
ax.set_xlabel("Age (years)", fontsize=12)
ax.set_ylabel("Salary (£)", fontsize=12)
ax.set_title("Salary vs Age by Department", fontsize=14, fontweight="bold")
# Always add legends when multiple series
ax.legend(title="Department", loc="best")
# Use tight_layout to prevent clipping
plt.tight_layout()
# Save at high resolution
plt.savefig("figure.png", dpi=150, bbox_inches="tight")
# Colour-blind friendly palettes
sns.set_palette("colorblind") # or "viridis", "cividis"
Reproducibility
Code Level
# 1. Set random seed at the TOP of notebook/script
import numpy as np, random
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
# 2. Document package versions
import pkg_resources
packages = ['pandas', 'numpy', 'scikit-learn', 'matplotlib', 'seaborn']
for pkg in packages:
version = pkg_resources.get_distribution(pkg).version
print(f" {pkg}: {version}")
# 3. Use absolute paths, not relative
from pathlib import Path
project_root = Path(__file__).parent.parent
data_path = project_root / "data" / "raw.csv"
df = pd.read_csv(data_path)
# 4. Never modify raw data
raw_df = pd.read_csv(data_path)
df = raw_df.copy() # Always work on a copy
Data Level
# 1. Document data source and date
data_source = "https://api.example.com/data"
download_date = "2026-03-22"
data_version = "v2.1"
# 2. Record data shape and checksums
print(f"Shape: {df.shape}")
print(f"MD5: {pd.util.hash_pandas_object(df, index=True).sum()}")
# 3. Document every transformation with before/after counts
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
initial_rows = len(df)
df = df[df['age'] >= 0]
print(f"After removing negative ages: {len(df)} rows (dropped {initial_rows - len(df)})")
df['salary'] = df['salary'].fillna(0)
df = df.drop_duplicates(subset=['name', 'dob'])
df['department'] = df['department'].str.title()
return df
Environment Level
# requirements.txt
pandas==2.0.3
numpy==1.24.3
scikit-learn==1.3.0
matplotlib==3.7.1
# OR use conda environment.yml
# name: analysis-env
# dependencies:
# - python=3.10
# - pandas=2.0.3
# - numpy=1.24.3
# Install: conda env create -f environment.yml
Documentation Level
# Analysis Project
## Overview
Predicts customer churn using historical purchase data.
## Data
- Source: company data warehouse
- Date: 2026-03-22
- Size: 100k customers, 50 features
## Steps to Reproduce
1. conda env create -f environment.yml
2. python scripts/download_data.py
3. jupyter notebook notebooks/01_eda.ipynb
## Key Results
- Model accuracy: 0.85 (test set)
- Top 3 features: tenure, monthly_charge, tech_support