🐍 Python Fundamentals

# Variables & Types
x = 42            # int
x = 3.14          # float
x = "hello"       # str
x = True          # bool
x = None          # NoneType
x = [1,2,3]       # list (mutable)
x = (1,2,3)       # tuple (immutable)
x = {"a":1}       # dict
x = {1,2,3}       # set

# String f-strings
f"Hello {name}! Score: {score:.2f}"

# List comprehension
[x**2 for x in range(10) if x % 2 == 0]

# Dict comprehension
{k: v for k, v in items.items() if v > 0}

# Unpacking
a, *b, c = [1,2,3,4,5]   # b = [2,3,4]
x, y = y, x               # swap variables

# Context manager
with open("file.txt") as f:
    data = f.read()

# Lambda
square = lambda x: x**2
sorted(items, key=lambda x: x["name"])

🏗️ OOP Quick Reference

class Animal:
    species = "Unknown"   # class variable

    def __init__(self, name: str, age: int):
        self.name = name     # instance variable
        self._age = age      # "protected"

    @property
    def age(self): return self._age

    @age.setter
    def age(self, v):
        if v < 0: raise ValueError()
        self._age = v

    @classmethod
    def from_birth_year(cls, name, year):
        return cls(name, 2026 - year)

    @staticmethod
    def is_valid_age(age): return age >= 0

    def __str__(self): return f"{self.name} ({self._age})"
    def __repr__(self): return f"Animal(name={self.name!r})"
    def __eq__(self, other): return self.name == other.name


class Dog(Animal):
    def __init__(self, name, age, breed):
        super().__init__(name, age)
        self.breed = breed

    def speak(self): return "Woof!"

# Dataclass
from dataclasses import dataclass
@dataclass
class Point:
    x: float
    y: float
    def distance(self): return (self.x**2 + self.y**2)**0.5

📊 Data Analysis Quick Reference

import pandas as pd
import numpy as np

# Load & inspect
df = pd.read_csv("data.csv")
df.head() | df.info() | df.describe()
df.isnull().sum() | df.dtypes

# Select
df["col"] | df[["col1","col2"]]
df.iloc[0] | df.loc[0, "col"]
df[df["age"] > 30 & df["dept"].isin(["Engineering"])]

# Transform
df["new"] = df["old"].apply(func)
df.assign(new_col=lambda df: df["a"] + df["b"])

# GroupBy
df.groupby("dept")["salary"].agg(["mean","count"])
df.groupby("dept").agg(avg=("salary","mean"), n=("id","count"))

# Merge
pd.merge(df1, df2, on="id", how="left")

# NumPy
np.array([1,2,3])
arr[arr > 5]          # boolean indexing
arr * 2               # vectorised operation
np.mean/std/min/max/sum(arr)

🤖 ML Quick Reference

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y
)

# Pipeline (prevents leakage!)
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", RandomForestClassifier(n_estimators=100))
])
pipe.fit(X_train, y_train)
score = pipe.score(X_test, y_test)

# Cross-validate
scores = cross_val_score(pipe, X, y, cv=5, scoring="roc_auc")

# Evaluate
y_pred = pipe.predict(X_test)
print(classification_report(y_test, y_pred))

# Tune
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(pipe, {"model__n_estimators": [100,200]}, cv=5)
gs.fit(X_train, y_train)

🧠 AI in Python Quick Reference

# OpenAI
from openai import OpenAI
client = OpenAI()  # reads OPENAI_API_KEY from env

response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are helpful."},
        {"role": "user", "content": "Hello!"}
    ],
    temperature=0.7
)
text = response.choices[0].message.content

# Anthropic
import anthropic
client = anthropic.Anthropic()
msg = client.messages.create(
    model="claude-opus-4-6",
    max_tokens=1024,
    messages=[{"role":"user","content":"Hello!"}]
)
text = msg.content[0].text

# Embeddings
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(["text 1", "text 2"])

# LangChain RAG
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

vectorstore = FAISS.from_texts(docs, OpenAIEmbeddings())
retriever = vectorstore.as_retriever()

chain = (
    {"context": retriever, "question": lambda x: x}
    | ChatPromptTemplate.from_template("Context: {context}\nQ: {question}")
    | ChatOpenAI()
    | StrOutputParser()
)
answer = chain.invoke("my question")

📐 Key Concepts Summary

TopicKey Concepts
Python BasicsVariables, types, operators, f-strings
Control Flowif/elif/else, for, while, comprehensions
Functions*args, **kwargs, closures, decorators
Data Structureslist, dict, tuple, set, Counter, defaultdict
OOP PillarsEncapsulation, Abstraction, Inheritance, Polymorphism
OOP PatternsSingleton, Factory, Observer, Strategy
NumPyArrays, broadcasting, vectorisation, linear algebra
PandasSeries, DataFrame, groupby, merge
EDAMissing values, distributions, correlations
ML WorkflowSplit → Preprocess → Train → Evaluate → Tune
SupervisedLogistic/Linear Regression, Trees, RF, XGBoost, SVM
UnsupervisedK-Means, DBSCAN, PCA, t-SNE
LLMsTokenisation, attention, context window, temperature
RAGEmbed → Store → Retrieve → Generate

📋 Practice Questions

Click any question to reveal the answer.

Pathway 1: Python Fundamentals

Q1What is the output of x = [1, 2, 3]; x[1] = 5; print(x)?
[1, 5, 3] — Lists are mutable, indexing is 0-based.
Q2Unpack [1, 2, 3, 4] so that a=1, b=4, c=[2,3]
a, *c, b = [1, 2, 3, 4]
# a = 1, c = [2, 3], b = 4
Q3What's the difference between == and is?
== compares value, is compares object identity. [1] == [1] is True, but [1] is [1] is False.
Q4How do list comprehensions compare to map() in readability?
Comprehensions are more readable and generally faster. Preferred in Python.
Q5Write a function that accepts *args and **kwargs
def func(*args, **kwargs):
    print(f"Args: {args}, Kwargs: {kwargs}")

func(1, 2, name="John")
# Args: (1, 2), Kwargs: {'name': 'John'}

Pathway 2: OOP

Q6Explain the Single Responsibility Principle with code
Each class has one job:
# ❌ Violates SRP
class User:
    def save_to_db(self): ...
    def send_email(self): ...

# ✅ Correct
class User: pass
class UserRepository:
    def save(self, user): ...
class EmailService:
    def send(self, user): ...
Q7What is the difference between @property and a regular method?
@property lets you use obj.attr syntax while running a method. Regular methods require obj.method().
Q8How does inheritance differ from composition?
Inheritance: "is-a" (Dog IS-A Animal). Composition: "has-a" (Car HAS-A Engine). Prefer composition for flexibility.
Q9What does super().__init__() do?
Calls the parent class __init__(), initialising base class attributes.
Q10Create a dataclass with a custom __post_init__ method
from dataclasses import dataclass

@dataclass
class Person:
    name: str
    age: int

    def __post_init__(self):
        if self.age < 0:
            raise ValueError("Age can't be negative")

Pathway 3: Data Analysis

Q11How do you remove duplicate rows in Pandas?
df.drop_duplicates()
Q12What's the difference between .loc[] and .iloc[]?
.loc[] is label-based, .iloc[] is integer position-based.
Q13Fill missing values only in the 'age' column
df['age'] = df['age'].fillna(df['age'].median())
Q14Group by a column and get the mean of another
df.groupby('column').agg({'other_column': 'mean'})
Q15What does df.assign() do differently?
.assign() returns a new DataFrame without modifying the original (non-mutating). Great for method chaining.

Pathway 4: Machine Learning

Q16Why fit the scaler on training data only?
Fitting on all data (including test) leaks test information into training. The model "sees" test data statistics during training, giving unrealistically good performance.
Q17Cross-validation vs train-test split?
CV: split into k folds, train k times. Train-test: single split. CV gives a more robust, unbiased estimate of model performance.
Q18Explain data leakage with an example
Using future information (e.g., a target date) as a feature for predicting that same target. Example: using "last payment date" to predict loan default when that date is AFTER the prediction point.
Q19What is the purpose of a Pipeline in sklearn?
Pipeline chains preprocessing and model training, automatically preventing data leakage by ensuring preprocessing is fit only on training data.
Q20Why report multiple metrics, not just accuracy?
Accuracy alone is misleading, especially with imbalanced data. Use precision, recall, F1, ROC-AUC depending on the problem and business context.

Pathway 5: AI in Python

Q21What is a token in the context of LLMs?
A token is an atomic unit of text — roughly 1 word or 4 characters. LLMs process text as sequences of tokens.
Q22How do you set the temperature in an API call?
response = client.chat.completions.create(
    model="gpt-4",
    temperature=0.7,  # 0=deterministic, 1=random
    messages=[...]
)
Q23Embeddings vs text generation?
Embeddings: dense vectors representing semantic meaning (used for similarity search). Text generation: predicting next token sequences (used for chat, writing).
Q24What is RAG (Retrieval-Augmented Generation)?
RAG: Retrieve relevant documents from a vector store, then generate an answer using those documents as context. Reduces hallucination and keeps answers grounded in source material.
Q25How do you handle API authentication for OpenAI?
from openai import OpenAI

# Best: environment variable
client = OpenAI()  # reads OPENAI_API_KEY from env

# Or explicitly (but keep out of source control!)
client = OpenAI(api_key="sk-...")