🐍 Python Fundamentals
# Variables & Types
x = 42 # int
x = 3.14 # float
x = "hello" # str
x = True # bool
x = None # NoneType
x = [1,2,3] # list (mutable)
x = (1,2,3) # tuple (immutable)
x = {"a":1} # dict
x = {1,2,3} # set
# String f-strings
f"Hello {name}! Score: {score:.2f}"
# List comprehension
[x**2 for x in range(10) if x % 2 == 0]
# Dict comprehension
{k: v for k, v in items.items() if v > 0}
# Unpacking
a, *b, c = [1,2,3,4,5] # b = [2,3,4]
x, y = y, x # swap variables
# Context manager
with open("file.txt") as f:
data = f.read()
# Lambda
square = lambda x: x**2
sorted(items, key=lambda x: x["name"])
🏗️ OOP Quick Reference
class Animal:
species = "Unknown" # class variable
def __init__(self, name: str, age: int):
self.name = name # instance variable
self._age = age # "protected"
@property
def age(self): return self._age
@age.setter
def age(self, v):
if v < 0: raise ValueError()
self._age = v
@classmethod
def from_birth_year(cls, name, year):
return cls(name, 2026 - year)
@staticmethod
def is_valid_age(age): return age >= 0
def __str__(self): return f"{self.name} ({self._age})"
def __repr__(self): return f"Animal(name={self.name!r})"
def __eq__(self, other): return self.name == other.name
class Dog(Animal):
def __init__(self, name, age, breed):
super().__init__(name, age)
self.breed = breed
def speak(self): return "Woof!"
# Dataclass
from dataclasses import dataclass
@dataclass
class Point:
x: float
y: float
def distance(self): return (self.x**2 + self.y**2)**0.5
📊 Data Analysis Quick Reference
import pandas as pd
import numpy as np
# Load & inspect
df = pd.read_csv("data.csv")
df.head() | df.info() | df.describe()
df.isnull().sum() | df.dtypes
# Select
df["col"] | df[["col1","col2"]]
df.iloc[0] | df.loc[0, "col"]
df[df["age"] > 30 & df["dept"].isin(["Engineering"])]
# Transform
df["new"] = df["old"].apply(func)
df.assign(new_col=lambda df: df["a"] + df["b"])
# GroupBy
df.groupby("dept")["salary"].agg(["mean","count"])
df.groupby("dept").agg(avg=("salary","mean"), n=("id","count"))
# Merge
pd.merge(df1, df2, on="id", how="left")
# NumPy
np.array([1,2,3])
arr[arr > 5] # boolean indexing
arr * 2 # vectorised operation
np.mean/std/min/max/sum(arr)
🤖 ML Quick Reference
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
# Split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y
)
# Pipeline (prevents leakage!)
pipe = Pipeline([
("scaler", StandardScaler()),
("model", RandomForestClassifier(n_estimators=100))
])
pipe.fit(X_train, y_train)
score = pipe.score(X_test, y_test)
# Cross-validate
scores = cross_val_score(pipe, X, y, cv=5, scoring="roc_auc")
# Evaluate
y_pred = pipe.predict(X_test)
print(classification_report(y_test, y_pred))
# Tune
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(pipe, {"model__n_estimators": [100,200]}, cv=5)
gs.fit(X_train, y_train)
🧠 AI in Python Quick Reference
# OpenAI
from openai import OpenAI
client = OpenAI() # reads OPENAI_API_KEY from env
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "You are helpful."},
{"role": "user", "content": "Hello!"}
],
temperature=0.7
)
text = response.choices[0].message.content
# Anthropic
import anthropic
client = anthropic.Anthropic()
msg = client.messages.create(
model="claude-opus-4-6",
max_tokens=1024,
messages=[{"role":"user","content":"Hello!"}]
)
text = msg.content[0].text
# Embeddings
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(["text 1", "text 2"])
# LangChain RAG
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
vectorstore = FAISS.from_texts(docs, OpenAIEmbeddings())
retriever = vectorstore.as_retriever()
chain = (
{"context": retriever, "question": lambda x: x}
| ChatPromptTemplate.from_template("Context: {context}\nQ: {question}")
| ChatOpenAI()
| StrOutputParser()
)
answer = chain.invoke("my question")
📋 Practice Questions
Click any question to reveal the answer.
Pathway 1: Python Fundamentals
[1, 5, 3] — Lists are mutable, indexing is 0-based.
a, *c, b = [1, 2, 3, 4]
# a = 1, c = [2, 3], b = 4
== compares value, is compares object identity. [1] == [1] is True, but [1] is [1] is False.
Comprehensions are more readable and generally faster. Preferred in Python.
def func(*args, **kwargs):
print(f"Args: {args}, Kwargs: {kwargs}")
func(1, 2, name="John")
# Args: (1, 2), Kwargs: {'name': 'John'}
Pathway 2: OOP
Each class has one job:
# ❌ Violates SRP
class User:
def save_to_db(self): ...
def send_email(self): ...
# ✅ Correct
class User: pass
class UserRepository:
def save(self, user): ...
class EmailService:
def send(self, user): ...
@property lets you use obj.attr syntax while running a method. Regular methods require obj.method().
Inheritance: "is-a" (Dog IS-A Animal). Composition: "has-a" (Car HAS-A Engine). Prefer composition for flexibility.
Calls the parent class __init__(), initialising base class attributes.
from dataclasses import dataclass
@dataclass
class Person:
name: str
age: int
def __post_init__(self):
if self.age < 0:
raise ValueError("Age can't be negative")
Pathway 3: Data Analysis
.loc[] is label-based, .iloc[] is integer position-based.
df['age'] = df['age'].fillna(df['age'].median())
df.groupby('column').agg({'other_column': 'mean'})
.assign() returns a new DataFrame without modifying the original (non-mutating). Great for method chaining.
Pathway 4: Machine Learning
Fitting on all data (including test) leaks test information into training. The model "sees" test data statistics during training, giving unrealistically good performance.
CV: split into k folds, train k times. Train-test: single split. CV gives a more robust, unbiased estimate of model performance.
Using future information (e.g., a target date) as a feature for predicting that same target. Example: using "last payment date" to predict loan default when that date is AFTER the prediction point.
Pipeline chains preprocessing and model training, automatically preventing data leakage by ensuring preprocessing is fit only on training data.
Accuracy alone is misleading, especially with imbalanced data. Use precision, recall, F1, ROC-AUC depending on the problem and business context.
Pathway 5: AI in Python
A token is an atomic unit of text — roughly 1 word or 4 characters. LLMs process text as sequences of tokens.
response = client.chat.completions.create(
model="gpt-4",
temperature=0.7, # 0=deterministic, 1=random
messages=[...]
)
Embeddings: dense vectors representing semantic meaning (used for similarity search). Text generation: predicting next token sequences (used for chat, writing).
RAG: Retrieve relevant documents from a vector store, then generate an answer using those documents as context. Reduces hallucination and keeps answers grounded in source material.
from openai import OpenAI
# Best: environment variable
client = OpenAI() # reads OPENAI_API_KEY from env
# Or explicitly (but keep out of source control!)
client = OpenAI(api_key="sk-...")