08 Weights Biases

Chapter 08: Weights & Biases (W&B)🔗

"W&B is the experiment tracking platform trusted by OpenAI, MidJourney, Cohere, and 30+ foundation model builders."


8.1 What is Weights & Biases?🔗

Weights & Biases (W&B / wandb) is a commercial ML experiment tracking platform with a generous free tier. It offers richer visualizations, collaboration features, and LLMOps capabilities compared to MLflow.

W&B vs MLflow at a Glance🔗

┌──────────────────────────────────────────────────────────────┐
│                    W&B vs MLFLOW                             │
├────────────────────────┬──────────────┬──────────────────────┤
│ Feature                │  MLflow      │  W&B                  │
├────────────────────────┼──────────────┼──────────────────────┤
│ Hosting                │ Self-hosted  │ Cloud-hosted (free)   │
│ Setup time             │ Minutes      │ Seconds               │
│ UI quality             │ Good         │ Excellent             │
│ Collaboration          │ Limited      │ Rich (teams, reports) │
│ Sweeps (HPO)           │ Basic        │ Advanced (Bayesian)   │
│ Artifacts              │ Yes          │ Yes (better UI)       │
│ LLMOps                 │ MLflow AI    │ Native (Prompts, etc) │
│ Alerts                 │ No           │ Yes                   │
│ Cost                   │ Free + infra │ Free tier; then $     │
└────────────────────────┴──────────────┴──────────────────────┘

8.2 W&B Core Components🔗

┌─────────────────────────────────────────────────────────────┐
│                    W&B COMPONENTS                           │
│                                                             │
│  Runs          → Individual training/eval experiments       │
│  Projects      → Group of related runs                      │
│  Sweeps        → Automated HPO (grid/random/Bayesian)       │
│  Artifacts     → Datasets, models, results (versioned)      │
│  Tables        → Interactive data/prediction analysis       │
│  Reports       → Shareable analysis documents               │
│  Alerts        → Slack/email notifications on metrics       │
│  Registry      → Enterprise model registry                  │
│  Prompts       → LLM prompt versioning/evaluation           │
└─────────────────────────────────────────────────────────────┘

8.3 W&B Experiment Tracking🔗

# pip install wandb

import wandb
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score

# Initialize run
run = wandb.init(
    project="churn-prediction",
    name="GBM-experiment-v3",
    tags=["gbm", "baseline", "v3"],
    config={
        "n_estimators": 200,
        "learning_rate": 0.05,
        "max_depth": 5,
        "dataset": "churn_v2",
        "model_type": "GradientBoosting"
    }
)

# Access config (hyperparams)
cfg = wandb.config

# Train model
model = GradientBoostingClassifier(
    n_estimators=cfg.n_estimators,
    learning_rate=cfg.learning_rate,
    max_depth=cfg.max_depth
)
model.fit(X_train, y_train)

# Log metrics per epoch (for neural nets, or per fold for CV)
for fold_i, (train_idx, val_idx) in enumerate(cv.split(X_train)):
    fold_model = GradientBoostingClassifier(**cfg)
    fold_model.fit(X_train[train_idx], y_train[train_idx])
    val_acc = accuracy_score(y_train[val_idx], fold_model.predict(X_train[val_idx]))

    wandb.log({
        "fold": fold_i,
        "val_accuracy": val_acc,
    })

# Final evaluation
y_pred = model.predict(X_test)
wandb.log({
    "test_accuracy": accuracy_score(y_test, y_pred),
    "test_f1": f1_score(y_test, y_pred),
})

# Log confusion matrix
wandb.log({
    "confusion_matrix": wandb.plot.confusion_matrix(
        preds=y_pred,
        y_true=y_test,
        class_names=["Not Churned", "Churned"]
    )
})

# Log artifacts (versioned datasets and models)
artifact = wandb.Artifact("churn-model", type="model")
artifact.add_file("models/model.pkl")
run.log_artifact(artifact)

# Finish run
wandb.finish()

8.4 W&B Sweeps — Automated HPO🔗

W&B Sweeps automate hyperparameter search with Bayesian optimization, random search, or grid search.

# sweep_config.yaml or as dict
sweep_config = {
    "method": "bayes",       # "bayes", "random", or "grid"
    "metric": {
        "name": "test_accuracy",
        "goal": "maximize"
    },
    "early_terminate": {
        "type": "hyperband",
        "min_iter": 3
    },
    "parameters": {
        "n_estimators": {
            "values": [100, 200, 300, 500]
        },
        "learning_rate": {
            "distribution": "log_uniform_values",
            "min": 0.001,
            "max": 0.3
        },
        "max_depth": {
            "values": [3, 5, 7, 10]
        },
        "subsample": {
            "distribution": "uniform",
            "min": 0.6,
            "max": 1.0
        }
    }
}

def train_sweep():
    with wandb.init() as run:
        cfg = run.config

        model = GradientBoostingClassifier(
            n_estimators=cfg.n_estimators,
            learning_rate=cfg.learning_rate,
            max_depth=cfg.max_depth,
            subsample=cfg.subsample,
        )
        model.fit(X_train, y_train)
        acc = accuracy_score(y_test, model.predict(X_test))
        wandb.log({"test_accuracy": acc})

# Create sweep
sweep_id = wandb.sweep(sweep_config, project="churn-prediction")

# Run agents (can run on multiple machines in parallel!)
wandb.agent(sweep_id, function=train_sweep, count=50)

8.5 W&B Artifacts (Data & Model Versioning)🔗

# ── Logging artifacts ─────────────────────────────────────
with wandb.init(project="churn-prediction") as run:

    # Log dataset as artifact
    dataset_artifact = wandb.Artifact(
        name="churn-dataset",
        type="dataset",
        description="Customer churn dataset v2",
        metadata={"rows": 50000, "source": "gs://my-bucket/churn/"}
    )
    dataset_artifact.add_file("data/processed/features.csv")
    run.log_artifact(dataset_artifact)

    # Log model as artifact
    model_artifact = wandb.Artifact(
        name="churn-model",
        type="model",
        metadata={"accuracy": 0.92, "framework": "sklearn"}
    )
    model_artifact.add_file("models/model.pkl")
    run.log_artifact(model_artifact)

# ── Using artifacts in downstream jobs ───────────────────
with wandb.init(project="churn-prediction") as run:

    # Download specific version
    artifact = run.use_artifact("churn-model:v3")
    artifact_dir = artifact.download()

    import pickle
    with open(f"{artifact_dir}/model.pkl", "rb") as f:
        model = pickle.load(f)

8.6 W&B Alerts🔗

import wandb

# Send alert when accuracy drops
with wandb.init(project="churn-prediction") as run:
    accuracy = evaluate_model()

    if accuracy < 0.80:
        wandb.alert(
            title="Model Accuracy Alert",
            text=f"Accuracy dropped to {accuracy:.3f} — below 0.80 threshold!",
            level=wandb.AlertLevel.CRITICAL,
        )

8.7 W&B Reports🔗

Reports are collaborative, shareable analysis documents combining charts, text, and code.

Use cases for W&B Reports:
  - Weekly model performance reviews
  - Experiment comparison for stakeholders
  - Model validation before production push
  - Post-mortem analysis of model failures
  - Sharing findings with non-technical team

Next → Chapter 09: ClearML & Neptune.ai