From ai-ml-pro
ML workflows, training pipelines, model evaluation, deployment, experiment tracking. Use when building or reviewing ML systems.
How this skill is triggered — by the user, by Claude, or both
Slash command
/ai-ml-pro:ai-ml-proThe summary Claude sees in its skill listing — used to decide when to auto-load this skill
Build production ML systems: data pipelines, training, evaluation, model serving, experiment tracking, and MLOps. Treat models as software — versioned, tested, monitored, and reproducible.
Build production ML systems: data pipelines, training, evaluation, model serving, experiment tracking, and MLOps. Treat models as software — versioned, tested, monitored, and reproducible.
Use this when:
Use this ESPECIALLY when:
Don't skip when:
project/
data/
raw/
processed/
features/
notebooks/
01_eda.ipynb
02_feature_engineering.ipynb
src/
features/
build_features.py
models/
train.py
predict.py
evaluate.py
pipelines/
training_pipeline.py
inference_pipeline.py
tests/
test_features.py
test_model.py
configs/
experiment_001.yaml
experiment_002.yaml
models/ ← Trained artifacts (gitignored)
reports/
figures/
pyproject.toml
requirements.txt
Dockerfile
.dvc/config ← Data version control
# Weights & Biases / MLflow
import mlflow
mlflow.set_tracking_uri("http://mlflow:5000")
mlflow.set_experiment("project-name")
with mlflow.start_run():
# Log parameters
mlflow.log_param("model_type", "random_forest")
mlflow.log_param("n_estimators", 100)
mlflow.log_param("max_depth", 10)
# Log metrics
mlflow.log_metric("accuracy", 0.94)
mlflow.log_metric("f1_score", 0.92)
mlflow.log_metric("latency_ms", 45)
# Log artifacts
mlflow.log_artifact("confusion_matrix.png")
mlflow.log_artifact("feature_importance.png")
# Log model
mlflow.sklearn.log_model(model, "model")
# src/pipelines/training_pipeline.py
from pathlib import Path
import yaml
class TrainingPipeline:
def __init__(self, config_path: str):
with open(config_path) as f:
self.config = yaml.safe_load(f)
def run(self):
# 1. Load and validate data
data = self.load_data()
self.validate_data(data)
# 2. Feature engineering
features = self.build_features(data)
# 3. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
features, data.target,
test_size=self.config['data']['test_size'],
random_state=self.config['data']['seed'],
)
# 4. Train model
model = self.train_model(X_train, y_train)
# 5. Evaluate
metrics = self.evaluate(model, X_test, y_test)
# 6. Save artifacts
self.save_model(model)
self.save_metrics(metrics)
return model, metrics
# src/models/evaluate.py
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix, classification_report, roc_auc_score,
)
def evaluate_classification(model, X_test, y_test):
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
metrics = {
'accuracy': accuracy_score(y_test, y_pred),
'precision': precision_score(y_test, y_pred, average='weighted'),
'recall': recall_score(y_test, y_pred, average='weighted'),
'f1': f1_score(y_test, y_pred, average='weighted'),
}
if y_proba is not None:
metrics['roc_auc'] = roc_auc_score(y_test, y_proba)
return metrics
# Test: Model metrics must exceed baseline
assert metrics['f1'] > BASELINE_F1, f"F1 {metrics['f1']} below baseline {BASELINE_F1}"
# Test: No data leakage (time-based split for time series)
# Test: Reproducibility (same seed → same results)
# FastAPI inference endpoint
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
app = FastAPI()
model = load_model("models/production/model.pkl")
class PredictionRequest(BaseModel):
features: list[float]
class PredictionResponse(BaseModel):
prediction: float
probability: float | None
model_version: str
latency_ms: float
@app.post("/api/v1/predict", response_model=PredictionResponse)
async def predict(request: PredictionRequest):
import time
start = time.time()
if len(request.features) != EXPECTED_FEATURES:
raise HTTPException(400, f"Expected {EXPECTED_FEATURES} features")
prediction = model.predict([request.features])[0]
probability = model.predict_proba([request.features])[0].max() if hasattr(model, 'predict_proba') else None
latency = (time.time() - start) * 1000
return PredictionResponse(
prediction=prediction,
probability=probability,
model_version=MODEL_VERSION,
latency_ms=round(latency, 2),
)
# Great Expectations / Pandas profiling
import pandera as pa
class InputSchema(pa.DataFrameModel):
age: int = pa.Field(ge=0, le=120)
income: float = pa.Field(ge=0)
education_years: int = pa.Field(ge=0, le=30)
employment_status: str = pa.Field(isin=['employed', 'unemployed', 'student', 'retired'])
@pa.check_types
def validate_input(df: pd.DataFrame) -> pd.DataFrame:
InputSchema.validate(df)
# Domain-specific checks
assert df['age'].notna().all(), "Missing age values"
assert (df['income'] > 0).all(), "Negative income values"
return df
# Model comparison in production
class ModelRouter:
def __init__(self):
self.control_model = load_model("v1")
self.treatment_model = load_model("v2")
self.traffic_split = 0.1 # 10% to v2
def predict(self, features, user_id: str):
# Consistent routing (same user → same model)
if hash(user_id) % 100 < self.traffic_split * 100:
model = self.treatment_model
variant = 'treatment'
else:
model = self.control_model
variant = 'control'
result = model.predict([features])[0]
self.log_prediction(user_id, variant, result)
return result
Guides creation, editing, and verification of skills for AI coding agents using test-driven development with subagent scenarios. Use when authoring or debugging skills.
npx claudepluginhub haj1t/senior-dev-squad-skills --plugin ai-ml-pro