slurm.conf#

# Title: Slurm Workload Manager Configuration for Weather MLOps Platform
#
# Purpose: Defines a containerized HPC cluster for distributed ML training workloads.
#          Provides computational resources and scheduling for weather forecasting models.
#
# Owner: MLOps Platform Team
# Source: slurm-config/slurm.conf
# Last-Reviewed: 2025-08-24
# Depends-On: docker-compose.yml, slurm_job_trigger.py, PostgreSQL database
#
# Change-Log:
#   2025-08-24: Updated file header, improved comments, and enhanced documentation structure
#
# Links:
#   - Weather MLOps Platform: https://github.com/user/weather_ml_platform
#   - Slurm Documentation: https://slurm.schedmd.com/slurm.conf.html
#
# Overview: This configuration establishes a 3-node Slurm cluster (master + 2 workers) for
#           executing ML training jobs. WHY: Enables scalable, containerized HPC resources
#           for weather forecasting model training without requiring physical infrastructure.

# =============================================================================
# CLUSTER IDENTITY AND NETWORKING
# =============================================================================
# WHY: Defines cluster identity for job routing and controller discovery in Docker network

ClusterName=docker-cluster  # Logical name for this Slurm cluster instance
SlurmctldHost=slurm-master  # Hostname of the slurm-master container running the primary controller

# Rationale: Standard ports ensure compatibility with existing Slurm tooling and documentation
SlurmctldTimeout=300  # Timeout in seconds for controller communications (5 minutes)
SlurmctldPort=6817    # Standard port for slurmctld daemon (job scheduling controller)
SlurmdPort=6818       # Standard port for slurmd daemons (compute node execution)

# =============================================================================
# AUTHENTICATION AND LOGGING
# =============================================================================
# WHY: Ensures secure inter-node communication and provides audit trails for HPC operations

# Rationale: Munge provides lightweight authentication suitable for container environments
AuthType=auth/munge  # Uses Munge for secure authentication between cluster components

# Rationale: Centralized logging simplifies debugging and monitoring in containerized setup
SlurmctldLogFile=/var/log/slurm/slurmctld.log  # Central log file for controller activities
SlurmdLogFile=/var/log/slurm/slurmd.log        # Per-node log file for compute daemon activities
MailProg=/bin/true  # Disable email notifications to silence MailProg warnings in containers

# =============================================================================
# STATE MANAGEMENT AND PERSISTENCE
# =============================================================================
# WHY: Maintains cluster state and job data persistence across container restarts

# Rationale: Persistent state ensures job continuity and proper resource accounting
StateSaveLocation=/var/spool/slurm/ctld_state   # Persistent storage for controller state information

# Rationale: Node-specific directories prevent job data conflicts in multi-node setup
SlurmdSpoolDir=/var/spool/slurm/slurmd_node_%n  # Node-specific directories for job execution data (%n = node name)

# =============================================================================
# PROCESS TRACKING
# =============================================================================
# WHY: Enables proper job isolation and resource cleanup in containerized environment

# Rationale: PGID tracking works reliably in Docker containers unlike cgroup-based methods
ProctrackType=proctrack/pgid  # Uses process group ID tracking (container-friendly alternative to cgroup)

# =============================================================================
# COMPUTE NODE DEFINITIONS
# =============================================================================
# WHY: Defines available computational resources for weather ML training workloads

# Rationale: Resource allocation matches typical ML training requirements (2 CPUs, 4GB RAM)
NodeName=cpu-worker-1 CPUs=4 Sockets=1 CoresPerSocket=2 ThreadsPerCore=2 RealMemory=4000 State=UNKNOWN  # Match container HW topology
NodeName=cpu-worker-2 CPUs=4 Sockets=1 CoresPerSocket=2 ThreadsPerCore=2 RealMemory=4000 State=UNKNOWN  # Match container HW topology

# =============================================================================
# PARTITION CONFIGURATION
# =============================================================================
# WHY: Groups compute resources for logical workload organization and resource allocation

# Rationale: Single partition simplifies scheduling for weather forecasting workloads
PartitionName=cpu-nodes Nodes=cpu-worker-1,cpu-worker-2 Default=YES MaxTime=INFINITE State=UP