Yuan Chiang
04/12/2024, 3:51 PMSLURM_CPU_BIND="cores"
with --gpus-per-task=1
but no luck. Here is my SLURMCluster input
cluster_kwargs = dict(
cores=4*nnodes,
memory="64 GB",
shebang="#!/bin/bash",
account="m3828_g",
walltime="02:00:00",
job_mem="0",
job_script_prologue=[
"source ~/.bashrc",
"module load python",
'export SLURM_CPU_BIND="cores"'
# "CUDA_VISIBLE_DEVICES=$SLURM_PROCID"
],
job_directives_skip=["-n", "--cpus-per-task"],
job_extra_directives=[
"-q preempt", "-C gpu",
f"-N {nnodes}",
"--ntasks-per-node=4",
"--cpus-per-task=4",
"--gpus-per-task=1",
"--comment=12:00:00",
"--time-min=01:00:00",
"--signal=B:USR1@60",
"--requeue",
"--open-mode=append"
],
# python="srun python",
death_timeout=86400,
)
cluster = SLURMCluster(**cluster_kwargs)
Yuan Chiang
04/12/2024, 3:52 PM#!/bin/bash
#SBATCH -J dask-worker
#SBATCH -A m3828_g
#SBATCH --mem=0
#SBATCH -t 00:30:00
#SBATCH -q debug_preempt
#SBATCH -C gpu
#SBATCH -N 1
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=4
#SBATCH --gpus-per-task=1
#SBATCH --comment=3:00:00
#SBATCH --time-min=00:05:00
#SBATCH --signal=B:USR1@60
#SBATCH --requeue
#SBATCH --open-mode=append
source ~/.bashrc
module load python
source activate /pscratch/sd/c/cyrusyc/.conda/mlip-arena
export SLURM_CPU_BIND="cores"
/pscratch/sd/c/cyrusyc/.conda/mlip-arena/bin/python -m distributed.cli.dask_worker <tcp://128.55.64.30:42243> --name dummy-name --nthreads 1 --memory-limit 14.90GiB --nworkers 4 --nanny --death-timeout 86400
Yuan Chiang
04/12/2024, 3:53 PMBianca Hoch
04/17/2024, 4:31 PMBianca Hoch
04/17/2024, 4:31 PM