Code Block

#SBATCH -p haswell
#SBATCH -n 4
#SBATCH -t 00:05:00

module swap PrgEnv-$(echo $PE_ENV | tr [:upper:] [:lower:]) PrgEnv-gnu
module load openmpi/4.0.3
module load singularity

export SINGULARITYENV_LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/cray/wlm_detect/default/lib64:/etc/alternatives:/usr/lib64:/usr/lib

export IMAGE=/project/k01/shaima0d/singularity_test/images/openmpi401_latest.sif
export BIND_MOUNT="-B /sw,/usr/lib64,/opt,/etc,/var"

echo "On same node"
mpirun -n 2 -N 2 hostname
mpirun -n 2 -N 2 singularity exec ${BIND_MOUNT} ${IMAGE} /usr/local/libexec/osu-micro-benchmarks/mpi/pt2pt/osu_latency

echo "Now trying inside a singularity container"
mpirun -n 2 -N 1 hostname
mpirun -n 2 -N 1 singularity exec ${BIND_MOUNT} ${IMAGE} /usr/local/libexec/osu-micro-benchmarks/mpi/pt2pt/osu_latency 


CPU job

On Ibex openmpi is installed on host. It is generally suited to launch the singularity MPI jobs with mpirun due to the unavailability of pmix integration of SLURM on host.

Code Block

#SBATCH --ntasks=4
#SBATCH --nodes=2
#SBATCH --gres=gpu:v100:2
#SBATCH --time=00:05:00
#SBATCH --account=ibex-cs

module load singularity
module load openmpi/4.0.3-cuda10.2
module list

export OMPI_MCA_btl=openib
export OMPI_MCA_btl_openib_allow_ib=1
export IMAGE=/ibex/scratch/shaima0d/scratch/singularity_mpi_testing/images/osu_cuda_openmpi403_563.sif

export EXE_lat=/usr/local/libexec/osu-micro-benchmarks/mpi/pt2pt/osu_latency
export EXE_bw=/usr/local/libexec/osu-micro-benchmarks/mpi/pt2pt/osu_bw

echo "On same node"
mpirun -n 2 --map-by ppr:2:node hostname
mpirun -n 2 --map-by ppr:2:node singularity exec ${IMAGE} ${EXE_lat}
mpirun -n 2 --map-by ppr:2:node singularity exec --nv ${IMAGE} ${EXE_bw}

echo "On two nodes"
mpirun -n 2 --map-by ppr:1:node hostname
mpirun -n 2 --map-by ppr:1:node singularity exec ${IMAGE} ${EXE_lat}
mpirun -n 2 --map-by ppr:1:node singularity exec ${IMAGE} ${EXE_bw}

GPU job

The following SLURM jobscript demonstrates run a container with MPI application running on Ibex GPUs leveraging GPU Direct RDMA feature to get close to maximum theoretical bandwidth available from a Host Channel Adapter(HCA).