mirror of
https://github.com/ROCm/jax.git
synced 2025-04-16 11:56:07 +00:00
Merge pull request #12591 from sudhakarsingh27:add_pytest_run_for_jaxlib_release
PiperOrigin-RevId: 478608240
This commit is contained in:
commit
6c3c51e8f3
@ -15,7 +15,7 @@ on:
|
||||
- '**workflows/nightly-ci-multiprocess-gpu.yml'
|
||||
|
||||
jobs:
|
||||
build:
|
||||
jaxlib-nightly:
|
||||
runs-on: self-hosted
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
@ -24,7 +24,46 @@ jobs:
|
||||
run: |
|
||||
export JOBSCRIPTSDIR=${GITHUB_WORKSPACE}/.github/workflows/slurm_job_scripts
|
||||
source $JOBSCRIPTSDIR/slurm_utils_common.sh
|
||||
sbatch -N 2 $JOBSCRIPTSDIR/multinode_pytest.sub | tee output.log
|
||||
sbatch -N 2 $JOBSCRIPTSDIR/multinode_pytest_jaxlib_nightly.sub | tee output.log
|
||||
sleep 2m
|
||||
export SLURM_JOBID=$(grep 'Submitted batch job' "output.log" | awk '{ print $4 }')
|
||||
export SLURM_OUTPUT=$(scontrol show job "${SLURM_JOBID}" | grep 'StdOut' | awk -F '=' '{ print $2 }')
|
||||
job_wait "${SLURM_JOBID}" & PID=$!
|
||||
touch "${SLURM_OUTPUT}"
|
||||
echo -e " ---------------------------------------------------\n" \
|
||||
"----------WAITING FOR SLURM JOB TO BEGIN-----------\n" \
|
||||
"---------------------------------------------------\n"
|
||||
tail --pid="${PID}" -f "${SLURM_OUTPUT}"
|
||||
export SLURM_STATE=$(job_state "${SLURM_JOBID}"); echo "SLURM_JOBID=${SLURM_JOBID} SLURM_STATE='${SLURM_STATE}'"
|
||||
export SLURM_WALLTIME=$(job_time "${SLURM_JOBID}"); echo "SLURM_WALLTIME=${SLURM_WALLTIME} secs"
|
||||
export SLURM_EXITCODE=$(job_exit_code "${SLURM_JOBID}" || echo $?); echo "SLURM_EXITCODE='${SLURM_EXITCODE}'"
|
||||
if [ "${SLURM_EXITCODE}" != "0" ]; then exit ${SLURM_EXITCODE:-999}; fi
|
||||
if [ "${SLURM_STATE}" != "COMPLETED" ]; then exit 1; fi
|
||||
|
||||
- name: Publish Test Results
|
||||
uses: EnricoMi/publish-unit-test-result-action@v2
|
||||
if: always()
|
||||
with:
|
||||
junit_files: "outputs/*.xml"
|
||||
|
||||
- name: Upload run results from all nodes
|
||||
uses: actions/upload-artifact@v3
|
||||
if: always()
|
||||
with:
|
||||
name: output-from-nodes
|
||||
path: "outputs/*.txt"
|
||||
|
||||
jaxlib-release:
|
||||
runs-on: self-hosted
|
||||
needs: jaxlib-nightly
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Launch slurm job and hook output to this shell
|
||||
run: |
|
||||
export JOBSCRIPTSDIR=${GITHUB_WORKSPACE}/.github/workflows/slurm_job_scripts
|
||||
source $JOBSCRIPTSDIR/slurm_utils_common.sh
|
||||
sbatch -N 2 $JOBSCRIPTSDIR/multinode_pytest_jaxlib_release.sub | tee output.log
|
||||
sleep 2m
|
||||
export SLURM_JOBID=$(grep 'Submitted batch job' "output.log" | awk '{ print $4 }')
|
||||
export SLURM_OUTPUT=$(scontrol show job "${SLURM_JOBID}" | grep 'StdOut' | awk -F '=' '{ print $2 }')
|
||||
@ -55,7 +94,7 @@ jobs:
|
||||
|
||||
report:
|
||||
name: report
|
||||
needs: build
|
||||
needs: [jaxlib-nightly, jaxlib-release]
|
||||
if: |
|
||||
failure()
|
||||
&& github.event_name == 'schedule'
|
||||
|
@ -50,7 +50,7 @@ OUTPUT_DIR="${BASE_WORKSPACE_DIR}/outputs/"
|
||||
mkdir -p $OUTPUT_DIR
|
||||
|
||||
# redirect both stdout and stderr in the same file for ease of analysis
|
||||
OUTFILE="${OUTPUT_DIR}/output-%j-%n.txt"
|
||||
OUTFILE="${OUTPUT_DIR}/output-test-jaxlib-nightly-%j-%n.txt"
|
||||
|
||||
# Run any setup commands before the actual pytest command to make sure
|
||||
# that the processes are launched together
|
80
.github/workflows/slurm_job_scripts/multinode_pytest_jaxlib_release.sub
vendored
Normal file
80
.github/workflows/slurm_job_scripts/multinode_pytest_jaxlib_release.sub
vendored
Normal file
@ -0,0 +1,80 @@
|
||||
#!/bin/bash
|
||||
#SBATCH -A ci-jax-gpu
|
||||
#SBATCH -p compute
|
||||
#SBATCH -N 2 # number of nodes
|
||||
#SBATCH -t 00:15:00 # wall time
|
||||
#SBATCH -J "ci-jax-gpu" # job name
|
||||
#SBATCH --exclusive # exclusive node access
|
||||
#SBATCH --mem=0 # all mem avail
|
||||
#SBATCH --mail-type=FAIL # only send email on failures
|
||||
#SBATCH --overcommit # Needed for pytorch
|
||||
|
||||
set -x
|
||||
|
||||
# File system and volume glue code
|
||||
#-------------------------------------------------------------------------------
|
||||
CONTAINER="nvcr.io/nvidian/jax_t5x:cuda11.4-cudnn8.2-ubuntu20.04-manylinux2014-multipython"
|
||||
CONTAINER_NAME="multinode_ci_test_container"
|
||||
|
||||
BASE_WORKSPACE_DIR=$GITHUB_WORKSPACE
|
||||
WORKSPACE_DIR=/workspace
|
||||
|
||||
MOUNTS="--container-mounts=$BASE_WORKSPACE_DIR:/$WORKSPACE_DIR"
|
||||
|
||||
# Since the docker container doesn't contain MLX drivers for IB, following flags
|
||||
# are needed to make NCCL work with an ethernet setup
|
||||
# Note:@sudhakarsingh27 This is very specific, need to abstract this out
|
||||
EXPORTS="--export=ALL,NCCL_SOCKET_IFNAME=enp45s0f0,NCCL_SOCKET_NTHREADS=2,NCCL_NSOCKS_PERTHREAD=2"
|
||||
#-------------------------------------------------------------------------------
|
||||
|
||||
# Setup command to be run before the actual pytest command
|
||||
read -r -d '' setup_cmd <<EOF
|
||||
python3.8 -m pip install --upgrade "jax[cuda]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html \
|
||||
&& python3.8 -m pip install pytest \
|
||||
&& python3.8 -m pip install pytest-forked \
|
||||
&& mkdir -p /workspace/outputs/
|
||||
EOF
|
||||
|
||||
# Main pytest command that runs the tests
|
||||
read -r -d '' cmd <<EOF
|
||||
date \
|
||||
&& python3.8 -m pip list | grep jax \
|
||||
&& python3.8 -m pytest -m SlurmMultiNodeGpuTest --forked -v -s --continue-on-collection-errors \
|
||||
--junit-xml=/workspace/outputs/junit_output_\${SLURM_PROCID}.xml \
|
||||
/workspace/tests/multiprocess_gpu_test.py
|
||||
EOF
|
||||
|
||||
# create run specific output directory for ease of analysis
|
||||
OUTPUT_DIR="${BASE_WORKSPACE_DIR}/outputs/"
|
||||
mkdir -p $OUTPUT_DIR
|
||||
|
||||
# redirect both stdout and stderr in the same file for ease of analysis
|
||||
OUTFILE="${OUTPUT_DIR}/output-test-jaxlib-release-%j-%n.txt"
|
||||
|
||||
# Run any setup commands before the actual pytest command to make sure
|
||||
# that the processes are launched together
|
||||
echo $setup_cmd
|
||||
srun -o $OUTFILE -e $OUTFILE \
|
||||
--ntasks-per-node=1 \
|
||||
--container-writable \
|
||||
--container-image="$CONTAINER" \
|
||||
--container-name=$CONTAINER_NAME \
|
||||
$MOUNTS \
|
||||
$EXPORTS \
|
||||
bash -c "${setup_cmd}"
|
||||
|
||||
# Barrier command
|
||||
wait
|
||||
|
||||
# Run the actual pytest command
|
||||
echo $cmd
|
||||
srun -o $OUTFILE -e $OUTFILE \
|
||||
--ntasks-per-node=8 \
|
||||
--open-mode=append \
|
||||
--container-writable \
|
||||
--container-image="$CONTAINER" \
|
||||
--container-name=$CONTAINER_NAME \
|
||||
$MOUNTS \
|
||||
$EXPORTS \
|
||||
bash -c "${cmd}"
|
||||
set +x
|
Loading…
x
Reference in New Issue
Block a user