diff --git a/build/rocm/Dockerfile.ms b/build/rocm/Dockerfile.ms new file mode 100644 index 000000000..1f28658bb --- /dev/null +++ b/build/rocm/Dockerfile.ms @@ -0,0 +1,80 @@ +################################################################################ +FROM rocm/dev-ubuntu-20.04:5.3-complete as rt_build +MAINTAINER Chao Chen +################################################################################ +ARG ROCM_PATH=/opt/rocm-5.3.0 + +ARG DEBIAN_FRONTEND=noninteractive +ENV HOME /root/ +ENV ROCM_PATH=$ROCM_PATH + +RUN apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteractive apt-get install -y \ + build-essential \ + software-properties-common \ + clang-6.0 \ + clang-format-6.0 \ + curl \ + g++-multilib \ + git \ + vim \ + libnuma-dev \ + virtualenv \ + python3-pip \ + pciutils \ + wget && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + + +# Set up paths +ENV HCC_HOME=$ROCM_PATH/hcc +ENV HIP_PATH=$ROCM_PATH/hip +ENV OPENCL_ROOT=$ROCM_PATH/opencl +ENV PATH="$HCC_HOME/bin:$HIP_PATH/bin:${PATH}" +ENV PATH="$ROCM_PATH/bin:${PATH}" +ENV PATH="$OPENCL_ROOT/bin:${PATH}" + +# Add target file to help determine which device(s) to build for +RUN bash -c 'echo -e "gfx900\ngfx906\ngfx908\ngfx90a\ngfx1030" >> ${ROCM_PATH}/bin/target.lst' + +# Need to explicitly create the $ROCM_PATH/.info/version file to workaround what seems to be a bazel bug +# The env vars being set via --action_env in .bazelrc and .tf_configure.bazelrc files are sometimes +# not getting set in the build command being spawned by bazel (in theory this should not happen) +# As a consequence ROCM_PATH is sometimes not set for the hipcc commands. +# When hipcc incokes hcc, it specifies $ROCM_PATH/.../include dirs via the `-isystem` options +# If ROCM_PATH is not set, it defaults to /opt/rocm, and as a consequence a dependency is generated on the +# header files included within `/opt/rocm`, which then leads to bazel dependency errors +# Explicitly creating the $ROCM_PATH/.info/version allows ROCM path to be set correrctly, even when ROCM_PATH +# is not explicitly set, and thus avoids the eventual bazel dependency error. +# The bazel bug needs to be root-caused and addressed, but that is out of our control and may take a long time +# to come to fruition, so implementing the workaround to make do till then +# Filed https://github.com/bazelbuild/bazel/issues/11163 for tracking this +RUN touch ${ROCM_PATH}/.info/version + +ENV PATH="/root/bin:/root/.local/bin:$PATH" + + +# Install python3.9 +RUN add-apt-repository ppa:deadsnakes/ppa && \ + apt update && \ + apt install -y python3.9-dev \ + python3-pip \ + python3.9-distutils \ + python-is-python3 + +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1 + +RUN pip3 install --upgrade --force-reinstall setuptools pip + +RUN pip3 install absl-py numpy==1.20.0 scipy wheel six setuptools pytest pytest-rerunfailures matplotlib + +# Get jax and build it with ROCm +RUN git clone https://github.com/google/jax.git + +################################################################################ +FROM rt_build as ci_build +################################################################################ +WORKDIR /jax +RUN ./build/rocm/build_rocm.sh +RUN ./build/rocm/run_single_gpu.py +RUN ./build/rocm/run_multi_gpu.sh diff --git a/build/rocm/README.md b/build/rocm/README.md index 13061244e..e029f51bf 100644 --- a/build/rocm/README.md +++ b/build/rocm/README.md @@ -1,23 +1,23 @@ # JAX Builds on ROCm -This directory contains files and setup instructions t0 build and test JAX for ROCm in Docker environment. You can build, test and run JAX on ROCm yourself! +This directory contains files and setup instructions to build and test JAX for ROCm in Docker environment (runtime and CI). You can build, test and run JAX on ROCm yourself! *** -### Build JAX-ROCm in docker +### Build JAX-ROCm in docker for the runtime 1. Install Docker: Follow the [instructions on the docker website](https://docs.docker.com/engine/installation/). - 2. Build JAX by running the following command from JAX root folder. +2. Build a runtime JAX-ROCm docker container and keep this image by running the following command. - ./build/rocm/ci_build.sh --keep_image bash -c "./build/rocm/build_rocm.sh" + ./build/rocm/ci_build.sh --keep_image --runtime bash -c "./build/rocm/build_rocm.sh" - 3. Launch a container: If the build was successful, there should be a docker image with name "jax_ci.rocm" in list of docker images (use "docker images" command to list them). - ``` - sudo docker run -it --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --entrypoint /bin/bash jax_ci.rocm:latest - ``` +3. To launch a JAX-ROCm container: If the build was successful, there should be a docker image with name "jax-rocm:latest" in list of docker images (use "docker images" command to list them). +``` +sudo docker run -it --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --entrypoint /bin/bash jax-rocm:latest +``` *** -### Build and Test JAX-ROCm in docker (suitable for CI jobs) +### Build and Test JAX-ROCm in docker for CI jobs This folder has all the scripts necessary to build and run tests for JAX-ROCm. The following command will build JAX on ROCm and run all the tests inside docker (script should be called from JAX root folder). ``` -./build/rocm/ci_build.sh bash -c "./build/rocm/build_rocm.sh&&./build/rocm/run_single_gpu.py&&build/rocm/run_multi_gpu.sh" +./build/rocm/ci_build.sh ``` diff --git a/build/rocm/build_rocm.sh b/build/rocm/build_rocm.sh index 7fb21e192..677acb813 100755 --- a/build/rocm/build_rocm.sh +++ b/build/rocm/build_rocm.sh @@ -29,6 +29,7 @@ then cd - fi + python3 ./build/build.py --enable_rocm --rocm_path=${ROCM_PATH} --bazel_options=--override_repository=org_tensorflow=/tmp/tensorflow-upstream pip3 install --force-reinstall dist/*.whl # installs jaxlib (includes XLA) pip3 install --force-reinstall . # installs jax diff --git a/build/rocm/ci_build.sh b/build/rocm/ci_build.sh index 01b223c03..431ef860d 100755 --- a/build/rocm/ci_build.sh +++ b/build/rocm/ci_build.sh @@ -30,11 +30,13 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/build_common.sh" CONTAINER_TYPE="rocm" -DOCKERFILE_PATH="${SCRIPT_DIR}/Dockerfile.rocm" +DOCKERFILE_PATH="${SCRIPT_DIR}/Dockerfile.ms" DOCKER_CONTEXT_PATH="${SCRIPT_DIR}" KEEP_IMAGE="--rm" POSITIONAL_ARGS=() +RUNTIME_FLAG=0 + while [[ $# -gt 0 ]]; do case $1 in --dockerfile) @@ -46,6 +48,10 @@ while [[ $# -gt 0 ]]; do KEEP_IMAGE="" shift 1 ;; + --runtime) + RUNTIME_FLAG=1 + shift 1 + ;; *) POSITIONAL_ARGS+=("$1") shift @@ -67,13 +73,12 @@ function upsearch (){ cd .. && upsearch "$1" } -# Set up WORKSPACE and BUILD_TAG. Jenkins will set them for you or we pick -# reasonable defaults if you run it outside of Jenkins. +# Set up WORKSPACE. WORKSPACE="${WORKSPACE:-$(upsearch WORKSPACE)}" -BUILD_TAG="${BUILD_TAG:-jax_ci}" +BUILD_TAG="${BUILD_TAG:-jax}" -# Determine the docker image name -DOCKER_IMG_NAME="${BUILD_TAG}.${CONTAINER_TYPE}" +# Determine the docker image name and BUILD_TAG. +DOCKER_IMG_NAME="${BUILD_TAG}_${CONTAINER_TYPE}" # Under Jenkins matrix build, the build tag may contain characters such as # commas (,) and equal signs (=), which are not valid inside docker image names. @@ -89,9 +94,15 @@ echo "BUILD_TAG: ${BUILD_TAG}" echo " (docker container name will be ${DOCKER_IMG_NAME})" echo "" -echo "Building container (${DOCKER_IMG_NAME})..." -docker build -t ${DOCKER_IMG_NAME} \ - -f "${DOCKERFILE_PATH}" "${DOCKER_CONTEXT_PATH}" +if [[ "${RUNTIME_FLAG}" -eq 1 ]]; then + echo "Building (runtime) container (${DOCKER_IMG_NAME}) with Dockerfile($DOCKERFILE_PATH)..." + docker build --target rt_build --tag ${DOCKER_IMG_NAME} \ + -f "${DOCKERFILE_PATH}" "${DOCKER_CONTEXT_PATH}" +else + echo "Building (CI) container (${DOCKER_IMG_NAME}) with Dockerfile($DOCKERFILE_PATH)..." + docker build --target ci_build --tag ${DOCKER_IMG_NAME} \ + -f "${DOCKERFILE_PATH}" "${DOCKER_CONTEXT_PATH}" +fi # Check docker build status if [[ $? != "0" ]]; then @@ -115,7 +126,8 @@ if [[ "${KEEP_IMAGE}" != "--rm" ]] && [[ $? == "0" ]]; then echo "Committing the docker container as jax-rocm" docker stop ${DOCKER_IMG_NAME} docker commit ${DOCKER_IMG_NAME} jax-rocm - docker rm ${DOCKER_IMG_NAME} + docker rm ${DOCKER_IMG_NAME} # remove this temp container + docker rmi ${DOCKER_IMG_NAME} # remote this temp image fi -echo "ROCm build was successful!" +echo "Jax-ROCm build was successful!"