mirror of
https://github.com/ROCm/jax.git
synced 2025-04-18 12:56:07 +00:00
Changes to make jax[tpu] work better in a docker container.
1. In cloud_tpu_init.py, check whether we're on a Cloud TPU VM by looking for the libtpu Python package, instead of /lib/libtpu.so (which isn't necessarily present in a docker container). JAX now relies on the libtpu package instead of the system libtpu.so, so this makes more sense either way. This means we'll try/catch an ImportError in all non-TPU environments when importing jax, which hopefully isn't noticeably slow. 2. Add requests as a jax[tpu] dependency, since it's needed by cloud_tpu_init.py. This comes pre-installed on Cloud TPU VMs, but may not be installed in docker containers, virtualenvs, etc. I manually tested by creating the following Dockerfile on a Cloud TPU VM: ``` FROM ubuntu:18.04 RUN apt update && apt install git python3-pip -y RUN git clone https://github.com/skye/jax && cd jax && git checkout tpu_docker WORKDIR jax RUN python3 -m pip install --upgrade pip RUN python3 -m pip install .[tpu] -f https://storage.googleapis.com/jax-releases/libtpu_releases.html CMD ["python3", "-c", "import jax; print(jax.device_count())"] ``` And then running the following commands: ``` $ sudo docker build -t jax-test . $ sudo docker run --privileged jax-test 8 ``` Note the `--privileged` flags is necessary to let the container access the TPU devices in /dev.
This commit is contained in:
parent
10569871b7
commit
b2fd6a772b
@ -15,12 +15,14 @@
|
||||
import os
|
||||
|
||||
def cloud_tpu_init():
|
||||
"""Automatically sets Cloud TPU topology env vars.
|
||||
"""Automatically sets Cloud TPU topology and other env vars.
|
||||
|
||||
**This must be called before the TPU runtime is loaded, which happens as soon
|
||||
as JAX's C++ backend is loaded! I.e. call this before xla_bridge or xla_client
|
||||
is imported.**
|
||||
|
||||
Safe to call in non-Cloud TPU environments.
|
||||
|
||||
Some of these environment variables are used to tell the TPU runtime what kind
|
||||
of mesh topology to use. It assumes a single-host topology by default, so we
|
||||
manually set them here to default to the full pod slice if applicable.
|
||||
@ -28,20 +30,18 @@ def cloud_tpu_init():
|
||||
This will not set any env vars if a single topology-related env var is already
|
||||
set.
|
||||
"""
|
||||
if not _running_in_cloud_tpu_vm():
|
||||
return
|
||||
|
||||
# Use pip-installed libtpu if applicable, rather than system default.
|
||||
try:
|
||||
# pylint: disable=import-outside-toplevel
|
||||
# pytype: disable=import-error
|
||||
import libtpu
|
||||
# pytype: enable=import-error
|
||||
# pylint: enable=import-outside-toplevel
|
||||
libtpu.configure_library_path()
|
||||
except ImportError:
|
||||
pass
|
||||
# We assume libtpu is installed iff we're in a correctly-configured Cloud
|
||||
# TPU environment. Exit early if we're not running on Cloud TPU.
|
||||
return
|
||||
|
||||
libtpu.configure_library_path()
|
||||
os.environ.setdefault('GRPC_VERBOSITY', 'ERROR')
|
||||
|
||||
# If the user has set any topology-related env vars, don't set any
|
||||
@ -56,7 +56,6 @@ def cloud_tpu_init():
|
||||
]):
|
||||
return
|
||||
|
||||
# Don't assume non-Cloud TPU environments have requests installed
|
||||
# pylint: disable=import-outside-toplevel
|
||||
# pytype: disable=import-error
|
||||
import requests
|
||||
@ -98,7 +97,3 @@ def cloud_tpu_init():
|
||||
os.environ['TPU_MESH_CONTROLLER_ADDRESS'] = worker_network_endpoints.split(
|
||||
',')[0].split(':')[2] + ':8476'
|
||||
os.environ['TPU_MESH_CONTROLLER_PORT'] = '8476'
|
||||
|
||||
|
||||
def _running_in_cloud_tpu_vm():
|
||||
return os.path.isfile('/lib/libtpu.so')
|
||||
|
4
setup.py
4
setup.py
@ -51,7 +51,9 @@ setup(
|
||||
# Cloud TPU VM jaxlib can be installed via:
|
||||
# $ pip install jax[tpu] -f https://storage.googleapis.com/jax-releases/jax_releases.html
|
||||
'tpu': [f'jaxlib=={_current_jaxlib_version}',
|
||||
f'libtpu-nightly=={_libtpu_version}'],
|
||||
f'libtpu-nightly=={_libtpu_version}',
|
||||
# Required by cloud_tpu_init.py
|
||||
'requests'],
|
||||
|
||||
# CUDA installations require adding jax releases URL; e.g.
|
||||
# $ pip install jax[cuda110] -f https://storage.googleapis.com/jax-releases/jax_releases.html
|
||||
|
Loading…
x
Reference in New Issue
Block a user