2022-09-22 12:26:48 -07:00
|
|
|
# Copyright 2021 The JAX Authors.
|
2021-03-05 14:57:36 -08:00
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# https://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
|
2025-02-12 08:15:15 -08:00
|
|
|
import datetime
|
2021-03-05 14:57:36 -08:00
|
|
|
import os
|
2025-02-12 08:15:15 -08:00
|
|
|
import re
|
2025-03-10 10:37:11 -07:00
|
|
|
import warnings
|
2024-05-24 13:55:34 -07:00
|
|
|
from jax import version
|
2024-05-28 13:42:18 -07:00
|
|
|
from jax._src import config
|
|
|
|
from jax._src import hardware_utils
|
2021-03-05 14:57:36 -08:00
|
|
|
|
2023-03-09 13:09:20 -08:00
|
|
|
running_in_cloud_tpu_vm: bool = False
|
2022-04-18 16:39:01 -07:00
|
|
|
|
2023-01-25 09:50:56 -08:00
|
|
|
|
|
|
|
def maybe_import_libtpu():
|
|
|
|
try:
|
|
|
|
# pylint: disable=import-outside-toplevel
|
|
|
|
# pytype: disable=import-error
|
|
|
|
import libtpu
|
|
|
|
|
|
|
|
# pytype: enable=import-error
|
|
|
|
# pylint: enable=import-outside-toplevel
|
|
|
|
except ImportError:
|
|
|
|
return None
|
|
|
|
else:
|
|
|
|
return libtpu
|
|
|
|
|
|
|
|
|
2024-06-26 14:44:52 -04:00
|
|
|
def get_tpu_library_path() -> str | None:
|
2024-06-12 11:54:57 -07:00
|
|
|
path_from_env = os.getenv("TPU_LIBRARY_PATH")
|
|
|
|
if path_from_env is not None and os.path.isfile(path_from_env):
|
|
|
|
return path_from_env
|
|
|
|
|
|
|
|
libtpu_module = maybe_import_libtpu()
|
|
|
|
if libtpu_module is not None:
|
|
|
|
return libtpu_module.get_library_path()
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
2023-01-25 09:50:56 -08:00
|
|
|
def jax_force_tpu_init() -> bool:
|
|
|
|
return 'JAX_FORCE_TPU_INIT' in os.environ
|
|
|
|
|
|
|
|
|
2023-03-09 13:09:20 -08:00
|
|
|
def cloud_tpu_init() -> None:
|
Changes to make jax[tpu] work better in a docker container.
1. In cloud_tpu_init.py, check whether we're on a Cloud TPU VM by
looking for the libtpu Python package, instead of /lib/libtpu.so
(which isn't necessarily present in a docker container). JAX now
relies on the libtpu package instead of the system libtpu.so, so
this makes more sense either way. This means we'll try/catch an
ImportError in all non-TPU environments when importing jax, which
hopefully isn't noticeably slow.
2. Add requests as a jax[tpu] dependency, since it's needed by
cloud_tpu_init.py. This comes pre-installed on Cloud TPU VMs, but
may not be installed in docker containers, virtualenvs, etc.
I manually tested by creating the following Dockerfile on a Cloud TPU VM:
```
FROM ubuntu:18.04
RUN apt update && apt install git python3-pip -y
RUN git clone https://github.com/skye/jax && cd jax && git checkout tpu_docker
WORKDIR jax
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install .[tpu] -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
CMD ["python3", "-c", "import jax; print(jax.device_count())"]
```
And then running the following commands:
```
$ sudo docker build -t jax-test .
$ sudo docker run --privileged jax-test
8
```
Note the `--privileged` flags is necessary to let the container access
the TPU devices in /dev.
2021-07-12 16:57:45 -07:00
|
|
|
"""Automatically sets Cloud TPU topology and other env vars.
|
2021-03-05 14:57:36 -08:00
|
|
|
|
|
|
|
**This must be called before the TPU runtime is loaded, which happens as soon
|
|
|
|
as JAX's C++ backend is loaded! I.e. call this before xla_bridge or xla_client
|
|
|
|
is imported.**
|
|
|
|
|
Changes to make jax[tpu] work better in a docker container.
1. In cloud_tpu_init.py, check whether we're on a Cloud TPU VM by
looking for the libtpu Python package, instead of /lib/libtpu.so
(which isn't necessarily present in a docker container). JAX now
relies on the libtpu package instead of the system libtpu.so, so
this makes more sense either way. This means we'll try/catch an
ImportError in all non-TPU environments when importing jax, which
hopefully isn't noticeably slow.
2. Add requests as a jax[tpu] dependency, since it's needed by
cloud_tpu_init.py. This comes pre-installed on Cloud TPU VMs, but
may not be installed in docker containers, virtualenvs, etc.
I manually tested by creating the following Dockerfile on a Cloud TPU VM:
```
FROM ubuntu:18.04
RUN apt update && apt install git python3-pip -y
RUN git clone https://github.com/skye/jax && cd jax && git checkout tpu_docker
WORKDIR jax
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install .[tpu] -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
CMD ["python3", "-c", "import jax; print(jax.device_count())"]
```
And then running the following commands:
```
$ sudo docker build -t jax-test .
$ sudo docker run --privileged jax-test
8
```
Note the `--privileged` flags is necessary to let the container access
the TPU devices in /dev.
2021-07-12 16:57:45 -07:00
|
|
|
Safe to call in non-Cloud TPU environments.
|
|
|
|
|
2021-06-22 23:31:12 +00:00
|
|
|
Some of these environment variables are used to tell the TPU runtime what kind
|
|
|
|
of mesh topology to use. It assumes a single-host topology by default, so we
|
|
|
|
manually set them here to default to the full pod slice if applicable.
|
2021-03-05 14:57:36 -08:00
|
|
|
|
|
|
|
This will not set any env vars if a single topology-related env var is already
|
|
|
|
set.
|
|
|
|
"""
|
2022-05-17 10:53:17 -07:00
|
|
|
global running_in_cloud_tpu_vm
|
2023-01-25 09:50:56 -08:00
|
|
|
|
2024-05-02 07:21:41 -07:00
|
|
|
# Exit early if we're not running on a Cloud TPU VM or libtpu isn't installed.
|
2024-06-12 11:54:57 -07:00
|
|
|
libtpu_path = get_tpu_library_path()
|
2025-03-10 10:37:11 -07:00
|
|
|
num_tpu_chips, tpu_id = hardware_utils.num_available_tpu_chips_and_device_id()
|
|
|
|
if (
|
|
|
|
tpu_id is not None
|
|
|
|
and tpu_id >= hardware_utils.TpuVersion.v5e
|
|
|
|
and not hardware_utils.transparent_hugepages_enabled()
|
|
|
|
):
|
|
|
|
warnings.warn(
|
|
|
|
'Transparent hugepages are not enabled. TPU runtime startup and'
|
|
|
|
' shutdown time should be significantly improved on TPU v5e and newer.'
|
|
|
|
' If not already set, you may need to enable transparent hugepages in'
|
|
|
|
' your VM image (sudo sh -c "echo always >'
|
|
|
|
' /sys/kernel/mm/transparent_hugepage/enabled")'
|
|
|
|
)
|
2024-06-12 11:54:57 -07:00
|
|
|
if (libtpu_path is None or num_tpu_chips == 0) and not jax_force_tpu_init():
|
Changes to make jax[tpu] work better in a docker container.
1. In cloud_tpu_init.py, check whether we're on a Cloud TPU VM by
looking for the libtpu Python package, instead of /lib/libtpu.so
(which isn't necessarily present in a docker container). JAX now
relies on the libtpu package instead of the system libtpu.so, so
this makes more sense either way. This means we'll try/catch an
ImportError in all non-TPU environments when importing jax, which
hopefully isn't noticeably slow.
2. Add requests as a jax[tpu] dependency, since it's needed by
cloud_tpu_init.py. This comes pre-installed on Cloud TPU VMs, but
may not be installed in docker containers, virtualenvs, etc.
I manually tested by creating the following Dockerfile on a Cloud TPU VM:
```
FROM ubuntu:18.04
RUN apt update && apt install git python3-pip -y
RUN git clone https://github.com/skye/jax && cd jax && git checkout tpu_docker
WORKDIR jax
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install .[tpu] -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
CMD ["python3", "-c", "import jax; print(jax.device_count())"]
```
And then running the following commands:
```
$ sudo docker build -t jax-test .
$ sudo docker run --privileged jax-test
8
```
Note the `--privileged` flags is necessary to let the container access
the TPU devices in /dev.
2021-07-12 16:57:45 -07:00
|
|
|
return
|
2021-06-22 23:31:12 +00:00
|
|
|
|
2022-05-17 10:53:17 -07:00
|
|
|
running_in_cloud_tpu_vm = True
|
|
|
|
|
2021-03-11 22:52:54 +00:00
|
|
|
os.environ.setdefault('GRPC_VERBOSITY', 'ERROR')
|
2024-10-02 15:14:08 -07:00
|
|
|
os.environ.setdefault('TPU_ML_PLATFORM', 'JAX')
|
|
|
|
os.environ.setdefault('TPU_ML_PLATFORM_VERSION', version.__version__)
|
2024-07-02 12:48:00 -07:00
|
|
|
os.environ.setdefault('ENABLE_RUNTIME_UPTIME_TELEMETRY', '1')
|
2024-11-20 17:46:06 +00:00
|
|
|
if '--xla_tpu_use_enhanced_launch_barrier' not in os.environ.get('LIBTPU_INIT_ARGS', ''):
|
2024-11-13 22:11:39 +00:00
|
|
|
os.environ['LIBTPU_INIT_ARGS'] = os.environ.get('LIBTPU_INIT_ARGS','') + ' --xla_tpu_use_enhanced_launch_barrier=true'
|
2024-04-26 14:19:06 -07:00
|
|
|
|
|
|
|
# this makes tensorstore serialization work better on TPU
|
|
|
|
os.environ.setdefault('TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS', '60')
|
|
|
|
os.environ.setdefault('TENSORSTORE_CURL_LOW_SPEED_LIMIT_BYTES', '256')
|
2024-05-28 13:42:18 -07:00
|
|
|
|
2024-07-22 15:08:26 -07:00
|
|
|
# If the JAX_PLATFORMS env variable isn't set, config.jax_platforms defaults
|
|
|
|
# to None. In this case, we set it to 'tpu,cpu' to ensure that JAX uses the
|
|
|
|
# TPU backend.
|
|
|
|
if config.jax_platforms.value is None:
|
|
|
|
config.update('jax_platforms', 'tpu,cpu')
|
|
|
|
|
2024-05-28 13:42:18 -07:00
|
|
|
if config.jax_pjrt_client_create_options.value is None:
|
|
|
|
config.update(
|
|
|
|
'jax_pjrt_client_create_options',
|
|
|
|
f'ml_framework_name:JAX;ml_framework_version:{version.__version__}'
|
|
|
|
)
|
2025-02-12 08:15:15 -08:00
|
|
|
|
|
|
|
|
|
|
|
def is_cloud_tpu_older_than(year: int, month: int, day: int):
|
|
|
|
# We import locally because the functions above must run before the runtime
|
|
|
|
# modules are imported.
|
2025-02-13 15:44:19 -08:00
|
|
|
from jax._src import xla_bridge # pytype: disable=import-error
|
2025-02-12 08:15:15 -08:00
|
|
|
date = datetime.date(year, month, day)
|
|
|
|
if not running_in_cloud_tpu_vm:
|
|
|
|
return False
|
|
|
|
# The format of Cloud TPU platform_version is like:
|
|
|
|
# PJRT C API
|
|
|
|
# TFRT TPU v2
|
|
|
|
# Built on Oct 30 2023 03:04:42 (1698660263) cl/577737722
|
|
|
|
platform_version = xla_bridge.get_backend().platform_version.split('\n')[-1]
|
|
|
|
results = re.findall(r'\(.*?\)', platform_version)
|
|
|
|
if len(results) != 1:
|
|
|
|
return True
|
|
|
|
build_date = date.fromtimestamp(int(results[0][1:-1]))
|
|
|
|
return build_date < date
|