rocm_jax/jax/_src/array.py

# Copyright 2021 The JAX Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

from collections import defaultdict
from collections.abc import Callable, Sequence
import enum
import functools
import math
import operator as op
from typing import Any, TYPE_CHECKING, cast

from jax._src import api
from jax._src import basearray
from jax._src import config
from jax._src import core
from jax._src import deprecations
from jax._src import dispatch
from jax._src import dtypes
from jax._src import errors
from jax._src import profiler
from jax._src import util
from jax._src import xla_bridge
from jax._src.mesh import set_concrete_mesh
from jax._src.interpreters import mlir
from jax._src.interpreters import pxla
from jax._src.interpreters import xla
from jax._src.layout import AutoLayout, DeviceLocalLayout, Layout
from jax._src.lib import xla_client as xc
from jax._src.lib import xla_extension as xe
from jax._src.sharding import Sharding
from jax._src.sharding_impls import (
    PmapSharding, SingleDeviceSharding,
    device_replica_id_map, hashed_index, num_addressable_indices, local_to_global_shape)  # pyformat: disable
from jax._src.typing import ArrayLike, DLDeviceType
from jax._src.util import safe_zip, unzip3, use_cpp_class, use_cpp_method, cache
import numpy as np


Shape = tuple[int, ...]
Device = xc.Device
Index = tuple[slice, ...]
PRNGKeyArray = Any  # TODO(jakevdp): fix cycles and import this.

def _get_device(a: ArrayImpl) -> Device:
  devices = a.sharding._internal_device_list  # pytype: disable=attribute-error
  if len(devices) != 1:
    raise ValueError(
        "When making an array from single-device arrays the input arrays must "
        f"have one shard each. An argument array had {len(devices)} shard(s).")
  return devices[0]


class Shard:
  """A single data shard of an Array.

  Attributes:
    device : Which device this shard resides on.
    index : The index into the global array of this shard.
    replica_id : Integer id indicating which replica of the global array this
      shard is part of. Always 0 for fully sharded data
      (i.e. when there’s only 1 replica).
    data : The data of this shard. None if ``device`` is non-local.
  """

  def __init__(self, device: Device, sharding: Sharding, global_shape: Shape,
               data: None | ArrayImpl | PRNGKeyArray = None):
    self._device = device
    self._sharding = sharding
    self._global_shape = global_shape
    self._data = data

  def __repr__(self):
    try:
      return (f'Shard(device={self.device!r}, index={self.index}, '
              f'replica_id={self.replica_id}, data={self.data})')
    except ValueError:
      return f'Shard(device={self.device!r}, data={self.data})'

  @functools.cached_property
  def index(self) -> Index:
    try:
      device_indices_map_fn = self._sharding.devices_indices_map
    except AttributeError:
      raise ValueError('Cannot calculate indices from sharding: '
                       f'{self._sharding}. Please create a device to index '
                       'mapping for your sharding.') from None
    index = device_indices_map_fn(self._global_shape)[self.device]
    assert index is not None
    return index

  @functools.cached_property
  def replica_id(self) -> int:
    return device_replica_id_map(self._sharding, self._global_shape)[self.device]

  @property
  def device(self):
    return self._device

  @property
  def data(self):
    return self._data


def _reconstruct_array(fun, args, arr_state, aval_state):
  """Method to reconstruct a device array from a serialized state."""
  np_value = fun(*args)
  np_value.__setstate__(arr_state)
  jnp_value = api.device_put(np_value)
  # TODO(slebedev): Remove this branch after December 10th 2024.
  if "named_shape" in aval_state:
    deprecations.warn(
        "jax-aval-named-shape",
        "Pickled array contains an aval with a named_shape attribute. This is"
        " deprecated and the code path supporting such avals will be removed."
        " Please re-pickle the array.",
        stacklevel=2,
    )
    del aval_state["named_shape"]
  jnp_value.aval = jnp_value.aval.update(**aval_state)
  return jnp_value


@cache(max_size=4096, trace_context_in_key=False)
def _cached_index_calc(s, shape):
  map_ = s.addressable_devices_indices_map(shape)
  seen_h_indices = set()
  l = []
  for array_index, index in enumerate(map_.values()):
    h_index = hashed_index(index)
    if h_index not in seen_h_indices:
      seen_h_indices.add(h_index)
      l.append((array_index, index))
  return l


@cache(max_size=4096, trace_context_in_key=False)
def _process_has_full_value_in_mcjax(s, shape):
  # Return False for single host as a fast path.
  if xla_bridge.process_count() == 1:
    return False

  num_unique_indices = len(
      {hashed_index(v) for v in s.devices_indices_map(shape).values()})
  num_addressable_unique_indices = len(
      {hashed_index(v) for v in s.addressable_devices_indices_map(shape).values()})
  return num_unique_indices == num_addressable_unique_indices


def _validate_shape_and_dtype_for_per_device_arrays(
    arrays: Sequence[ArrayImpl | np.ndarray],
    sharding: Sharding,
    aval: core.ShapedArray,
    expected_shape: Shape,
):
  """Validates that per-device arrays are valid and consistent."""
  expected_dtype = aval.dtype
  for db in arrays:
    if db.dtype != expected_dtype:
      raise ValueError(
          "Input buffers to `Array` must have matching dtypes. "
          f"Got {db.dtype}, expected {expected_dtype} for buffer: {db}"
      )
    if db.shape != expected_shape:
      raise ValueError(
          f"Expected shard shape {expected_shape} doesn't match the single "
          f"device array shape {db.shape}. Shape of Array is "
          f"{aval.str_short()} with sharding {sharding}"
      )


class ArrayImpl(basearray.Array):
  # TODO(yashkatariya): Add __slots__ here.

  aval: core.ShapedArray
  _sharding: Sharding
  _arrays: list[ArrayImpl]
  _committed: bool
  _skip_checks: bool
  _npy_value: np.ndarray | None

  @use_cpp_method()
  def __init__(self, aval: core.ShapedArray, sharding: Sharding,
               arrays: Sequence[ArrayImpl],
               committed: bool, _skip_checks: bool = False):
    # NOTE: the actual implementation of the constructor is moved to C++.

    self.aval = aval
    self._sharding = sharding
    self._committed = committed
    self._npy_value = None
    arrays = [a._arrays[0] for a in arrays]

    # Don't rearrange if skip_checks is enabled because this assumes that the
    # input buffers are already arranged properly. This usually happens when
    # Array's are created as output of a JAX transformation
    # (like pjit, etc).
    if not _skip_checks or config.enable_checks.value:
      arrays = self._check_and_rearrange(arrays, self._sharding, self.aval)
    self._arrays = arrays

  def _check_and_rearrange(self, arrays, sharding, aval):
    device_id_to_buffer = {_get_device(db).id: db for db in arrays}

    addressable_dev = sharding.addressable_devices
    if len(arrays) != len(addressable_dev):
      raise ValueError(
          f"Expected {len(addressable_dev)} per-device arrays "
          "(this is how many devices are addressable by the sharding), but "
          f"got {len(arrays)}")

    array_device_ids = set(device_id_to_buffer.keys())
    addressable_device_ids = {d.id for d in addressable_dev}
    if len(array_device_ids) != len(arrays):
      buffer_device_ids = [_get_device(db).id for db in arrays]
      raise ValueError(
          "When making an array from single-device arrays, the input arrays"
          " must be from distinct devices, but got device IDs"
          f" {buffer_device_ids}")

    # Calculate a symmetric difference because the device ids between sharding
    # and _arrays should match.
    diff = array_device_ids ^ addressable_device_ids
    if diff:
      dev_in_sharding_not_in_arrays = addressable_device_ids - array_device_ids
      dev_in_arrays_not_in_sharding = array_device_ids - addressable_device_ids
      err_msg = (
          "Addressable devices and per-device arrays devices do not match.")
      if dev_in_sharding_not_in_arrays:
        err_msg += (f" Sharding contains devices {dev_in_sharding_not_in_arrays} "
                    "that are not present in per-device arrays.")
      if dev_in_arrays_not_in_sharding:
        err_msg += (f" Per-device arrays contain devices {dev_in_arrays_not_in_sharding} "
                    "that are not present in the sharding.")
      raise ValueError(err_msg)

    _validate_shape_and_dtype_for_per_device_arrays(
        arrays,
        sharding=sharding,
        aval=aval,
        expected_shape=sharding.shard_shape(aval.shape),
    )

    # Rearrange arrays based on the device assignment.
    addressable_da = sharding._addressable_device_assignment
    return [device_id_to_buffer[device.id] for device in addressable_da]

  @property
  def shape(self) -> Shape:
    return self.aval.shape

  @property
  def dtype(self):
    return self.aval.dtype

  @property
  def ndim(self):
    return len(self.shape)

  @property
  def size(self):
    return math.prod(self.shape)

  @property
  def sharding(self):
    return self._sharding

  @property
  def device(self):
    self._check_if_deleted()
    if isinstance(self.sharding, SingleDeviceSharding):
      return list(self.sharding.device_set)[0]
    return self.sharding

  @property
  def weak_type(self):
    return self.aval.weak_type

  @property
  def committed(self) -> bool:
    return self._committed

  def __str__(self):
    return str(self._value)

  def __len__(self):
    try:
      return self.shape[0]
    except IndexError as err:
      raise TypeError("len() of unsized object") from err  # same as numpy error

  def __bool__(self):
    core.check_bool_conversion(self)
    return bool(self._value)

  def __float__(self):
    core.check_scalar_conversion(self)
    return self._value.__float__()

  def __int__(self):
    core.check_scalar_conversion(self)
    return self._value.__int__()

  def __complex__(self):
    core.check_scalar_conversion(self)
    return self._value.__complex__()

  def __hex__(self):
    core.check_integer_conversion(self)
    return hex(self._value)

  def __oct__(self):
    core.check_integer_conversion(self)
    return oct(self._value)

  def __index__(self):
    core.check_integer_conversion(self)
    return op.index(self._value)

  def tobytes(self, order="C"):
    return self._value.tobytes(order)

  def tolist(self):
    return self._value.tolist()

  def __format__(self, format_spec):
    # Simulates behavior of https://github.com/numpy/numpy/pull/9883
    if self.ndim == 0:
      return format(self._value[()], format_spec)
    else:
      return format(self._value, format_spec)

  def __getitem__(self, idx):
    from jax._src.lax import lax
    from jax._src.numpy import indexing
    self._check_if_deleted()

    if isinstance(self.sharding, PmapSharding):
      if config.pmap_no_rank_reduction.value:
        cidx = idx if isinstance(idx, tuple) else (idx,)

        padded_cidx = tuple(
            slice(i, i + 1, None) if isinstance(i, int) else i for i in cidx
        ) + (slice(None),) * (len(self.shape) - len(cidx))
      else:
        if not isinstance(idx, tuple):
          padded_cidx = (idx,) + (slice(None),) * (len(self.shape) - 1)
        else:
          padded_cidx = idx + (slice(None),) * (len(self.shape) - len(idx))

      indices = tuple(self.sharding.devices_indices_map(self.shape).values())
      try:
        arr_idx = indices.index(padded_cidx)
      except ValueError:
        arr_idx = None
      if arr_idx is not None:
        out = self._arrays[arr_idx]
        sharding = SingleDeviceSharding(_get_device(out))

        if config.pmap_no_rank_reduction.value:
          # If cidx was the index of a single shard, then it corresponds to one
          # shard of the chunked dimension.
          dims = tuple(i for i, x in enumerate(cidx) if isinstance(x, int))
          # Squeeze on committed arrays to avoid data movement to shard 0.
          out = lax.squeeze(out, dimensions=dims)

        return ArrayImpl(
            out.aval, sharding, [out], committed=False, _skip_checks=True)

    return indexing.rewriting_take(self, idx)

  def __iter__(self):
    if self.ndim == 0:
      raise TypeError("iteration over a 0-d array")  # same as numpy error
    else:
      assert self.is_fully_replicated or self.is_fully_addressable
      if dispatch.is_single_device_sharding(self.sharding) or self.is_fully_replicated:
        return (sl for chunk in self._chunk_iter(100) for sl in chunk._unstack())
      elif isinstance(self.sharding, PmapSharding):
        return (self[i] for i in range(self.shape[0]))
      else:
        # TODO(yashkatariya): Don't bounce to host and use `_chunk_iter` path
        # here after uneven partitioning support is added.
        return (api.device_put(self._value[i]) for i in range(self.shape[0]))

  @property
  def is_fully_replicated(self) -> bool:
    return self.sharding.is_fully_replicated

  def __repr__(self):
    prefix = 'Array('
    if self.aval is not None and self.aval.weak_type:
      dtype_str = f'dtype={self.dtype.name}, weak_type=True)'
    else:
      dtype_str = f'dtype={self.dtype.name})'

    if self.is_fully_addressable or self.is_fully_replicated:
      line_width = np.get_printoptions()["linewidth"]
      if self.size == 0:
        s = f"[], shape={self.shape}"
      else:
        s = np.array2string(self._value, prefix=prefix, suffix=',',
                            separator=', ', max_line_width=line_width)
      last_line_len = len(s) - s.rfind('\n') + 1
      sep = ' '
      if last_line_len + len(dtype_str) + 1 > line_width:
        sep = ' ' * len(prefix)
      return f"{prefix}{s},{sep}{dtype_str}"
    else:
      return f"{prefix}{self.shape}, {dtype_str}"

  @property
  def is_fully_addressable(self) -> bool:
    """Is this Array fully addressable?

    A jax.Array is fully addressable if the current process can address all of
    the devices named in the :class:`Sharding`. ``is_fully_addressable`` is
    equivalent to "is_local" in multi-process JAX.

    Note that fully replicated is not equal to fully addressable i.e.
    a jax.Array which is fully replicated can span across multiple hosts and is
    not fully addressable.
    """
    return self.sharding.is_fully_addressable

  def __array__(self, dtype=None, context=None, copy=None):
    # copy argument is supported by np.asarray starting in numpy 2.0
    kwds = {} if copy is None else {'copy': copy}
    return np.asarray(self._value, dtype=dtype, **kwds)

  def __dlpack__(self, *, stream: int | Any | None = None,
                 max_version: tuple[int, int] | None = None,
                 dl_device: tuple[DLDeviceType, int] | None = None,
                 copy: bool | None = None):
    from jax._src.dlpack import to_dlpack  # pylint: disable=g-import-not-at-top

    device_set = self.sharding.device_set
    if len(device_set) > 1:
      raise BufferError(
        "to_dlpack can only pack a dlpack tensor from an array on a singular "
        f"device, but an array with a Sharding over {len(device_set)} devices "
        "was provided."
      )
    device, = device_set
    return to_dlpack(self, stream=stream,
                     max_version=max_version,
                     src_device=device,
                     dl_device=dl_device,
                     copy=copy)

  def __dlpack_device__(self) -> tuple[enum.Enum, int]:
    if len(self._arrays) != 1:
      raise BufferError("__dlpack__ only supported for unsharded arrays.")

    from jax._src.dlpack import DLDeviceType  # pylint: disable=g-import-not-at-top

    if self.platform() == "cpu":
      return DLDeviceType.kDLCPU, 0

    elif self.platform() == "gpu":
      platform_version = _get_device(self).client.platform_version
      if "cuda" in platform_version:
        dl_device_type = DLDeviceType.kDLCUDA
      elif "rocm" in platform_version:
        dl_device_type = DLDeviceType.kDLROCM
      else:
        raise BufferError("Unknown GPU platform for __dlpack__: "
                         f"{platform_version}")

      local_hardware_id = _get_device(self).local_hardware_id
      if local_hardware_id is None:
        raise BufferError("Couldn't get local_hardware_id for __dlpack__")

      return dl_device_type, local_hardware_id

    else:
      raise BufferError(
          "__dlpack__ device only supported for CPU and GPU, got platform: "
          f"{self.platform()}"
      )

  def __reduce__(self):
    fun, args, arr_state = self._value.__reduce__()
    aval_state = {'weak_type': self.aval.weak_type}
    return (_reconstruct_array, (fun, args, arr_state, aval_state))

  @use_cpp_method()
  def unsafe_buffer_pointer(self):
    if len(self._arrays) != 1:
      raise ValueError("unsafe_buffer_pointer() is supported only for unsharded"
                       " arrays.")
    return self._arrays[0].unsafe_buffer_pointer()

  @property
  @use_cpp_method()
  def __cuda_array_interface__(self):
    if len(self._arrays) != 1:
      raise ValueError("__cuda_array_interface__() is supported only for "
                       "unsharded arrays.")
    return self._arrays[0].__cuda_array_interface__  # pytype: disable=attribute-error  # bind-properties

  @use_cpp_method()
  def on_device_size_in_bytes(self):
    """Returns the total global on-device size of the array in bytes."""
    arr = self._arrays[0]
    per_shard_size = arr.on_device_size_in_bytes()
    return per_shard_size * self.sharding.num_devices

  def devices(self) -> set[Device]:
    self._check_if_deleted()
    return self.sharding.device_set

  @property
  def device_buffer(self):
    raise AttributeError(
      "arr.device_buffer has been deprecated. Use arr.addressable_data(0)")

  @property
  def device_buffers(self):
    raise AttributeError(
      "arr.device_buffers has been deprecated. Use [x.data for x in arr.addressable_shards]")

  def addressable_data(self, index: int) -> ArrayImpl:
    self._check_if_deleted()
    if self.is_fully_replicated:
      return self._fully_replicated_shard()
    return self._arrays[index]

  @functools.cached_property
  def addressable_shards(self) -> Sequence[Shard]:
    self._check_if_deleted()
    out = []
    for a in self._arrays:
      out.append(Shard(_get_device(a), self.sharding, self.shape, a))
    return out

  @property
  def layout(self):
    # TODO(yashkatariya): Remove the deleted check from here.
    if self.is_deleted():
      return Layout(None, self.sharding)
    try:
      return Layout(DeviceLocalLayout.from_pjrt_layout(self._pjrt_layout),
                    self.sharding)
    except xe.XlaRuntimeError as e:
      msg, *_ = e.args
      if type(msg) is str and msg.startswith("UNIMPLEMENTED"):
        return Layout(None, self.sharding)
      else:
        raise

  @property
  def global_shards(self) -> Sequence[Shard]:
    """Returns list of all `Shard`s of the Array across all devices.

    The result includes shards that are not addressable by the current process.
    If a `Shard` is not addressable, then its `data` will be `None`.
    """
    self._check_if_deleted()
    if self.is_fully_addressable:  # pylint: disable=using-constant-test
      return self.addressable_shards

    out = []
    device_id_to_buffer = {_get_device(a).id: a for a in self._arrays}
    for global_d in self.sharding.device_set:
      if device_id_to_buffer.get(global_d.id, None) is not None:
        array = device_id_to_buffer[global_d.id]
      else:
        array = None
      out.append(Shard(global_d, self.sharding, self.shape, array))
    return out

  @use_cpp_method()
  def delete(self):
    if self._arrays is None:
      return
    for buf in self._arrays:
      buf.delete()
    self._arrays = None
    self._npy_value = None

  @use_cpp_method()
  def is_deleted(self):
    if self._arrays is None:
      return True
    # This path is taken when a view of `Array` is created and the original
    # Array is deleted. In that case, the buffers the view represents also get
    # deleted.
    return any(buf.is_deleted() for buf in self._arrays)

  def _check_if_deleted(self):
    if self.is_deleted():
      raise RuntimeError(
          f"Array has been deleted with shape={self.aval.str_short()}.")

  @use_cpp_method()
  def block_until_ready(self):
    self._check_if_deleted()
    for db in self._arrays:
      db.block_until_ready()
    return self

  @use_cpp_method()
  def _single_device_array_to_np_array_did_copy(self) -> tuple[np.ndarray, bool]:  # type: ignore
    ...  # pytype: disable=bad-return-type

  @use_cpp_method()
  def _copy_single_device_array_to_host_async(self):
    self._arrays[0].copy_to_host_async()

  @profiler.annotate_function
  def copy_to_host_async(self):
    self._check_if_deleted()
    if self._npy_value is None:
      if self.is_fully_replicated:
        self._copy_single_device_array_to_host_async()
        return
      for i, _ in _cached_index_calc(self.sharding, self.shape):
        self._arrays[i]._copy_single_device_array_to_host_async()

  @property
  @functools.partial(profiler.annotate_function, name="np.asarray(jax.Array)")
  def _value(self) -> np.ndarray:
    self._check_if_deleted()

    if self._npy_value is None:
      if self.is_fully_replicated:
        npy_value, did_copy = self._single_device_array_to_np_array_did_copy()
        npy_value.flags.writeable = False
        if did_copy:
          self._npy_value = npy_value
        return npy_value

      # TODO(yashkatariya): Merge `_process_has_full_value_in_mcjax` with
      # is_fully_addressable.
      if (not self.is_fully_addressable and
          not _process_has_full_value_in_mcjax(self.sharding, self.shape)):
        raise RuntimeError(
            "Fetching value for `jax.Array` that spans non-addressable"
            " (non process local) devices is not possible. You can use"
            " `jax.experimental.multihost_utils.process_allgather` to print the"
            " global array or use `.addressable_shards` method of jax.Array to"
            " inspect the addressable (process local) shards."
        )

      for i, _ in _cached_index_calc(self.sharding, self.shape):
        self._arrays[i]._copy_single_device_array_to_host_async()

      npy_value = np.empty(self.shape, self.dtype)
      for i, ind in _cached_index_calc(self.sharding, self.shape):
        npy_value[ind], _ = self._arrays[i]._single_device_array_to_np_array_did_copy()
      self._npy_value = npy_value
      self._npy_value.flags.writeable = False
    return self._npy_value


# TODO(b/273265390): ideally we would write this as a decorator on the ArrayImpl
# class, however this triggers a pytype bug. Workaround: apply the decorator
# after the fact.
if not TYPE_CHECKING:
  ArrayImpl = use_cpp_class(xc.ArrayImpl)(ArrayImpl)


def _get_shape_from_index(slc: Index, shape: Shape) -> Shape:
  return tuple(
      (s.stop or dim) - (s.start or 0)
      for s, dim in safe_zip(slc, shape)
      if isinstance(s, slice)  # If element is int, this dimension is reduced
  )


# explicitly set to be unhashable.
setattr(ArrayImpl, "__hash__", None)
setattr(ArrayImpl, "__array_priority__", 100)

# TODO(yashkatariya): Remove None from callback input type.

def make_array_from_callback(
    shape: Shape, sharding: Sharding | Layout,
    data_callback: Callable[[Index | None], ArrayLike]) -> ArrayImpl:
  # pyformat: disable
  """Returns a ``jax.Array`` via data fetched from ``data_callback``.

  ``data_callback`` is used to fetch the data for each addressable shard of the
  returned ``jax.Array``. This function must return concrete arrays, meaning that
  ``make_array_from_callback`` has limited compatibility with JAX transformations
  like :func:`jit` or :func:`vmap`.

  Args:
    shape : Shape of the ``jax.Array``.
    sharding: A ``Sharding`` instance which describes how the ``jax.Array`` is
      laid out across devices.
    data_callback : Callback that takes indices into the global array value as
      input and returns the corresponding data of the global array value.
      The data can be returned as any array-like object, e.g. a ``numpy.ndarray``.

  Returns:
    A ``jax.Array`` via data fetched from ``data_callback``.

  Examples:

    >>> import math
    >>> from jax.sharding import Mesh
    >>> from jax.sharding import PartitionSpec as P
    >>> import numpy as np
    ...
    >>> input_shape = (8, 8)
    >>> global_input_data = np.arange(math.prod(input_shape)).reshape(input_shape)
    >>> global_mesh = Mesh(np.array(jax.devices()).reshape(2, 4), ('x', 'y'))
    >>> inp_sharding = jax.sharding.NamedSharding(global_mesh, P('x', 'y'))
    ...
    >>> def cb(index):
    ...  return global_input_data[index]
    ...
    >>> arr = jax.make_array_from_callback(input_shape, inp_sharding, cb)
    >>> arr.addressable_data(0).shape
    (4, 2)
  """
  # pyformat: enable
  dll = sharding.device_local_layout if isinstance(sharding, Layout) else None
  if isinstance(dll, AutoLayout):
    raise TypeError(
        "`DeviceLocalLayout.AUTO` cannot be used in place of a device-local"
        f" layout when calling `jax.make_array_from_callback`. Got {sharding}")
  sharding = sharding.sharding if isinstance(sharding, Layout) else sharding
  if not isinstance(sharding, Sharding):
    raise TypeError(
        f"sharding should be an instance of `jax.sharding`. Got {sharding} of"
        f" type {type(sharding)}")

  def get_data(index: Index | None) -> ArrayImpl | np.ndarray:
    # Perhaps cache on index here, then we can unify fully_replicated
    # and non-fully_replicated cases below and become faster for
    # partially replicated cases.
    assert index is not None
    r = data_callback(index)
    if isinstance(r, core.Tracer):
      raise errors.UnexpectedTracerError(
          "jax.make_array_from_callback cannot be called within a traced"
          " context."
      )
    # Value can be python scalar, resolve it into something with dtype.
    return xla.canonicalize_dtype(r)

  if sharding.is_fully_replicated:
    devices = list(sharding._internal_device_list.addressable_device_list)  # type: ignore
    # Only compute data once.
    per_device_values = [get_data((slice(None),) * len(shape))] * len(devices)
  else:
    device_to_index_map = sharding.addressable_devices_indices_map(shape)
    devices = list(device_to_index_map.keys())
    per_device_values = [
        get_data(device_to_index_map[device]) for device in devices
    ]

  first_value = per_device_values[0]
  expected_dtype = first_value.dtype
  expected_shape = sharding.shard_shape(shape)
  aval = core.update_aval_with_sharding(
      core.ShapedArray(shape, expected_dtype), sharding)
  _validate_shape_and_dtype_for_per_device_arrays(
      per_device_values,
      expected_shape=expected_shape,
      aval=aval,
      sharding=sharding,
  )
  if (isinstance(first_value, ArrayImpl)
      and first_value._committed
      and sharding.is_fully_replicated
      and first_value.is_fully_replicated
      and first_value.sharding._device_assignment == tuple(devices)
      and first_value.layout.device_local_layout == dll):
    return first_value

  if dtypes.issubdtype(aval.dtype, dtypes.extended):
    # TODO(yashkatariya): Can this also use batched_device_put?
    arrays = api.device_put(per_device_values, devices)
    return aval.dtype._rules.make_sharded_array(
        aval, sharding, arrays, committed=True
    )

  if dll is not None:
    devices = [Layout(dll, SingleDeviceSharding(d)) for d in devices]
    # pxla.batched_device_put doesn't support Layout... Take the slow route
    arrays = api.device_put(per_device_values, devices)
    return ArrayImpl(aval, sharding, arrays, committed=True)

  if isinstance(first_value, ArrayImpl) and len(first_value.devices()) > 1:
    # The output of the callback is already a sharded array, move it to
    # to target device.
    per_device_values = api.device_put(per_device_values, devices)

  return pxla.batched_device_put(aval, sharding, per_device_values, devices)


def make_array_from_process_local_data(
    sharding: Sharding,
    local_data: np.ndarray,
    global_shape: Shape | None = None,
) -> ArrayImpl:
  # pyformat: disable
  """Creates distributed tensor using the data available in process.

  This function is a common special case of `make_array_from_callback`. It
  assumes that the data is available in the process and takes care of the
  index wrangling.

  The most common case is when the sharding is sharded across the batch
  dimension and each host just loads its corresponding sub-batch. This function
  supports more general cases as well, such as mixed multi-host and multi-axis
  replication and sharding but you would need to compute the size and the
  contents of process-local data correctly to satisfy the sharding constraints.

  In particular, if any two hosts are replicas, host_local_data should be
  identical as well.

  The global_shape is optional. If not provided it will be be inferred from
  the local_data and sharding, under the assumption that
  each host represents only their own data for uniform sharding. If sharding
  is non-uniform, (see note below) an exception will be raised.

  Setting global_shape explicitly allows for finer grain control and works with
  non-uniform shardings. Each dimension of global_shape must either match
  host_local_data, or match the inferred global shape of the sharding (in which
  case it is equivalent to setting it to None, but is more explicit).

  For example if dimension `i` is fully sharded then this size would be
  `per_device_shape[i] * jax.local_device_count()`.  Each device will be mapped
  into local slice of `local_data` array. For example, if given process
  addresses slices (8, 12) and  (24, 28), then these slices will be mapped
  into (0, 4) and (4, 8) of the `local_data`.

  For each dimension where global_shapes matches local_shape, each device
  will lookup the slice in the local_data. For example if
  global_shape == local_data.shape, the local data is assumed to be the
  actual target array that will be sharded into device.

  If global_shape is the same as local_data.shape, then the data must
  be the same across all hosts.

  Examples:
    >>> from jax.sharding import PartitionSpec as P
    >>> mesh_rows = 2
    >>> mesh_cols =  jax.device_count() // 2
    ...
    >>> mesh = jax.sharding.Mesh(np.array(jax.devices()).reshape(mesh_rows, mesh_cols), ('x', 'y'))

    >>> sharding = jax.sharding.NamedSharding(mesh, P(('x', 'y'),))
    >>> rows_per_device = 2
    >>> feature_length = 32
    >>> per_device_shape = (rows_per_device, feature_length)
    >>> per_host_shape = (rows_per_device * len(mesh.local_devices), feature_length)
    >>> per_host_generator = lambda : np.arange(np.prod(per_host_shape)).reshape(per_host_shape)
    >>> per_host_data = per_host_generator()  # replace with your own per-host data pipeline that outputs numpy arrays
    >>> global_shape = (rows_per_device * len(sharding.device_set), ) + per_device_shape[1:]
    >>> output_global_array = jax.make_array_from_process_local_data(sharding, per_host_data, global_shape)
    ...
    >>> assert output_global_array.addressable_data(0).shape == per_device_shape
    >>> assert output_global_array.shape == global_shape

  NB: While most shardings are uniform, It is possible to design am exotic
  sharding mesh where each process's  devices will be arranged in a non-grid
  like pattern in some dimensions, or for indices to overlap non-trivially.
  Such sharding is called "non-uniform" in those dimensions. In that case,
  the global shape along those directions must match local shape as there is
  no meaningful way to represent all needed
  per-process data in non-overlapping fashion. For example for global_shape 4x4
  if sharding looks like this:

      0123
      2103
      4675
      4567

  with 4 processes, containing devices (0,1), (2, 3), (4, 5), (6, 7) respectively.
  Then the data for each host look like

      xx..    ..xx     ....    ....
      .xx.    x..x     ....    ....
      ....    ....     x..x    .xx.
      ....    ....     xx..    ..xx

  the sharding is uniform on rows (each host requires either rows 1-2, or rows 3-4)
  and non-uniform on columns (hosts require overlapping but not matching
  set of columns). Thus local data must have the shape 2x4 or 4x4
  for all hosts, even though each  host can potentially fit into 2x2 shape.
  In this case user must provide global_shape explicitly and for
  local_shape=(2, 4), potentially valid global shapes are (2, 4) and (4, 4).

  On the other hand for sharding:

      0213   x.x.  .x.x.  ....  ....
      0213   x.x.  .x.x.  ....  ....
      4657   ....  ....   .x.x  x.x.
      4657   ....  ....   .x.x  x.x.

  for local_shape=(2, 2) this function can accept a choice of 2x2, 2x4, 4x2
  and 4x4 global shapes. Setting global_shape to None, is equivalent to
  setting it to (4, 4) in this case.

  Args:
    sharding: Sharding of the global array.
    local_data: Data on the host to be placed on local devices. Each
      dimension should either match global_shape, or match
      num_addressable_indices(dim).
    global_shape: The target shape of the global array. If None,
      will infer from local_data and sharding.

  Returns:
    Tensor that will have sharding=sharding and of shape global_shape.
  """
  # pyformat: enable
  if xla_bridge.process_count() == 1:
    return api.device_put(local_data, sharding)

  # TODO(sandler): consider supporting partially specified global_shape or
  # making local_to_global_shape available in the api.
  local_shape = local_data.shape
  if global_shape is None:
    global_shape = local_to_global_shape(sharding, local_shape)  # type: ignore[assignment]
    assert global_shape is not None
    if None in global_shape:
      raise ValueError(
          "Unable to compute global_shape due to non-uniform sharding."
          f" Specify global shape directly. Partially computed {global_shape=}."
      )
  elif None in global_shape:
    raise ValueError(f"{global_shape=} has Nones. This is not supported.")
  full_dim = []
  for i, (data_dim, global_dim) in enumerate(
      zip(local_data.shape, global_shape)
  ):
    full_dim.append(data_dim == global_dim)
    if data_dim != global_dim:
      process_slice = num_addressable_indices(sharding, i, global_shape)
      if process_slice != data_dim:
        raise ValueError(
            "Invalid host data, each dimension should match either global or "
            f"process shape. In dimension {i=}, the process data has {data_dim}"
            f"elements. Process addresses {process_slice} elements and "
            f"{global_shape=}."
        )
  addressable_shards = sharding.addressable_devices_indices_map(global_shape)
  shard = next(iter(addressable_shards.values()))
  assert shard is not None
  shard_shape = _get_shape_from_index(shard, global_shape)
  slices_for_each_dim: list[list[int]] = [[] for _ in global_shape]
  for shard_index in addressable_shards.values():
    assert shard_index is not None
    for i, slc in enumerate(shard_index):
      slices_for_each_dim[i].append(slc.start or 0)
  for i in range(len(global_shape)):
    slices_for_each_dim[i] = sorted(set(slices_for_each_dim[i]))

  @functools.lru_cache(maxsize=4096)
  def local_slice(i, start):
    # Looks up the index of this slice in the list of slices for this dimension.
    # This will determine the slice in host_local_data
    start = slices_for_each_dim[i].index(start or 0) * shard_shape[i]
    end = start + shard_shape[i]
    return slice(start, end)

  def cb(index: Index | None) -> ArrayLike:
    assert index is not None
    data_slice = (
        slc if full_dim[i] else local_slice(i, slc.start)
        for i, slc in enumerate(index)
    )
    return local_data[tuple(data_slice)]

  return make_array_from_callback(global_shape, sharding, cb)


def make_array_from_single_device_arrays(
    shape: Shape, sharding: Sharding, arrays: Sequence[basearray.Array]
) -> ArrayImpl:
  r"""Returns a ``jax.Array`` from a sequence of ``jax.Array``\s each on a single device.
      Every device in input ``sharding``\'s mesh must have an array in ``arrays``\s.

  Args:
    shape : Shape of the output ``jax.Array``. This conveys information already included with
      ``sharding`` and ``arrays`` and serves as a double check.
    sharding: Sharding: A global Sharding instance which describes how the output jax.Array is laid out across devices.
    arrays: Sequence of ``jax.Array``\s that are each single device addressable. ``len(arrays)``
      must equal ``len(sharding.addressable_devices)`` and the shape of each array must be the same. For multiprocess code,
      each process will call with a different ``arrays`` argument that corresponds to that processes' data.
      These arrays are commonly created via ``jax.device_put``.

  Returns:
    A global ``jax.Array``, sharded as ``sharding``, with shape equal to ``shape``, and with per-device
      contents matching ``arrays``.

  Examples:

    >>> import math
    >>> from jax.sharding import Mesh
    >>> from jax.sharding import PartitionSpec as P
    >>> import numpy as np
    ...
    >>> mesh_rows = 2
    >>> mesh_cols =  jax.device_count() // 2
    ...
    >>> global_shape = (8, 8)
    >>> mesh = Mesh(np.array(jax.devices()).reshape(mesh_rows, mesh_cols), ('x', 'y'))
    >>> sharding = jax.sharding.NamedSharding(mesh, P('x', 'y'))
    >>> inp_data = np.arange(math.prod(global_shape)).reshape(global_shape)
    ...
    >>> arrays = [
    ...    jax.device_put(inp_data[index], d)
    ...        for d, index in sharding.addressable_devices_indices_map(global_shape).items()]
    ...
    >>> arr = jax.make_array_from_single_device_arrays(global_shape, sharding, arrays)
    >>> assert arr.shape == (8,8) # arr.shape is (8,8) regardless of jax.device_count()

  For cases where you have a local array and want to convert it to a global
  jax.Array, use ``jax.make_array_from_process_local_data``.
  """
  # All input arrays should be committed. Checking it is expensive on
  # single-controller systems.
  aval = core.update_aval_with_sharding(
      core.ShapedArray(shape, arrays[0].dtype, weak_type=False), sharding)
  if dtypes.issubdtype(aval.dtype, dtypes.extended):
    return aval.dtype._rules.make_sharded_array(aval, sharding, arrays,
                                                committed=True)
  # TODO(phawkins): ideally the cast() could be checked.
  try:
    return ArrayImpl(aval, sharding, cast(Sequence[ArrayImpl], arrays),
                    committed=True)
  except TypeError:
    if not isinstance(arrays, Sequence):
      raise TypeError("jax.make_array_from_single_device_arrays `arrays` "
                      "argument must be a Sequence (list or tuple), but got "
                      f"{type(arrays)}.")
    if any(isinstance(arr, core.Tracer) for arr in arrays):
      raise ValueError(
          "jax.make_array_from_single_device_arrays requires a list of concrete"
          f" arrays as input, but got types {set(map(type, arrays))}")
    raise

xla.canonicalize_dtype_handlers[ArrayImpl] = pxla.identity

def _get_aval_array(self):
  return core.update_aval_with_sharding(self.aval, self.sharding)
core.pytype_aval_mappings[ArrayImpl] = _get_aval_array

# TODO(jakevdp) replace this with true inheritance at the C++ level.
basearray.Array.register(ArrayImpl)


def _array_mlir_constant_handler(val):
  try:
    return mlir.ir_constant(val._value)
  except RuntimeError as e:
    # TODO(yashkatariya): Ideally we would catch a custom exception from
    # `_value` function in ArrayImpl instead of checking the error string.
    if 'Fetching value for `jax.Array` that spans non-addressable' in str(e):
      raise RuntimeError(
          "Closing over jax.Array that spans non-addressable (non process"
          " local) devices is not allowed. Please pass such arrays as arguments"
          f" to the function. Got jax.Array: {val.aval.str_short()}") from e
    raise

mlir.register_constant_handler(ArrayImpl, _array_mlir_constant_handler)


# NOTE(skye): we could refactor to generate _multi_slice parameters directly
# from the input ShardingSpec, rather than the indices. However, this would
# require duplicating the ordering logic of spec_to_indices, which is more
# subtle and more likely to change than the index logic we have to support here.
def as_slice_indices(arr: Any, idx: Index) -> tuple[
    tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
  """Returns start_indices, limit_indices, removed_dims"""
  start_indices = [0] * arr.ndim
  limit_indices = list(arr.shape)
  removed_dims: list[int] = []

  tuple_idx = idx if isinstance(idx, tuple) else (idx,)
  for dim, sub_idx in enumerate(tuple_idx):
    if isinstance(sub_idx, int):
      start_indices[dim] = sub_idx
      limit_indices[dim] = sub_idx + 1
      removed_dims.append(dim)
    elif sub_idx == slice(None):
      continue
    else:
      assert isinstance(sub_idx, slice), sub_idx
      assert isinstance(sub_idx.start, int), sub_idx
      assert isinstance(sub_idx.stop, int), sub_idx
      start_indices[dim] = sub_idx.start
      limit_indices[dim] = sub_idx.stop

  return tuple(start_indices), tuple(limit_indices), tuple(removed_dims)


def shard_device_array(x, devices, indices, sharding):
  start_indices, limit_indices, removed_dims = unzip3(
      as_slice_indices(x, idx) for idx in indices)
  if sharding.is_fully_replicated:
    shards = [x] * len(devices)
  else:
    # TODO(yashkatariya): Maybe this should be set when we call the handler in
    # InputsHandler.__call__?
    with set_concrete_mesh(None):
      shards = x._multi_slice(start_indices, limit_indices, removed_dims)
  aval = core.shaped_abstractify(x)
  return pxla.batched_device_put(aval, sharding, shards, devices)


def shard_sharded_device_array_slow_path(x, devices, indices, sharding):
  candidates = defaultdict(list)
  bufs = [buf.data for buf in x.addressable_shards]
  arr_indices = tuple(x.sharding.devices_indices_map(x.shape).values())
  for buf, idx in safe_zip(bufs, arr_indices):
    candidates[hashed_index(idx)].append(buf)

  bufs = []
  for idx, device in safe_zip(indices, devices):
    # Look up all buffers that contain the correct slice of the logical array.
    candidates_list = candidates[hashed_index(idx)]
    if not candidates_list:
      return pxla.shard_args([sharding], [None], [None], [x._value],
                             canonicalize=False)[0]
    # Try to find a candidate buffer already on the correct device,
    # otherwise copy one of them.
    for buf in candidates_list:
      if buf.devices() == {device}:
        bufs.append(buf)
        break
    else:
      bufs.append(candidates_list[-1])
  return pxla.batched_device_put(x.aval, sharding, bufs, devices)


@cache(max_size=4096, trace_context_in_key=False)
def _sharding_indices_and_eq(src_sharding, shape, dst_sharding):
  src_indices = src_sharding.addressable_devices_indices_map(shape).values()
  dst_indices = dst_sharding.addressable_devices_indices_map(shape).values()
  return dst_indices, tuple(src_indices) == tuple(dst_indices)


def _array_shard_arg(xs, shardings, layouts, copy_semantics):
  util.test_event("_array_shard_arg")
  results = []
  batch_xs, batch_devs, batch_shardings, batch_indices = [], [], [], []
  batch_cs = []

  for i, (x, sharding, layout, cs) in enumerate(
      safe_zip(xs, shardings, layouts, copy_semantics)):
    x._check_if_deleted()
    indices, same_indices = _sharding_indices_and_eq(x.sharding, x.shape, sharding)
    same_layout = (True if layout is None else
                   x.layout.device_local_layout == layout)

    if not x.is_fully_addressable:
      if same_indices and same_layout:
        results.append(x)
      else:
        raise NotImplementedError(
            "Cannot reshard an input that is not fully addressable")
    else:
      devices = sharding._addressable_device_assignment
      if same_indices and same_layout:
        # Add a placeholder result that will be filled in later.
        results.append(None)
        # Accumulate arguments to `batched_copy_array_to_devices_with_sharding`.
        batch_xs.append(x)
        batch_devs.append(list(devices))
        batch_shardings.append(sharding)
        batch_indices.append(i)
        batch_cs.append(cs)
      # Resharding starts here:
      elif not same_layout:
        results.append(api.device_put(x, Layout(layout, sharding)))
      elif dispatch.is_single_device_sharding(x.sharding):
        results.append(shard_device_array(x, devices, indices, sharding))
      else:
        results.append(
            shard_sharded_device_array_slow_path(x, devices, indices, sharding))

  util.test_event("batched_copy_array")
  copy_outs = xc.batched_copy_array_to_devices_with_sharding(
      batch_xs, batch_devs, batch_shardings, batch_cs)
  for i, copy_out in safe_zip(batch_indices, copy_outs):
    assert results[i] is None
    results[i] = copy_out
  return results
pxla.shard_arg_handlers[ArrayImpl] = _array_shard_arg


def _array_global_result_handler(global_aval, out_sharding, committed):
  global_aval = core.update_aval_with_sharding(global_aval, out_sharding)
  if global_aval.dtype == dtypes.float0:
    return lambda _: np.zeros(global_aval.shape, dtypes.float0)
  if dtypes.issubdtype(global_aval.dtype, dtypes.extended):
    return global_aval.dtype._rules.global_sharded_result_handler(
        global_aval, out_sharding, committed)
  return xc.array_result_handler(
      global_aval, out_sharding, committed=committed, _skip_checks=True
  )
pxla.global_result_handlers[core.ShapedArray] = _array_global_result_handler

# Only used for Arrays that come out of pmap.
def _array_local_result_handler(aval, sharding, indices):
  if aval.dtype == dtypes.float0:
    return lambda _: np.zeros(aval.shape, dtypes.float0)
  if dtypes.issubdtype(aval.dtype, dtypes.extended):
    return aval.dtype._rules.local_sharded_result_handler(
        aval, sharding, indices)
  return xc.array_result_handler(
      aval, sharding, committed=True, _skip_checks=True
  )
pxla.local_result_handlers[core.ShapedArray] = _array_local_result_handler


# Token handlers

def _token_shard_arg(xs, shardings, layouts, copy_semantics):
  results = []
  for x, sharding, layout in safe_zip(xs, shardings, layouts):
    x.block_until_ready()
    x = np.array([], dtype=bool)
    results.append(api.device_put(x, Layout(layout, sharding)))
  return results
pxla.shard_arg_handlers[core.Token] = _token_shard_arg


def _token_global_result_handler(global_aval, out_sharding, committed):
  array_handler = _array_global_result_handler(
      core.get_token_aval(), out_sharding, committed)

  def wrapper(*args, **kwargs):
    out_buf = array_handler(*args, **kwargs)
    return core.Token(out_buf)
  return wrapper
pxla.global_result_handlers[core.AbstractToken] = _token_global_result_handler
-												Change JAX's copyright attribution from "Google LLC" to "The JAX Authors.".

See https://opensource.google/documentation/reference/releasing/contributions#copyright for more details.

PiperOrigin-RevId: 476167538

											
										
										
											2022-09-22 12:26:48 -07:00
+								# Copyright 2021 The JAX Authors.
-												Adding `jax.Array` to jax.experimental. Its pretty much the same as GDA (without the performance optimization for now).

Currently, jax.Array takes DeviceArrays in `assemble_array` because device_put returns a DA. In the future (with IFRT), it will return an `Array`.

`addressable_shards` wraps DA into jax.Array with a `SingleDeviceSharding`.

PiperOrigin-RevId: 453319811

											
										
										
											2022-06-06 17:31:20 -07:00
+								#
 								# Licensed under the Apache License, Version 2.0 (the "License");
 								# you may not use this file except in compliance with the License.
 								# You may obtain a copy of the License at
 								#
 								#     https://www.apache.org/licenses/LICENSE-2.0
 								#
 								# Unless required by applicable law or agreed to in writing, software
 								# distributed under the License is distributed on an "AS IS" BASIS,
 								# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								# See the License for the specific language governing permissions and
 								# limitations under the License.
 								from __future__ import annotations
-												Move _array_shard_arg helpers from pxla into array.

Refactoring only which fixes a TODO.

Add a canonicalize argument to pxla.shard_arg so we can call that API from array yet  avoid double-canonicalization.

PiperOrigin-RevId: 549658117

											
										
										
											2023-07-20 09:43:40 -07:00
+								from collections import defaultdict
-												Run `pyupgrade --py310-plus`.

Also apply manual fixes to import sorting and unused imports.

											
										
										
											2024-06-26 14:44:52 -04:00
+								from collections.abc import Callable, Sequence
-												Fix dlpack type signatures to match Array API spec.

Fixes https://github.com/google/jax/issues/17510

											
										
										
											2023-09-08 09:18:38 -04:00
+								import enum
-												Add jax.make_array_from_process_local_data to create a distributed tensor from host data and supporting scaffolding in sharding to be able to figure out dimensions of host data required.

PiperOrigin-RevId: 634205261

											
										
										
											2024-05-15 22:06:11 -07:00
+								import functools
-												Replace jax._src.util.prod with math.prod.

math.prod() was added in Python 3.8, so we can assume it is always present.

PiperOrigin-RevId: 513011144

											
										
										
											2023-02-28 12:40:30 -08:00
+								import math
-												Add weak type support to Array. Also make all api_test.py tests pass with Array. I have disabled the `float0` test for now until I investigate.

PiperOrigin-RevId: 468264910

											
										
										
											2022-08-17 12:25:14 -07:00
+								import operator as op
-												Run `pyupgrade --py310-plus`.

Also apply manual fixes to import sorting and unused imports.

											
										
										
											2024-06-26 14:44:52 -04:00
+								from typing import Any, TYPE_CHECKING, cast
-												Adding `jax.Array` to jax.experimental. Its pretty much the same as GDA (without the performance optimization for now).

Currently, jax.Array takes DeviceArrays in `assemble_array` because device_put returns a DA. In the future (with IFRT), it will return an `Array`.

`addressable_shards` wraps DA into jax.Array with a `SingleDeviceSharding`.

PiperOrigin-RevId: 453319811

											
										
										
											2022-06-06 17:31:20 -07:00
-												Avoid imports from the public jax.* namespace in more places internally.

This change is in preparation for more cycle breaking in the Bazel dependency graph.

PiperOrigin-RevId: 521822756

											
										
										
											2023-04-04 11:41:00 -07:00
+								from jax._src import api
-												Add initial jax.Array base class for instance checks & annotation

											
										
										
											2022-09-23 09:59:46 -07:00
+								from jax._src import basearray
-												Migrate a subset of internal modules to use state objects

The motivation here is to gradually replace all dynamic lookups on `jax.config`
with statically-typed state objects, which are more type checker/IDE friendly.

PiperOrigin-RevId: 571932143

											
										
										
											2023-10-09 07:28:18 -07:00
+								from jax._src import config
-												migrate internal dependencies from `jax.core` to `jax._src.core`

... in preparation for paring down `jax.core`'s exported symbols.

Also includes a few import fixups along the way, and a TODO comment to avoid an
import cycle in `_src/dtypes.py`.

PiperOrigin-RevId: 496024782

											
										
										
											2022-12-16 20:59:41 -08:00
+								from jax._src import core
-												Removed the `named_shape` argument from `jex.core.ShapedArray` and `jax.ShapeDtypeStruct`

It is unused and was only kept around to avoid breaking internal users.

PiperOrigin-RevId: 674310795

											
										
										
											2024-09-13 08:37:32 -07:00
+								from jax._src import deprecations
-												Make `jnp.array` return `jax.Array`. Add input and result handlers for `jax.Array`. Also added tests for `add` under jit.

TODO:
* Don't allow `x + y` if `jax.Array` is not fully addressable.
* Figure out how to use the already written tests with Array. Might be able to follow the path taken by SDA.
PiperOrigin-RevId: 457034779

											
										
										
											2022-06-24 10:04:31 -07:00
+								from jax._src import dispatch
-												Make checkify tests pass with Array and add methods on Array that are present on DA.

PiperOrigin-RevId: 468058909

											
										
										
											2022-08-16 16:51:26 -07:00
+								from jax._src import dtypes
-												jax.make_array_from_callback: better errors in traced context

											
										
										
											2024-01-31 15:13:33 -08:00
+								from jax._src import errors
-												Implement copy_to_host_async and _value with a single call to
device_replica_id_map and device_indices_map.

PiperOrigin-RevId: 516835021

											
										
										
											2023-03-15 08:41:47 -07:00
+								from jax._src import profiler
-												Don't monkey-patch functions in test_utils to count events for tests.

This has two problems:
* it's not thread-safe, which will become problematic if we run tests with thread-parallelism.
* it's not very maintainable.

Instead, add a new util.test_event(...) function that can be called at points of interest in the program. test_utils registers a callback that is invoked when an event is received. This avoids the need to make thread-unsafe global monkey patches.

											
										
										
											2024-12-11 16:54:52 -05:00
+								from jax._src import util
-												Avoid imports from the public jax.* namespace in more places internally.

This change is in preparation for more cycle breaking in the Bazel dependency graph.

PiperOrigin-RevId: 521822756

											
										
										
											2023-04-04 11:41:00 -07:00
+								from jax._src import xla_bridge
-												Make device_put resharding on single device array input work under use_mesh. Fixes https://github.com/jax-ml/jax/issues/26552

PiperOrigin-RevId: 728382461

											
										
										
											2025-02-18 15:22:06 -08:00
+								from jax._src.mesh import set_concrete_mesh
-												Replace references to jax.interpreters with jax._src.interpreters in JAX core.

PiperOrigin-RevId: 520933067

											
										
										
											2023-03-31 08:50:59 -07:00
+								from jax._src.interpreters import mlir
-												Prune accidental exports from jax.interpreters.pxla.

These imports do not appear to have users outside JAX itself.

PiperOrigin-RevId: 507835295

											
										
										
											2023-02-07 11:16:01 -08:00
+								from jax._src.interpreters import pxla
-												Move jax.interpreters.xla to jax._src.interpreters.xla.

Replace jax.interpreters.xla with a shim that re-exports names that are likely to be used externally.

PiperOrigin-RevId: 507895040

											
										
										
											2023-02-07 15:00:56 -08:00
+								from jax._src.interpreters import xla
-												Add jax.make_array_from_process_local_data to create a distributed tensor from host data and supporting scaffolding in sharding to be able to figure out dimensions of host data required.

PiperOrigin-RevId: 634205261

											
										
										
											2024-05-15 22:06:11 -07:00
+								from jax._src.layout import AutoLayout, DeviceLocalLayout, Layout
 								from jax._src.lib import xla_client as xc
 								from jax._src.lib import xla_extension as xe
-												Split Sharding subclasses out of _src/sharding.py into _src/sharding_impls.py

By defining the Sharding base class in its own module, we can pull it out into a separate Bazel submodule, which will help pytype inference when defining Array.

PiperOrigin-RevId: 516223009

											
										
										
											2023-03-13 08:49:39 -07:00
+								from jax._src.sharding import Sharding
 								from jax._src.sharding_impls import (
-												Don't allow users to query `tracer.sharding` even under sharding in types mode.

Instead, users should do `tracer.aval.sharding` so that code behaves the same under jit and eager mode.

PiperOrigin-RevId: 717638986

											
										
										
											2025-01-20 15:12:12 -08:00
+								    PmapSharding, SingleDeviceSharding,
-												Reverts dfe61285093ff826e1ad23bb36b77a42c01040b4

PiperOrigin-RevId: 640987745

											
										
										
											2024-06-06 12:40:21 -07:00
+								    device_replica_id_map, hashed_index, num_addressable_indices, local_to_global_shape)  # pyformat: disable
-												Rolling back a commit that caused a 50-90% performance regression in most MaxText workloads.

Reverts 9d421c9149a1db006444adeea87464bd6b8c0743

PiperOrigin-RevId: 731506280

											
										
										
											2025-02-26 16:56:47 -08:00
+								from jax._src.typing import ArrayLike, DLDeviceType
-												Add `util.cache` to `jax.clear_caches` and move pjit, sharding, array, etc uses of `functools.lru_cache` to `util.cache` so that those caches will be cleared if `jax.clear_caches` is called.

PiperOrigin-RevId: 642359226

											
										
										
											2024-06-11 12:46:11 -07:00
+								from jax._src.util import safe_zip, unzip3, use_cpp_class, use_cpp_method, cache
-												Add jax.make_array_from_process_local_data to create a distributed tensor from host data and supporting scaffolding in sharding to be able to figure out dimensions of host data required.

PiperOrigin-RevId: 634205261

											
										
										
											2024-05-15 22:06:11 -07:00
+								import numpy as np
-												Adding `jax.Array` to jax.experimental. Its pretty much the same as GDA (without the performance optimization for now).

Currently, jax.Array takes DeviceArrays in `assemble_array` because device_put returns a DA. In the future (with IFRT), it will return an `Array`.

`addressable_shards` wraps DA into jax.Array with a `SingleDeviceSharding`.

PiperOrigin-RevId: 453319811

											
										
										
											2022-06-06 17:31:20 -07:00
-												Register jax.Array device method deprecation

											
										
										
											2024-02-09 11:18:19 -08:00
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								Shape = tuple[int, ...]
-												Adding `jax.Array` to jax.experimental. Its pretty much the same as GDA (without the performance optimization for now).

Currently, jax.Array takes DeviceArrays in `assemble_array` because device_put returns a DA. In the future (with IFRT), it will return an `Array`.

`addressable_shards` wraps DA into jax.Array with a `SingleDeviceSharding`.

PiperOrigin-RevId: 453319811

											
										
										
											2022-06-06 17:31:20 -07:00
+								Device = xc.Device
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								Index = tuple[slice, ...]
-												remove `PRNGKeyArray` ABC

We don't expose the `PRNGKeyArray` symbol publicly any longer and we only implement the interface in one place.

PiperOrigin-RevId: 602470550

											
										
										
											2024-01-29 12:40:47 -08:00
+								PRNGKeyArray = Any  # TODO(jakevdp): fix cycles and import this.
-												Adding `jax.Array` to jax.experimental. Its pretty much the same as GDA (without the performance optimization for now).

Currently, jax.Array takes DeviceArrays in `assemble_array` because device_put returns a DA. In the future (with IFRT), it will return an `Array`.

`addressable_shards` wraps DA into jax.Array with a `SingleDeviceSharding`.

PiperOrigin-RevId: 453319811

											
										
										
											2022-06-06 17:31:20 -07:00
-												Deprecate the device() method of JAX arrays

											
										
										
											2023-11-29 16:52:09 -08:00
+								def _get_device(a: ArrayImpl) -> Device:
-												Merge pull request #21273 from superbobry:mypy-ruff

PiperOrigin-RevId: 636146344

											
										
										
											2024-05-22 06:35:38 -07:00
+								  devices = a.sharding._internal_device_list  # pytype: disable=attribute-error
-												Return arrays from `ArrayImpl._check_and_rearrange`.

This is in preparation for a larger change, so that input buffers can be checked before Array creation in XLA and the user gets more helpful JAX error messages instead of XLA errors.

Reverts 3b2410f77cdb0acc6951e1770c1229e6689b7409

PiperOrigin-RevId: 723539592

											
										
										
											2025-02-05 09:23:38 -08:00
+								  if len(devices) != 1:
 								    raise ValueError(
 								        "When making an array from single-device arrays the input arrays must "
 								        f"have one shard each. An argument array had {len(devices)} shard(s).")
-												Use `_internal_device_list` in `_get_device` so that all places accessing `_get_device` get a speedup.

PiperOrigin-RevId: 624320655

											
										
										
											2024-04-12 16:12:20 -07:00
+								  return devices[0]
-												Deprecate the device() method of JAX arrays

											
										
										
											2023-11-29 16:52:09 -08:00
-												Add the Shard class from GDA to `jax.Array` to make the management of shards easier and less verbose.

PiperOrigin-RevId: 453330349

											
										
										
											2022-06-06 18:44:45 -07:00
+								class Shard:
 								  """A single data shard of an Array.
-												Make `Shard` a normal class making `index` and `replica_id` properties on that class. Raise an error if an indices cannot be calculated from a sharding.

PiperOrigin-RevId: 454899275

											
										
										
											2022-06-14 10:34:19 -07:00
+								  Attributes:
-												Add the Shard class from GDA to `jax.Array` to make the management of shards easier and less verbose.

PiperOrigin-RevId: 453330349

											
										
										
											2022-06-06 18:44:45 -07:00
+								    device : Which device this shard resides on.
 								    index : The index into the global array of this shard.
 								    replica_id : Integer id indicating which replica of the global array this
 								      shard is part of. Always 0 for fully sharded data
 								      (i.e. when there’s only 1 replica).
 								    data : The data of this shard. None if ``device`` is non-local.
 								  """
-												Make `Shard` a normal class making `index` and `replica_id` properties on that class. Raise an error if an indices cannot be calculated from a sharding.

PiperOrigin-RevId: 454899275

											
										
										
											2022-06-14 10:34:19 -07:00
 								  def __init__(self, device: Device, sharding: Sharding, global_shape: Shape,
-												remove `PRNGKeyArray` ABC

We don't expose the `PRNGKeyArray` symbol publicly any longer and we only implement the interface in one place.

PiperOrigin-RevId: 602470550

											
										
										
											2024-01-29 12:40:47 -08:00
+								               data: None | ArrayImpl | PRNGKeyArray = None):
-												Make Shard.device and Shard.data read-only properties.

											
										
										
											2023-01-05 14:27:17 +00:00
+								    self._device = device
-												Make `Shard` a normal class making `index` and `replica_id` properties on that class. Raise an error if an indices cannot be calculated from a sharding.

PiperOrigin-RevId: 454899275

											
										
										
											2022-06-14 10:34:19 -07:00
+								    self._sharding = sharding
 								    self._global_shape = global_shape
-												Make Shard.device and Shard.data read-only properties.

											
										
										
											2023-01-05 14:27:17 +00:00
+								    self._data = data
-												Make `Shard` a normal class making `index` and `replica_id` properties on that class. Raise an error if an indices cannot be calculated from a sharding.

PiperOrigin-RevId: 454899275

											
										
										
											2022-06-14 10:34:19 -07:00
-												Add `__repr__` to Shard since its not a dataclass anymore

PiperOrigin-RevId: 456463979

											
										
										
											2022-06-22 02:25:34 -07:00
+								  def __repr__(self):
 								    try:
-												MAINT Do not use `str()` and `repr()` in f-string replacement fields

`str()` is called by default by the formatting machinery, and `repr()` only
needs `!r`.

											
										
										
											2023-10-23 15:11:15 +01:00
+								      return (f'Shard(device={self.device!r}, index={self.index}, '
-												Add `__repr__` to Shard since its not a dataclass anymore

PiperOrigin-RevId: 456463979

											
										
										
											2022-06-22 02:25:34 -07:00
+								              f'replica_id={self.replica_id}, data={self.data})')
 								    except ValueError:
-												MAINT Do not use `str()` and `repr()` in f-string replacement fields

`str()` is called by default by the formatting machinery, and `repr()` only
needs `!r`.

											
										
										
											2023-10-23 15:11:15 +01:00
+								      return f'Shard(device={self.device!r}, data={self.data})'
-												Add `__repr__` to Shard since its not a dataclass anymore

PiperOrigin-RevId: 456463979

											
										
										
											2022-06-22 02:25:34 -07:00
-												Optimize accessing `index` and `replica_id` of
addressable_shards

Benchmark:

```
name                                 old time/op  new time/op  delta
bench_addressable_shards_index       53.0µs ± 2%   2.6µs ± 4%  -95.07%  (p=0.008 n=5+5)
bench_addressable_shards_replica_id  51.7µs ± 2%   2.6µs ± 2%  -94.92%  (p=0.008 n=5+5)
```

PiperOrigin-RevId: 517977244

											
										
										
											2023-03-20 08:36:25 -07:00
+								  @functools.cached_property
-												Make `Shard` a normal class making `index` and `replica_id` properties on that class. Raise an error if an indices cannot be calculated from a sharding.

PiperOrigin-RevId: 454899275

											
										
										
											2022-06-14 10:34:19 -07:00
+								  def index(self) -> Index:
-												Remove fast_path_args from Array and add `id` checks to Sharding's `__eq__` method as a fast shortcut.

Also the C++ pjit path should help optimize the dispatch path.

PiperOrigin-RevId: 475163903

											
										
										
											2022-09-18 15:35:18 -07:00
+								    try:
 								      device_indices_map_fn = self._sharding.devices_indices_map
 								    except AttributeError:
 								      raise ValueError('Cannot calculate indices from sharding: '
 								                       f'{self._sharding}. Please create a device to index '
 								                       'mapping for your sharding.') from None
 								    index = device_indices_map_fn(self._global_shape)[self.device]
-												Make `Shard` a normal class making `index` and `replica_id` properties on that class. Raise an error if an indices cannot be calculated from a sharding.

PiperOrigin-RevId: 454899275

											
										
										
											2022-06-14 10:34:19 -07:00
+								    assert index is not None
 								    return index
-												Optimize accessing `index` and `replica_id` of
addressable_shards

Benchmark:

```
name                                 old time/op  new time/op  delta
bench_addressable_shards_index       53.0µs ± 2%   2.6µs ± 4%  -95.07%  (p=0.008 n=5+5)
bench_addressable_shards_replica_id  51.7µs ± 2%   2.6µs ± 2%  -94.92%  (p=0.008 n=5+5)
```

PiperOrigin-RevId: 517977244

											
										
										
											2023-03-20 08:36:25 -07:00
+								  @functools.cached_property
-												Make `Shard` a normal class making `index` and `replica_id` properties on that class. Raise an error if an indices cannot be calculated from a sharding.

PiperOrigin-RevId: 454899275

											
										
										
											2022-06-14 10:34:19 -07:00
+								  def replica_id(self) -> int:
-												Remove `device_replica_id_map` from the Sharding interface because the standalone function should be more than enough to use. The major use-case of this is for checkpointing and accessing addressable_shards which accesses the standalone function makes it work.

PiperOrigin-RevId: 470820443

											
										
										
											2022-08-29 14:49:17 -07:00
+								    return device_replica_id_map(self._sharding, self._global_shape)[self.device]
-												Add the Shard class from GDA to `jax.Array` to make the management of shards easier and less verbose.

PiperOrigin-RevId: 453330349

											
										
										
											2022-06-06 18:44:45 -07:00
-												Make Shard.device and Shard.data read-only properties.

											
										
										
											2023-01-05 14:27:17 +00:00
+								  @property
 								  def device(self):
 								    return self._device
 								  @property
 								  def data(self):
 								    return self._data
-												Add the Shard class from GDA to `jax.Array` to make the management of shards easier and less verbose.

PiperOrigin-RevId: 453330349

											
										
										
											2022-06-06 18:44:45 -07:00
-												Add support for interoperability via dlpack for Array and also make pickle_tests and lax_numpy_test pass with Array.

PiperOrigin-RevId: 468568917

											
										
										
											2022-08-18 15:58:40 -07:00
+								def _reconstruct_array(fun, args, arr_state, aval_state):
 								  """Method to reconstruct a device array from a serialized state."""
 								  np_value = fun(*args)
 								  np_value.__setstate__(arr_state)
-												Remove local imports of array.py. The remaining local imports are in pxla.py but I will chip away at them when we delete SDA and move some more APIs out of experimental.

PiperOrigin-RevId: 492033543

											
										
										
											2022-11-30 15:25:21 -08:00
+								  jnp_value = api.device_put(np_value)
-												Removed the `named_shape` argument from `jex.core.ShapedArray` and `jax.ShapeDtypeStruct`

It is unused and was only kept around to avoid breaking internal users.

PiperOrigin-RevId: 674310795

											
										
										
											2024-09-13 08:37:32 -07:00
+								  # TODO(slebedev): Remove this branch after December 10th 2024.
 								  if "named_shape" in aval_state:
 								    deprecations.warn(
 								        "jax-aval-named-shape",
 								        "Pickled array contains an aval with a named_shape attribute. This is"
 								        " deprecated and the code path supporting such avals will be removed."
 								        " Please re-pickle the array.",
 								        stacklevel=2,
 								    )
 								    del aval_state["named_shape"]
-												Add support for interoperability via dlpack for Array and also make pickle_tests and lax_numpy_test pass with Array.

PiperOrigin-RevId: 468568917

											
										
										
											2022-08-18 15:58:40 -07:00
+								  jnp_value.aval = jnp_value.aval.update(**aval_state)
 								  return jnp_value
-												Use C++ Array in pmap path and move PmapSharding to cpp

PiperOrigin-RevId: 474151089

											
										
										
											2022-09-13 16:18:31 -07:00
-												Add `util.cache` to `jax.clear_caches` and move pjit, sharding, array, etc uses of `functools.lru_cache` to `util.cache` so that those caches will be cleared if `jax.clear_caches` is called.

PiperOrigin-RevId: 642359226

											
										
										
											2024-06-11 12:46:11 -07:00
+								@cache(max_size=4096, trace_context_in_key=False)
-												If each host has the full value of the Array, allow fetching it to host. Fixes #15162

Benchmarks:

```
name                  old cpu/op   new cpu/op   delta
np_asarray_8_devices  3.71ms ± 6%  3.32ms ± 7%  -10.48%  (p=0.008 n=5+5)

name                  old time/op  new time/op  delta
np_asarray_8_devices  3.86ms ± 6%  3.49ms ± 7%  -9.72%  (p=0.008 n=5+5)
```

PiperOrigin-RevId: 519222320

											
										
										
											2023-03-24 13:21:20 -07:00
+								def _cached_index_calc(s, shape):
 								  map_ = s.addressable_devices_indices_map(shape)
 								  seen_h_indices = set()
-												Optimize `_create_copy_plan` by iterating over only the shards that are needed for materialization

For arrays that are fully or partially replicated, it is more efficient to (pre-)construct a list of addressable array shards that participate in materialization rather than going over all array shards. This is particularly useful for single-controller JAX.

The implementation assumes that addressable arrays appear in the same order as the corresponding addressable devices in `sharding.addressable_devices_indices_map()`.

PiperOrigin-RevId: 624969222

											
										
										
											2024-04-15 08:29:02 -07:00
+								  l = []
 								  for array_index, index in enumerate(map_.values()):
-												If each host has the full value of the Array, allow fetching it to host. Fixes #15162

Benchmarks:

```
name                  old cpu/op   new cpu/op   delta
np_asarray_8_devices  3.71ms ± 6%  3.32ms ± 7%  -10.48%  (p=0.008 n=5+5)

name                  old time/op  new time/op  delta
np_asarray_8_devices  3.86ms ± 6%  3.49ms ± 7%  -9.72%  (p=0.008 n=5+5)
```

PiperOrigin-RevId: 519222320

											
										
										
											2023-03-24 13:21:20 -07:00
+								    h_index = hashed_index(index)
 								    if h_index not in seen_h_indices:
 								      seen_h_indices.add(h_index)
-												Optimize `_create_copy_plan` by iterating over only the shards that are needed for materialization

For arrays that are fully or partially replicated, it is more efficient to (pre-)construct a list of addressable array shards that participate in materialization rather than going over all array shards. This is particularly useful for single-controller JAX.

The implementation assumes that addressable arrays appear in the same order as the corresponding addressable devices in `sharding.addressable_devices_indices_map()`.

PiperOrigin-RevId: 624969222

											
										
										
											2024-04-15 08:29:02 -07:00
+								      l.append((array_index, index))
 								  return l
-												If each host has the full value of the Array, allow fetching it to host. Fixes #15162

Benchmarks:

```
name                  old cpu/op   new cpu/op   delta
np_asarray_8_devices  3.71ms ± 6%  3.32ms ± 7%  -10.48%  (p=0.008 n=5+5)

name                  old time/op  new time/op  delta
np_asarray_8_devices  3.86ms ± 6%  3.49ms ± 7%  -9.72%  (p=0.008 n=5+5)
```

PiperOrigin-RevId: 519222320

											
										
										
											2023-03-24 13:21:20 -07:00
-												Add `util.cache` to `jax.clear_caches` and move pjit, sharding, array, etc uses of `functools.lru_cache` to `util.cache` so that those caches will be cleared if `jax.clear_caches` is called.

PiperOrigin-RevId: 642359226

											
										
										
											2024-06-11 12:46:11 -07:00
+								@cache(max_size=4096, trace_context_in_key=False)
-												If each host has the full value of the Array, allow fetching it to host. Fixes #15162

Benchmarks:

```
name                  old cpu/op   new cpu/op   delta
np_asarray_8_devices  3.71ms ± 6%  3.32ms ± 7%  -10.48%  (p=0.008 n=5+5)

name                  old time/op  new time/op  delta
np_asarray_8_devices  3.86ms ± 6%  3.49ms ± 7%  -9.72%  (p=0.008 n=5+5)
```

PiperOrigin-RevId: 519222320

											
										
										
											2023-03-24 13:21:20 -07:00
+								def _process_has_full_value_in_mcjax(s, shape):
 								  # Return False for single host as a fast path.
-												Avoid imports from the public jax.* namespace in more places internally.

This change is in preparation for more cycle breaking in the Bazel dependency graph.

PiperOrigin-RevId: 521822756

											
										
										
											2023-04-04 11:41:00 -07:00
+								  if xla_bridge.process_count() == 1:
-												If each host has the full value of the Array, allow fetching it to host. Fixes #15162

Benchmarks:

```
name                  old cpu/op   new cpu/op   delta
np_asarray_8_devices  3.71ms ± 6%  3.32ms ± 7%  -10.48%  (p=0.008 n=5+5)

name                  old time/op  new time/op  delta
np_asarray_8_devices  3.86ms ± 6%  3.49ms ± 7%  -9.72%  (p=0.008 n=5+5)
```

PiperOrigin-RevId: 519222320

											
										
										
											2023-03-24 13:21:20 -07:00
+								    return False
 								  num_unique_indices = len(
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								      {hashed_index(v) for v in s.devices_indices_map(shape).values()})
-												If each host has the full value of the Array, allow fetching it to host. Fixes #15162

Benchmarks:

```
name                  old cpu/op   new cpu/op   delta
np_asarray_8_devices  3.71ms ± 6%  3.32ms ± 7%  -10.48%  (p=0.008 n=5+5)

name                  old time/op  new time/op  delta
np_asarray_8_devices  3.86ms ± 6%  3.49ms ± 7%  -9.72%  (p=0.008 n=5+5)
```

PiperOrigin-RevId: 519222320

											
										
										
											2023-03-24 13:21:20 -07:00
+								  num_addressable_unique_indices = len(
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								      {hashed_index(v) for v in s.addressable_devices_indices_map(shape).values()})
-												If each host has the full value of the Array, allow fetching it to host. Fixes #15162

Benchmarks:

```
name                  old cpu/op   new cpu/op   delta
np_asarray_8_devices  3.71ms ± 6%  3.32ms ± 7%  -10.48%  (p=0.008 n=5+5)

name                  old time/op  new time/op  delta
np_asarray_8_devices  3.86ms ± 6%  3.49ms ± 7%  -9.72%  (p=0.008 n=5+5)
```

PiperOrigin-RevId: 519222320

											
										
										
											2023-03-24 13:21:20 -07:00
+								  return num_unique_indices == num_addressable_unique_indices
-												Switches make_array_from_callback to use batched_device_put

PiperOrigin-RevId: 647537267

											
										
										
											2024-06-27 20:59:25 -07:00
+								def _validate_shape_and_dtype_for_per_device_arrays(
 								    arrays: Sequence[ArrayImpl | np.ndarray],
 								    sharding: Sharding,
 								    aval: core.ShapedArray,
 								    expected_shape: Shape,
 								):
 								  """Validates that per-device arrays are valid and consistent."""
 								  expected_dtype = aval.dtype
 								  for db in arrays:
 								    if db.dtype != expected_dtype:
 								      raise ValueError(
 								          "Input buffers to `Array` must have matching dtypes. "
 								          f"Got {db.dtype}, expected {expected_dtype} for buffer: {db}"
 								      )
 								    if db.shape != expected_shape:
 								      raise ValueError(
 								          f"Expected shard shape {expected_shape} doesn't match the single "
 								          f"device array shape {db.shape}. Shape of Array is "
 								          f"{aval.str_short()} with sharding {sharding}"
 								      )
-												Rename the concrete class `Array` to `ArrayImpl`

PiperOrigin-RevId: 477017236

											
										
										
											2022-09-26 16:17:26 -07:00
+								class ArrayImpl(basearray.Array):
-												Adding `jax.Array` to jax.experimental. Its pretty much the same as GDA (without the performance optimization for now).

Currently, jax.Array takes DeviceArrays in `assemble_array` because device_put returns a DA. In the future (with IFRT), it will return an `Array`.

`addressable_shards` wraps DA into jax.Array with a `SingleDeviceSharding`.

PiperOrigin-RevId: 453319811

											
										
										
											2022-06-06 17:31:20 -07:00
+								  # TODO(yashkatariya): Add __slots__ here.
-												[typing] add class-level declarations of Array members.

This fixes some pytype errors associated with the changes in #12421

											
										
										
											2022-09-21 12:51:32 -07:00
+								  aval: core.ShapedArray
 								  _sharding: Sharding
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								  _arrays: list[ArrayImpl]
-												[typing] add class-level declarations of Array members.

This fixes some pytype errors associated with the changes in #12421

											
										
										
											2022-09-21 12:51:32 -07:00
+								  _committed: bool
 								  _skip_checks: bool
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								  _npy_value: np.ndarray | None
-												[typing] add class-level declarations of Array members.

This fixes some pytype errors associated with the changes in #12421

											
										
										
											2022-09-21 12:51:32 -07:00
-												[Rollforward] Convert _arrays to return PyArray instead of PyBuffer.

This change also converts all callsites that construct buffers to
return PyArrays.

PiperOrigin-RevId: 510486273

											
										
										
											2023-02-17 11:52:08 -08:00
+								  @use_cpp_method()
-												Add weak type support to Array. Also make all api_test.py tests pass with Array. I have disabled the `float0` test for now until I investigate.

PiperOrigin-RevId: 468264910

											
										
										
											2022-08-17 12:25:14 -07:00
+								  def __init__(self, aval: core.ShapedArray, sharding: Sharding,
-												Delete `_single_device_array_from_buf` since everything from JAX is an Array

PiperOrigin-RevId: 520418231

											
										
										
											2023-03-29 12:58:34 -07:00
+								               arrays: Sequence[ArrayImpl],
-												Remove fast_path_args from Array and add `id` checks to Sharding's `__eq__` method as a fast shortcut.

Also the C++ pjit path should help optimize the dispatch path.

PiperOrigin-RevId: 475163903

											
										
										
											2022-09-18 15:35:18 -07:00
+								               committed: bool, _skip_checks: bool = False):
-												Introduce class PyArray that contains the data members of python Array.

A few key methods is implemented in C++ while the rest are still implmemented in python and added to the class later. A class decorator, @use_cpp_array, is added to add python methods to xc.Array.

PiperOrigin-RevId: 473075244

											
										
										
											2022-09-08 13:47:57 -07:00
+								    # NOTE: the actual implementation of the constructor is moved to C++.
-												Add weak type support to Array. Also make all api_test.py tests pass with Array. I have disabled the `float0` test for now until I investigate.

PiperOrigin-RevId: 468264910

											
										
										
											2022-08-17 12:25:14 -07:00
+								    self.aval = aval
-												Adding `jax.Array` to jax.experimental. Its pretty much the same as GDA (without the performance optimization for now).

Currently, jax.Array takes DeviceArrays in `assemble_array` because device_put returns a DA. In the future (with IFRT), it will return an `Array`.

`addressable_shards` wraps DA into jax.Array with a `SingleDeviceSharding`.

PiperOrigin-RevId: 453319811

											
										
										
											2022-06-06 17:31:20 -07:00
+								    self._sharding = sharding
 								    self._committed = committed
-												Add `__array__` (for device_get), `_npy_value`, `block_until_ready`, `delete` and `_check_if_deleted` to Array.

PiperOrigin-RevId: 454741685

											
										
										
											2022-06-13 18:07:55 -07:00
+								    self._npy_value = None
-												Return arrays from `ArrayImpl._check_and_rearrange`.

This is in preparation for a larger change, so that input buffers can be checked before Array creation in XLA and the user gets more helpful JAX error messages instead of XLA errors.

Reverts 3b2410f77cdb0acc6951e1770c1229e6689b7409

PiperOrigin-RevId: 723539592

											
										
										
											2025-02-05 09:23:38 -08:00
+								    arrays = [a._arrays[0] for a in arrays]
-												Adding `jax.Array` to jax.experimental. Its pretty much the same as GDA (without the performance optimization for now).

Currently, jax.Array takes DeviceArrays in `assemble_array` because device_put returns a DA. In the future (with IFRT), it will return an `Array`.

`addressable_shards` wraps DA into jax.Array with a `SingleDeviceSharding`.

PiperOrigin-RevId: 453319811

											
										
										
											2022-06-06 17:31:20 -07:00
-												Enable fast path in the Array constructor. This means that the rearranging of `_arrays` according to the device_assignment won't happen when fastpath is enabled because we assume that jax transformations will return the right arrangement.

PiperOrigin-RevId: 469492283

											
										
										
											2022-08-23 10:19:59 -07:00
+								    # Don't rearrange if skip_checks is enabled because this assumes that the
 								    # input buffers are already arranged properly. This usually happens when
 								    # Array's are created as output of a JAX transformation
-												Delete `xmap` and the `jax.experimental.maps` module. It's been 5 months since its deprecation (more than the standard 3 months deprecation period).

PiperOrigin-RevId: 655614395

											
										
										
											2024-07-24 10:23:29 -07:00
+								    # (like pjit, etc).
-												Migrate a subset of internal modules to use state objects

The motivation here is to gradually replace all dynamic lookups on `jax.config`
with statically-typed state objects, which are more type checker/IDE friendly.

PiperOrigin-RevId: 571932143

											
										
										
											2023-10-09 07:28:18 -07:00
+								    if not _skip_checks or config.enable_checks.value:
-												Return arrays from `ArrayImpl._check_and_rearrange`.

This is in preparation for a larger change, so that input buffers can be checked before Array creation in XLA and the user gets more helpful JAX error messages instead of XLA errors.

Reverts 3b2410f77cdb0acc6951e1770c1229e6689b7409

PiperOrigin-RevId: 723539592

											
										
										
											2025-02-05 09:23:38 -08:00
+								      arrays = self._check_and_rearrange(arrays, self._sharding, self.aval)
-												Removed unused ``# type: ignore`` comments

For future reference, this can be done via

    python -m mypy jax --warn-unused-ignores > /tmp/unused.txt
    while IFS=: read file line rest; do
      echo "$file:$line";
      gsed -i "${line}s/ *\# type: ignore\(\[[^]]*\]\)*//" "$file"
    done < /tmp/unused.txt

											
										
										
											2025-02-13 18:05:27 +00:00
+								    self._arrays = arrays
-												Return arrays from `ArrayImpl._check_and_rearrange`.

This is in preparation for a larger change, so that input buffers can be checked before Array creation in XLA and the user gets more helpful JAX error messages instead of XLA errors.

Reverts 3b2410f77cdb0acc6951e1770c1229e6689b7409

PiperOrigin-RevId: 723539592

											
										
										
											2025-02-05 09:23:38 -08:00
-												Remove code present to support jaxlib < 0.5.1.

The new minimum xla_extension_version is 317 and the new mlir_api_version is 58.

											
										
										
											2025-02-24 17:45:19 -05:00
+								  def _check_and_rearrange(self, arrays, sharding, aval):
 								    device_id_to_buffer = {_get_device(db).id: db for db in arrays}
-												Reverts bb951136e9b91a584bb422119ada76cc69c86024

PiperOrigin-RevId: 721908669

											
										
										
											2025-01-31 14:41:45 -08:00
-												Remove code present to support jaxlib < 0.5.1.

The new minimum xla_extension_version is 317 and the new mlir_api_version is 58.

											
										
										
											2025-02-24 17:45:19 -05:00
+								    addressable_dev = sharding.addressable_devices
 								    if len(arrays) != len(addressable_dev):
 								      raise ValueError(
 								          f"Expected {len(addressable_dev)} per-device arrays "
 								          "(this is how many devices are addressable by the sharding), but "
 								          f"got {len(arrays)}")
 								    array_device_ids = set(device_id_to_buffer.keys())
 								    addressable_device_ids = {d.id for d in addressable_dev}
 								    if len(array_device_ids) != len(arrays):
 								      buffer_device_ids = [_get_device(db).id for db in arrays]
 								      raise ValueError(
 								          "When making an array from single-device arrays, the input arrays"
 								          " must be from distinct devices, but got device IDs"
 								          f" {buffer_device_ids}")
 								    # Calculate a symmetric difference because the device ids between sharding
 								    # and _arrays should match.
 								    diff = array_device_ids ^ addressable_device_ids
 								    if diff:
 								      dev_in_sharding_not_in_arrays = addressable_device_ids - array_device_ids
 								      dev_in_arrays_not_in_sharding = array_device_ids - addressable_device_ids
 								      err_msg = (
 								          "Addressable devices and per-device arrays devices do not match.")
 								      if dev_in_sharding_not_in_arrays:
 								        err_msg += (f" Sharding contains devices {dev_in_sharding_not_in_arrays} "
 								                    "that are not present in per-device arrays.")
 								      if dev_in_arrays_not_in_sharding:
 								        err_msg += (f" Per-device arrays contain devices {dev_in_arrays_not_in_sharding} "
 								                    "that are not present in the sharding.")
 								      raise ValueError(err_msg)
 								    _validate_shape_and_dtype_for_per_device_arrays(
 								        arrays,
 								        sharding=sharding,
 								        aval=aval,
 								        expected_shape=sharding.shard_shape(aval.shape),
 								    )
 								    # Rearrange arrays based on the device assignment.
 								    addressable_da = sharding._addressable_device_assignment
 								    return [device_id_to_buffer[device.id] for device in addressable_da]
-												Introduce class PyArray that contains the data members of python Array.

A few key methods is implemented in C++ while the rest are still implmemented in python and added to the class later. A class decorator, @use_cpp_array, is added to add python methods to xc.Array.

PiperOrigin-RevId: 473075244

											
										
										
											2022-09-08 13:47:57 -07:00
-												Adding `jax.Array` to jax.experimental. Its pretty much the same as GDA (without the performance optimization for now).

Currently, jax.Array takes DeviceArrays in `assemble_array` because device_put returns a DA. In the future (with IFRT), it will return an `Array`.

`addressable_shards` wraps DA into jax.Array with a `SingleDeviceSharding`.

PiperOrigin-RevId: 453319811

											
										
										
											2022-06-06 17:31:20 -07:00
+								  @property
 								  def shape(self) -> Shape:
-												Add weak type support to Array. Also make all api_test.py tests pass with Array. I have disabled the `float0` test for now until I investigate.

PiperOrigin-RevId: 468264910

											
										
										
											2022-08-17 12:25:14 -07:00
+								    return self.aval.shape
-												Adding `jax.Array` to jax.experimental. Its pretty much the same as GDA (without the performance optimization for now).

Currently, jax.Array takes DeviceArrays in `assemble_array` because device_put returns a DA. In the future (with IFRT), it will return an `Array`.

`addressable_shards` wraps DA into jax.Array with a `SingleDeviceSharding`.

PiperOrigin-RevId: 453319811

											
										
										
											2022-06-06 17:31:20 -07:00
-												Make `jnp.array` return `jax.Array`. Add input and result handlers for `jax.Array`. Also added tests for `add` under jit.

TODO:
* Don't allow `x + y` if `jax.Array` is not fully addressable.
* Figure out how to use the already written tests with Array. Might be able to follow the path taken by SDA.
PiperOrigin-RevId: 457034779

											
										
										
											2022-06-24 10:04:31 -07:00
+								  @property
-												Add weak type support to Array. Also make all api_test.py tests pass with Array. I have disabled the `float0` test for now until I investigate.

PiperOrigin-RevId: 468264910

											
										
										
											2022-08-17 12:25:14 -07:00
+								  def dtype(self):
 								    return self.aval.dtype
-												Make `jnp.array` return `jax.Array`. Add input and result handlers for `jax.Array`. Also added tests for `add` under jit.

TODO:
* Don't allow `x + y` if `jax.Array` is not fully addressable.
* Figure out how to use the already written tests with Array. Might be able to follow the path taken by SDA.
PiperOrigin-RevId: 457034779

											
										
										
											2022-06-24 10:04:31 -07:00
-												Adding `jax.Array` to jax.experimental. Its pretty much the same as GDA (without the performance optimization for now).

Currently, jax.Array takes DeviceArrays in `assemble_array` because device_put returns a DA. In the future (with IFRT), it will return an `Array`.

`addressable_shards` wraps DA into jax.Array with a `SingleDeviceSharding`.

PiperOrigin-RevId: 453319811

											
										
										
											2022-06-06 17:31:20 -07:00
+								  @property
 								  def ndim(self):
 								    return len(self.shape)
 								  @property
 								  def size(self):
-												Replace jax._src.util.prod with math.prod.

math.prod() was added in Python 3.8, so we can assume it is always present.

PiperOrigin-RevId: 513011144

											
										
										
											2023-02-28 12:40:30 -08:00
+								    return math.prod(self.shape)
-												Adding `jax.Array` to jax.experimental. Its pretty much the same as GDA (without the performance optimization for now).

Currently, jax.Array takes DeviceArrays in `assemble_array` because device_put returns a DA. In the future (with IFRT), it will return an `Array`.

`addressable_shards` wraps DA into jax.Array with a `SingleDeviceSharding`.

PiperOrigin-RevId: 453319811

											
										
										
											2022-06-06 17:31:20 -07:00
 								  @property
 								  def sharding(self):
 								    return self._sharding
-												[array API] add device property & to_device method

											
										
										
											2024-07-23 09:48:51 -07:00
+								  @property
 								  def device(self):
 								    self._check_if_deleted()
 								    if isinstance(self.sharding, SingleDeviceSharding):
 								      return list(self.sharding.device_set)[0]
 								    return self.sharding
-												Add weak_type attribute to `Array` since it exists on DA (but doesn't exist on SDA).

PiperOrigin-RevId: 480223116

											
										
										
											2022-10-10 18:10:46 -07:00
+								  @property
 								  def weak_type(self):
 								    return self.aval.weak_type
-												Make committed a public property of jax.Array.

Why?

Because users need to know if an array is committed or not since JAX raises errors based on committedness of a jax.Array. JAX also makes decisions about dispatching based on committedness of a jax.Array.
But the placement of such arrays on devices is an internal implementation detail.

PiperOrigin-RevId: 686329828

											
										
										
											2024-10-15 19:45:25 -07:00
+								  @property
 								  def committed(self) -> bool:
 								    return self._committed
-												Make checkify tests pass with Array and add methods on Array that are present on DA.

PiperOrigin-RevId: 468058909

											
										
										
											2022-08-16 16:51:26 -07:00
+								  def __str__(self):
 								    return str(self._value)
 								  def __len__(self):
 								    try:
 								      return self.shape[0]
 								    except IndexError as err:
 								      raise TypeError("len() of unsized object") from err  # same as numpy error
 								  def __bool__(self):
-												Error on conversion of empty arrays to boolean.

PiperOrigin-RevId: 595264332

											
										
										
											2024-01-02 19:26:09 -08:00
+								    core.check_bool_conversion(self)
-												Make checkify tests pass with Array and add methods on Array that are present on DA.

PiperOrigin-RevId: 468058909

											
										
										
											2022-08-16 16:51:26 -07:00
+								    return bool(self._value)
 								  def __float__(self):
-												Better errors for array scalar/boolean conversion

											
										
										
											2023-09-19 09:00:19 -07:00
+								    core.check_scalar_conversion(self)
-												Make checkify tests pass with Array and add methods on Array that are present on DA.

PiperOrigin-RevId: 468058909

											
										
										
											2022-08-16 16:51:26 -07:00
+								    return self._value.__float__()
 								  def __int__(self):
-												Better errors for array scalar/boolean conversion

											
										
										
											2023-09-19 09:00:19 -07:00
+								    core.check_scalar_conversion(self)
-												Make checkify tests pass with Array and add methods on Array that are present on DA.

PiperOrigin-RevId: 468058909

											
										
										
											2022-08-16 16:51:26 -07:00
+								    return self._value.__int__()
 								  def __complex__(self):
-												Better errors for array scalar/boolean conversion

											
										
										
											2023-09-19 09:00:19 -07:00
+								    core.check_scalar_conversion(self)
-												Make checkify tests pass with Array and add methods on Array that are present on DA.

PiperOrigin-RevId: 468058909

											
										
										
											2022-08-16 16:51:26 -07:00
+								    return self._value.__complex__()
 								  def __hex__(self):
-												Better errors for array scalar/boolean conversion

											
										
										
											2023-09-19 09:00:19 -07:00
+								    core.check_integer_conversion(self)
-												Removed noop # type: ignore comments

mypy should now flag these by default.

											
										
										
											2024-05-17 09:46:36 +01:00
+								    return hex(self._value)
-												Make checkify tests pass with Array and add methods on Array that are present on DA.

PiperOrigin-RevId: 468058909

											
										
										
											2022-08-16 16:51:26 -07:00
 								  def __oct__(self):
-												Better errors for array scalar/boolean conversion

											
										
										
											2023-09-19 09:00:19 -07:00
+								    core.check_integer_conversion(self)
-												Removed noop # type: ignore comments

mypy should now flag these by default.

											
										
										
											2024-05-17 09:46:36 +01:00
+								    return oct(self._value)
-												Make checkify tests pass with Array and add methods on Array that are present on DA.

PiperOrigin-RevId: 468058909

											
										
										
											2022-08-16 16:51:26 -07:00
 								  def __index__(self):
-												Better errors for array scalar/boolean conversion

											
										
										
											2023-09-19 09:00:19 -07:00
+								    core.check_integer_conversion(self)
-												Add weak type support to Array. Also make all api_test.py tests pass with Array. I have disabled the `float0` test for now until I investigate.

PiperOrigin-RevId: 468264910

											
										
										
											2022-08-17 12:25:14 -07:00
+								    return op.index(self._value)
-												Make checkify tests pass with Array and add methods on Array that are present on DA.

PiperOrigin-RevId: 468058909

											
										
										
											2022-08-16 16:51:26 -07:00
-												Add support for interoperability via dlpack for Array and also make pickle_tests and lax_numpy_test pass with Array.

PiperOrigin-RevId: 468568917

											
										
										
											2022-08-18 15:58:40 -07:00
+								  def tobytes(self, order="C"):
-												Make checkify tests pass with Array and add methods on Array that are present on DA.

PiperOrigin-RevId: 468058909

											
										
										
											2022-08-16 16:51:26 -07:00
+								    return self._value.tobytes(order)
 								  def tolist(self):
 								    return self._value.tolist()
 								  def __format__(self, format_spec):
 								    # Simulates behavior of https://github.com/numpy/numpy/pull/9883
 								    if self.ndim == 0:
 								      return format(self._value[()], format_spec)
 								    else:
 								      return format(self._value, format_spec)
-												Make `__getitem__` work for PmapSharding just like SDA works. DA is already covered with the current implementation.

Added TODOs to take fast path for indices wherever it is possible to do that. If a correct index is passed during getitem and if that index exists on `Array`, then the fast path is taken (see the test in this CL).

PiperOrigin-RevId: 473342504

											
										
										
											2022-09-09 14:24:39 -07:00
+								  def __getitem__(self, idx):
-												Add a new experimental option jax_pmap_no_rank_reduction.

This option changes the implementation of pmap so that the individual shards have the same rank as the entire array, i.e. in the terminology of pmap using a "chunked" axis instead of an "unstacked" axis.

i.e., previously a typical array used by pmap might have a shape of, say, [8, 100], if sharded across 8 accelerators on its first axis, and each individual shard would have a shape of, say, [100]. With this change, each individual shard has a shape of [1, 100] instead.

Why do this?

The main reason to do this is that XLA's sharding (HloSharding), which is exposed in JAX as GSPMDSharding/NamedSharding/PositionalSharding, cannot represent a change of rank. This means that the kind of sharding used by pmap cannot be represented to XLA as a sharding. If we change the definition of PmapSharding to preserve the array rank instead, then this means that PmapSharding can in the future be represented directly as a kind of sharding known to XLA.

The new definition of PmapSharding will allow a number of internal simplifications to JAX, for example in a subsequent change we can probably delete PmapSharding entirely. This in turn also would allow us to delete the APIs `jax.device_put_replicated` and `jax.device_put_sharded`, which predate the current sharding design.

This change also prepares for an upcoming change where we would like to redefine `pmap` in terms of `jit(shard_map(...))`, allowing us to delete most `pmap` code paths.

Once enabled, this change has the potential to break pmap users who:
a) look at the shards of an array, e.g., via `.addressable_shards`, or `jax.make_array_from_single_device_arrays`, since the shapes of the shards will change.
b) rely on zero-copy behavior in APIs like `jax.device_put_replicated`.

The change is disabled by default, so we do not expect any user visible impacts from this change.

PiperOrigin-RevId: 599787818

											
										
										
											2024-01-19 03:53:01 -08:00
+								    from jax._src.lax import lax
-												refactor: move lax_numpy indexing routines to their own submodule

											
										
										
											2025-02-12 11:52:11 -08:00
+								    from jax._src.numpy import indexing
-												Make `__getitem__` work for PmapSharding just like SDA works. DA is already covered with the current implementation.

Added TODOs to take fast path for indices wherever it is possible to do that. If a correct index is passed during getitem and if that index exists on `Array`, then the fast path is taken (see the test in this CL).

PiperOrigin-RevId: 473342504

											
										
										
											2022-09-09 14:24:39 -07:00
+								    self._check_if_deleted()
-												Go via `_rewriting_take` if reducing on a dim for __getitem__ so that we can preserve the sharding and run it via XLA which will do sharding propagation.

PiperOrigin-RevId: 516288270

											
										
										
											2023-03-13 12:26:35 -07:00
+								    if isinstance(self.sharding, PmapSharding):
-												Add a new experimental option jax_pmap_no_rank_reduction.

This option changes the implementation of pmap so that the individual shards have the same rank as the entire array, i.e. in the terminology of pmap using a "chunked" axis instead of an "unstacked" axis.

i.e., previously a typical array used by pmap might have a shape of, say, [8, 100], if sharded across 8 accelerators on its first axis, and each individual shard would have a shape of, say, [100]. With this change, each individual shard has a shape of [1, 100] instead.

Why do this?

The main reason to do this is that XLA's sharding (HloSharding), which is exposed in JAX as GSPMDSharding/NamedSharding/PositionalSharding, cannot represent a change of rank. This means that the kind of sharding used by pmap cannot be represented to XLA as a sharding. If we change the definition of PmapSharding to preserve the array rank instead, then this means that PmapSharding can in the future be represented directly as a kind of sharding known to XLA.

The new definition of PmapSharding will allow a number of internal simplifications to JAX, for example in a subsequent change we can probably delete PmapSharding entirely. This in turn also would allow us to delete the APIs `jax.device_put_replicated` and `jax.device_put_sharded`, which predate the current sharding design.

This change also prepares for an upcoming change where we would like to redefine `pmap` in terms of `jit(shard_map(...))`, allowing us to delete most `pmap` code paths.

Once enabled, this change has the potential to break pmap users who:
a) look at the shards of an array, e.g., via `.addressable_shards`, or `jax.make_array_from_single_device_arrays`, since the shapes of the shards will change.
b) rely on zero-copy behavior in APIs like `jax.device_put_replicated`.

The change is disabled by default, so we do not expect any user visible impacts from this change.

PiperOrigin-RevId: 599787818

											
										
										
											2024-01-19 03:53:01 -08:00
+								      if config.pmap_no_rank_reduction.value:
 								        cidx = idx if isinstance(idx, tuple) else (idx,)
 								        padded_cidx = tuple(
 								            slice(i, i + 1, None) if isinstance(i, int) else i for i in cidx
 								        ) + (slice(None),) * (len(self.shape) - len(cidx))
-												Make `__getitem__` work for PmapSharding just like SDA works. DA is already covered with the current implementation.

Added TODOs to take fast path for indices wherever it is possible to do that. If a correct index is passed during getitem and if that index exists on `Array`, then the fast path is taken (see the test in this CL).

PiperOrigin-RevId: 473342504

											
										
										
											2022-09-09 14:24:39 -07:00
+								      else:
-												Add a new experimental option jax_pmap_no_rank_reduction.

This option changes the implementation of pmap so that the individual shards have the same rank as the entire array, i.e. in the terminology of pmap using a "chunked" axis instead of an "unstacked" axis.

i.e., previously a typical array used by pmap might have a shape of, say, [8, 100], if sharded across 8 accelerators on its first axis, and each individual shard would have a shape of, say, [100]. With this change, each individual shard has a shape of [1, 100] instead.

Why do this?

The main reason to do this is that XLA's sharding (HloSharding), which is exposed in JAX as GSPMDSharding/NamedSharding/PositionalSharding, cannot represent a change of rank. This means that the kind of sharding used by pmap cannot be represented to XLA as a sharding. If we change the definition of PmapSharding to preserve the array rank instead, then this means that PmapSharding can in the future be represented directly as a kind of sharding known to XLA.

The new definition of PmapSharding will allow a number of internal simplifications to JAX, for example in a subsequent change we can probably delete PmapSharding entirely. This in turn also would allow us to delete the APIs `jax.device_put_replicated` and `jax.device_put_sharded`, which predate the current sharding design.

This change also prepares for an upcoming change where we would like to redefine `pmap` in terms of `jit(shard_map(...))`, allowing us to delete most `pmap` code paths.

Once enabled, this change has the potential to break pmap users who:
a) look at the shards of an array, e.g., via `.addressable_shards`, or `jax.make_array_from_single_device_arrays`, since the shapes of the shards will change.
b) rely on zero-copy behavior in APIs like `jax.device_put_replicated`.

The change is disabled by default, so we do not expect any user visible impacts from this change.

PiperOrigin-RevId: 599787818

											
										
										
											2024-01-19 03:53:01 -08:00
+								        if not isinstance(idx, tuple):
 								          padded_cidx = (idx,) + (slice(None),) * (len(self.shape) - 1)
 								        else:
 								          padded_cidx = idx + (slice(None),) * (len(self.shape) - len(idx))
-												Remove the check for existence of _npy_value before taking the fast path for `__getitem__`. This must have been a remnant of SDA era.

PiperOrigin-RevId: 596005983

											
										
										
											2024-01-05 08:50:02 -08:00
+								      indices = tuple(self.sharding.devices_indices_map(self.shape).values())
 								      try:
-												Add a new experimental option jax_pmap_no_rank_reduction.

This option changes the implementation of pmap so that the individual shards have the same rank as the entire array, i.e. in the terminology of pmap using a "chunked" axis instead of an "unstacked" axis.

i.e., previously a typical array used by pmap might have a shape of, say, [8, 100], if sharded across 8 accelerators on its first axis, and each individual shard would have a shape of, say, [100]. With this change, each individual shard has a shape of [1, 100] instead.

Why do this?

The main reason to do this is that XLA's sharding (HloSharding), which is exposed in JAX as GSPMDSharding/NamedSharding/PositionalSharding, cannot represent a change of rank. This means that the kind of sharding used by pmap cannot be represented to XLA as a sharding. If we change the definition of PmapSharding to preserve the array rank instead, then this means that PmapSharding can in the future be represented directly as a kind of sharding known to XLA.

The new definition of PmapSharding will allow a number of internal simplifications to JAX, for example in a subsequent change we can probably delete PmapSharding entirely. This in turn also would allow us to delete the APIs `jax.device_put_replicated` and `jax.device_put_sharded`, which predate the current sharding design.

This change also prepares for an upcoming change where we would like to redefine `pmap` in terms of `jit(shard_map(...))`, allowing us to delete most `pmap` code paths.

Once enabled, this change has the potential to break pmap users who:
a) look at the shards of an array, e.g., via `.addressable_shards`, or `jax.make_array_from_single_device_arrays`, since the shapes of the shards will change.
b) rely on zero-copy behavior in APIs like `jax.device_put_replicated`.

The change is disabled by default, so we do not expect any user visible impacts from this change.

PiperOrigin-RevId: 599787818

											
										
										
											2024-01-19 03:53:01 -08:00
+								        arr_idx = indices.index(padded_cidx)
-												Remove the check for existence of _npy_value before taking the fast path for `__getitem__`. This must have been a remnant of SDA era.

PiperOrigin-RevId: 596005983

											
										
										
											2024-01-05 08:50:02 -08:00
+								      except ValueError:
 								        arr_idx = None
 								      if arr_idx is not None:
-												Fix array_test.py when jax_pmap_no_rank_reduction is flipped to true.
The problem is that squeezing was happening on noncommitted arrays
so list(x) was moving all the shards to device 0. This will potentially
cause ooms.

PiperOrigin-RevId: 661408226

											
										
										
											2024-08-09 14:40:08 -07:00
+								        out = self._arrays[arr_idx]
 								        sharding = SingleDeviceSharding(_get_device(out))
-												Add a new experimental option jax_pmap_no_rank_reduction.

This option changes the implementation of pmap so that the individual shards have the same rank as the entire array, i.e. in the terminology of pmap using a "chunked" axis instead of an "unstacked" axis.

i.e., previously a typical array used by pmap might have a shape of, say, [8, 100], if sharded across 8 accelerators on its first axis, and each individual shard would have a shape of, say, [100]. With this change, each individual shard has a shape of [1, 100] instead.

Why do this?

The main reason to do this is that XLA's sharding (HloSharding), which is exposed in JAX as GSPMDSharding/NamedSharding/PositionalSharding, cannot represent a change of rank. This means that the kind of sharding used by pmap cannot be represented to XLA as a sharding. If we change the definition of PmapSharding to preserve the array rank instead, then this means that PmapSharding can in the future be represented directly as a kind of sharding known to XLA.

The new definition of PmapSharding will allow a number of internal simplifications to JAX, for example in a subsequent change we can probably delete PmapSharding entirely. This in turn also would allow us to delete the APIs `jax.device_put_replicated` and `jax.device_put_sharded`, which predate the current sharding design.

This change also prepares for an upcoming change where we would like to redefine `pmap` in terms of `jit(shard_map(...))`, allowing us to delete most `pmap` code paths.

Once enabled, this change has the potential to break pmap users who:
a) look at the shards of an array, e.g., via `.addressable_shards`, or `jax.make_array_from_single_device_arrays`, since the shapes of the shards will change.
b) rely on zero-copy behavior in APIs like `jax.device_put_replicated`.

The change is disabled by default, so we do not expect any user visible impacts from this change.

PiperOrigin-RevId: 599787818

											
										
										
											2024-01-19 03:53:01 -08:00
 								        if config.pmap_no_rank_reduction.value:
 								          # If cidx was the index of a single shard, then it corresponds to one
 								          # shard of the chunked dimension.
 								          dims = tuple(i for i, x in enumerate(cidx) if isinstance(x, int))
-												Fix array_test.py when jax_pmap_no_rank_reduction is flipped to true.
The problem is that squeezing was happening on noncommitted arrays
so list(x) was moving all the shards to device 0. This will potentially
cause ooms.

PiperOrigin-RevId: 661408226

											
										
										
											2024-08-09 14:40:08 -07:00
+								          # Squeeze on committed arrays to avoid data movement to shard 0.
 								          out = lax.squeeze(out, dimensions=dims)
 								        return ArrayImpl(
 								            out.aval, sharding, [out], committed=False, _skip_checks=True)
-												Add a new experimental option jax_pmap_no_rank_reduction.

This option changes the implementation of pmap so that the individual shards have the same rank as the entire array, i.e. in the terminology of pmap using a "chunked" axis instead of an "unstacked" axis.

i.e., previously a typical array used by pmap might have a shape of, say, [8, 100], if sharded across 8 accelerators on its first axis, and each individual shard would have a shape of, say, [100]. With this change, each individual shard has a shape of [1, 100] instead.

Why do this?

The main reason to do this is that XLA's sharding (HloSharding), which is exposed in JAX as GSPMDSharding/NamedSharding/PositionalSharding, cannot represent a change of rank. This means that the kind of sharding used by pmap cannot be represented to XLA as a sharding. If we change the definition of PmapSharding to preserve the array rank instead, then this means that PmapSharding can in the future be represented directly as a kind of sharding known to XLA.

The new definition of PmapSharding will allow a number of internal simplifications to JAX, for example in a subsequent change we can probably delete PmapSharding entirely. This in turn also would allow us to delete the APIs `jax.device_put_replicated` and `jax.device_put_sharded`, which predate the current sharding design.

This change also prepares for an upcoming change where we would like to redefine `pmap` in terms of `jit(shard_map(...))`, allowing us to delete most `pmap` code paths.

Once enabled, this change has the potential to break pmap users who:
a) look at the shards of an array, e.g., via `.addressable_shards`, or `jax.make_array_from_single_device_arrays`, since the shapes of the shards will change.
b) rely on zero-copy behavior in APIs like `jax.device_put_replicated`.

The change is disabled by default, so we do not expect any user visible impacts from this change.

PiperOrigin-RevId: 599787818

											
										
										
											2024-01-19 03:53:01 -08:00
-												refactor: move lax_numpy indexing routines to their own submodule

											
										
										
											2025-02-12 11:52:11 -08:00
+								    return indexing.rewriting_take(self, idx)
-												Make `__getitem__` work for PmapSharding just like SDA works. DA is already covered with the current implementation.

Added TODOs to take fast path for indices wherever it is possible to do that. If a correct index is passed during getitem and if that index exists on `Array`, then the fast path is taken (see the test in this CL).

PiperOrigin-RevId: 473342504

											
										
										
											2022-09-09 14:24:39 -07:00
-												Make checkify tests pass with Array and add methods on Array that are present on DA.

PiperOrigin-RevId: 468058909

											
										
										
											2022-08-16 16:51:26 -07:00
+								  def __iter__(self):
 								    if self.ndim == 0:
 								      raise TypeError("iteration over a 0-d array")  # same as numpy error
 								    else:
-												Make `is_fully_replicated` and `is_fully_addressble` a property rather than a method.

Why?

1. Because it's easy to cache a property than a method with only the `self` argument. (See below for article)

2. There's no harm in making them a property because both of them return a bool without any side-effects and are cached (so its fast). Why cache `is_fully_addressable`? Because its very expensive to calculate when you have 1000s of devices.

PiperOrigin-RevId: 479850850

											
										
										
											2022-10-08 19:23:32 -07:00
+								      assert self.is_fully_replicated or self.is_fully_addressable
-												Use `_rewriting_take` and `_chunk_iter` path during `__getitem__` and `__iter__` respectively when the Array is fully replicated

For example:

```
k1, k2 = jax.random.split(key, 2) # where key is fully replicated on 8 devices
```

Then `k1` and `k2` should also maintain the sharding of `key` since `key` is fully replicated.

PiperOrigin-RevId: 480434272

											
										
										
											2022-10-11 13:08:54 -07:00
+								      if dispatch.is_single_device_sharding(self.sharding) or self.is_fully_replicated:
-												Removed noop # type: ignore comments

mypy should now flag these by default.

											
										
										
											2024-05-17 09:46:36 +01:00
+								        return (sl for chunk in self._chunk_iter(100) for sl in chunk._unstack())
-												Bounce to host for any sharding that's not PmapSharding or a sharding with a single device for `__iter__` and `__getitem__`.

PiperOrigin-RevId: 473402857

											
										
										
											2022-09-09 20:41:12 -07:00
+								      elif isinstance(self.sharding, PmapSharding):
-												Removed noop # type: ignore comments

mypy should now flag these by default.

											
										
										
											2024-05-17 09:46:36 +01:00
+								        return (self[i] for i in range(self.shape[0]))
-												Make `__iter__` of `Array` behave like DA when there is a SingleDeviceSharding and like SDA when there is a non-trivial sharding.

This is important because when `Array` contains more than 1 shard, each shard can be on a different device and those things need to be preserved when iterating over `Array`.

PiperOrigin-RevId: 471695841

											
										
										
											2022-09-01 19:53:58 -07:00
+								      else:
-												Bounce to host for any sharding that's not PmapSharding or a sharding with a single device for `__iter__` and `__getitem__`.

PiperOrigin-RevId: 473402857

											
										
										
											2022-09-09 20:41:12 -07:00
+								        # TODO(yashkatariya): Don't bounce to host and use `_chunk_iter` path
-												Create `Array`s from `__getitem__` and `__iter__`. This is done by `device_put`ting from the host to default device which is suboptimal. But there is a TODO to fix this!

PiperOrigin-RevId: 478691051

											
										
										
											2022-10-03 22:28:26 -07:00
+								        # here after uneven partitioning support is added.
-												Remove local imports of array.py. The remaining local imports are in pxla.py but I will chip away at them when we delete SDA and move some more APIs out of experimental.

PiperOrigin-RevId: 492033543

											
										
										
											2022-11-30 15:25:21 -08:00
+								        return (api.device_put(self._value[i]) for i in range(self.shape[0]))
-												Make checkify tests pass with Array and add methods on Array that are present on DA.

PiperOrigin-RevId: 468058909

											
										
										
											2022-08-16 16:51:26 -07:00
-												Make `is_fully_replicated` and `is_fully_addressble` a property rather than a method.

Why?

1. Because it's easy to cache a property than a method with only the `self` argument. (See below for article)

2. There's no harm in making them a property because both of them return a bool without any side-effects and are cached (so its fast). Why cache `is_fully_addressable`? Because its very expensive to calculate when you have 1000s of devices.

PiperOrigin-RevId: 479850850

											
										
										
											2022-10-08 19:23:32 -07:00
+								  @property
-												`__repr__` if an Array is fully replicated. Its the same for `_value` so it makes sense to do the same for `__repr__`.

PiperOrigin-RevId: 469892350

											
										
										
											2022-08-24 20:41:48 -07:00
+								  def is_fully_replicated(self) -> bool:
-												Add is_fully_replicated method to Shardings. This allows to scrub the usage of is_op_sharding_replicated from JAX because we can just query it on Shardings and save an expensive round trip to OpSharding creation.

PiperOrigin-RevId: 524379122

											
										
										
											2023-04-14 13:55:52 -07:00
+								    return self.sharding.is_fully_replicated
-												`__repr__` if an Array is fully replicated. Its the same for `_value` so it makes sense to do the same for `__repr__`.

PiperOrigin-RevId: 469892350

											
										
										
											2022-08-24 20:41:48 -07:00
-												Add `__repr__` to `Array`. It works exactly as it does for DA and SDA when it is fully addressable. Otherwise it works like GDA.

TODO is adding weak_type support in general and to `__repr__`.

PiperOrigin-RevId: 455680796

											
										
										
											2022-06-17 13:11:52 -07:00
+								  def __repr__(self):
-												Use `Array` in `__repr__` instead of the class name which is `ArrayImpl`.

PiperOrigin-RevId: 477465432

											
										
										
											2022-09-28 08:57:07 -07:00
+								    prefix = 'Array('
-												Add weak type support to Array. Also make all api_test.py tests pass with Array. I have disabled the `float0` test for now until I investigate.

PiperOrigin-RevId: 468264910

											
										
										
											2022-08-17 12:25:14 -07:00
+								    if self.aval is not None and self.aval.weak_type:
 								      dtype_str = f'dtype={self.dtype.name}, weak_type=True)'
 								    else:
 								      dtype_str = f'dtype={self.dtype.name})'
-												Add `__repr__` to `Array`. It works exactly as it does for DA and SDA when it is fully addressable. Otherwise it works like GDA.

TODO is adding weak_type support in general and to `__repr__`.

PiperOrigin-RevId: 455680796

											
										
										
											2022-06-17 13:11:52 -07:00
-												Make `is_fully_replicated` and `is_fully_addressble` a property rather than a method.

Why?

1. Because it's easy to cache a property than a method with only the `self` argument. (See below for article)

2. There's no harm in making them a property because both of them return a bool without any side-effects and are cached (so its fast). Why cache `is_fully_addressable`? Because its very expensive to calculate when you have 1000s of devices.

PiperOrigin-RevId: 479850850

											
										
										
											2022-10-08 19:23:32 -07:00
+								    if self.is_fully_addressable or self.is_fully_replicated:
-												Add `__repr__` to `Array`. It works exactly as it does for DA and SDA when it is fully addressable. Otherwise it works like GDA.

TODO is adding weak_type support in general and to `__repr__`.

PiperOrigin-RevId: 455680796

											
										
										
											2022-06-17 13:11:52 -07:00
+								      line_width = np.get_printoptions()["linewidth"]
-												Improve repr for empty jax.Array

											
										
										
											2024-02-05 13:18:33 -08:00
+								      if self.size == 0:
 								        s = f"[], shape={self.shape}"
 								      else:
 								        s = np.array2string(self._value, prefix=prefix, suffix=',',
 								                            separator=', ', max_line_width=line_width)
-												Add `__repr__` to `Array`. It works exactly as it does for DA and SDA when it is fully addressable. Otherwise it works like GDA.

TODO is adding weak_type support in general and to `__repr__`.

PiperOrigin-RevId: 455680796

											
										
										
											2022-06-17 13:11:52 -07:00
+								      last_line_len = len(s) - s.rfind('\n') + 1
 								      sep = ' '
 								      if last_line_len + len(dtype_str) + 1 > line_width:
 								        sep = ' ' * len(prefix)
 								      return f"{prefix}{s},{sep}{dtype_str}"
 								    else:
-												Allow `to_py()` on Array if the sharding is fully replicated.

PiperOrigin-RevId: 469617855

											
										
										
											2022-08-23 19:48:59 -07:00
+								      return f"{prefix}{self.shape}, {dtype_str}"
-												Add `__repr__` to `Array`. It works exactly as it does for DA and SDA when it is fully addressable. Otherwise it works like GDA.

TODO is adding weak_type support in general and to `__repr__`.

PiperOrigin-RevId: 455680796

											
										
										
											2022-06-17 13:11:52 -07:00
-												Make `is_fully_addressable` an abstract method and implement it on each concrete Sharding.

Also, don't cache methods. Pull them out into a free function and cache that function.

PiperOrigin-RevId: 562939188

											
										
										
											2023-09-05 17:27:47 -07:00
+								  @property
-												Adding `jax.Array` to jax.experimental. Its pretty much the same as GDA (without the performance optimization for now).

Currently, jax.Array takes DeviceArrays in `assemble_array` because device_put returns a DA. In the future (with IFRT), it will return an `Array`.

`addressable_shards` wraps DA into jax.Array with a `SingleDeviceSharding`.

PiperOrigin-RevId: 453319811

											
										
										
											2022-06-06 17:31:20 -07:00
+								  def is_fully_addressable(self) -> bool:
-												Add docs for `is_fully_addressable` to jax.Array and remove GDA from process_allgather docs and clarify it a bit more.

PiperOrigin-RevId: 558643985

											
										
										
											2023-08-20 18:56:50 -07:00
+								    """Is this Array fully addressable?
 								    A jax.Array is fully addressable if the current process can address all of
 								    the devices named in the :class:`Sharding`. ``is_fully_addressable`` is
 								    equivalent to "is_local" in multi-process JAX.
 								    Note that fully replicated is not equal to fully addressable i.e.
 								    a jax.Array which is fully replicated can span across multiple hosts and is
 								    not fully addressable.
 								    """
-												Make `is_fully_replicated` and `is_fully_addressble` a property rather than a method.

Why?

1. Because it's easy to cache a property than a method with only the `self` argument. (See below for article)

2. There's no harm in making them a property because both of them return a bool without any side-effects and are cached (so its fast). Why cache `is_fully_addressable`? Because its very expensive to calculate when you have 1000s of devices.

PiperOrigin-RevId: 479850850

											
										
										
											2022-10-08 19:23:32 -07:00
+								    return self.sharding.is_fully_addressable
-												Adding `jax.Array` to jax.experimental. Its pretty much the same as GDA (without the performance optimization for now).

Currently, jax.Array takes DeviceArrays in `assemble_array` because device_put returns a DA. In the future (with IFRT), it will return an `Array`.

`addressable_shards` wraps DA into jax.Array with a `SingleDeviceSharding`.

PiperOrigin-RevId: 453319811

											
										
										
											2022-06-06 17:31:20 -07:00
-												Add copy argument to Array.__array__

											
										
										
											2024-03-05 09:31:16 -08:00
+								  def __array__(self, dtype=None, context=None, copy=None):
 								    # copy argument is supported by np.asarray starting in numpy 2.0
 								    kwds = {} if copy is None else {'copy': copy}
 								    return np.asarray(self._value, dtype=dtype, **kwds)
-												Add `__array__` (for device_get), `_npy_value`, `block_until_ready`, `delete` and `_check_if_deleted` to Array.

PiperOrigin-RevId: 454741685

											
										
										
											2022-06-13 18:07:55 -07:00
-												Add support for max_version, dl_device, copy kwargs in __dlpack__

											
										
										
											2024-04-11 16:44:19 +00:00
+								  def __dlpack__(self, *, stream: int | Any | None = None,
 								                 max_version: tuple[int, int] | None = None,
 								                 dl_device: tuple[DLDeviceType, int] | None = None,
 								                 copy: bool | None = None):
-												Avoid imports from the public jax.* namespace in more places internally.

This change is in preparation for more cycle breaking in the Bazel dependency graph.

PiperOrigin-RevId: 521822756

											
										
										
											2023-04-04 11:41:00 -07:00
+								    from jax._src.dlpack import to_dlpack  # pylint: disable=g-import-not-at-top
-												Add support for max_version, dl_device, copy kwargs in __dlpack__

											
										
										
											2024-04-11 16:44:19 +00:00
 								    device_set = self.sharding.device_set
 								    if len(device_set) > 1:
 								      raise BufferError(
 								        "to_dlpack can only pack a dlpack tensor from an array on a singular "
 								        f"device, but an array with a Sharding over {len(device_set)} devices "
 								        "was provided."
 								      )
 								    device, = device_set
 								    return to_dlpack(self, stream=stream,
 								                     max_version=max_version,
 								                     src_device=device,
 								                     dl_device=dl_device,
 								                     copy=copy)
-												[JAX] Implement the `stream` argument to jax.Array.__dlpack__ for CUDA GPU

Also implements jax.Array.__dlpack_device__. See
https://dmlc.github.io/dlpack/latest/python_spec.html

This requires plumbing the raw CUDA stream pointer through PJRT and
StreamExecutor (since the GPU PJRT implementation is still based on
SE). This is done via the new PJRT method
ExternalReference::WaitUntilBufferReadyOnStream.

I haven't plumbed this through the PJRT C API yet, because I'm still
debating whether this should be part of the main API or a GPU-specific
extension (plus either way it should probably be its own change).

PiperOrigin-RevId: 558245360

											
										
										
											2023-08-18 14:19:49 -07:00
-												Fix dlpack type signatures to match Array API spec.

Fixes https://github.com/google/jax/issues/17510

											
										
										
											2023-09-08 09:18:38 -04:00
+								  def __dlpack_device__(self) -> tuple[enum.Enum, int]:
-												[JAX] Implement the `stream` argument to jax.Array.__dlpack__ for CUDA GPU

Also implements jax.Array.__dlpack_device__. See
https://dmlc.github.io/dlpack/latest/python_spec.html

This requires plumbing the raw CUDA stream pointer through PJRT and
StreamExecutor (since the GPU PJRT implementation is still based on
SE). This is done via the new PJRT method
ExternalReference::WaitUntilBufferReadyOnStream.

I haven't plumbed this through the PJRT C API yet, because I'm still
debating whether this should be part of the main API or a GPU-specific
extension (plus either way it should probably be its own change).

PiperOrigin-RevId: 558245360

											
										
										
											2023-08-18 14:19:49 -07:00
+								    if len(self._arrays) != 1:
-												Update

											
										
										
											2024-03-12 12:56:22 +00:00
+								      raise BufferError("__dlpack__ only supported for unsharded arrays.")
-												[JAX] Implement the `stream` argument to jax.Array.__dlpack__ for CUDA GPU

Also implements jax.Array.__dlpack_device__. See
https://dmlc.github.io/dlpack/latest/python_spec.html

This requires plumbing the raw CUDA stream pointer through PJRT and
StreamExecutor (since the GPU PJRT implementation is still based on
SE). This is done via the new PJRT method
ExternalReference::WaitUntilBufferReadyOnStream.

I haven't plumbed this through the PJRT C API yet, because I'm still
debating whether this should be part of the main API or a GPU-specific
extension (plus either way it should probably be its own change).

PiperOrigin-RevId: 558245360

											
										
										
											2023-08-18 14:19:49 -07:00
-												[JAX] Implement importing external dlpack-aware Python arrays.

See https://dmlc.github.io/dlpack/latest/python_spec.html.

This is the import path. The export path was implemented in
https://github.com/openxla/xla/commit/0b3cbfe4bc7cd68dc20924bd878cdfb2faa1a169.

This allows for creating jax.Arrays from external GPU arrays
asynchronously.

PiperOrigin-RevId: 561172624

											
										
										
											2023-08-29 16:38:34 -07:00
+								    from jax._src.dlpack import DLDeviceType  # pylint: disable=g-import-not-at-top
-												[JAX] Implement the `stream` argument to jax.Array.__dlpack__ for CUDA GPU

Also implements jax.Array.__dlpack_device__. See
https://dmlc.github.io/dlpack/latest/python_spec.html

This requires plumbing the raw CUDA stream pointer through PJRT and
StreamExecutor (since the GPU PJRT implementation is still based on
SE). This is done via the new PJRT method
ExternalReference::WaitUntilBufferReadyOnStream.

I haven't plumbed this through the PJRT C API yet, because I'm still
debating whether this should be part of the main API or a GPU-specific
extension (plus either way it should probably be its own change).

PiperOrigin-RevId: 558245360

											
										
										
											2023-08-18 14:19:49 -07:00
+								    if self.platform() == "cpu":
 								      return DLDeviceType.kDLCPU, 0
 								    elif self.platform() == "gpu":
-												Deprecate the device() method of JAX arrays

											
										
										
											2023-11-29 16:52:09 -08:00
+								      platform_version = _get_device(self).client.platform_version
-												[JAX] Implement the `stream` argument to jax.Array.__dlpack__ for CUDA GPU

Also implements jax.Array.__dlpack_device__. See
https://dmlc.github.io/dlpack/latest/python_spec.html

This requires plumbing the raw CUDA stream pointer through PJRT and
StreamExecutor (since the GPU PJRT implementation is still based on
SE). This is done via the new PJRT method
ExternalReference::WaitUntilBufferReadyOnStream.

I haven't plumbed this through the PJRT C API yet, because I'm still
debating whether this should be part of the main API or a GPU-specific
extension (plus either way it should probably be its own change).

PiperOrigin-RevId: 558245360

											
										
										
											2023-08-18 14:19:49 -07:00
+								      if "cuda" in platform_version:
 								        dl_device_type = DLDeviceType.kDLCUDA
 								      elif "rocm" in platform_version:
 								        dl_device_type = DLDeviceType.kDLROCM
 								      else:
-												Update

											
										
										
											2024-03-12 12:56:22 +00:00
+								        raise BufferError("Unknown GPU platform for __dlpack__: "
-												[JAX] Implement the `stream` argument to jax.Array.__dlpack__ for CUDA GPU

Also implements jax.Array.__dlpack_device__. See
https://dmlc.github.io/dlpack/latest/python_spec.html

This requires plumbing the raw CUDA stream pointer through PJRT and
StreamExecutor (since the GPU PJRT implementation is still based on
SE). This is done via the new PJRT method
ExternalReference::WaitUntilBufferReadyOnStream.

I haven't plumbed this through the PJRT C API yet, because I'm still
debating whether this should be part of the main API or a GPU-specific
extension (plus either way it should probably be its own change).

PiperOrigin-RevId: 558245360

											
										
										
											2023-08-18 14:19:49 -07:00
+								                         f"{platform_version}")
-												Deprecate the device() method of JAX arrays

											
										
										
											2023-11-29 16:52:09 -08:00
+								      local_hardware_id = _get_device(self).local_hardware_id
-												[JAX] Implement the `stream` argument to jax.Array.__dlpack__ for CUDA GPU

Also implements jax.Array.__dlpack_device__. See
https://dmlc.github.io/dlpack/latest/python_spec.html

This requires plumbing the raw CUDA stream pointer through PJRT and
StreamExecutor (since the GPU PJRT implementation is still based on
SE). This is done via the new PJRT method
ExternalReference::WaitUntilBufferReadyOnStream.

I haven't plumbed this through the PJRT C API yet, because I'm still
debating whether this should be part of the main API or a GPU-specific
extension (plus either way it should probably be its own change).

PiperOrigin-RevId: 558245360

											
										
										
											2023-08-18 14:19:49 -07:00
+								      if local_hardware_id is None:
-												Update

											
										
										
											2024-03-12 12:56:22 +00:00
+								        raise BufferError("Couldn't get local_hardware_id for __dlpack__")
-												[JAX] Implement the `stream` argument to jax.Array.__dlpack__ for CUDA GPU

Also implements jax.Array.__dlpack_device__. See
https://dmlc.github.io/dlpack/latest/python_spec.html

This requires plumbing the raw CUDA stream pointer through PJRT and
StreamExecutor (since the GPU PJRT implementation is still based on
SE). This is done via the new PJRT method
ExternalReference::WaitUntilBufferReadyOnStream.

I haven't plumbed this through the PJRT C API yet, because I'm still
debating whether this should be part of the main API or a GPU-specific
extension (plus either way it should probably be its own change).

PiperOrigin-RevId: 558245360

											
										
										
											2023-08-18 14:19:49 -07:00
 								      return dl_device_type, local_hardware_id
 								    else:
-												Update

											
										
										
											2024-03-12 12:56:22 +00:00
+								      raise BufferError(
-												[JAX] Implement the `stream` argument to jax.Array.__dlpack__ for CUDA GPU

Also implements jax.Array.__dlpack_device__. See
https://dmlc.github.io/dlpack/latest/python_spec.html

This requires plumbing the raw CUDA stream pointer through PJRT and
StreamExecutor (since the GPU PJRT implementation is still based on
SE). This is done via the new PJRT method
ExternalReference::WaitUntilBufferReadyOnStream.

I haven't plumbed this through the PJRT C API yet, because I'm still
debating whether this should be part of the main API or a GPU-specific
extension (plus either way it should probably be its own change).

PiperOrigin-RevId: 558245360

											
										
										
											2023-08-18 14:19:49 -07:00
+								          "__dlpack__ device only supported for CPU and GPU, got platform: "
 								          f"{self.platform()}"
 								      )
-												Add support for interoperability via dlpack for Array and also make pickle_tests and lax_numpy_test pass with Array.

PiperOrigin-RevId: 468568917

											
										
										
											2022-08-18 15:58:40 -07:00
 								  def __reduce__(self):
-												Removed noop # type: ignore comments

mypy should now flag these by default.

											
										
										
											2024-05-17 09:46:36 +01:00
+								    fun, args, arr_state = self._value.__reduce__()
-												remove named_shapes (since xmap is now gone)

											
										
										
											2024-07-25 00:02:55 +00:00
+								    aval_state = {'weak_type': self.aval.weak_type}
-												Add support for interoperability via dlpack for Array and also make pickle_tests and lax_numpy_test pass with Array.

PiperOrigin-RevId: 468568917

											
										
										
											2022-08-18 15:58:40 -07:00
+								    return (_reconstruct_array, (fun, args, arr_state, aval_state))
-												Bump minimum jaxlib version from 0.4.6 to 0.4.7.

Also removes a bunch of dead version guards (0.4.7 has
xla_extension_version 144 and mlir_api_version 47)

											
										
										
											2023-03-28 12:43:32 -07:00
+								  @use_cpp_method()
-												Make jit == pjit. This means that the lowering and execution paths of jit and pjit are merged.

A fallback to `lower_xla_callable` is taken when pmap appears in the jaxpr during the jit lowering path.

Added support for `keep_unused`, `committed` and `core.Token` to pxla.py.

PiperOrigin-RevId: 470896270

											
										
										
											2022-08-29 22:02:32 -07:00
+								  def unsafe_buffer_pointer(self):
-												Implement .on_device_size_in_bytes() on jax.Array.

This is an array present in DeviceArray that is missing from Array.

PiperOrigin-RevId: 492571171

											
										
										
											2022-12-02 15:10:56 -08:00
+								    if len(self._arrays) != 1:
 								      raise ValueError("unsafe_buffer_pointer() is supported only for unsharded"
 								                       " arrays.")
-												Make jit == pjit. This means that the lowering and execution paths of jit and pjit are merged.

A fallback to `lower_xla_callable` is taken when pmap appears in the jaxpr during the jit lowering path.

Added support for `keep_unused`, `committed` and `core.Token` to pxla.py.

PiperOrigin-RevId: 470896270

											
										
										
											2022-08-29 22:02:32 -07:00
+								    return self._arrays[0].unsafe_buffer_pointer()
 								  @property
-												Bump minimum jaxlib version from 0.4.6 to 0.4.7.

Also removes a bunch of dead version guards (0.4.7 has
xla_extension_version 144 and mlir_api_version 47)

											
										
										
											2023-03-28 12:43:32 -07:00
+								  @use_cpp_method()
-												Make jit == pjit. This means that the lowering and execution paths of jit and pjit are merged.

A fallback to `lower_xla_callable` is taken when pmap appears in the jaxpr during the jit lowering path.

Added support for `keep_unused`, `committed` and `core.Token` to pxla.py.

PiperOrigin-RevId: 470896270

											
										
										
											2022-08-29 22:02:32 -07:00
+								  def __cuda_array_interface__(self):
-												Implement .on_device_size_in_bytes() on jax.Array.

This is an array present in DeviceArray that is missing from Array.

PiperOrigin-RevId: 492571171

											
										
										
											2022-12-02 15:10:56 -08:00
+								    if len(self._arrays) != 1:
 								      raise ValueError("__cuda_array_interface__() is supported only for "
 								                       "unsharded arrays.")
-												Make jit == pjit. This means that the lowering and execution paths of jit and pjit are merged.

A fallback to `lower_xla_callable` is taken when pmap appears in the jaxpr during the jit lowering path.

Added support for `keep_unused`, `committed` and `core.Token` to pxla.py.

PiperOrigin-RevId: 470896270

											
										
										
											2022-08-29 22:02:32 -07:00
+								    return self._arrays[0].__cuda_array_interface__  # pytype: disable=attribute-error  # bind-properties
-												Bump minimum jaxlib version from 0.4.6 to 0.4.7.

Also removes a bunch of dead version guards (0.4.7 has
xla_extension_version 144 and mlir_api_version 47)

											
										
										
											2023-03-28 12:43:32 -07:00
+								  @use_cpp_method()
-												Implement .on_device_size_in_bytes() on jax.Array.

This is an array present in DeviceArray that is missing from Array.

PiperOrigin-RevId: 492571171

											
										
										
											2022-12-02 15:10:56 -08:00
+								  def on_device_size_in_bytes(self):
 								    """Returns the total global on-device size of the array in bytes."""
-												[Rollforward] Convert _arrays to return PyArray instead of PyBuffer.

This change also converts all callsites that construct buffers to
return PyArrays.

PiperOrigin-RevId: 510486273

											
										
										
											2023-02-17 11:52:08 -08:00
+								    arr = self._arrays[0]
-												Removed noop # type: ignore comments

mypy should now flag these by default.

											
										
										
											2024-05-17 09:46:36 +01:00
+								    per_shard_size = arr.on_device_size_in_bytes()
-												Add num_devices to Sharding interface so that it works with NamedSharding containing AbstractMesh too.

PiperOrigin-RevId: 662938823

											
										
										
											2024-08-14 09:02:20 -07:00
+								    return per_shard_size * self.sharding.num_devices
-												Implement .on_device_size_in_bytes() on jax.Array.

This is an array present in DeviceArray that is missing from Array.

PiperOrigin-RevId: 492571171

											
										
										
											2022-12-02 15:10:56 -08:00
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								  def devices(self) -> set[Device]:
-												Add support for interoperability via dlpack for Array and also make pickle_tests and lax_numpy_test pass with Array.

PiperOrigin-RevId: 468568917

											
										
										
											2022-08-18 15:58:40 -07:00
+								    self._check_if_deleted()
-												Make `.devices()` a set rather than a list because the code looks at sharding.device_set

PiperOrigin-RevId: 515519484

											
										
										
											2023-03-09 20:42:45 -08:00
+								    return self.sharding.device_set
-												Add weak type support to Array. Also make all api_test.py tests pass with Array. I have disabled the `float0` test for now until I investigate.

PiperOrigin-RevId: 468264910

											
										
										
											2022-08-17 12:25:14 -07:00
-												Some miscellaneous changes to make tests pass when jax.Array is enabled by default.

1. Add `device_buffer` and `device_buffers` fields to Array as a backwards compatible change for DA and SDA.
2. Support PartitionSpecs as input to in_axis_resources and out_axis_resources when jax_array is enabled as a backwards compatible change since all user code uses this currently. Create a MeshPspecSharding internally.
3. Some tests changes to make them pass

PiperOrigin-RevId: 474642889

											
										
										
											2022-09-15 13:26:57 -07:00
+								  @property
-												Finalize deprecation of `arr.device_buffer` and `arr.device_buffers`

PiperOrigin-RevId: 627899901

											
										
										
											2024-04-24 17:26:38 -07:00
+								  def device_buffer(self):
 								    raise AttributeError(
 								      "arr.device_buffer has been deprecated. Use arr.addressable_data(0)")
-												Some miscellaneous changes to make tests pass when jax.Array is enabled by default.

1. Add `device_buffer` and `device_buffers` fields to Array as a backwards compatible change for DA and SDA.
2. Support PartitionSpecs as input to in_axis_resources and out_axis_resources when jax_array is enabled as a backwards compatible change since all user code uses this currently. Create a MeshPspecSharding internally.
3. Some tests changes to make them pass

PiperOrigin-RevId: 474642889

											
										
										
											2022-09-15 13:26:57 -07:00
 								  @property
-												Finalize deprecation of `arr.device_buffer` and `arr.device_buffers`

PiperOrigin-RevId: 627899901

											
										
										
											2024-04-24 17:26:38 -07:00
+								  def device_buffers(self):
 								    raise AttributeError(
 								      "arr.device_buffers has been deprecated. Use [x.data for x in arr.addressable_shards]")
-												Some miscellaneous changes to make tests pass when jax.Array is enabled by default.

1. Add `device_buffer` and `device_buffers` fields to Array as a backwards compatible change for DA and SDA.
2. Support PartitionSpecs as input to in_axis_resources and out_axis_resources when jax_array is enabled as a backwards compatible change since all user code uses this currently. Create a MeshPspecSharding internally.
3. Some tests changes to make them pass

PiperOrigin-RevId: 474642889

											
										
										
											2022-09-15 13:26:57 -07:00
-												Rename the concrete class `Array` to `ArrayImpl`

PiperOrigin-RevId: 477017236

											
										
										
											2022-09-26 16:17:26 -07:00
+								  def addressable_data(self, index: int) -> ArrayImpl:
-												Add addressable_data to Array (similar to GDA) to aid in transition and also in auto spmd partitioner mode, always convert to MeshPspecSharding.

PiperOrigin-RevId: 475972534

											
										
										
											2022-09-21 18:18:57 -07:00
+								    self._check_if_deleted()
-												Bump minimum jaxlib version to 0.4.11. xla_extension_version is 158 and mlir_api_version is 49. It will subsume https://github.com/google/jax/pull/16161#issuecomment-1564977332

PiperOrigin-RevId: 537047525

											
										
										
											2023-06-01 09:36:32 -07:00
+								    if self.is_fully_replicated:
-												Don't explode into individual shards if an Array is fully replicated and addressable_data is called.

We can just extract 1 pjrt_buffer() and convert it to an Array with SingleDeviceSharding.

PiperOrigin-RevId: 524877300

											
										
										
											2023-04-17 10:05:01 -07:00
+								      return self._fully_replicated_shard()
-												Delete `_single_device_array_from_buf` since everything from JAX is an Array

PiperOrigin-RevId: 520418231

											
										
										
											2023-03-29 12:58:34 -07:00
+								    return self._arrays[index]
-												Add addressable_data to Array (similar to GDA) to aid in transition and also in auto spmd partitioner mode, always convert to MeshPspecSharding.

PiperOrigin-RevId: 475972534

											
										
										
											2022-09-21 18:18:57 -07:00
-												Remove local_imports of sharding.py. Adding `pxla` local imports but then cleaning those up will be super easy since those will be the only ones left and restricted to `sharding.py` file only.

Also remove `maybe_cached_property` from this CL since we are dropping 3.7 support

PiperOrigin-RevId: 491769101

											
										
										
											2022-11-29 16:39:45 -08:00
+								  @functools.cached_property
-												Add the Shard class from GDA to `jax.Array` to make the management of shards easier and less verbose.

PiperOrigin-RevId: 453330349

											
										
										
											2022-06-06 18:44:45 -07:00
+								  def addressable_shards(self) -> Sequence[Shard]:
-												Fix the _check_if_deleted check that was merged at the wrong place by the cider merging machinery.

PiperOrigin-RevId: 454912448

											
										
										
											2022-06-14 11:23:07 -07:00
+								    self._check_if_deleted()
-												Add the Shard class from GDA to `jax.Array` to make the management of shards easier and less verbose.

PiperOrigin-RevId: 453330349

											
										
										
											2022-06-06 18:44:45 -07:00
+								    out = []
-												Delete `_single_device_array_from_buf` since everything from JAX is an Array

PiperOrigin-RevId: 520418231

											
										
										
											2023-03-29 12:58:34 -07:00
+								    for a in self._arrays:
-												Deprecate the device() method of JAX arrays

											
										
										
											2023-11-29 16:52:09 -08:00
+								      out.append(Shard(_get_device(a), self.sharding, self.shape, a))
-												Add `global_shards` to `jax.Array` as it exists on GDA and is being used in various places.

PiperOrigin-RevId: 485065876

											
										
										
											2022-10-31 09:07:28 -07:00
+								    return out
-												Make layout on Array a property instead of a cached_property.

PiperOrigin-RevId: 622936539

											
										
										
											2024-04-08 13:30:27 -07:00
+								  @property
-												[Take 2] Expose .layout on jax.Array. Also add checks in the AOT path to make sure that the input Array's layout matches the layout given to jax.jit.

Reverts cd79e71d85621a8d6dede9a710bdb2a29bb380fd

PiperOrigin-RevId: 618878870

											
										
										
											2024-03-25 10:07:55 -07:00
+								  def layout(self):
-												Add `Layout` support to `jax.jit`.

`jax.jit` now accepts `Layout` instances to the `in_shardings` and `out_shardings` argument. Major changes are just plumbing `in_layouts` and `out_layouts` everywhere.

Note that public api is `Layout(device_local_layout, sharding)` which is how users will pass us the Layout but internally we split them apart into device_local_layout and sharding.

Docs are coming up on how to use the API and what Layouts mean and how to make sense of them (especially on TPU).

PiperOrigin-RevId: 622352537

											
										
										
											2024-04-05 20:08:48 -07:00
+								    # TODO(yashkatariya): Remove the deleted check from here.
 								    if self.is_deleted():
 								      return Layout(None, self.sharding)
-												[Take 2] Expose .layout on jax.Array. Also add checks in the AOT path to make sure that the input Array's layout matches the layout given to jax.jit.

Reverts cd79e71d85621a8d6dede9a710bdb2a29bb380fd

PiperOrigin-RevId: 618878870

											
										
										
											2024-03-25 10:07:55 -07:00
+								    try:
-												Add concrete layout API to JAX. The API takes `major_to_minor: tuple[int, ...]` and `tiling: tuple[tuple[int, ...], ...]` as the arguments. Allows users to pass layouts to `with_sharding_constraint` to constrain the layout + sharding.

`sub_byte_element_size_in_bits` is a lowering only thing for now (since we know the dtype of the aval so JAX can add the appropriate value). We can expose it to the user API if required.

memory space is exposed via JAX memories API so it doesn't have to be in the layout API.

Also expose `_xla_layout` as a private API from `PJRTLayout` so that we can access fields to create JAX layouts.

Add construtors to `xla::Layout` so that JAX can create Layouts with minor_to_major and tiling information.

PiperOrigin-RevId: 647487510

											
										
										
											2024-06-27 16:46:44 -07:00
+								      return Layout(DeviceLocalLayout.from_pjrt_layout(self._pjrt_layout),
 								                    self.sharding)
-												[Take 2] Expose .layout on jax.Array. Also add checks in the AOT path to make sure that the input Array's layout matches the layout given to jax.jit.

Reverts cd79e71d85621a8d6dede9a710bdb2a29bb380fd

PiperOrigin-RevId: 618878870

											
										
										
											2024-03-25 10:07:55 -07:00
+								    except xe.XlaRuntimeError as e:
 								      msg, *_ = e.args
 								      if type(msg) is str and msg.startswith("UNIMPLEMENTED"):
-												Add `Layout` support to `jax.jit`.

`jax.jit` now accepts `Layout` instances to the `in_shardings` and `out_shardings` argument. Major changes are just plumbing `in_layouts` and `out_layouts` everywhere.

Note that public api is `Layout(device_local_layout, sharding)` which is how users will pass us the Layout but internally we split them apart into device_local_layout and sharding.

Docs are coming up on how to use the API and what Layouts mean and how to make sense of them (especially on TPU).

PiperOrigin-RevId: 622352537

											
										
										
											2024-04-05 20:08:48 -07:00
+								        return Layout(None, self.sharding)
-												[Take 2] Expose .layout on jax.Array. Also add checks in the AOT path to make sure that the input Array's layout matches the layout given to jax.jit.

Reverts cd79e71d85621a8d6dede9a710bdb2a29bb380fd

PiperOrigin-RevId: 618878870

											
										
										
											2024-03-25 10:07:55 -07:00
+								      else:
 								        raise
-												Add `global_shards` to `jax.Array` as it exists on GDA and is being used in various places.

PiperOrigin-RevId: 485065876

											
										
										
											2022-10-31 09:07:28 -07:00
+								  @property
 								  def global_shards(self) -> Sequence[Shard]:
 								    """Returns list of all `Shard`s of the Array across all devices.
 								    The result includes shards that are not addressable by the current process.
 								    If a `Shard` is not addressable, then its `data` will be `None`.
 								    """
 								    self._check_if_deleted()
 								    if self.is_fully_addressable:  # pylint: disable=using-constant-test
 								      return self.addressable_shards
 								    out = []
-												Deprecate the device() method of JAX arrays

											
										
										
											2023-11-29 16:52:09 -08:00
+								    device_id_to_buffer = {_get_device(a).id: a for a in self._arrays}
-												Add `global_shards` to `jax.Array` as it exists on GDA and is being used in various places.

PiperOrigin-RevId: 485065876

											
										
										
											2022-10-31 09:07:28 -07:00
+								    for global_d in self.sharding.device_set:
 								      if device_id_to_buffer.get(global_d.id, None) is not None:
-												Delete `_single_device_array_from_buf` since everything from JAX is an Array

PiperOrigin-RevId: 520418231

											
										
										
											2023-03-29 12:58:34 -07:00
+								        array = device_id_to_buffer[global_d.id]
-												Add `global_shards` to `jax.Array` as it exists on GDA and is being used in various places.

PiperOrigin-RevId: 485065876

											
										
										
											2022-10-31 09:07:28 -07:00
+								      else:
 								        array = None
 								      out.append(Shard(global_d, self.sharding, self.shape, array))
-												Add the Shard class from GDA to `jax.Array` to make the management of shards easier and less verbose.

PiperOrigin-RevId: 453330349

											
										
										
											2022-06-06 18:44:45 -07:00
+								    return out
-												Adding `jax.Array` to jax.experimental. Its pretty much the same as GDA (without the performance optimization for now).

Currently, jax.Array takes DeviceArrays in `assemble_array` because device_put returns a DA. In the future (with IFRT), it will return an `Array`.

`addressable_shards` wraps DA into jax.Array with a `SingleDeviceSharding`.

PiperOrigin-RevId: 453319811

											
										
										
											2022-06-06 17:31:20 -07:00
-												Bump minimum jaxlib version from 0.4.6 to 0.4.7.

Also removes a bunch of dead version guards (0.4.7 has
xla_extension_version 144 and mlir_api_version 47)

											
										
										
											2023-03-28 12:43:32 -07:00
+								  @use_cpp_method()
-												Add `__array__` (for device_get), `_npy_value`, `block_until_ready`, `delete` and `_check_if_deleted` to Array.

PiperOrigin-RevId: 454741685

											
										
										
											2022-06-13 18:07:55 -07:00
+								  def delete(self):
 								    if self._arrays is None:
 								      return
 								    for buf in self._arrays:
 								      buf.delete()
 								    self._arrays = None
 								    self._npy_value = None
-												[Rollforward] Convert _arrays to return PyArray instead of PyBuffer.

This change also converts all callsites that construct buffers to
return PyArrays.

PiperOrigin-RevId: 510486273

											
										
										
											2023-02-17 11:52:08 -08:00
+								  @use_cpp_method()
-												Add support for interoperability via dlpack for Array and also make pickle_tests and lax_numpy_test pass with Array.

PiperOrigin-RevId: 468568917

											
										
										
											2022-08-18 15:58:40 -07:00
+								  def is_deleted(self):
-												Enable `testArrayCopy` now that its fixed.

PiperOrigin-RevId: 473088085

											
										
										
											2022-09-08 14:39:12 -07:00
+								    if self._arrays is None:
 								      return True
 								    # This path is taken when a view of `Array` is created and the original
 								    # Array is deleted. In that case, the buffers the view represents also get
 								    # deleted.
 								    return any(buf.is_deleted() for buf in self._arrays)
-												Add support for interoperability via dlpack for Array and also make pickle_tests and lax_numpy_test pass with Array.

PiperOrigin-RevId: 468568917

											
										
										
											2022-08-18 15:58:40 -07:00
-												Add `__array__` (for device_get), `_npy_value`, `block_until_ready`, `delete` and `_check_if_deleted` to Array.

PiperOrigin-RevId: 454741685

											
										
										
											2022-06-13 18:07:55 -07:00
+								  def _check_if_deleted(self):
-												Move `is_deleted()` to C++ so that we can check if an Array is deleted without materializing `_arrays`.

Also raise a better error message when doing operations of a deleted Array rather than the current thing which says: `NoneType has no len()`. Now it says: `Array has been deleted`.

PiperOrigin-RevId: 482497114

											
										
										
											2022-10-20 08:28:47 -07:00
+								    if self.is_deleted():
-												Enhance "Array has been deleted" error message with shape and type information.

PiperOrigin-RevId: 559673904

											
										
										
											2023-08-24 01:05:36 -07:00
+								      raise RuntimeError(
-												Use `self.aval.str_short()` to represent array shape in the error message.

PiperOrigin-RevId: 559799200

											
										
										
											2023-08-24 10:37:55 -07:00
+								          f"Array has been deleted with shape={self.aval.str_short()}.")
-												Add `__array__` (for device_get), `_npy_value`, `block_until_ready`, `delete` and `_check_if_deleted` to Array.

PiperOrigin-RevId: 454741685

											
										
										
											2022-06-13 18:07:55 -07:00
-												[Rollforward] Convert _arrays to return PyArray instead of PyBuffer.

This change also converts all callsites that construct buffers to
return PyArrays.

PiperOrigin-RevId: 510486273

											
										
										
											2023-02-17 11:52:08 -08:00
+								  @use_cpp_method()
-												Add `__array__` (for device_get), `_npy_value`, `block_until_ready`, `delete` and `_check_if_deleted` to Array.

PiperOrigin-RevId: 454741685

											
										
										
											2022-06-13 18:07:55 -07:00
+								  def block_until_ready(self):
 								    self._check_if_deleted()
 								    for db in self._arrays:
 								      db.block_until_ready()
 								    return self
-												Remove code present to support jaxlib < 0.5.1.

The new minimum xla_extension_version is 317 and the new mlir_api_version is 58.

											
										
										
											2025-02-24 17:45:19 -05:00
+								  @use_cpp_method()
 								  def _single_device_array_to_np_array_did_copy(self) -> tuple[np.ndarray, bool]:  # type: ignore
 								    ...  # pytype: disable=bad-return-type
-												[Rollforward] Move PyBuffer methods used by PyArray to c++.

```
  def delete(self): ...
  def unsafe_buffer_pointer(self) -> Any: ...
  def clone(self) -> ArrayImpl: ...
  def _copy_single_device_array_to_host_async(self): ...
  def _single_device_array_to_np_array(self) -> np.ndarray: ...
  def on_device_size_in_bytes(self) -> int: ...
```

PiperOrigin-RevId: 516372847

											
										
										
											2023-03-13 17:51:09 -07:00
-												Bump minimum jaxlib version from 0.4.6 to 0.4.7.

Also removes a bunch of dead version guards (0.4.7 has
xla_extension_version 144 and mlir_api_version 47)

											
										
										
											2023-03-28 12:43:32 -07:00
+								  @use_cpp_method()
-												[Rollforward] Move PyBuffer methods used by PyArray to c++.

```
  def delete(self): ...
  def unsafe_buffer_pointer(self) -> Any: ...
  def clone(self) -> ArrayImpl: ...
  def _copy_single_device_array_to_host_async(self): ...
  def _single_device_array_to_np_array(self) -> np.ndarray: ...
  def on_device_size_in_bytes(self) -> int: ...
```

PiperOrigin-RevId: 516372847

											
										
										
											2023-03-13 17:51:09 -07:00
+								  def _copy_single_device_array_to_host_async(self):
 								    self._arrays[0].copy_to_host_async()
-												Implement copy_to_host_async and _value with a single call to
device_replica_id_map and device_indices_map.

PiperOrigin-RevId: 516835021

											
										
										
											2023-03-15 08:41:47 -07:00
+								  @profiler.annotate_function
-												Adding `jax.Array` to jax.experimental. Its pretty much the same as GDA (without the performance optimization for now).

Currently, jax.Array takes DeviceArrays in `assemble_array` because device_put returns a DA. In the future (with IFRT), it will return an `Array`.

`addressable_shards` wraps DA into jax.Array with a `SingleDeviceSharding`.

PiperOrigin-RevId: 453319811

											
										
										
											2022-06-06 17:31:20 -07:00
+								  def copy_to_host_async(self):
-												Add `__array__` (for device_get), `_npy_value`, `block_until_ready`, `delete` and `_check_if_deleted` to Array.

PiperOrigin-RevId: 454741685

											
										
										
											2022-06-13 18:07:55 -07:00
+								    self._check_if_deleted()
 								    if self._npy_value is None:
-												[Rollforward] Convert _arrays to return PyArray instead of PyBuffer.

This change also converts all callsites that construct buffers to
return PyArrays.

PiperOrigin-RevId: 510486273

											
										
										
											2023-02-17 11:52:08 -08:00
+								      if self.is_fully_replicated:
-												[Rollforward] Move PyBuffer methods used by PyArray to c++.

```
  def delete(self): ...
  def unsafe_buffer_pointer(self) -> Any: ...
  def clone(self) -> ArrayImpl: ...
  def _copy_single_device_array_to_host_async(self): ...
  def _single_device_array_to_np_array(self) -> np.ndarray: ...
  def on_device_size_in_bytes(self) -> int: ...
```

PiperOrigin-RevId: 516372847

											
										
										
											2023-03-13 17:51:09 -07:00
+								        self._copy_single_device_array_to_host_async()
-												Cleanup the logic that was rolled back and doesn't exist in C++ but still exists in python.

PiperOrigin-RevId: 514078264

											
										
										
											2023-03-04 18:06:26 +00:00
+								        return
-												Optimize `_create_copy_plan` by iterating over only the shards that are needed for materialization

For arrays that are fully or partially replicated, it is more efficient to (pre-)construct a list of addressable array shards that participate in materialization rather than going over all array shards. This is particularly useful for single-controller JAX.

The implementation assumes that addressable arrays appear in the same order as the corresponding addressable devices in `sharding.addressable_devices_indices_map()`.

PiperOrigin-RevId: 624969222

											
										
										
											2024-04-15 08:29:02 -07:00
+								      for i, _ in _cached_index_calc(self.sharding, self.shape):
 								        self._arrays[i]._copy_single_device_array_to_host_async()
-												Adding `jax.Array` to jax.experimental. Its pretty much the same as GDA (without the performance optimization for now).

Currently, jax.Array takes DeviceArrays in `assemble_array` because device_put returns a DA. In the future (with IFRT), it will return an `Array`.

`addressable_shards` wraps DA into jax.Array with a `SingleDeviceSharding`.

PiperOrigin-RevId: 453319811

											
										
										
											2022-06-06 17:31:20 -07:00
-												Add `__array__` (for device_get), `_npy_value`, `block_until_ready`, `delete` and `_check_if_deleted` to Array.

PiperOrigin-RevId: 454741685

											
										
										
											2022-06-13 18:07:55 -07:00
+								  @property
-												Implement copy_to_host_async and _value with a single call to
device_replica_id_map and device_indices_map.

PiperOrigin-RevId: 516835021

											
										
										
											2023-03-15 08:41:47 -07:00
+								  @functools.partial(profiler.annotate_function, name="np.asarray(jax.Array)")
-												Adding `jax.Array` to jax.experimental. Its pretty much the same as GDA (without the performance optimization for now).

Currently, jax.Array takes DeviceArrays in `assemble_array` because device_put returns a DA. In the future (with IFRT), it will return an `Array`.

`addressable_shards` wraps DA into jax.Array with a `SingleDeviceSharding`.

PiperOrigin-RevId: 453319811

											
										
										
											2022-06-06 17:31:20 -07:00
+								  def _value(self) -> np.ndarray:
-												Add `__array__` (for device_get), `_npy_value`, `block_until_ready`, `delete` and `_check_if_deleted` to Array.

PiperOrigin-RevId: 454741685

											
										
										
											2022-06-13 18:07:55 -07:00
+								    self._check_if_deleted()
-												Allow `to_py()` on Array if the sharding is fully replicated.

PiperOrigin-RevId: 469617855

											
										
										
											2022-08-23 19:48:59 -07:00
-												Add `__array__` (for device_get), `_npy_value`, `block_until_ready`, `delete` and `_check_if_deleted` to Array.

PiperOrigin-RevId: 454741685

											
										
										
											2022-06-13 18:07:55 -07:00
+								    if self._npy_value is None:
-												Make `is_fully_replicated` and `is_fully_addressble` a property rather than a method.

Why?

1. Because it's easy to cache a property than a method with only the `self` argument. (See below for article)

2. There's no harm in making them a property because both of them return a bool without any side-effects and are cached (so its fast). Why cache `is_fully_addressable`? Because its very expensive to calculate when you have 1000s of devices.

PiperOrigin-RevId: 479850850

											
										
										
											2022-10-08 19:23:32 -07:00
+								      if self.is_fully_replicated:
-												Only cache jax.Array._npy_value when a copy is required.

As discovered in https://github.com/jax-ml/jax/issues/26216, for non-standard dtypes, calling `np.array` on a JAX array will unnecessarily cache the constructed `_npy_value` even when a copy isn't required. This change updates the logic to only save the cached value when it is a copy.

This fixes https://github.com/jax-ml/jax/issues/26216 by making the behavior consistent across dtypes, but we probably also want to expose a mechanism for clearing this cached value regardless.

PiperOrigin-RevId: 726522955

											
										
										
											2025-02-13 09:34:32 -08:00
+								        npy_value, did_copy = self._single_device_array_to_np_array_did_copy()
 								        npy_value.flags.writeable = False
 								        if did_copy:
 								          self._npy_value = npy_value
 								        return npy_value
-												Allow `to_py()` on Array if the sharding is fully replicated.

PiperOrigin-RevId: 469617855

											
										
										
											2022-08-23 19:48:59 -07:00
-												If each host has the full value of the Array, allow fetching it to host. Fixes #15162

Benchmarks:

```
name                  old cpu/op   new cpu/op   delta
np_asarray_8_devices  3.71ms ± 6%  3.32ms ± 7%  -10.48%  (p=0.008 n=5+5)

name                  old time/op  new time/op  delta
np_asarray_8_devices  3.86ms ± 6%  3.49ms ± 7%  -9.72%  (p=0.008 n=5+5)
```

PiperOrigin-RevId: 519222320

											
										
										
											2023-03-24 13:21:20 -07:00
+								      # TODO(yashkatariya): Merge `_process_has_full_value_in_mcjax` with
 								      # is_fully_addressable.
 								      if (not self.is_fully_addressable and
 								          not _process_has_full_value_in_mcjax(self.sharding, self.shape)):
-												Improve error message when trying to fetch value of non-addressable array.

PiperOrigin-RevId: 636642130

											
										
										
											2024-05-23 12:40:40 -07:00
+								        raise RuntimeError(
 								            "Fetching value for `jax.Array` that spans non-addressable"
 								            " (non process local) devices is not possible. You can use"
 								            " `jax.experimental.multihost_utils.process_allgather` to print the"
 								            " global array or use `.addressable_shards` method of jax.Array to"
 								            " inspect the addressable (process local) shards."
 								        )
-												Allow `to_py()` on Array if the sharding is fully replicated.

PiperOrigin-RevId: 469617855

											
										
										
											2022-08-23 19:48:59 -07:00
-												Optimize `_create_copy_plan` by iterating over only the shards that are needed for materialization

For arrays that are fully or partially replicated, it is more efficient to (pre-)construct a list of addressable array shards that participate in materialization rather than going over all array shards. This is particularly useful for single-controller JAX.

The implementation assumes that addressable arrays appear in the same order as the corresponding addressable devices in `sharding.addressable_devices_indices_map()`.

PiperOrigin-RevId: 624969222

											
										
										
											2024-04-15 08:29:02 -07:00
+								      for i, _ in _cached_index_calc(self.sharding, self.shape):
 								        self._arrays[i]._copy_single_device_array_to_host_async()
-												If each host has the full value of the Array, allow fetching it to host. Fixes #15162

Benchmarks:

```
name                  old cpu/op   new cpu/op   delta
np_asarray_8_devices  3.71ms ± 6%  3.32ms ± 7%  -10.48%  (p=0.008 n=5+5)

name                  old time/op  new time/op  delta
np_asarray_8_devices  3.86ms ± 6%  3.49ms ± 7%  -9.72%  (p=0.008 n=5+5)
```

PiperOrigin-RevId: 519222320

											
										
										
											2023-03-24 13:21:20 -07:00
-												Add `__array__` (for device_get), `_npy_value`, `block_until_ready`, `delete` and `_check_if_deleted` to Array.

PiperOrigin-RevId: 454741685

											
										
										
											2022-06-13 18:07:55 -07:00
+								      npy_value = np.empty(self.shape, self.dtype)
-												Optimize `_create_copy_plan` by iterating over only the shards that are needed for materialization

For arrays that are fully or partially replicated, it is more efficient to (pre-)construct a list of addressable array shards that participate in materialization rather than going over all array shards. This is particularly useful for single-controller JAX.

The implementation assumes that addressable arrays appear in the same order as the corresponding addressable devices in `sharding.addressable_devices_indices_map()`.

PiperOrigin-RevId: 624969222

											
										
										
											2024-04-15 08:29:02 -07:00
+								      for i, ind in _cached_index_calc(self.sharding, self.shape):
-												Only cache jax.Array._npy_value when a copy is required.

As discovered in https://github.com/jax-ml/jax/issues/26216, for non-standard dtypes, calling `np.array` on a JAX array will unnecessarily cache the constructed `_npy_value` even when a copy isn't required. This change updates the logic to only save the cached value when it is a copy.

This fixes https://github.com/jax-ml/jax/issues/26216 by making the behavior consistent across dtypes, but we probably also want to expose a mechanism for clearing this cached value regardless.

PiperOrigin-RevId: 726522955

											
										
										
											2025-02-13 09:34:32 -08:00
+								        npy_value[ind], _ = self._arrays[i]._single_device_array_to_np_array_did_copy()
-												Removed noop # type: ignore comments

mypy should now flag these by default.

											
										
										
											2024-05-17 09:46:36 +01:00
+								      self._npy_value = npy_value
-												Some miscellaneous changes to make tests pass when jax.Array is enabled by default.

1. Add `device_buffer` and `device_buffers` fields to Array as a backwards compatible change for DA and SDA.
2. Support PartitionSpecs as input to in_axis_resources and out_axis_resources when jax_array is enabled as a backwards compatible change since all user code uses this currently. Create a MeshPspecSharding internally.
3. Some tests changes to make them pass

PiperOrigin-RevId: 474642889

											
										
										
											2022-09-15 13:26:57 -07:00
+								      self._npy_value.flags.writeable = False
-												Bumped jaxlib version mypy uses on the CI

I also enabled unnecessary cast checking, because turns out we have quite
a few of those.

											
										
										
											2024-07-26 10:59:56 +01:00
+								    return self._npy_value
-												Adding `jax.Array` to jax.experimental. Its pretty much the same as GDA (without the performance optimization for now).

Currently, jax.Array takes DeviceArrays in `assemble_array` because device_put returns a DA. In the future (with IFRT), it will return an `Array`.

`addressable_shards` wraps DA into jax.Array with a `SingleDeviceSharding`.

PiperOrigin-RevId: 453319811

											
										
										
											2022-06-06 17:31:20 -07:00
-												Improve pytype inference for Sharding type.

* Define use_cpp_class and use_cpp_method decorators as no-ops for type checking.
* Remove the use of abc.ABC when defining the Sharding type. This triggers a pytype bug: the easiest fix seems to be to skip the use of the ABC.
* Write use_cpp_class decorator differently on ArrayImpl to work around pytype bug.
* Fix a few new type errors.

PiperOrigin-RevId: 516631428

											
										
										
											2023-03-14 14:19:25 -07:00
 								# TODO(b/273265390): ideally we would write this as a decorator on the ArrayImpl
 								# class, however this triggers a pytype bug. Workaround: apply the decorator
 								# after the fact.
 								if not TYPE_CHECKING:
 								  ArrayImpl = use_cpp_class(xc.ArrayImpl)(ArrayImpl)
-												Switches make_array_from_callback to use batched_device_put

PiperOrigin-RevId: 647537267

											
										
										
											2024-06-27 20:59:25 -07:00
+								def _get_shape_from_index(slc: Index, shape: Shape) -> Shape:
 								  return tuple(
 								      (s.stop or dim) - (s.start or 0)
 								      for s, dim in safe_zip(slc, shape)
 								      if isinstance(s, slice)  # If element is int, this dimension is reduced
 								  )
-												[Take 2] Expose .layout on jax.Array. Also add checks in the AOT path to make sure that the input Array's layout matches the layout given to jax.jit.

Reverts cd79e71d85621a8d6dede9a710bdb2a29bb380fd

PiperOrigin-RevId: 618878870

											
										
										
											2024-03-25 10:07:55 -07:00
+								# explicitly set to be unhashable.
-												Rename the concrete class `Array` to `ArrayImpl`

PiperOrigin-RevId: 477017236

											
										
										
											2022-09-26 16:17:26 -07:00
+								setattr(ArrayImpl, "__hash__", None)
 								setattr(ArrayImpl, "__array_priority__", 100)
-												Add fast path args to Array similar to GDA to speed up initialization and other operations like calculating indices and addressable_device_assignment.

This is important because looping over 1000s of devices is extremely expensive during runtime and throttles the performance (all these optimizations were applied to GDA when integrating it into PAX and are applicable to Array as well). This will also be helpful for single-controller environments.

Also even hashing and __eq__ checks when you have 1000s of devices is going to be slow and will show up in xprof as a slowdown (I have seen this before).

PiperOrigin-RevId: 471366295

											
										
										
											2022-08-31 15:06:58 -07:00
-												Add jax.make_array_from_process_local_data to create a distributed tensor from host data and supporting scaffolding in sharding to be able to figure out dimensions of host data required.

PiperOrigin-RevId: 634205261

											
										
										
											2024-05-15 22:06:11 -07:00
+								# TODO(yashkatariya): Remove None from callback input type.
-												Rename the concrete class `Array` to `ArrayImpl`

PiperOrigin-RevId: 477017236

											
										
										
											2022-09-26 16:17:26 -07:00
+								def make_array_from_callback(
-												Add layout support to `make_array_from_callback`.

PiperOrigin-RevId: 625048520

											
										
										
											2024-04-15 12:37:46 -07:00
+								    shape: Shape, sharding: Sharding | Layout,
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								    data_callback: Callable[[Index | None], ArrayLike]) -> ArrayImpl:
-												Add jax.make_array_from_process_local_data to create a distributed tensor from host data and supporting scaffolding in sharding to be able to figure out dimensions of host data required.

PiperOrigin-RevId: 634205261

											
										
										
											2024-05-15 22:06:11 -07:00
+								  # pyformat: disable
-												Add docstrings for jax.Array APIs `make_array_from_callback` and `make_array_from_single_device_arrays`.

PiperOrigin-RevId: 487929688

											
										
										
											2022-11-11 15:20:27 -08:00
+								  """Returns a ``jax.Array`` via data fetched from ``data_callback``.
 								  ``data_callback`` is used to fetch the data for each addressable shard of the
-												jax.make_array_from_callback: better errors in traced context

											
										
										
											2024-01-31 15:13:33 -08:00
+								  returned ``jax.Array``. This function must return concrete arrays, meaning that
 								  ``make_array_from_callback`` has limited compatibility with JAX transformations
 								  like :func:`jit` or :func:`vmap`.
-												Add docstrings for jax.Array APIs `make_array_from_callback` and `make_array_from_single_device_arrays`.

PiperOrigin-RevId: 487929688

											
										
										
											2022-11-11 15:20:27 -08:00
 								  Args:
 								    shape : Shape of the ``jax.Array``.
 								    sharding: A ``Sharding`` instance which describes how the ``jax.Array`` is
 								      laid out across devices.
 								    data_callback : Callback that takes indices into the global array value as
 								      input and returns the corresponding data of the global array value.
 								      The data can be returned as any array-like object, e.g. a ``numpy.ndarray``.
 								  Returns:
 								    A ``jax.Array`` via data fetched from ``data_callback``.
-												Rename "Example" to "Examples" in docstrings.

This PR updates all docstrings that previously had a section heading
called "Example" and replaces that with "Examples" to be consistent.

											
										
										
											2024-06-21 11:28:35 -04:00
+								  Examples:
-												Add docstrings for jax.Array APIs `make_array_from_callback` and `make_array_from_single_device_arrays`.

PiperOrigin-RevId: 487929688

											
										
										
											2022-11-11 15:20:27 -08:00
-												Replace jax._src.util.prod with math.prod.

math.prod() was added in Python 3.8, so we can assume it is always present.

PiperOrigin-RevId: 513011144

											
										
										
											2023-02-28 12:40:30 -08:00
+								    >>> import math
-												Replace uses of deprecated JAX sharding APIs with their new names in jax.sharding.

This change updates:
* {jax.experimental.maps.Mesh, jax.interpreters.pxla.Mesh} to jax.sharding.Mesh
* {jax.experimental.PartitionSpec, jax.experimental.pjit.PartitionSpec, jax.interpreters.pxla.PartitionSpec, jax.pxla.PartitionSpec} to jax.sharding.PartitionSpec
* jax.experimental.maps.NamedSharding to jax.sharding.NamedSharding.

PiperOrigin-RevId: 506994892

											
										
										
											2023-02-03 14:28:07 -08:00
+								    >>> from jax.sharding import Mesh
 								    >>> from jax.sharding import PartitionSpec as P
-												Add docstrings for jax.Array APIs `make_array_from_callback` and `make_array_from_single_device_arrays`.

PiperOrigin-RevId: 487929688

											
										
										
											2022-11-11 15:20:27 -08:00
+								    >>> import numpy as np
 								    ...
 								    >>> input_shape = (8, 8)
-												Replace jax._src.util.prod with math.prod.

math.prod() was added in Python 3.8, so we can assume it is always present.

PiperOrigin-RevId: 513011144

											
										
										
											2023-02-28 12:40:30 -08:00
+								    >>> global_input_data = np.arange(math.prod(input_shape)).reshape(input_shape)
-												Add docstrings for jax.Array APIs `make_array_from_callback` and `make_array_from_single_device_arrays`.

PiperOrigin-RevId: 487929688

											
										
										
											2022-11-11 15:20:27 -08:00
+								    >>> global_mesh = Mesh(np.array(jax.devices()).reshape(2, 4), ('x', 'y'))
 								    >>> inp_sharding = jax.sharding.NamedSharding(global_mesh, P('x', 'y'))
 								    ...
 								    >>> def cb(index):
 								    ...  return global_input_data[index]
 								    ...
 								    >>> arr = jax.make_array_from_callback(input_shape, inp_sharding, cb)
 								    >>> arr.addressable_data(0).shape
 								    (4, 2)
 								  """
-												Add jax.make_array_from_process_local_data to create a distributed tensor from host data and supporting scaffolding in sharding to be able to figure out dimensions of host data required.

PiperOrigin-RevId: 634205261

											
										
										
											2024-05-15 22:06:11 -07:00
+								  # pyformat: enable
-												Add layout support to `make_array_from_callback`.

PiperOrigin-RevId: 625048520

											
										
										
											2024-04-15 12:37:46 -07:00
+								  dll = sharding.device_local_layout if isinstance(sharding, Layout) else None
 								  if isinstance(dll, AutoLayout):
 								    raise TypeError(
 								        "`DeviceLocalLayout.AUTO` cannot be used in place of a device-local"
 								        f" layout when calling `jax.make_array_from_callback`. Got {sharding}")
-												Removed unused ``# type: ignore`` comments

For future reference, this can be done via

    python -m mypy jax --warn-unused-ignores > /tmp/unused.txt
    while IFS=: read file line rest; do
      echo "$file:$line";
      gsed -i "${line}s/ *\# type: ignore\(\[[^]]*\]\)*//" "$file"
    done < /tmp/unused.txt

											
										
										
											2025-02-13 18:05:27 +00:00
+								  sharding = sharding.sharding if isinstance(sharding, Layout) else sharding
-												Add layout support to `make_array_from_callback`.

PiperOrigin-RevId: 625048520

											
										
										
											2024-04-15 12:37:46 -07:00
+								  if not isinstance(sharding, Sharding):
 								    raise TypeError(
 								        f"sharding should be an instance of `jax.sharding`. Got {sharding} of"
 								        f" type {type(sharding)}")
-												Switches make_array_from_callback to use batched_device_put

PiperOrigin-RevId: 647537267

											
										
										
											2024-06-27 20:59:25 -07:00
+								  def get_data(index: Index | None) -> ArrayImpl | np.ndarray:
 								    # Perhaps cache on index here, then we can unify fully_replicated
 								    # and non-fully_replicated cases below and become faster for
 								    # partially replicated cases.
 								    assert index is not None
 								    r = data_callback(index)
 								    if isinstance(r, core.Tracer):
 								      raise errors.UnexpectedTracerError(
 								          "jax.make_array_from_callback cannot be called within a traced"
 								          " context."
 								      )
 								    # Value can be python scalar, resolve it into something with dtype.
 								    return xla.canonicalize_dtype(r)
-												Optimize make_array_from_callback for fully replicated shardings by going via batched_device_put

Before:

```
name                                                      cpu/op
bench_make_array_from_callback_fully_replicated_sharding  467µs ± 3%

name                                                      time/op
bench_make_array_from_callback_fully_replicated_sharding  467µs ± 3%
```

After:

```
name                                                      cpu/op
bench_make_array_from_callback_fully_replicated_sharding  28.1µs ± 2%

name                                                      time/op
bench_make_array_from_callback_fully_replicated_sharding  28.1µs ± 2%
```

PiperOrigin-RevId: 572429822

											
										
										
											2023-10-10 19:01:26 -07:00
+								  if sharding.is_fully_replicated:
-												If callback returns a fully replicated global array, return it as is.

Also take the batched_device_put fast path for non-jax.Array's since slicing can return arrays on multiple devices which batched_device_put doesn't support.

PiperOrigin-RevId: 624763603

											
										
										
											2024-04-14 14:35:13 -07:00
+								    devices = list(sharding._internal_device_list.addressable_device_list)  # type: ignore
-												Switches make_array_from_callback to use batched_device_put

PiperOrigin-RevId: 647537267

											
										
										
											2024-06-27 20:59:25 -07:00
+								    # Only compute data once.
 								    per_device_values = [get_data((slice(None),) * len(shape))] * len(devices)
-												Optimize make_array_from_callback for fully replicated shardings by going via batched_device_put

Before:

```
name                                                      cpu/op
bench_make_array_from_callback_fully_replicated_sharding  467µs ± 3%

name                                                      time/op
bench_make_array_from_callback_fully_replicated_sharding  467µs ± 3%
```

After:

```
name                                                      cpu/op
bench_make_array_from_callback_fully_replicated_sharding  28.1µs ± 2%

name                                                      time/op
bench_make_array_from_callback_fully_replicated_sharding  28.1µs ± 2%
```

PiperOrigin-RevId: 572429822

											
										
										
											2023-10-10 19:01:26 -07:00
+								  else:
 								    device_to_index_map = sharding.addressable_devices_indices_map(shape)
 								    devices = list(device_to_index_map.keys())
-												Switches make_array_from_callback to use batched_device_put

PiperOrigin-RevId: 647537267

											
										
										
											2024-06-27 20:59:25 -07:00
+								    per_device_values = [
 								        get_data(device_to_index_map[device]) for device in devices
 								    ]
-												Optimize make_array_from_callback for fully replicated shardings by going via batched_device_put

Before:

```
name                                                      cpu/op
bench_make_array_from_callback_fully_replicated_sharding  467µs ± 3%

name                                                      time/op
bench_make_array_from_callback_fully_replicated_sharding  467µs ± 3%
```

After:

```
name                                                      cpu/op
bench_make_array_from_callback_fully_replicated_sharding  28.1µs ± 2%

name                                                      time/op
bench_make_array_from_callback_fully_replicated_sharding  28.1µs ± 2%
```

PiperOrigin-RevId: 572429822

											
										
										
											2023-10-10 19:01:26 -07:00
-												Switches make_array_from_callback to use batched_device_put

PiperOrigin-RevId: 647537267

											
										
										
											2024-06-27 20:59:25 -07:00
+								  first_value = per_device_values[0]
 								  expected_dtype = first_value.dtype
 								  expected_shape = sharding.shard_shape(shape)
-												Don't allow users to query `tracer.sharding` even under sharding in types mode.

Instead, users should do `tracer.aval.sharding` so that code behaves the same under jit and eager mode.

PiperOrigin-RevId: 717638986

											
										
										
											2025-01-20 15:12:12 -08:00
+								  aval = core.update_aval_with_sharding(
 								      core.ShapedArray(shape, expected_dtype), sharding)
-												Switches make_array_from_callback to use batched_device_put

PiperOrigin-RevId: 647537267

											
										
										
											2024-06-27 20:59:25 -07:00
+								  _validate_shape_and_dtype_for_per_device_arrays(
 								      per_device_values,
 								      expected_shape=expected_shape,
 								      aval=aval,
 								      sharding=sharding,
 								  )
-												Add layout support to `make_array_from_callback`.

PiperOrigin-RevId: 625048520

											
										
										
											2024-04-15 12:37:46 -07:00
+								  if (isinstance(first_value, ArrayImpl)
 								      and first_value._committed
 								      and sharding.is_fully_replicated
 								      and first_value.is_fully_replicated
 								      and first_value.sharding._device_assignment == tuple(devices)
-												Standardize default layout to `None` in internals (dispatch, lowering and compilation) and non-default layouts to concrete layouts.

This massively simplifies the amount of checks we need and improves dispatch time too. It also fixes a donation bug being hit in serving code related to layouts and non-standardization of default layout in JAX.

PiperOrigin-RevId: 668527139

											
										
										
											2024-08-28 11:05:45 -07:00
+								      and first_value.layout.device_local_layout == dll):
-												If callback returns a fully replicated global array, return it as is.

Also take the batched_device_put fast path for non-jax.Array's since slicing can return arrays on multiple devices which batched_device_put doesn't support.

PiperOrigin-RevId: 624763603

											
										
										
											2024-04-14 14:35:13 -07:00
+								    return first_value
-												Switches make_array_from_callback to use batched_device_put

PiperOrigin-RevId: 647537267

											
										
										
											2024-06-27 20:59:25 -07:00
+								  if dtypes.issubdtype(aval.dtype, dtypes.extended):
 								    # TODO(yashkatariya): Can this also use batched_device_put?
 								    arrays = api.device_put(per_device_values, devices)
 								    return aval.dtype._rules.make_sharded_array(
 								        aval, sharding, arrays, committed=True
 								    )
-												Add layout support to `make_array_from_callback`.

PiperOrigin-RevId: 625048520

											
										
										
											2024-04-15 12:37:46 -07:00
+								  if dll is not None:
 								    devices = [Layout(dll, SingleDeviceSharding(d)) for d in devices]
-												Switches make_array_from_callback to use batched_device_put

PiperOrigin-RevId: 647537267

											
										
										
											2024-06-27 20:59:25 -07:00
+								    # pxla.batched_device_put doesn't support Layout... Take the slow route
 								    arrays = api.device_put(per_device_values, devices)
 								    return ArrayImpl(aval, sharding, arrays, committed=True)
 								  if isinstance(first_value, ArrayImpl) and len(first_value.devices()) > 1:
 								    # The output of the callback is already a sharded array, move it to
 								    # to target device.
 								    per_device_values = api.device_put(per_device_values, devices)
 								  return pxla.batched_device_put(aval, sharding, per_device_values, devices)
-												Add pjit support for `Array`. `Array` takes the same codepath as GDA so there are very little modifications to pjit. Add handlers aval, shard_args and result handlers for Array.

PiperOrigin-RevId: 454160854

											
										
										
											2022-06-10 07:31:43 -07:00
-												Add jax.make_array_from_process_local_data to create a distributed tensor from host data and supporting scaffolding in sharding to be able to figure out dimensions of host data required.

PiperOrigin-RevId: 634205261

											
										
										
											2024-05-15 22:06:11 -07:00
+								def make_array_from_process_local_data(
 								    sharding: Sharding,
 								    local_data: np.ndarray,
-												Reverts dfe61285093ff826e1ad23bb36b77a42c01040b4

PiperOrigin-RevId: 640987745

											
										
										
											2024-06-06 12:40:21 -07:00
+								    global_shape: Shape | None = None,
-												Add jax.make_array_from_process_local_data to create a distributed tensor from host data and supporting scaffolding in sharding to be able to figure out dimensions of host data required.

PiperOrigin-RevId: 634205261

											
										
										
											2024-05-15 22:06:11 -07:00
+								) -> ArrayImpl:
 								  # pyformat: disable
 								  """Creates distributed tensor using the data available in process.
 								  This function is a common special case of `make_array_from_callback`. It
 								  assumes that the data is available in the process and takes care of the
 								  index wrangling.
-												Reverts dfe61285093ff826e1ad23bb36b77a42c01040b4

PiperOrigin-RevId: 640987745

											
										
										
											2024-06-06 12:40:21 -07:00
+								  The most common case is when the sharding is sharded across the batch
 								  dimension and each host just loads its corresponding sub-batch. This function
 								  supports more general cases as well, such as mixed multi-host and multi-axis
 								  replication and sharding but you would need to compute the size and the
 								  contents of process-local data correctly to satisfy the sharding constraints.
-												Makes global_shape optional for jax.make_array_from_process_local_data.

PiperOrigin-RevId: 640695090

											
										
										
											2024-06-05 16:57:35 -07:00
-												Reverts dfe61285093ff826e1ad23bb36b77a42c01040b4

PiperOrigin-RevId: 640987745

											
										
										
											2024-06-06 12:40:21 -07:00
+								  In particular, if any two hosts are replicas, host_local_data should be
 								  identical as well.
-												Makes global_shape optional for jax.make_array_from_process_local_data.

PiperOrigin-RevId: 640695090

											
										
										
											2024-06-05 16:57:35 -07:00
-												Reverts dfe61285093ff826e1ad23bb36b77a42c01040b4

PiperOrigin-RevId: 640987745

											
										
										
											2024-06-06 12:40:21 -07:00
+								  The global_shape is optional. If not provided it will be be inferred from
 								  the local_data and sharding, under the assumption that
 								  each host represents only their own data for uniform sharding. If sharding
 								  is non-uniform, (see note below) an exception will be raised.
-												Makes global_shape optional for jax.make_array_from_process_local_data.

PiperOrigin-RevId: 640695090

											
										
										
											2024-06-05 16:57:35 -07:00
-												Reverts dfe61285093ff826e1ad23bb36b77a42c01040b4

PiperOrigin-RevId: 640987745

											
										
										
											2024-06-06 12:40:21 -07:00
+								  Setting global_shape explicitly allows for finer grain control and works with
 								  non-uniform shardings. Each dimension of global_shape must either match
 								  host_local_data, or match the inferred global shape of the sharding (in which
 								  case it is equivalent to setting it to None, but is more explicit).
 								  For example if dimension `i` is fully sharded then this size would be
 								  `per_device_shape[i] * jax.local_device_count()`.  Each device will be mapped
 								  into local slice of `local_data` array. For example, if given process
 								  addresses slices (8, 12) and  (24, 28), then these slices will be mapped
 								  into (0, 4) and (4, 8) of the `local_data`.
 								  For each dimension where global_shapes matches local_shape, each device
 								  will lookup the slice in the local_data. For example if
 								  global_shape == local_data.shape, the local data is assumed to be the
 								  actual target array that will be sharded into device.
 								  If global_shape is the same as local_data.shape, then the data must
 								  be the same across all hosts.
-												Add jax.make_array_from_process_local_data to create a distributed tensor from host data and supporting scaffolding in sharding to be able to figure out dimensions of host data required.

PiperOrigin-RevId: 634205261

											
										
										
											2024-05-15 22:06:11 -07:00
 								  Examples:
 								    >>> from jax.sharding import PartitionSpec as P
 								    >>> mesh_rows = 2
 								    >>> mesh_cols =  jax.device_count() // 2
 								    ...
 								    >>> mesh = jax.sharding.Mesh(np.array(jax.devices()).reshape(mesh_rows, mesh_cols), ('x', 'y'))
 								    >>> sharding = jax.sharding.NamedSharding(mesh, P(('x', 'y'),))
 								    >>> rows_per_device = 2
 								    >>> feature_length = 32
 								    >>> per_device_shape = (rows_per_device, feature_length)
 								    >>> per_host_shape = (rows_per_device * len(mesh.local_devices), feature_length)
 								    >>> per_host_generator = lambda : np.arange(np.prod(per_host_shape)).reshape(per_host_shape)
 								    >>> per_host_data = per_host_generator()  # replace with your own per-host data pipeline that outputs numpy arrays
 								    >>> global_shape = (rows_per_device * len(sharding.device_set), ) + per_device_shape[1:]
 								    >>> output_global_array = jax.make_array_from_process_local_data(sharding, per_host_data, global_shape)
 								    ...
 								    >>> assert output_global_array.addressable_data(0).shape == per_device_shape
 								    >>> assert output_global_array.shape == global_shape
-												Reverts dfe61285093ff826e1ad23bb36b77a42c01040b4

PiperOrigin-RevId: 640987745

											
										
										
											2024-06-06 12:40:21 -07:00
+								  NB: While most shardings are uniform, It is possible to design am exotic
 								  sharding mesh where each process's  devices will be arranged in a non-grid
 								  like pattern in some dimensions, or for indices to overlap non-trivially.
 								  Such sharding is called "non-uniform" in those dimensions. In that case,
 								  the global shape along those directions must match local shape as there is
 								  no meaningful way to represent all needed
 								  per-process data in non-overlapping fashion. For example for global_shape 4x4
 								  if sharding looks like this:
 
 
 
 
 								  with 4 processes, containing devices (0,1), (2, 3), (4, 5), (6, 7) respectively.
 								  Then the data for each host look like
 								      xx..    ..xx     ....    ....
 								      .xx.    x..x     ....    ....
 								      ....    ....     x..x    .xx.
 								      ....    ....     xx..    ..xx
 								  the sharding is uniform on rows (each host requires either rows 1-2, or rows 3-4)
 								  and non-uniform on columns (hosts require overlapping but not matching
 								  set of columns). Thus local data must have the shape 2x4 or 4x4
 								  for all hosts, even though each  host can potentially fit into 2x2 shape.
 								  In this case user must provide global_shape explicitly and for
 								  local_shape=(2, 4), potentially valid global shapes are (2, 4) and (4, 4).
 								  On the other hand for sharding:
 x.x.  .x.x.  ....  ....
 x.x.  .x.x.  ....  ....
 ....  ....   .x.x  x.x.
 ....  ....   .x.x  x.x.
 								  for local_shape=(2, 2) this function can accept a choice of 2x2, 2x4, 4x2
 								  and 4x4 global shapes. Setting global_shape to None, is equivalent to
 								  setting it to (4, 4) in this case.
-												Add jax.make_array_from_process_local_data to create a distributed tensor from host data and supporting scaffolding in sharding to be able to figure out dimensions of host data required.

PiperOrigin-RevId: 634205261

											
										
										
											2024-05-15 22:06:11 -07:00
+								  Args:
-												Make `make_array_from_process_local_data` go via `device_put` if there is only 1 process.

PiperOrigin-RevId: 677232996

											
										
										
											2024-09-21 10:22:36 -07:00
+								    sharding: Sharding of the global array.
 								    local_data: Data on the host to be placed on local devices. Each
-												Add jax.make_array_from_process_local_data to create a distributed tensor from host data and supporting scaffolding in sharding to be able to figure out dimensions of host data required.

PiperOrigin-RevId: 634205261

											
										
										
											2024-05-15 22:06:11 -07:00
+								      dimension should either match global_shape, or match
 								      num_addressable_indices(dim).
-												Make `make_array_from_process_local_data` go via `device_put` if there is only 1 process.

PiperOrigin-RevId: 677232996

											
										
										
											2024-09-21 10:22:36 -07:00
+								    global_shape: The target shape of the global array. If None,
-												Match the argument name with the name in `Args` section in docstring

PiperOrigin-RevId: 663926739

											
										
										
											2024-08-16 17:21:10 -07:00
+								      will infer from local_data and sharding.
-												Add jax.make_array_from_process_local_data to create a distributed tensor from host data and supporting scaffolding in sharding to be able to figure out dimensions of host data required.

PiperOrigin-RevId: 634205261

											
										
										
											2024-05-15 22:06:11 -07:00
 								  Returns:
-												Reverts dfe61285093ff826e1ad23bb36b77a42c01040b4

PiperOrigin-RevId: 640987745

											
										
										
											2024-06-06 12:40:21 -07:00
+								    Tensor that will have sharding=sharding and of shape global_shape.
-												Add jax.make_array_from_process_local_data to create a distributed tensor from host data and supporting scaffolding in sharding to be able to figure out dimensions of host data required.

PiperOrigin-RevId: 634205261

											
										
										
											2024-05-15 22:06:11 -07:00
+								  """
 								  # pyformat: enable
-												Make `make_array_from_process_local_data` go via `device_put` if there is only 1 process.

PiperOrigin-RevId: 677232996

											
										
										
											2024-09-21 10:22:36 -07:00
+								  if xla_bridge.process_count() == 1:
 								    return api.device_put(local_data, sharding)
-												Reverts dfe61285093ff826e1ad23bb36b77a42c01040b4

PiperOrigin-RevId: 640987745

											
										
										
											2024-06-06 12:40:21 -07:00
+								  # TODO(sandler): consider supporting partially specified global_shape or
 								  # making local_to_global_shape available in the api.
 								  local_shape = local_data.shape
 								  if global_shape is None:
 								    global_shape = local_to_global_shape(sharding, local_shape)  # type: ignore[assignment]
 								    assert global_shape is not None
 								    if None in global_shape:
 								      raise ValueError(
 								          "Unable to compute global_shape due to non-uniform sharding."
 								          f" Specify global shape directly. Partially computed {global_shape=}."
 								      )
 								  elif None in global_shape:
 								    raise ValueError(f"{global_shape=} has Nones. This is not supported.")
-												Add jax.make_array_from_process_local_data to create a distributed tensor from host data and supporting scaffolding in sharding to be able to figure out dimensions of host data required.

PiperOrigin-RevId: 634205261

											
										
										
											2024-05-15 22:06:11 -07:00
+								  full_dim = []
 								  for i, (data_dim, global_dim) in enumerate(
 								      zip(local_data.shape, global_shape)
 								  ):
 								    full_dim.append(data_dim == global_dim)
 								    if data_dim != global_dim:
 								      process_slice = num_addressable_indices(sharding, i, global_shape)
 								      if process_slice != data_dim:
 								        raise ValueError(
 								            "Invalid host data, each dimension should match either global or "
 								            f"process shape. In dimension {i=}, the process data has {data_dim}"
 								            f"elements. Process addresses {process_slice} elements and "
 								            f"{global_shape=}."
 								        )
 								  addressable_shards = sharding.addressable_devices_indices_map(global_shape)
-												Switches make_array_from_callback to use batched_device_put

PiperOrigin-RevId: 647537267

											
										
										
											2024-06-27 20:59:25 -07:00
+								  shard = next(iter(addressable_shards.values()))
 								  assert shard is not None
 								  shard_shape = _get_shape_from_index(shard, global_shape)
-												Add jax.make_array_from_process_local_data to create a distributed tensor from host data and supporting scaffolding in sharding to be able to figure out dimensions of host data required.

PiperOrigin-RevId: 634205261

											
										
										
											2024-05-15 22:06:11 -07:00
+								  slices_for_each_dim: list[list[int]] = [[] for _ in global_shape]
 								  for shard_index in addressable_shards.values():
 								    assert shard_index is not None
 								    for i, slc in enumerate(shard_index):
 								      slices_for_each_dim[i].append(slc.start or 0)
 								  for i in range(len(global_shape)):
 								    slices_for_each_dim[i] = sorted(set(slices_for_each_dim[i]))
-												Switches make_array_from_callback to use batched_device_put

PiperOrigin-RevId: 647537267

											
										
										
											2024-06-27 20:59:25 -07:00
+								  @functools.lru_cache(maxsize=4096)
 								  def local_slice(i, start):
-												Add jax.make_array_from_process_local_data to create a distributed tensor from host data and supporting scaffolding in sharding to be able to figure out dimensions of host data required.

PiperOrigin-RevId: 634205261

											
										
										
											2024-05-15 22:06:11 -07:00
+								    # Looks up the index of this slice in the list of slices for this dimension.
 								    # This will determine the slice in host_local_data
-												Switches make_array_from_callback to use batched_device_put

PiperOrigin-RevId: 647537267

											
										
										
											2024-06-27 20:59:25 -07:00
+								    start = slices_for_each_dim[i].index(start or 0) * shard_shape[i]
-												Add jax.make_array_from_process_local_data to create a distributed tensor from host data and supporting scaffolding in sharding to be able to figure out dimensions of host data required.

PiperOrigin-RevId: 634205261

											
										
										
											2024-05-15 22:06:11 -07:00
+								    end = start + shard_shape[i]
 								    return slice(start, end)
 								  def cb(index: Index | None) -> ArrayLike:
 								    assert index is not None
-												Switches make_array_from_callback to use batched_device_put

PiperOrigin-RevId: 647537267

											
										
										
											2024-06-27 20:59:25 -07:00
+								    data_slice = (
 								        slc if full_dim[i] else local_slice(i, slc.start)
-												Add jax.make_array_from_process_local_data to create a distributed tensor from host data and supporting scaffolding in sharding to be able to figure out dimensions of host data required.

PiperOrigin-RevId: 634205261

											
										
										
											2024-05-15 22:06:11 -07:00
+								        for i, slc in enumerate(index)
-												Switches make_array_from_callback to use batched_device_put

PiperOrigin-RevId: 647537267

											
										
										
											2024-06-27 20:59:25 -07:00
+								    )
-												Add jax.make_array_from_process_local_data to create a distributed tensor from host data and supporting scaffolding in sharding to be able to figure out dimensions of host data required.

PiperOrigin-RevId: 634205261

											
										
										
											2024-05-15 22:06:11 -07:00
+								    return local_data[tuple(data_slice)]
 								  return make_array_from_callback(global_shape, sharding, cb)
-												Rename the concrete class `Array` to `ArrayImpl`

PiperOrigin-RevId: 477017236

											
										
										
											2022-09-26 16:17:26 -07:00
+								def make_array_from_single_device_arrays(
-												Rolling back a commit that caused a 50-90% performance regression in most MaxText workloads.

Reverts 9d421c9149a1db006444adeea87464bd6b8c0743

PiperOrigin-RevId: 731506280

											
										
										
											2025-02-26 16:56:47 -08:00
+								    shape: Shape, sharding: Sharding, arrays: Sequence[basearray.Array]
-												Improve pytype inference for Sharding type.

* Define use_cpp_class and use_cpp_method decorators as no-ops for type checking.
* Remove the use of abc.ABC when defining the Sharding type. This triggers a pytype bug: the easiest fix seems to be to skip the use of the ABC.
* Write use_cpp_class decorator differently on ArrayImpl to work around pytype bug.
* Fix a few new type errors.

PiperOrigin-RevId: 516631428

											
										
										
											2023-03-14 14:19:25 -07:00
+								) -> ArrayImpl:
-												Updated make_array_from_single_device_arrays docs

											
										
										
											2023-10-15 21:55:10 +00:00
+								  r"""Returns a ``jax.Array`` from a sequence of ``jax.Array``\s each on a single device.
 								      Every device in input ``sharding``\'s mesh must have an array in ``arrays``\s.
-												Add docstrings for jax.Array APIs `make_array_from_callback` and `make_array_from_single_device_arrays`.

PiperOrigin-RevId: 487929688

											
										
										
											2022-11-11 15:20:27 -08:00
 								  Args:
-												Updated make_array_from_single_device_arrays docs

											
										
										
											2023-10-15 21:55:10 +00:00
+								    shape : Shape of the output ``jax.Array``. This conveys information already included with
 								      ``sharding`` and ``arrays`` and serves as a double check.
 								    sharding: Sharding: A global Sharding instance which describes how the output jax.Array is laid out across devices.
 								    arrays: Sequence of ``jax.Array``\s that are each single device addressable. ``len(arrays)``
 								      must equal ``len(sharding.addressable_devices)`` and the shape of each array must be the same. For multiprocess code,
 								      each process will call with a different ``arrays`` argument that corresponds to that processes' data.
 								      These arrays are commonly created via ``jax.device_put``.
-												Add docstrings for jax.Array APIs `make_array_from_callback` and `make_array_from_single_device_arrays`.

PiperOrigin-RevId: 487929688

											
										
										
											2022-11-11 15:20:27 -08:00
 								  Returns:
-												Updated make_array_from_single_device_arrays docs

											
										
										
											2023-10-15 21:55:10 +00:00
+								    A global ``jax.Array``, sharded as ``sharding``, with shape equal to ``shape``, and with per-device
 								      contents matching ``arrays``.
-												Add docstrings for jax.Array APIs `make_array_from_callback` and `make_array_from_single_device_arrays`.

PiperOrigin-RevId: 487929688

											
										
										
											2022-11-11 15:20:27 -08:00
-												Updated make_array_from_single_device_arrays docs

											
										
										
											2023-10-15 21:55:10 +00:00
+								  Examples:
-												Replace jax._src.util.prod with math.prod.

math.prod() was added in Python 3.8, so we can assume it is always present.

PiperOrigin-RevId: 513011144

											
										
										
											2023-02-28 12:40:30 -08:00
+								    >>> import math
-												Add infrastructure for managing deprecations.

Use it to deprecate jax.experimental.PartitionSpec, jax.interpreters.pxla.PartitionSpec, jax.interpreters.pxla.Mesh.

PiperOrigin-RevId: 508349776

											
										
										
											2023-02-09 05:47:59 -08:00
+								    >>> from jax.sharding import Mesh
 								    >>> from jax.sharding import PartitionSpec as P
-												Add docstrings for jax.Array APIs `make_array_from_callback` and `make_array_from_single_device_arrays`.

PiperOrigin-RevId: 487929688

											
										
										
											2022-11-11 15:20:27 -08:00
+								    >>> import numpy as np
 								    ...
-												Updated make_array_from_single_device_arrays docs

											
										
										
											2023-10-15 21:55:10 +00:00
+								    >>> mesh_rows = 2
 								    >>> mesh_cols =  jax.device_count() // 2
 								    ...
-												Add docs on how to create a jax.Array from data parallel host local inputs

PiperOrigin-RevId: 529579626

											
										
										
											2023-05-04 19:11:26 -07:00
+								    >>> global_shape = (8, 8)
-												Updated make_array_from_single_device_arrays docs

											
										
										
											2023-10-15 21:55:10 +00:00
+								    >>> mesh = Mesh(np.array(jax.devices()).reshape(mesh_rows, mesh_cols), ('x', 'y'))
 								    >>> sharding = jax.sharding.NamedSharding(mesh, P('x', 'y'))
-												Add docs on how to create a jax.Array from data parallel host local inputs

PiperOrigin-RevId: 529579626

											
										
										
											2023-05-04 19:11:26 -07:00
+								    >>> inp_data = np.arange(math.prod(global_shape)).reshape(global_shape)
-												Add docstrings for jax.Array APIs `make_array_from_callback` and `make_array_from_single_device_arrays`.

PiperOrigin-RevId: 487929688

											
										
										
											2022-11-11 15:20:27 -08:00
+								    ...
 								    >>> arrays = [
-												Updated make_array_from_single_device_arrays docs

											
										
										
											2023-10-15 21:55:10 +00:00
+								    ...    jax.device_put(inp_data[index], d)
 								    ...        for d, index in sharding.addressable_devices_indices_map(global_shape).items()]
-												Add docstrings for jax.Array APIs `make_array_from_callback` and `make_array_from_single_device_arrays`.

PiperOrigin-RevId: 487929688

											
										
										
											2022-11-11 15:20:27 -08:00
+								    ...
-												Add docs on how to create a jax.Array from data parallel host local inputs

PiperOrigin-RevId: 529579626

											
										
										
											2023-05-04 19:11:26 -07:00
+								    >>> arr = jax.make_array_from_single_device_arrays(global_shape, sharding, arrays)
-												Updated make_array_from_single_device_arrays docs

											
										
										
											2023-10-15 21:55:10 +00:00
+								    >>> assert arr.shape == (8,8) # arr.shape is (8,8) regardless of jax.device_count()
-												Add docs on how to create a jax.Array from data parallel host local inputs

PiperOrigin-RevId: 529579626

											
										
										
											2023-05-04 19:11:26 -07:00
-												Reference `make_array_from_process_local_data` in `make_array_from_single_device_arrays` docstring.

PiperOrigin-RevId: 651937263

											
										
										
											2024-07-12 18:09:27 -07:00
+								  For cases where you have a local array and want to convert it to a global
 								  jax.Array, use ``jax.make_array_from_process_local_data``.
-												Add docstrings for jax.Array APIs `make_array_from_callback` and `make_array_from_single_device_arrays`.

PiperOrigin-RevId: 487929688

											
										
										
											2022-11-11 15:20:27 -08:00
+								  """
-												Add `make_array_from_single_device_arrays` to prepare to rename of the concrete `Array` to `ArrayImpl`.

PiperOrigin-RevId: 476965287

											
										
										
											2022-09-26 12:43:13 -07:00
+								  # All input arrays should be committed. Checking it is expensive on
 								  # single-controller systems.
-												Don't allow users to query `tracer.sharding` even under sharding in types mode.

Instead, users should do `tracer.aval.sharding` so that code behaves the same under jit and eager mode.

PiperOrigin-RevId: 717638986

											
										
										
											2025-01-20 15:12:12 -08:00
+								  aval = core.update_aval_with_sharding(
-												Rolling back a commit that caused a 50-90% performance regression in most MaxText workloads.

Reverts 9d421c9149a1db006444adeea87464bd6b8c0743

PiperOrigin-RevId: 731506280

											
										
										
											2025-02-26 16:56:47 -08:00
+								      core.ShapedArray(shape, arrays[0].dtype, weak_type=False), sharding)
-												Copybara import of the project:

--
b243ea79ae7c9e2c2aa85e264b8dca8fc4c61b7b by Jake VanderPlas <jakevdp@google.com>:

Rename opaque dtype to extended dtype.

This includes three deprecations:
 - jax.core.is_opaque_dtype(dt) is deprecated in favor of jnp.issubdtype(dt, jax.dtypes.extended)
 - jax.core.has_opaque_dtype(x) is deprecated in favor of jnp.issubdtype(x.dtype, jax.dtypes.extended)
 - the allow_opaque_dtype argument to jax.core.canonicalize_dtype is now allow_extended_dtype
Because jax.core is explicitly excluded from the API deprecation policy, these changes will not be
subject to a standard 3-month deprecation period.

COPYBARA_INTEGRATE_REVIEW=https://github.com/google/jax/pull/16824 from jakevdp:extended-dtype b243ea79ae7c9e2c2aa85e264b8dca8fc4c61b7b
PiperOrigin-RevId: 550674205

											
										
										
											2023-07-24 14:29:37 -07:00
+								  if dtypes.issubdtype(aval.dtype, dtypes.extended):
-												Add layout support to `make_array_from_callback`.

PiperOrigin-RevId: 625048520

											
										
										
											2024-04-15 12:37:46 -07:00
+								    return aval.dtype._rules.make_sharded_array(aval, sharding, arrays,
 								                                                committed=True)
-												Replace references to DeviceArray with Array.

A number of stale references are lurking in our documentation.

											
										
										
											2023-08-18 16:50:36 -04:00
+								  # TODO(phawkins): ideally the cast() could be checked.
-												improve make_array_from_single_device_arrays error

											
										
										
											2025-01-25 07:11:18 +00:00
+								  try:
 								    return ArrayImpl(aval, sharding, cast(Sequence[ArrayImpl], arrays),
 								                    committed=True)
 								  except TypeError:
 								    if not isinstance(arrays, Sequence):
 								      raise TypeError("jax.make_array_from_single_device_arrays `arrays` "
 								                      "argument must be a Sequence (list or tuple), but got "
 								                      f"{type(arrays)}.")
 								    if any(isinstance(arr, core.Tracer) for arr in arrays):
 								      raise ValueError(
 								          "jax.make_array_from_single_device_arrays requires a list of concrete"
 								          f" arrays as input, but got types {set(map(type, arrays))}")
 								    raise
-												Add `make_array_from_single_device_arrays` to prepare to rename of the concrete `Array` to `ArrayImpl`.

PiperOrigin-RevId: 476965287

											
										
										
											2022-09-26 12:43:13 -07:00
-												Rename the concrete class `Array` to `ArrayImpl`

PiperOrigin-RevId: 477017236

											
										
										
											2022-09-26 16:17:26 -07:00
+								xla.canonicalize_dtype_handlers[ArrayImpl] = pxla.identity
-												Always use the same code for array avals

											
										
										
											2024-12-17 13:47:58 -08:00
-												Initial integration of sharding in types in JAX. Currently we just support `nary` ops in forward only sharding propagation. Currently this functionality is experimental and hidden behind `jax_sharding_in_types` config flag.

There will be more improvements and semantics clarification coming in the future as we integrate it more into JAX.

Co-authored-by: Dougal Maclaurin <dougalm@google.com>
PiperOrigin-RevId: 668991384

											
										
										
											2024-08-29 10:49:30 -07:00
+								def _get_aval_array(self):
-												Don't allow users to query `tracer.sharding` even under sharding in types mode.

Instead, users should do `tracer.aval.sharding` so that code behaves the same under jit and eager mode.

PiperOrigin-RevId: 717638986

											
										
										
											2025-01-20 15:12:12 -08:00
+								  return core.update_aval_with_sharding(self.aval, self.sharding)
-												Always use the same code for array avals

											
										
										
											2024-12-17 13:47:58 -08:00
+								core.pytype_aval_mappings[ArrayImpl] = _get_aval_array
-												Remove more checks now that the minimum jaxlib version corresponds to xla_extension_version == 109. Also remove usage of `xc._version` and replace it with `xla_extension_version`.

PiperOrigin-RevId: 496474494

											
										
										
											2022-12-19 13:13:15 -08:00
+								# TODO(jakevdp) replace this with true inheritance at the C++ level.
 								basearray.Array.register(ArrayImpl)
-												Make all pmap tests pass with Array! I am skipping all soft pmap tests for now.

PiperOrigin-RevId: 467264992

											
										
										
											2022-08-12 12:09:22 -07:00
-												Remove the `canonicalize_dtypes` argument from mlir.ir_constant(s).

Instead, force the caller to explicitly canonicalize the argument if that's what they want.

The current behavior (canonicalize by default) is not the behavior we want to encourage: we want to canonicalize exactly where we need to and nowhere else.

PiperOrigin-RevId: 557806903

											
										
										
											2023-08-17 06:43:31 -07:00
+								def _array_mlir_constant_handler(val):
-												Improve error message when a global jax.Array is closed over a jitted function in McJAX.

PiperOrigin-RevId: 648010704

											
										
										
											2024-06-29 14:36:03 -07:00
+								  try:
-												Don't wrap singleton ir.Values with tuples during HLO lowering.

In general a JAX value might correspond to multiple HLO values, which is why the HLO lowering represents each value as a tuple of zero or more ir.Values. However, the common case is that there is exactly one value, and almost all such lists are singletons.

To reduce the number of singleton list and tuple objects allocated during MLIR lowering, instead represent singleton values as unwrapped ir.Values, and only use a tuple if there is not exactly one ir.Value backing a JAX value.

											
										
										
											2024-07-01 08:42:48 -04:00
+								    return mlir.ir_constant(val._value)
-												Improve error message when a global jax.Array is closed over a jitted function in McJAX.

PiperOrigin-RevId: 648010704

											
										
										
											2024-06-29 14:36:03 -07:00
+								  except RuntimeError as e:
 								    # TODO(yashkatariya): Ideally we would catch a custom exception from
 								    # `_value` function in ArrayImpl instead of checking the error string.
 								    if 'Fetching value for `jax.Array` that spans non-addressable' in str(e):
 								      raise RuntimeError(
 								          "Closing over jax.Array that spans non-addressable (non process"
 								          " local) devices is not allowed. Please pass such arrays as arguments"
 								          f" to the function. Got jax.Array: {val.aval.str_short()}") from e
 								    raise
-												Rename the concrete class `Array` to `ArrayImpl`

PiperOrigin-RevId: 477017236

											
										
										
											2022-09-26 16:17:26 -07:00
+								mlir.register_constant_handler(ArrayImpl, _array_mlir_constant_handler)
-												Add pjit support for `Array`. `Array` takes the same codepath as GDA so there are very little modifications to pjit. Add handlers aval, shard_args and result handlers for Array.

PiperOrigin-RevId: 454160854

											
										
										
											2022-06-10 07:31:43 -07:00
-												Make `jnp.array` return `jax.Array`. Add input and result handlers for `jax.Array`. Also added tests for `add` under jit.

TODO:
* Don't allow `x + y` if `jax.Array` is not fully addressable.
* Figure out how to use the already written tests with Array. Might be able to follow the path taken by SDA.
PiperOrigin-RevId: 457034779

											
										
										
											2022-06-24 10:04:31 -07:00
-												Move _array_shard_arg helpers from pxla into array.

Refactoring only which fixes a TODO.

Add a canonicalize argument to pxla.shard_arg so we can call that API from array yet  avoid double-canonicalization.

PiperOrigin-RevId: 549658117

											
										
										
											2023-07-20 09:43:40 -07:00
+								# NOTE(skye): we could refactor to generate _multi_slice parameters directly
 								# from the input ShardingSpec, rather than the indices. However, this would
 								# require duplicating the ordering logic of spec_to_indices, which is more
 								# subtle and more likely to change than the index logic we have to support here.
 								def as_slice_indices(arr: Any, idx: Index) -> tuple[
 								    tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
 								  """Returns start_indices, limit_indices, removed_dims"""
 								  start_indices = [0] * arr.ndim
 								  limit_indices = list(arr.shape)
-												CI: update mypy to v1.6.0

											
										
										
											2023-10-11 12:54:51 -07:00
+								  removed_dims: list[int] = []
-												Move _array_shard_arg helpers from pxla into array.

Refactoring only which fixes a TODO.

Add a canonicalize argument to pxla.shard_arg so we can call that API from array yet  avoid double-canonicalization.

PiperOrigin-RevId: 549658117

											
										
										
											2023-07-20 09:43:40 -07:00
 								  tuple_idx = idx if isinstance(idx, tuple) else (idx,)
 								  for dim, sub_idx in enumerate(tuple_idx):
 								    if isinstance(sub_idx, int):
 								      start_indices[dim] = sub_idx
 								      limit_indices[dim] = sub_idx + 1
 								      removed_dims.append(dim)
 								    elif sub_idx == slice(None):
 								      continue
 								    else:
 								      assert isinstance(sub_idx, slice), sub_idx
 								      assert isinstance(sub_idx.start, int), sub_idx
 								      assert isinstance(sub_idx.stop, int), sub_idx
 								      start_indices[dim] = sub_idx.start
 								      limit_indices[dim] = sub_idx.stop
-												Removed noop # type: ignore comments

mypy should now flag these by default.

											
										
										
											2024-05-17 09:46:36 +01:00
+								  return tuple(start_indices), tuple(limit_indices), tuple(removed_dims)
-												Move _array_shard_arg helpers from pxla into array.

Refactoring only which fixes a TODO.

Add a canonicalize argument to pxla.shard_arg so we can call that API from array yet  avoid double-canonicalization.

PiperOrigin-RevId: 549658117

											
										
										
											2023-07-20 09:43:40 -07:00
 								def shard_device_array(x, devices, indices, sharding):
 								  start_indices, limit_indices, removed_dims = unzip3(
 								      as_slice_indices(x, idx) for idx in indices)
-												Reverts 7012a05497faf4d33c967bee3cebc83588234e63

PiperOrigin-RevId: 556001895

											
										
										
											2023-08-11 10:29:41 -07:00
+								  if sharding.is_fully_replicated:
 								    shards = [x] * len(devices)
 								  else:
-												Make device_put resharding on single device array input work under use_mesh. Fixes https://github.com/jax-ml/jax/issues/26552

PiperOrigin-RevId: 728382461

											
										
										
											2025-02-18 15:22:06 -08:00
+								    # TODO(yashkatariya): Maybe this should be set when we call the handler in
 								    # InputsHandler.__call__?
 								    with set_concrete_mesh(None):
 								      shards = x._multi_slice(start_indices, limit_indices, removed_dims)
-												Remove internal uses of api_util.shaped_abstractify

											
										
										
											2024-12-19 07:06:12 -08:00
+								  aval = core.shaped_abstractify(x)
-												[Memories] Allow device_put outside jax.jit to work with different memory kinds.

Currently only jax.Arrays work. Other types will be added in subsequent CLs.

PiperOrigin-RevId: 555677540

											
										
										
											2023-08-10 15:25:39 -07:00
+								  return pxla.batched_device_put(aval, sharding, shards, devices)
-												Move _array_shard_arg helpers from pxla into array.

Refactoring only which fixes a TODO.

Add a canonicalize argument to pxla.shard_arg so we can call that API from array yet  avoid double-canonicalization.

PiperOrigin-RevId: 549658117

											
										
										
											2023-07-20 09:43:40 -07:00
-												Remove the dead code now that jax.Array is the only array we have

PiperOrigin-RevId: 624390245

											
										
										
											2024-04-12 21:40:47 -07:00
-												Move _array_shard_arg helpers from pxla into array.

Refactoring only which fixes a TODO.

Add a canonicalize argument to pxla.shard_arg so we can call that API from array yet  avoid double-canonicalization.

PiperOrigin-RevId: 549658117

											
										
										
											2023-07-20 09:43:40 -07:00
+								def shard_sharded_device_array_slow_path(x, devices, indices, sharding):
 								  candidates = defaultdict(list)
-												Remove the dead code now that jax.Array is the only array we have

PiperOrigin-RevId: 624390245

											
										
										
											2024-04-12 21:40:47 -07:00
+								  bufs = [buf.data for buf in x.addressable_shards]
 								  arr_indices = tuple(x.sharding.devices_indices_map(x.shape).values())
-												Move _array_shard_arg helpers from pxla into array.

Refactoring only which fixes a TODO.

Add a canonicalize argument to pxla.shard_arg so we can call that API from array yet  avoid double-canonicalization.

PiperOrigin-RevId: 549658117

											
										
										
											2023-07-20 09:43:40 -07:00
+								  for buf, idx in safe_zip(bufs, arr_indices):
-												Fix the efficient reshard path in device_put when you want to go from 1 mesh to another with different device assignments.

The old code lead to the wrong answer as shown in the test added in this PR.

PiperOrigin-RevId: 654318251

											
										
										
											2024-07-20 09:08:16 -07:00
+								    candidates[hashed_index(idx)].append(buf)
-												Move _array_shard_arg helpers from pxla into array.

Refactoring only which fixes a TODO.

Add a canonicalize argument to pxla.shard_arg so we can call that API from array yet  avoid double-canonicalization.

PiperOrigin-RevId: 549658117

											
										
										
											2023-07-20 09:43:40 -07:00
 								  bufs = []
 								  for idx, device in safe_zip(indices, devices):
 								    # Look up all buffers that contain the correct slice of the logical array.
-												Fix the efficient reshard path in device_put when you want to go from 1 mesh to another with different device assignments.

The old code lead to the wrong answer as shown in the test added in this PR.

PiperOrigin-RevId: 654318251

											
										
										
											2024-07-20 09:08:16 -07:00
+								    candidates_list = candidates[hashed_index(idx)]
-												Move _array_shard_arg helpers from pxla into array.

Refactoring only which fixes a TODO.

Add a canonicalize argument to pxla.shard_arg so we can call that API from array yet  avoid double-canonicalization.

PiperOrigin-RevId: 549658117

											
										
										
											2023-07-20 09:43:40 -07:00
+								    if not candidates_list:
-												Propagate CopySemantics from python to C++ transfer APIs so that device_put works correctly in presence of copy/donate options that user specified.

This change only supports pinned_host -> pinned_host copies on the same device. HBM -> HBM copies don't work yet and donation also doesn't work in PJRT.

This CL also sets up the plumbing from JAX to PJRT so that in the future support for missing features can be added easily.

Fixes https://github.com/jax-ml/jax/issues/24521

PiperOrigin-RevId: 694274616

											
										
										
											2024-11-07 15:50:32 -08:00
+								      return pxla.shard_args([sharding], [None], [None], [x._value],
-												If input layouts are specified via `in_shardings` to `jit` and the array that the jitted function is called with is uncommitted, reshard the input array to the layout specified by the user.

Not doing the resharding, leads to incorrect outputs on GPU and a crash on TPU which is not good.

Fixes: https://github.com/google/jax/issues/23100
PiperOrigin-RevId: 665000157

											
										
										
											2024-08-19 15:10:00 -07:00
+								                             canonicalize=False)[0]
-												Move _array_shard_arg helpers from pxla into array.

Refactoring only which fixes a TODO.

Add a canonicalize argument to pxla.shard_arg so we can call that API from array yet  avoid double-canonicalization.

PiperOrigin-RevId: 549658117

											
										
										
											2023-07-20 09:43:40 -07:00
+								    # Try to find a candidate buffer already on the correct device,
 								    # otherwise copy one of them.
 								    for buf in candidates_list:
-												Deprecate the device() method of JAX arrays

											
										
										
											2023-11-29 16:52:09 -08:00
+								      if buf.devices() == {device}:
-												Move _array_shard_arg helpers from pxla into array.

Refactoring only which fixes a TODO.

Add a canonicalize argument to pxla.shard_arg so we can call that API from array yet  avoid double-canonicalization.

PiperOrigin-RevId: 549658117

											
										
										
											2023-07-20 09:43:40 -07:00
+								        bufs.append(buf)
 								        break
 								    else:
-												Ensured that JAX type checks under pytype on Python 3.12

Some errors uncovered by pytype look genuine and need to be revisited in
the in the future.

PiperOrigin-RevId: 704268742

											
										
										
											2024-12-09 06:52:25 -08:00
+								      bufs.append(candidates_list[-1])
-												Move _array_shard_arg helpers from pxla into array.

Refactoring only which fixes a TODO.

Add a canonicalize argument to pxla.shard_arg so we can call that API from array yet  avoid double-canonicalization.

PiperOrigin-RevId: 549658117

											
										
										
											2023-07-20 09:43:40 -07:00
+								  return pxla.batched_device_put(x.aval, sharding, bufs, devices)
-												Add `util.cache` to `jax.clear_caches` and move pjit, sharding, array, etc uses of `functools.lru_cache` to `util.cache` so that those caches will be cleared if `jax.clear_caches` is called.

PiperOrigin-RevId: 642359226

											
										
										
											2024-06-11 12:46:11 -07:00
+								@cache(max_size=4096, trace_context_in_key=False)
-												Optimize `jax.device_put()` dispatch for 1:1 device-to-device transfers

* Cache the sharding index comparison in addition to sharding index calculation. This helps when the list of indices is expensive to compare.
* Remove caching from `pxla.get_addressable_devices_for_shard_arg()` since `sharding._addressable_device_assignment` is already a cached property.
* Use `a is b` instead of `id(a) == id(b)` since the former is more concise.

PiperOrigin-RevId: 627080325

											
										
										
											2024-04-22 10:23:47 -07:00
+								def _sharding_indices_and_eq(src_sharding, shape, dst_sharding):
 								  src_indices = src_sharding.addressable_devices_indices_map(shape).values()
 								  dst_indices = dst_sharding.addressable_devices_indices_map(shape).values()
 								  return dst_indices, tuple(src_indices) == tuple(dst_indices)
-												Propagate CopySemantics from python to C++ transfer APIs so that device_put works correctly in presence of copy/donate options that user specified.

This change only supports pinned_host -> pinned_host copies on the same device. HBM -> HBM copies don't work yet and donation also doesn't work in PJRT.

This CL also sets up the plumbing from JAX to PJRT so that in the future support for missing features can be added easily.

Fixes https://github.com/jax-ml/jax/issues/24521

PiperOrigin-RevId: 694274616

											
										
										
											2024-11-07 15:50:32 -08:00
+								def _array_shard_arg(xs, shardings, layouts, copy_semantics):
-												Don't monkey-patch functions in test_utils to count events for tests.

This has two problems:
* it's not thread-safe, which will become problematic if we run tests with thread-parallelism.
* it's not very maintainable.

Instead, add a new util.test_event(...) function that can be called at points of interest in the program. test_utils registers a callback that is invoked when an event is received. This avoids the need to make thread-unsafe global monkey patches.

											
										
										
											2024-12-11 16:54:52 -05:00
+								  util.test_event("_array_shard_arg")
-												Make `pxla.shard_arg` batch calls to `xc.copy_array_to_devices_with_sharding`

This CL changes `shard_arg_handlers` to be batched, in that it now receives a list of objects and a list of shardings and returns a list of array. This makes it possible to batch backend calls whenever it's beneficial to do so.

Based on the above, the batched shard arg for arrays leverages the newly added `xla::ifrt::Client::CopyArrays()` (https://github.com/tensorflow/tensorflow/pull/69096) to make bulk copy cheaper in some backend implementations. Since `Client::CopyArrays()` requires batched arrays to have the same set of source/destination devices, `PyArray::BatchedCopyToDeviceWithSharding()` internally groups arrays by their source/destination devices and memory kinds. The grouping is pushed all the way to C++ for performance in case we have lots of arrays.

PiperOrigin-RevId: 643097852

											
										
										
											2024-06-13 13:09:35 -07:00
+								  results = []
 								  batch_xs, batch_devs, batch_shardings, batch_indices = [], [], [], []
-												Propagate CopySemantics from python to C++ transfer APIs so that device_put works correctly in presence of copy/donate options that user specified.

This change only supports pinned_host -> pinned_host copies on the same device. HBM -> HBM copies don't work yet and donation also doesn't work in PJRT.

This CL also sets up the plumbing from JAX to PJRT so that in the future support for missing features can be added easily.

Fixes https://github.com/jax-ml/jax/issues/24521

PiperOrigin-RevId: 694274616

											
										
										
											2024-11-07 15:50:32 -08:00
+								  batch_cs = []
-												If input layouts are specified via `in_shardings` to `jit` and the array that the jitted function is called with is uncommitted, reshard the input array to the layout specified by the user.

Not doing the resharding, leads to incorrect outputs on GPU and a crash on TPU which is not good.

Fixes: https://github.com/google/jax/issues/23100
PiperOrigin-RevId: 665000157

											
										
										
											2024-08-19 15:10:00 -07:00
-												Propagate CopySemantics from python to C++ transfer APIs so that device_put works correctly in presence of copy/donate options that user specified.

This change only supports pinned_host -> pinned_host copies on the same device. HBM -> HBM copies don't work yet and donation also doesn't work in PJRT.

This CL also sets up the plumbing from JAX to PJRT so that in the future support for missing features can be added easily.

Fixes https://github.com/jax-ml/jax/issues/24521

PiperOrigin-RevId: 694274616

											
										
										
											2024-11-07 15:50:32 -08:00
+								  for i, (x, sharding, layout, cs) in enumerate(
 								      safe_zip(xs, shardings, layouts, copy_semantics)):
-												Make `pxla.shard_arg` batch calls to `xc.copy_array_to_devices_with_sharding`

This CL changes `shard_arg_handlers` to be batched, in that it now receives a list of objects and a list of shardings and returns a list of array. This makes it possible to batch backend calls whenever it's beneficial to do so.

Based on the above, the batched shard arg for arrays leverages the newly added `xla::ifrt::Client::CopyArrays()` (https://github.com/tensorflow/tensorflow/pull/69096) to make bulk copy cheaper in some backend implementations. Since `Client::CopyArrays()` requires batched arrays to have the same set of source/destination devices, `PyArray::BatchedCopyToDeviceWithSharding()` internally groups arrays by their source/destination devices and memory kinds. The grouping is pushed all the way to C++ for performance in case we have lots of arrays.

PiperOrigin-RevId: 643097852

											
										
										
											2024-06-13 13:09:35 -07:00
+								    x._check_if_deleted()
-												If input layouts are specified via `in_shardings` to `jit` and the array that the jitted function is called with is uncommitted, reshard the input array to the layout specified by the user.

Not doing the resharding, leads to incorrect outputs on GPU and a crash on TPU which is not good.

Fixes: https://github.com/google/jax/issues/23100
PiperOrigin-RevId: 665000157

											
										
										
											2024-08-19 15:10:00 -07:00
+								    indices, same_indices = _sharding_indices_and_eq(x.sharding, x.shape, sharding)
-												Standardize default layout to `None` in internals (dispatch, lowering and compilation) and non-default layouts to concrete layouts.

This massively simplifies the amount of checks we need and improves dispatch time too. It also fixes a donation bug being hit in serving code related to layouts and non-standardization of default layout in JAX.

PiperOrigin-RevId: 668527139

											
										
										
											2024-08-28 11:05:45 -07:00
+								    same_layout = (True if layout is None else
 								                   x.layout.device_local_layout == layout)
-												Make eager pmap tests pass with `Array`. Also add a slow path for Array in `pmap` similar to what SDA has. This is required for eager pmap. Adding a slow path removes the need for doing sharding checks in api.py because SDA doesn't do those checks and if the sharding does not match with pmap sharding, then it just defaults to the slow path (exactly like SDA).

PiperOrigin-RevId: 468843310

											
										
										
											2022-08-19 21:36:43 -07:00
-												Make `pxla.shard_arg` batch calls to `xc.copy_array_to_devices_with_sharding`

This CL changes `shard_arg_handlers` to be batched, in that it now receives a list of objects and a list of shardings and returns a list of array. This makes it possible to batch backend calls whenever it's beneficial to do so.

Based on the above, the batched shard arg for arrays leverages the newly added `xla::ifrt::Client::CopyArrays()` (https://github.com/tensorflow/tensorflow/pull/69096) to make bulk copy cheaper in some backend implementations. Since `Client::CopyArrays()` requires batched arrays to have the same set of source/destination devices, `PyArray::BatchedCopyToDeviceWithSharding()` internally groups arrays by their source/destination devices and memory kinds. The grouping is pushed all the way to C++ for performance in case we have lots of arrays.

PiperOrigin-RevId: 643097852

											
										
										
											2024-06-13 13:09:35 -07:00
+								    if not x.is_fully_addressable:
-												If input layouts are specified via `in_shardings` to `jit` and the array that the jitted function is called with is uncommitted, reshard the input array to the layout specified by the user.

Not doing the resharding, leads to incorrect outputs on GPU and a crash on TPU which is not good.

Fixes: https://github.com/google/jax/issues/23100
PiperOrigin-RevId: 665000157

											
										
										
											2024-08-19 15:10:00 -07:00
+								      if same_indices and same_layout:
-												Make `pxla.shard_arg` batch calls to `xc.copy_array_to_devices_with_sharding`

This CL changes `shard_arg_handlers` to be batched, in that it now receives a list of objects and a list of shardings and returns a list of array. This makes it possible to batch backend calls whenever it's beneficial to do so.

Based on the above, the batched shard arg for arrays leverages the newly added `xla::ifrt::Client::CopyArrays()` (https://github.com/tensorflow/tensorflow/pull/69096) to make bulk copy cheaper in some backend implementations. Since `Client::CopyArrays()` requires batched arrays to have the same set of source/destination devices, `PyArray::BatchedCopyToDeviceWithSharding()` internally groups arrays by their source/destination devices and memory kinds. The grouping is pushed all the way to C++ for performance in case we have lots of arrays.

PiperOrigin-RevId: 643097852

											
										
										
											2024-06-13 13:09:35 -07:00
+								        results.append(x)
 								      else:
 								        raise NotImplementedError(
 								            "Cannot reshard an input that is not fully addressable")
-												Merge pull request #12705 from mattjj:fix-prng-key-array-device-put

PiperOrigin-RevId: 479813689

											
										
										
											2022-10-08 11:39:05 -07:00
+								    else:
-												Make `pxla.shard_arg` batch calls to `xc.copy_array_to_devices_with_sharding`

This CL changes `shard_arg_handlers` to be batched, in that it now receives a list of objects and a list of shardings and returns a list of array. This makes it possible to batch backend calls whenever it's beneficial to do so.

Based on the above, the batched shard arg for arrays leverages the newly added `xla::ifrt::Client::CopyArrays()` (https://github.com/tensorflow/tensorflow/pull/69096) to make bulk copy cheaper in some backend implementations. Since `Client::CopyArrays()` requires batched arrays to have the same set of source/destination devices, `PyArray::BatchedCopyToDeviceWithSharding()` internally groups arrays by their source/destination devices and memory kinds. The grouping is pushed all the way to C++ for performance in case we have lots of arrays.

PiperOrigin-RevId: 643097852

											
										
										
											2024-06-13 13:09:35 -07:00
+								      devices = sharding._addressable_device_assignment
-												If input layouts are specified via `in_shardings` to `jit` and the array that the jitted function is called with is uncommitted, reshard the input array to the layout specified by the user.

Not doing the resharding, leads to incorrect outputs on GPU and a crash on TPU which is not good.

Fixes: https://github.com/google/jax/issues/23100
PiperOrigin-RevId: 665000157

											
										
										
											2024-08-19 15:10:00 -07:00
+								      if same_indices and same_layout:
-												Make `pxla.shard_arg` batch calls to `xc.copy_array_to_devices_with_sharding`

This CL changes `shard_arg_handlers` to be batched, in that it now receives a list of objects and a list of shardings and returns a list of array. This makes it possible to batch backend calls whenever it's beneficial to do so.

Based on the above, the batched shard arg for arrays leverages the newly added `xla::ifrt::Client::CopyArrays()` (https://github.com/tensorflow/tensorflow/pull/69096) to make bulk copy cheaper in some backend implementations. Since `Client::CopyArrays()` requires batched arrays to have the same set of source/destination devices, `PyArray::BatchedCopyToDeviceWithSharding()` internally groups arrays by their source/destination devices and memory kinds. The grouping is pushed all the way to C++ for performance in case we have lots of arrays.

PiperOrigin-RevId: 643097852

											
										
										
											2024-06-13 13:09:35 -07:00
+								        # Add a placeholder result that will be filled in later.
 								        results.append(None)
 								        # Accumulate arguments to `batched_copy_array_to_devices_with_sharding`.
 								        batch_xs.append(x)
 								        batch_devs.append(list(devices))
 								        batch_shardings.append(sharding)
 								        batch_indices.append(i)
-												Propagate CopySemantics from python to C++ transfer APIs so that device_put works correctly in presence of copy/donate options that user specified.

This change only supports pinned_host -> pinned_host copies on the same device. HBM -> HBM copies don't work yet and donation also doesn't work in PJRT.

This CL also sets up the plumbing from JAX to PJRT so that in the future support for missing features can be added easily.

Fixes https://github.com/jax-ml/jax/issues/24521

PiperOrigin-RevId: 694274616

											
										
										
											2024-11-07 15:50:32 -08:00
+								        batch_cs.append(cs)
-												Make `pxla.shard_arg` batch calls to `xc.copy_array_to_devices_with_sharding`

This CL changes `shard_arg_handlers` to be batched, in that it now receives a list of objects and a list of shardings and returns a list of array. This makes it possible to batch backend calls whenever it's beneficial to do so.

Based on the above, the batched shard arg for arrays leverages the newly added `xla::ifrt::Client::CopyArrays()` (https://github.com/tensorflow/tensorflow/pull/69096) to make bulk copy cheaper in some backend implementations. Since `Client::CopyArrays()` requires batched arrays to have the same set of source/destination devices, `PyArray::BatchedCopyToDeviceWithSharding()` internally groups arrays by their source/destination devices and memory kinds. The grouping is pushed all the way to C++ for performance in case we have lots of arrays.

PiperOrigin-RevId: 643097852

											
										
										
											2024-06-13 13:09:35 -07:00
+								      # Resharding starts here:
-												If input layouts are specified via `in_shardings` to `jit` and the array that the jitted function is called with is uncommitted, reshard the input array to the layout specified by the user.

Not doing the resharding, leads to incorrect outputs on GPU and a crash on TPU which is not good.

Fixes: https://github.com/google/jax/issues/23100
PiperOrigin-RevId: 665000157

											
										
										
											2024-08-19 15:10:00 -07:00
+								      elif not same_layout:
 								        results.append(api.device_put(x, Layout(layout, sharding)))
-												Make `pxla.shard_arg` batch calls to `xc.copy_array_to_devices_with_sharding`

This CL changes `shard_arg_handlers` to be batched, in that it now receives a list of objects and a list of shardings and returns a list of array. This makes it possible to batch backend calls whenever it's beneficial to do so.

Based on the above, the batched shard arg for arrays leverages the newly added `xla::ifrt::Client::CopyArrays()` (https://github.com/tensorflow/tensorflow/pull/69096) to make bulk copy cheaper in some backend implementations. Since `Client::CopyArrays()` requires batched arrays to have the same set of source/destination devices, `PyArray::BatchedCopyToDeviceWithSharding()` internally groups arrays by their source/destination devices and memory kinds. The grouping is pushed all the way to C++ for performance in case we have lots of arrays.

PiperOrigin-RevId: 643097852

											
										
										
											2024-06-13 13:09:35 -07:00
+								      elif dispatch.is_single_device_sharding(x.sharding):
 								        results.append(shard_device_array(x, devices, indices, sharding))
 								      else:
 								        results.append(
 								            shard_sharded_device_array_slow_path(x, devices, indices, sharding))
-												Don't monkey-patch functions in test_utils to count events for tests.

This has two problems:
* it's not thread-safe, which will become problematic if we run tests with thread-parallelism.
* it's not very maintainable.

Instead, add a new util.test_event(...) function that can be called at points of interest in the program. test_utils registers a callback that is invoked when an event is received. This avoids the need to make thread-unsafe global monkey patches.

											
										
										
											2024-12-11 16:54:52 -05:00
+								  util.test_event("batched_copy_array")
-												Remove dead code after minimum jaxlib version bump to v0.4.36.

New minimum xla_extension_version is 299, and the new mlir_api_version is 57.

PiperOrigin-RevId: 704280856

											
										
										
											2024-12-09 07:34:26 -08:00
+								  copy_outs = xc.batched_copy_array_to_devices_with_sharding(
 								      batch_xs, batch_devs, batch_shardings, batch_cs)
-												Make `pxla.shard_arg` batch calls to `xc.copy_array_to_devices_with_sharding`

This CL changes `shard_arg_handlers` to be batched, in that it now receives a list of objects and a list of shardings and returns a list of array. This makes it possible to batch backend calls whenever it's beneficial to do so.

Based on the above, the batched shard arg for arrays leverages the newly added `xla::ifrt::Client::CopyArrays()` (https://github.com/tensorflow/tensorflow/pull/69096) to make bulk copy cheaper in some backend implementations. Since `Client::CopyArrays()` requires batched arrays to have the same set of source/destination devices, `PyArray::BatchedCopyToDeviceWithSharding()` internally groups arrays by their source/destination devices and memory kinds. The grouping is pushed all the way to C++ for performance in case we have lots of arrays.

PiperOrigin-RevId: 643097852

											
										
										
											2024-06-13 13:09:35 -07:00
+								  for i, copy_out in safe_zip(batch_indices, copy_outs):
 								    assert results[i] is None
 								    results[i] = copy_out
 								  return results
-												Rename the concrete class `Array` to `ArrayImpl`

PiperOrigin-RevId: 477017236

											
										
										
											2022-09-26 16:17:26 -07:00
+								pxla.shard_arg_handlers[ArrayImpl] = _array_shard_arg
-												Add pjit support for `Array`. `Array` takes the same codepath as GDA so there are very little modifications to pjit. Add handlers aval, shard_args and result handlers for Array.

PiperOrigin-RevId: 454160854

											
										
										
											2022-06-10 07:31:43 -07:00
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								def _array_global_result_handler(global_aval, out_sharding, committed):
-												Don't allow users to query `tracer.sharding` even under sharding in types mode.

Instead, users should do `tracer.aval.sharding` so that code behaves the same under jit and eager mode.

PiperOrigin-RevId: 717638986

											
										
										
											2025-01-20 15:12:12 -08:00
+								  global_aval = core.update_aval_with_sharding(global_aval, out_sharding)
-												Make jit == pjit. This means that the lowering and execution paths of jit and pjit are merged.

A fallback to `lower_xla_callable` is taken when pmap appears in the jaxpr during the jit lowering path.

Added support for `keep_unused`, `committed` and `core.Token` to pxla.py.

PiperOrigin-RevId: 470896270

											
										
										
											2022-08-29 22:02:32 -07:00
+								  if global_aval.dtype == dtypes.float0:
-												Removed noop # type: ignore comments

mypy should now flag these by default.

											
										
										
											2024-05-17 09:46:36 +01:00
+								    return lambda _: np.zeros(global_aval.shape, dtypes.float0)
-												Copybara import of the project:

--
b243ea79ae7c9e2c2aa85e264b8dca8fc4c61b7b by Jake VanderPlas <jakevdp@google.com>:

Rename opaque dtype to extended dtype.

This includes three deprecations:
 - jax.core.is_opaque_dtype(dt) is deprecated in favor of jnp.issubdtype(dt, jax.dtypes.extended)
 - jax.core.has_opaque_dtype(x) is deprecated in favor of jnp.issubdtype(x.dtype, jax.dtypes.extended)
 - the allow_opaque_dtype argument to jax.core.canonicalize_dtype is now allow_extended_dtype
Because jax.core is explicitly excluded from the API deprecation policy, these changes will not be
subject to a standard 3-month deprecation period.

COPYBARA_INTEGRATE_REVIEW=https://github.com/google/jax/pull/16824 from jakevdp:extended-dtype b243ea79ae7c9e2c2aa85e264b8dca8fc4c61b7b
PiperOrigin-RevId: 550674205

											
										
										
											2023-07-24 14:29:37 -07:00
+								  if dtypes.issubdtype(global_aval.dtype, dtypes.extended):
-												access rules through a hidden attribute of opaque dtype

											
										
										
											2022-08-30 13:25:49 -07:00
+								    return global_aval.dtype._rules.global_sharded_result_handler(
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								        global_aval, out_sharding, committed)
-												Bump minimum jaxlib version to 0.4.6 which means xla_extension_version == 137 and mlir_api_version == 45

PiperOrigin-RevId: 516364523

											
										
										
											2023-03-13 17:09:06 -07:00
+								  return xc.array_result_handler(
 								      global_aval, out_sharding, committed=committed, _skip_checks=True
 								  )
-												Remove pxla.OutputType enum class now that the only output can be jax.Array

PiperOrigin-RevId: 517985356

											
										
										
											2023-03-20 09:09:15 -07:00
+								pxla.global_result_handlers[core.ShapedArray] = _array_global_result_handler
-												Make `core.Token` a non-trivial class which wraps a `jax.Array`. Currently, we use a singleton and empty `core.token` object everywhere. After the change, tokens could be created and threaded in and out of computations to build up dependency.

Also update ordered side-effects to use the new `core.Token` class (NFC for this part, just to unify token usage).

PiperOrigin-RevId: 626091210

											
										
										
											2024-04-18 11:09:02 -07:00
-												Add fast path args to Array similar to GDA to speed up initialization and other operations like calculating indices and addressable_device_assignment.

This is important because looping over 1000s of devices is extremely expensive during runtime and throttles the performance (all these optimizations were applied to GDA when integrating it into PAX and are applicable to Array as well). This will also be helpful for single-controller environments.

Also even hashing and __eq__ checks when you have 1000s of devices is going to be slow and will show up in xprof as a slowdown (I have seen this before).

PiperOrigin-RevId: 471366295

											
										
										
											2022-08-31 15:06:58 -07:00
+								# Only used for Arrays that come out of pmap.
-												Pmap should output SDA like `Array`s to maintain the current behavior exactly. Split the shard_arg_handler for `Array` based on whether the mode is pmap or pjit. Why do this? The doc below explains more about the context.

PiperOrigin-RevId: 466849614

											
										
										
											2022-08-10 20:11:06 -07:00
+								def _array_local_result_handler(aval, sharding, indices):
-												[Rollforward] Convert _arrays to return PyArray instead of PyBuffer.

This change also converts all callsites that construct buffers to
return PyArrays.

PiperOrigin-RevId: 510486273

											
										
										
											2023-02-17 11:52:08 -08:00
+								  if aval.dtype == dtypes.float0:
-												Removed noop # type: ignore comments

mypy should now flag these by default.

											
										
										
											2024-05-17 09:46:36 +01:00
+								    return lambda _: np.zeros(aval.shape, dtypes.float0)
-												Copybara import of the project:

--
b243ea79ae7c9e2c2aa85e264b8dca8fc4c61b7b by Jake VanderPlas <jakevdp@google.com>:

Rename opaque dtype to extended dtype.

This includes three deprecations:
 - jax.core.is_opaque_dtype(dt) is deprecated in favor of jnp.issubdtype(dt, jax.dtypes.extended)
 - jax.core.has_opaque_dtype(x) is deprecated in favor of jnp.issubdtype(x.dtype, jax.dtypes.extended)
 - the allow_opaque_dtype argument to jax.core.canonicalize_dtype is now allow_extended_dtype
Because jax.core is explicitly excluded from the API deprecation policy, these changes will not be
subject to a standard 3-month deprecation period.

COPYBARA_INTEGRATE_REVIEW=https://github.com/google/jax/pull/16824 from jakevdp:extended-dtype b243ea79ae7c9e2c2aa85e264b8dca8fc4c61b7b
PiperOrigin-RevId: 550674205

											
										
										
											2023-07-24 14:29:37 -07:00
+								  if dtypes.issubdtype(aval.dtype, dtypes.extended):
-												access rules through a hidden attribute of opaque dtype

											
										
										
											2022-08-30 13:25:49 -07:00
+								    return aval.dtype._rules.local_sharded_result_handler(
 								        aval, sharding, indices)
-												Bump minimum jaxlib version to 0.4.6 which means xla_extension_version == 137 and mlir_api_version == 45

PiperOrigin-RevId: 516364523

											
										
										
											2023-03-13 17:09:06 -07:00
+								  return xc.array_result_handler(
 								      aval, sharding, committed=True, _skip_checks=True
 								  )
-												Remove pxla.OutputType enum class now that the only output can be jax.Array

PiperOrigin-RevId: 517985356

											
										
										
											2023-03-20 09:09:15 -07:00
+								pxla.local_result_handlers[core.ShapedArray] = _array_local_result_handler
-												Allow tokens being passed to `jit` and through dispatch and being returned from the jitted function.

Fixes https://github.com/google/jax/issues/21160

PiperOrigin-RevId: 632531105

											
										
										
											2024-05-10 10:11:55 -07:00
 								# Token handlers
-												Propagate CopySemantics from python to C++ transfer APIs so that device_put works correctly in presence of copy/donate options that user specified.

This change only supports pinned_host -> pinned_host copies on the same device. HBM -> HBM copies don't work yet and donation also doesn't work in PJRT.

This CL also sets up the plumbing from JAX to PJRT so that in the future support for missing features can be added easily.

Fixes https://github.com/jax-ml/jax/issues/24521

PiperOrigin-RevId: 694274616

											
										
										
											2024-11-07 15:50:32 -08:00
+								def _token_shard_arg(xs, shardings, layouts, copy_semantics):
-												Allow resharding between tokens on a single device
and multiple devices.

Whenever this happens we can essentially introduce an effects barrier
instead of doing the normal device -> host -> device transfer.

Fixes https://github.com/jax-ml/jax/issues/25671.

PiperOrigin-RevId: 716309978

											
										
										
											2025-01-16 11:23:39 -08:00
+								  results = []
 								  for x, sharding, layout in safe_zip(xs, shardings, layouts):
 								    x.block_until_ready()
 								    x = np.array([], dtype=bool)
 								    results.append(api.device_put(x, Layout(layout, sharding)))
 								  return results
-												Allow tokens being passed to `jit` and through dispatch and being returned from the jitted function.

Fixes https://github.com/google/jax/issues/21160

PiperOrigin-RevId: 632531105

											
										
										
											2024-05-10 10:11:55 -07:00
+								pxla.shard_arg_handlers[core.Token] = _token_shard_arg
 								def _token_global_result_handler(global_aval, out_sharding, committed):
 								  array_handler = _array_global_result_handler(
-												[sharding_in_types] Add a canonicalize_value step before dispatching `bind` so that we can insert `mesh_cast`s under the following conditions:

* When current_mesh is Manual and aval mesh is Auto

* When current mesh is set and aval mesh is unset

* Final style primitives skip this canonicalization and they are free to add it in their own `bind` method.

* `mesh_cast` is skipped from this canonicalization to avoid recursion errors.

This is required to make sure that after we hit abstract_eval rule and check_jaxpr, everything is properly typed in JAX's type system.

`Auto` right now is a bit more permissive because we need to keep the current code at HEAD working but `Explicit` and `Manual` are very strict.

PiperOrigin-RevId: 722868091

											
										
										
											2025-02-03 17:59:44 -08:00
+								      core.get_token_aval(), out_sharding, committed)
-												Allow tokens being passed to `jit` and through dispatch and being returned from the jitted function.

Fixes https://github.com/google/jax/issues/21160

PiperOrigin-RevId: 632531105

											
										
										
											2024-05-10 10:11:55 -07:00
 								  def wrapper(*args, **kwargs):
 								    out_buf = array_handler(*args, **kwargs)
 								    return core.Token(out_buf)
 								  return wrapper
 								pxla.global_result_handlers[core.AbstractToken] = _token_global_result_handler