rocm_jax/jax/_src/interpreters/pxla.py

# Copyright 2018 The JAX Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Implementation of pmap and related functionality."""

from __future__ import annotations

import enum
from contextlib import contextmanager
from collections import namedtuple
from collections.abc import Sequence, Iterable
import dataclasses
from functools import partial, lru_cache, cached_property
import itertools as it
import logging
import math
import threading
from typing import Any, Callable, NamedTuple, TypeVar, Union, cast
from collections.abc import Iterator
import warnings

import numpy as np

import jax
from jax.errors import JAXTypeError

from jax._src import api_util
from jax._src import compiler
from jax._src import config
from jax._src import core
from jax._src import dispatch
from jax._src import dtypes
from jax._src import effects
from jax._src import linear_util as lu
from jax._src import mesh as mesh_lib
from jax._src import op_shardings
from jax._src import sharding_specs
from jax._src import profiler
from jax._src import sharding_impls
from jax._src import source_info_util
from jax._src import stages
from jax._src import tree_util
from jax._src import util
from jax._src import xla_bridge as xb
from jax._src.abstract_arrays import array_types
from jax._src.core import DShapedArray
from jax._src.core import ShapedArray
from jax._src.interpreters import ad
from jax._src.interpreters import batching
from jax._src.interpreters import partial_eval as pe
from jax._src.interpreters import mlir
from jax._src.interpreters import xla
from jax._src.layout import XLACompatibleLayout, SpecifiedLayout, LayoutRequest
from jax._src.lib import xla_client as xc
from jax._src.lib import xla_extension_version
from jax._src.lib.mlir import ir
from jax._src.lib.mlir.dialects import hlo
from jax._src.partition_spec import PartitionSpec
from jax._src.sharding_impls import (
    ArrayMapping, ArrayMappingOrAutoOrUnspecified, AUTO, UNSPECIFIED,
    UnspecifiedValue, get_array_mapping as _get_array_mapping, is_auto,
    is_unspecified, is_unspecified_or_auto, array_mapping_to_axis_resources,
    SingleDeviceSharding, GSPMDSharding)
from jax._src.util import (safe_map, safe_zip, partition_list, wrap_name,
                           tuple_update, tuple_delete, distributed_debug_log,
                           unzip2, HashableFunction, weakref_lru_cache)
from jax._src.state.types import AbstractRef, RefEffect


# Built in Python lists don't support weak refs but subclasses of lists do.
class WeakRefList(list):
  pass


xe = xc._xla

unsafe_map, map = map, safe_map  # type: ignore

logger = logging.getLogger(__name__)

Index = Union[int, slice, tuple[Union[int, slice], ...]]

NoSharding = sharding_specs.NoSharding
Chunked = sharding_specs.Chunked
Unstacked = sharding_specs.Unstacked

ShardedAxis = sharding_specs.ShardedAxis
Replicated = sharding_specs.Replicated

AvalDimSharding = Union[Unstacked, Chunked, NoSharding]
Mesh = mesh_lib.Mesh
MeshAxisName = sharding_impls.MeshAxisName
MeshDimAssignment = Union[ShardedAxis, Replicated]
ShardingSpec = sharding_specs.ShardingSpec

### util

def identity(x): return x

def shard_arg(arg, sharding, canonicalize=True):
  if canonicalize:
    arg = xla.canonicalize_dtype(arg)
  return shard_arg_handlers[type(arg)](arg, sharding)


@profiler.annotate_function
def shard_args(
    shardings: Sequence[sharding_impls.XLACompatibleSharding], args,
) -> Sequence[jax.Array]:
  return [shard_arg(arg, shardings[i]) for i, arg in enumerate(args)]

shard_arg_handlers: dict[Any, Callable[[Any, Any], Any]] = {}


@lru_cache(maxsize=1024)
def get_addressable_devices_for_shard_arg(
    s: sharding_impls.XLACompatibleSharding) -> tuple[xc.Device, ...]:
  return s._addressable_device_assignment

@lru_cache(maxsize=1024)
def _get_replicated_slices(num_addressable_devices: int):
  return ((slice(None),),) * num_addressable_devices

def _shard_token(x, sharding):
  devices = get_addressable_devices_for_shard_arg(sharding)
  indices = _get_replicated_slices(len(devices))
  zeros = np.zeros((), dtype=np.dtype(np.bool_))
  aval = api_util.shaped_abstractify(zeros)
  return batched_device_put(aval, sharding, [zeros for _ in indices], devices)
shard_arg_handlers[core.Token] = _shard_token

def _masked_array_error(x, sharding):
  raise ValueError("numpy masked arrays are not supported as direct inputs to JAX functions. "
                   "Use arr.filled() to convert the value to a standard numpy array.")
shard_arg_handlers[np.ma.MaskedArray] = _masked_array_error

def _shard_array(x, sharding):
  devices = get_addressable_devices_for_shard_arg(sharding)
  if x.dtype == dtypes.float0:
    x = np.zeros(x.shape, dtype=np.dtype(bool))
  aval = api_util.shaped_abstractify(x)
  if sharding.is_fully_replicated:
    shards = [x] * len(devices)
  else:
    indices = tuple(sharding.addressable_devices_indices_map(x.shape).values())
    shards = [x[i] for i in indices]
  return batched_device_put(aval, sharding, shards, devices)
for _t in array_types:
  shard_arg_handlers[_t] = _shard_array

def _shard_darray(x, sharding):
  return shard_arg(x._data, sharding)
shard_arg_handlers[core.DArray] = _shard_darray

def _shard_mutable_array(x, sharding):
  return shard_arg(x._buf, sharding)
shard_arg_handlers[core.MutableArray] = _shard_mutable_array

def batched_device_put(aval: core.ShapedArray,
                       sharding: jax.sharding.Sharding, xs: Sequence[Any],
                       devices: Sequence[jax.Device], committed: bool = True):
  from jax._src import array

  bufs = [x for x, d in safe_zip(xs, devices)
          if (isinstance(x, array.ArrayImpl) and
              dispatch.is_single_device_sharding(x.sharding) and
              x.devices() == {d})]
  if len(bufs) == len(xs):
    return array.ArrayImpl(
        aval, sharding, bufs, committed=committed, _skip_checks=True)
  return xc.batched_device_put(aval, sharding, xs, list(devices), committed)  # type: ignore

def _shard_aval(size, axis: int, aval):
  try:
    return _shard_aval_handlers[type(aval)](size, axis, aval)
  except KeyError as err:
    raise TypeError(f"No _shard_aval handler for type: {type(aval)}") from err
_shard_aval_handlers: dict[type[core.AbstractValue], Callable[[int, int, Any], Any]] = {}

def _shard_abstract_array(size, axis: int, x):
  try:
    if x.shape[axis] != size:
      raise ValueError(f"Axis size {size} does not match dimension {axis} of "
                       f"shape {x.shape}")
  except IndexError:
    raise ValueError("Cannot split a {x.dim}D value along axis {axis}") from None
  if config.pmap_no_rank_reduction.value:
    return x.update(shape=tuple_update(x.shape, axis, 1))
  else:
    return x.update(shape=tuple_delete(x.shape, axis))
_shard_aval_handlers[ShapedArray] = _shard_abstract_array


def local_aval_to_result_handler(
    aval: core.AbstractValue,
    sharding: sharding_impls.XLACompatibleSharding,
    indices: tuple[Index, ...] | None,
) -> Callable[[list[xc.ArrayImpl]], Any]:
  """Returns a function for handling the raw buffers of a single output aval.

  Args:
    aval: The local output AbstractValue.
    sharding_spec: Indicates how the output is sharded across devices, or None
      for non-array avals.
    indices: The pre-computed result of spec_to_indices, or None for non-array
      avals.

  Returns:
    A function for handling the Buffers that will eventually be produced
    for this output. The function will return an object suitable for returning
    to the user, e.g. an Array.
  """
  try:
    return local_result_handlers[(type(aval))](aval, sharding, indices)
  except KeyError as err:
    raise TypeError(
        f"No pxla_result_handler for type: {type(aval)}") from err

PxlaResultHandler = Callable[..., Callable[[Any], Any]]
local_result_handlers: dict[type[core.AbstractValue], PxlaResultHandler] = {}


def global_aval_to_result_handler(
    aval: core.AbstractValue, out_sharding, committed: bool
) -> Callable[[Sequence[xc.ArrayImpl]], Any]:
  """Returns a function for handling the raw buffers of a single output aval.

  Args:
    aval: The global output AbstractValue.
    out_axis_resources: A PartitionSpec specifying the sharding of outputs.
      Used for creating GSDAs.
    global_mesh: The global device mesh that generated this output. Used
      for creating GSDAs.

  Returns:
    A function for handling the Buffers that will eventually be produced
    for this output. The function will return an object suitable for returning
    to the user, e.g. an Array.
  """
  try:
    return global_result_handlers[type(aval)](aval, out_sharding, committed)
  except KeyError as err:
    raise TypeError(
        f"No pxla_result_handler for type: {type(aval)}") from err

global_result_handlers: dict[type[core.AbstractValue], PxlaResultHandler] = {}

### lazy device-memory persistence and result handling

### the xla_pmap primitive and its rules are comparable to xla_call in xla.py


def xla_pmap_impl_lazy(
    fun: lu.WrappedFun,
    *args,
    backend: str | None,
    axis_name: core.AxisName,
    axis_size: int,
    global_axis_size: int,
    devices: Sequence[Any] | None,
    name: str,
    in_axes: Sequence[int | None],
    out_axes_thunk: Callable[[], Sequence[int | None]],
    donated_invars: Sequence[bool],
    is_explicit_global_axis_size: bool,
) -> Callable:
  if (config.disable_jit.value and config.eager_pmap.value and
      not is_explicit_global_axis_size and not any(d for d in donated_invars)):
    def _emap_apply_fn(*args):
      return _emap_impl(fun, *args, backend=backend, axis_name=axis_name,
                        axis_size=axis_size, global_axis_size=global_axis_size,
                        devices=devices, name=name, in_axes=in_axes,
                        out_axes_thunk=out_axes_thunk,
                        donated_invars=donated_invars,
                        is_explicit_global_axis_size=is_explicit_global_axis_size)
    return _emap_apply_fn
  abstract_args = unsafe_map(xla.abstractify, args)
  compiled_fun, fingerprint = parallel_callable(
      fun, backend, axis_name, axis_size, global_axis_size, devices, name,
      in_axes, out_axes_thunk, donated_invars,
      is_explicit_global_axis_size, *abstract_args)

  # Don't re-abstractify args unless logging is enabled for performance.
  if config.distributed_debug.value:
    distributed_debug_log(("Running pmapped function", name),
                          ("python function", fun.f),
                          ("devices", devices),
                          ("abstract args", map(xla.abstractify, args)),
                          ("fingerprint", fingerprint))
  return compiled_fun

def xla_pmap_impl(fun: lu.WrappedFun, *args, **params):
  compiled_fun = xla_pmap_impl_lazy(fun, *args, **params)
  return compiled_fun(*args)

class EmapInfo(NamedTuple):
  backend: str | None
  devices: Sequence[Any] | None

def _emap_impl(fun: lu.WrappedFun, *args,
               backend: str | None,
               axis_name: core.AxisName,
               axis_size: int,
               global_axis_size: int,
               devices: Sequence[Any] | None,
               name: str,
               in_axes: Sequence[int | None],
               out_axes_thunk: Callable[[], Sequence[int | None]],
               donated_invars: Sequence[bool],
               is_explicit_global_axis_size: bool,
               ):
  from jax._src import array
  # TODO(sharadmv,mattjj): implement these cases
  if any(d for d in donated_invars):
    raise NotImplementedError("Buffer donation not supported in eager pmap.")
  if is_explicit_global_axis_size:
    raise NotImplementedError("Non-default global_axis_size not supported in "
                              "eager pmap.")

  emap_info = EmapInfo(backend, devices)
  shard_axes = [{} if in_axis is None else {axis_name: in_axis} for in_axis in in_axes]
  with core.new_base_main(MapTrace, emap_info=emap_info) as main:
    with core.new_sublevel(), core.extend_axis_env(axis_name, axis_size, main):
      t = main.with_cur_sublevel()
      tracers = [MapTracer(t, arg, s) for arg, s in zip(args, shard_axes)]
      ans = fun.call_wrapped(*tracers)
      out_tracers = map(t.full_raise, ans)
      outvals, out_axes_src = unzip2((t.val, t.shard_axes) for t in out_tracers)
    del main
  out_axes = out_axes_thunk()

  platform = xb.get_backend(backend).platform
  donate_argnums = (1,) if platform in {"cuda", "rocm", "tpu"} else ()
  new_outvals = []
  for out_axis_src, out_axis, outval in zip(out_axes_src, out_axes, outvals):
    with jax.disable_jit(False):
      donate_argnums_ = donate_argnums
      if isinstance(outval, array.ArrayImpl):
        # We don't want to donate if it's already sharded.
        donate_argnums_ = ()
      out = jax.pmap(
          lambda _, x: x,
          in_axes=(0, out_axis_src.get(axis_name)),
          out_axes=out_axis,
          devices=(None if devices is None else list(devices)),
          backend=backend,
          donate_argnums=donate_argnums_)(np.arange(axis_size), outval)
      new_outvals.append(out)
  return new_outvals

def _map_schedule(idx: tuple[int | None, ...]) -> tuple[int | None, ...]:
  # In order to do a multi-map (a simultaneous map over several axes), we will
  # nest several maps. Each time we do a map, we "remove" an input axis so we
  # need to update the remaining map axes. For example, if we are to map over
  # the axes 0, 3, and 4, we make three calls to pmap with in_axes as 0, 2, 2.
  return tuple(None if i is None else
               i - sum(j is not None and j < i for j in idx[:l])
               for l, i in enumerate(idx))


# We're often creating `f`s on the fly and we try to carefully make them have
# the right __hash__ and __eq__. However, despite our attempts pmap's caching
# still ends up not working, because it has a separate cache per
# _function object_. Adding this annotation here lets us reuse the same pmap
# callable for all equivalent primitive pmaps.
@lru_cache
def _multi_pmap(f: Callable, info: EmapInfo, names: list[core.AxisName],
                all_axes: list[tuple[int | None, ...]]
                ) -> tuple[Callable, dict[core.AxisName, int]]:
  used_names = []
  for i, name in reversed(list(enumerate(names))):
    in_axes = tuple(arg_axis[i] for arg_axis in all_axes)
    if any(in_axis is not None for in_axis in in_axes):
      f = jax.pmap(
          f,
          in_axes=in_axes,
          axis_name=name,
          out_axes=0,
          backend=info.backend,
          devices=(None if info.devices is None else list(info.devices)))
      used_names.append(name)
  out_shard_axes = {name: i for i, name in enumerate(reversed(used_names))}
  return f, out_shard_axes

FakePrimitive = namedtuple("FakePrimitive", ["multiple_results", "bind"])

class MapTrace(core.Trace):

  def __init__(self, *args, emap_info):
    super().__init__(*args)
    self.emap_info = emap_info

  def pure(self, val):
    return MapTracer(self, val, {})

  def sublift(self, tracer):
    return MapTracer(self, tracer.val, tracer.shard_axes)

  def process_primitive(self, primitive, tracers, params):
    info = self.main.payload["emap_info"]
    vals, shard_axes = unzip2([(t.val, t.shard_axes) for t in tracers])
    names = tuple(f.name for f in core.thread_local_state.trace_state.axis_env
                  if f.main_trace is self.main)
    all_axes = tuple(_map_schedule(map(s.get, names)) for s in shard_axes)  # pytype: disable=wrong-arg-types  # always-use-return-annotations
    f = HashableFunction(lambda *args: primitive.bind(*args, **params),
                         (primitive, tuple(params.items())))
    f_mapped, out_shard_axes = _multi_pmap(f, info, names, all_axes)
    with core.eval_context(), jax.disable_jit(False):
      outvals = f_mapped(*vals)
    if primitive.multiple_results:
      return [MapTracer(self, val, out_shard_axes) for val in outvals]
    return MapTracer(self, outvals, out_shard_axes)

  def process_call(self, call_primitive, fun, tracers, params):
    raise NotImplementedError

  def process_map(self, map_primitive, fun, tracers, params):
    if params['devices'] is not None:
      raise ValueError("Nested pmap with explicit devices argument.")
    if not config.disable_jit.value:
      bind = HashableFunction(
          lambda *args, **kwargs: map_primitive.bind(fun, *args, **kwargs),
          (map_primitive, fun))
      fake_primitive = FakePrimitive(multiple_results=True, bind=bind)
      return self.process_primitive(fake_primitive, tracers, params)
    axis_name, in_axes, out_axes_thunk, axis_size = (params["axis_name"],
        params["in_axes"], params["out_axes_thunk"], params["axis_size"])
    vals, shard_axes = unzip2((t.val, t.shard_axes) for t in tracers)
    shard_axes = [{axis_name: _annot_to_flat(np.ndim(v), s.values(), ax), **s}
                  if ax is not None else s
                  for v, ax, s in zip(vals, in_axes, shard_axes)]
    # TODO(mattjj): use _emap_subtrace here?
    with core.new_sublevel(), core.extend_axis_env(axis_name, axis_size, self.main):
      t = self.main.with_cur_sublevel()
      in_tracers = map(partial(MapTracer, t), vals, shard_axes)
      ans = fun.call_wrapped(*in_tracers)
      out_tracers = map(t.full_raise, ans)
      out, outaxes = unzip2((t.val, t.shard_axes) for t in out_tracers)
      del t, in_tracers, ans, out_tracers
    out, outaxes = unzip2(_match_annot(axis_name, axis_size, v, s, dst)
                           for v, s, dst in zip(out, outaxes, out_axes_thunk()))
    return map(partial(MapTracer, self), out, outaxes)

  def process_custom_jvp_call(self, prim, fun, jvp, tracers, *, symbolic_zeros):
    if symbolic_zeros:
      msg = ("custom_jvp with symbolic_zeros=True not supported with eager pmap. "
             "Please open an issue at https://github.com/google/jax/issues !")
      raise NotImplementedError(msg)
    del prim, jvp, symbolic_zeros  # always base main, can drop jvp
    in_vals, in_axes = unzip2((t.val, t.shard_axes) for t in tracers)
    fun, out_axes = _emap_subtrace(fun, self.main, in_axes)
    with core.new_sublevel():
      out_vals = fun.call_wrapped(*in_vals)
    return map(partial(MapTracer, self), out_vals, out_axes())

  def process_custom_vjp_call(self, primitive, fun, fwd, bwd, tracers,
                              out_trees, symbolic_zeros):
    if symbolic_zeros:
      msg = ("custom_vjp with symbolic_zeros=True not supported with eager pmap. "
             "Please open an issue at https://github.com/google/jax/issues !")
      raise NotImplementedError(msg)
    del primitive, fwd, bwd, out_trees, symbolic_zeros  # always base main, drop vjp
    in_vals, in_axes = unzip2((t.val, t.shard_axes) for t in tracers)
    fun, out_axes = _emap_subtrace(fun, self.main, in_axes)
    with core.new_sublevel():
      out_vals = fun.call_wrapped(*in_vals)
    return map(partial(MapTracer, self), out_vals, out_axes())

  def process_axis_index(self, frame):
    bind = HashableFunction(
        lambda _: jax.lax.axis_index(frame.name),
        (jax.lax.axis_index, frame.name))
    fake_primitive = FakePrimitive(multiple_results=False, bind=bind)
    with core.eval_context():
      range = jax.lax.iota(np.int32, frame.size)
    dummy_tracer = MapTracer(self, range, {frame.name: 0})
    return self.process_primitive(fake_primitive, (dummy_tracer,), {})

@lu.transformation_with_aux
def _emap_subtrace(main, in_axes, *in_vals):
  t = main.with_cur_sublevel()
  in_tracers = map(partial(MapTracer, t), in_vals, in_axes)
  ans = yield in_tracers, {}
  out_tracers = map(t.full_raise, ans)
  out_vals, out_axes = unzip2((t.val, t.shard_axes) for t in out_tracers)
  del t, in_tracers, ans, out_tracers
  yield out_vals, out_axes

def _annot_to_flat(ndim: int, mapped_axes: Iterable[int],
                 annotation: int | None) -> int | None:
  if annotation is None: return None
  mapped_axes_ = set(mapped_axes)
  return [i for i in range(ndim) if i not in mapped_axes_][annotation]

def _match_annot(axis_name: core.AxisName, axis_size: int, val: Any,
                 shard_axis_src: dict[core.AxisName, int],
                 dst_annotation: int | None
                 ) -> tuple[Any, dict[core.AxisName, int]]:
  shard_axis_out = dict(shard_axis_src)
  src = shard_axis_out.pop(axis_name, None)
  dst = _annot_to_flat(np.ndim(val) + (src is None), shard_axis_out.values(),
                       dst_annotation)
  with core.eval_context():
    if src == dst:
      outval = val
    elif type(src) == type(dst) == int:
      outval = batching.moveaxis(val, src, dst)
      shard_axis_out = _moveaxis(np.ndim(val), shard_axis_src, src, dst)
    elif src is None and dst is not None:
      outval = batching.broadcast(val, axis_size, dst)
      shard_axis_out = {n: d + (dst <= d) for n, d in shard_axis_out.items()}
    else:
      raise NotImplementedError
  return outval, shard_axis_out

def _moveaxis(ndim: int, shard_axes: dict[core.AxisName, int],
              src: int, dst: int) -> dict[core.AxisName, int]:
  lst: list[core.AxisName | None] = [None] * ndim
  for k, v in shard_axes.items():
    lst[v] = k
  name = lst.pop(src)
  lst.insert(dst - (src < dst), name)
  return {name: i for i, name in enumerate(lst) if name is not None}

class MapTracer(core.Tracer):
  __slots__ = ["val", "shard_axes"]

  def __init__(self, trace: MapTrace, val, shard_axes: dict[core.AxisName, int]):
    self._trace = trace
    self.val = val
    self.shard_axes = shard_axes
    assert all(val < self.val.ndim for val in self.shard_axes.values())

  @property
  def aval(self):
    aval = xla.abstractify(self.val)
    shard_axes = dict(self.shard_axes)
    for axis_idx in sorted(shard_axes.values())[::-1]:
      aval = core.mapped_aval(aval.shape[axis_idx], axis_idx, aval)
    return aval

  def full_lower(self):
    return self

  def __str__(self):
    named_axes = [f"{k}={v}" for k, v in self.shard_axes.items()]
    return f"{self.val}{{{','.join(named_axes)}}}"

@lu.cache
def parallel_callable(fun: lu.WrappedFun,
                      backend_name: str | None,
                      axis_name: core.AxisName,
                      axis_size: int,
                      global_axis_size: int,
                      devices: Sequence[Any] | None,
                      name: str,
                      in_axes: Sequence[int | None],
                      out_axes_thunk: Callable[[], Sequence[int | None]],
                      donated_invars: Sequence[bool],
                      is_explicit_global_axis_size: bool,
                      *avals):
  pmap_computation = lower_parallel_callable(
      fun, backend_name, axis_name, axis_size, global_axis_size, devices, name,
      in_axes, out_axes_thunk, donated_invars,
      is_explicit_global_axis_size, avals,
      lowering_parameters=mlir.LoweringParameters())
  pmap_executable = pmap_computation.compile()
  return WeakRefList([pmap_executable.unsafe_call, pmap_executable.fingerprint])


@dataclasses.dataclass(frozen=True)
class ParallelCallableInfo:
  name: str
  backend: xc.Client
  axis_name: core.AxisName
  axis_size: int
  global_axis_size: int
  devices: Sequence[xc.Device] | None
  in_axes: Iterable[int | None]
  out_axes_thunk: Callable[[], Sequence[int | None]]
  avals: Sequence[core.AbstractValue]

  @cached_property
  def local_devices(self):
    if self.devices:
      out = [d for d in self.devices
             if d.process_index == xb.process_index(self.backend)]
      assert len(out) > 0
    else:
      out = None  # type: ignore
    return out

  @cached_property
  def out_axes(self):
    return self.out_axes_thunk()


class ShardInfo(NamedTuple):
  sharded_avals: Sequence[core.AbstractValue]
  out_sharded_avals: Sequence[core.ShapedArray]
  global_sharded_avals: Sequence[core.AbstractValue]
  num_local_shards: int
  num_global_shards: int


class ReplicaInfo(NamedTuple):
  jaxpr_replicas: int
  num_local_replicas: int
  num_global_replicas: int


def find_replicas(
    jaxpr: core.Jaxpr, axis_size: int, global_axis_size: int
) -> ReplicaInfo:
  # TODO(skyewm): replace this with a chain of pmaps and/or sharded_jits
  jaxpr_replicas = dispatch.jaxpr_replicas(jaxpr)
  num_local_replicas = axis_size * jaxpr_replicas
  num_global_replicas = global_axis_size * jaxpr_replicas
  return ReplicaInfo(jaxpr_replicas, num_local_replicas, num_global_replicas)

@lu.transformation
def _change_argument_ranks(in_axes, out_axes_thunk, *args):
  args = tuple(
      arg if in_axis is None else jax.lax.squeeze(arg, dimensions=(in_axis,))
      for in_axis, arg in zip(in_axes, args)
  )
  results = yield (args, {})
  out_axes = out_axes_thunk()
  yield tuple(
      x if axis is None else jax.lax.expand_dims(x, dimensions=(axis,))
      for x, axis in zip(results, out_axes)
  )


def stage_parallel_callable(
    pci: ParallelCallableInfo, fun: lu.WrappedFun
) -> tuple[core.Jaxpr, list[Any], ReplicaInfo, ShardInfo]:
  sharded_avals = tuple(
      _shard_aval(pci.axis_size, axis, aval) if axis is not None else aval
      for axis, aval in safe_zip(pci.in_axes, pci.avals))

  orig_fun = fun
  if config.pmap_no_rank_reduction.value:
    fun = _change_argument_ranks(fun, pci.in_axes, pci.out_axes_thunk)
  else:
    fun = orig_fun
  with core.extend_axis_env(pci.axis_name, pci.global_axis_size, None):  # type: ignore
    with dispatch.log_elapsed_time(
        "Finished tracing + transforming {fun_name} for pmap in {elapsed_time} sec",
        fun_name=fun.__name__, event=dispatch.JAXPR_TRACE_EVENT):
      jaxpr, out_sharded_avals, consts = pe.trace_to_jaxpr_final(
          fun, sharded_avals, pe.debug_info_final(fun, "pmap"))
  jaxpr = api_util.jaxpr_debug_info(jaxpr, orig_fun.debug_info)
  jaxpr = dispatch.apply_outfeed_rewriter(jaxpr)

  assert len(out_sharded_avals) == len(pci.out_axes), (
      len(out_sharded_avals), len(pci.out_axes))

  replicas = find_replicas(jaxpr, pci.axis_size, pci.global_axis_size)
  num_local_shards = replicas.num_local_replicas
  num_global_shards = replicas.num_global_replicas

  shards = ShardInfo(
      sharded_avals, out_sharded_avals, sharded_avals,
      num_local_shards, num_global_shards)

  return jaxpr, consts, replicas, shards


@profiler.annotate_function
def lower_parallel_callable(
    fun: lu.WrappedFun,
    backend_name: str | None,
    axis_name: core.AxisName,
    axis_size: int,
    global_axis_size: int,
    devices: Sequence[xc.Device] | None,
    name: str,
    in_axes: Iterable[int | None],
    out_axes_thunk: Callable[[], Sequence[int | None]],
    donated_invars: Sequence[bool],
    is_explicit_global_axis_size: bool,
    avals: Sequence[core.AbstractValue],
    *,
    lowering_parameters: mlir.LoweringParameters) -> PmapComputation:
  # Determine global_axis_size for use in AxisEnv.
  # TODO(mattjj,skyewm): revive this check (inner_pmap always False now)
  # if xb.process_count() > 1 and global_axis_size is None and inner_pmap:
  #   raise ValueError("'axis_size' must be specified for nested multi-host pmaps")
  if (xb.process_count() == 1 and is_explicit_global_axis_size
      and global_axis_size != axis_size):
    raise ValueError(
        f"Specified axis_size {global_axis_size} doesn't match received "
        f"axis_size {axis_size}.")

  if devices is not None and backend_name is None:
    backend = xb.get_device_backend(devices[0])
  else:
    backend = xb.get_backend(backend_name)

  no_nested_sharding = False
  must_run_on_all_devices = False
  if not is_explicit_global_axis_size:
    if xb.process_count(backend) > 1:
      if devices:
        # This allows each host in a multi-host pmap to run on a different number
        # of devices, but precludes nested sharding (i.e. inner pmaps).
        no_nested_sharding = True
      else:
        # This assumes all hosts run on the same number of devices. We make sure
        # this assumption is true by requiring that the pmap is run on all devices
        # (and making the further assumption that each host has the same number of
        # devices). Nested sharding is ok in this case.
        must_run_on_all_devices = True

  pci = ParallelCallableInfo(
      name, backend, axis_name, axis_size, global_axis_size, devices,
      in_axes, out_axes_thunk, avals)
  jaxpr, consts, replicas, shards = stage_parallel_callable(pci, fun)
  if logger.isEnabledFor(logging.DEBUG):
    logger.debug("sharded_avals: %s", shards.sharded_avals)
    logger.debug("global_sharded_avals: %s", shards.global_sharded_avals)
    logger.debug("num_replicas: %d  num_local_replicas: %d",
                 replicas.num_global_replicas, replicas.num_local_replicas)
    logger.debug("devices: %s", devices)
    logger.debug("local_devices: %s", pci.local_devices)

  if (xb.process_count(backend) > 1 and must_run_on_all_devices and
      shards.num_local_shards != xb.local_device_count(backend)):
    if shards.num_local_shards == axis_size:
      raise ValueError(
         f"On multi-host platforms, the input to pmapped functions must have "
         f"leading axis size equal to the number of local devices if no "
         f"`devices` argument is specified. Got {axis_size=}, "
         f"num_local_devices={xb.local_device_count(backend)}")
    else:
      raise ValueError(
        f"On multi-host platforms, pmapped functions must run across all "
        f"devices, i.e. num_replicas * num_partitions should equal the "
        f"number of local devices. Got "
        f"num_replicas={replicas.num_local_replicas}, and "
        f"num_local_devices={xb.local_device_count(backend)}")

  if no_nested_sharding and replicas.jaxpr_replicas > 1:
    raise ValueError(
      f"On multi-host platforms, pmapped functions that both have `devices` "
      f"specified and contain an inner_pmap must specify an "
      f"`axis_size` (or remove the `devices` argument). Got nested_replicas="
      f"{replicas.jaxpr_replicas}")

  log_priority = logging.WARNING if config.log_compiles.value else logging.DEBUG
  if logger.isEnabledFor(log_priority):
    logger.log(log_priority,
               "Compiling %s (%d) for %d devices with args %s. (num_replicas=%d)",
               fun.__name__, id(fun),
               shards.num_global_shards, avals, replicas.num_global_replicas)

  axis_env = sharding_impls.AxisEnv(
      replicas.num_global_replicas, (axis_name,), (global_axis_size,))
  name_stack = source_info_util.new_name_stack(wrap_name(name, 'pmap'))
  jaxpr = core.remove_named_axis_effects(jaxpr, {axis_name})
  closed_jaxpr = core.ClosedJaxpr(jaxpr, consts)
  replicated_args = [axis is None for axis in in_axes]
  tuple_args = dispatch.should_tuple_args(len(shards.global_sharded_avals),
                                          backend.platform)
  module_name = f"pmap_{fun.__name__}"
  with maybe_extend_axis_env(axis_name, global_axis_size, None):  # type: ignore
    ordered_effects = list(
        effects.ordered_effects.filter_in(closed_jaxpr.effects))
    if ordered_effects:
      raise ValueError("Ordered effects not supported in `pmap`.")
    unordered_effects = list(
        effects.ordered_effects.filter_not_in(closed_jaxpr.effects))
    with dispatch.log_elapsed_time(
        "Finished jaxpr to MLIR module conversion {fun_name} in {elapsed_time} sec",
        fun_name=str(name_stack), event=dispatch.JAXPR_TO_MLIR_MODULE_EVENT):
      lowering_result = mlir.lower_jaxpr_to_module(
          module_name,
          closed_jaxpr,
          ordered_effects=ordered_effects,
          backend_or_name=backend,
          platforms=lowering_parameters.platforms or (backend.platform,),
          axis_context=sharding_impls.ReplicaAxisContext(axis_env),
          name_stack=name_stack,
          donated_args=donated_invars,
          replicated_args=replicated_args,
          arg_shardings=None,
          result_shardings=None,
          arg_names=jaxpr.debug_info and jaxpr.debug_info.arg_names,
          result_names=jaxpr.debug_info and jaxpr.debug_info.result_paths,
          num_replicas=replicas.num_global_replicas,
          lowering_parameters=lowering_parameters)
  return PmapComputation(lowering_result.module, pci=pci, replicas=replicas,
                         shards=shards, tuple_args=tuple_args,
                         unordered_effects=unordered_effects,
                         ordered_effects=ordered_effects,
                         keepalive=lowering_result.keepalive,
                         host_callbacks=lowering_result.host_callbacks,
                         jaxpr_debug_info=closed_jaxpr.jaxpr.debug_info)


def _pmap_unmap_shaped_array(
    size: int, axis_name: core.AxisName, axis: int | None, aval: ShapedArray
  ) -> ShapedArray:
  named_shape = dict(aval.named_shape)
  named_shape.pop(axis_name, None)  # TODO: make this mandatory
  if axis is None: return aval.update(named_shape=named_shape)
  elif type(axis) is int:
    return ShapedArray(tuple_update(aval.shape, axis, size), aval.dtype,
                       named_shape=named_shape, weak_type=aval.weak_type)
  else: raise TypeError(axis)


AvalMapHandlerPair = tuple[Any, Callable]
_pmap_aval_mapping_handlers: dict[type, AvalMapHandlerPair] = {
    ShapedArray:   (Any, _pmap_unmap_shaped_array),
}

def _pmap_unmapped_aval(size: core.AxisSize, axis_name, axis: int | None,
                       aval: core.AbstractValue) -> core.AbstractValue:
  if not config.pmap_no_rank_reduction.value:
    return core.unmapped_aval(size, axis_name, axis, aval)

  _, handler = _pmap_aval_mapping_handlers.get(type(aval), (None, None))
  if handler is not None:
    return handler(size, axis_name, axis, aval)
  else:
    raise TypeError(f"no unmapping handler for {aval} of type {type(aval)}")


class PmapComputation(stages.XlaLowering):
  _hlo: ir.Module
  _executable: PmapExecutable | None

  def __init__(self, hlo: ir.Module, **compile_args):
    self._executable = None
    self._hlo = hlo
    self.compile_args = compile_args

  # -- stages.XlaLowering overrides

  def stablehlo(self) -> ir.Module:
    return self._hlo

  @profiler.annotate_function
  def compile(self, compiler_options=None) -> PmapExecutable:
    if self._executable is None or compiler_options is not None:
      executable = UnloadedPmapExecutable.from_hlo(
          self._hlo, **self.compile_args,
          compiler_options=compiler_options)
      if compiler_options is None:
        self._executable = executable
      return executable
    return self._executable

def _cast_to_shaped_array(aval: core.AbstractValue) -> ShapedArray:
  assert isinstance(aval, ShapedArray), aval
  return cast(ShapedArray, aval)

@dataclasses.dataclass
class UnloadedPmapExecutable:
  compiled: Any
  backend: xb.XlaBackend
  local_input_avals: Sequence[core.AbstractValue]
  input_shardings: Sequence[sharding_impls.XLACompatibleSharding]
  local_output_avals: Sequence[ShapedArray]
  output_shardings: Sequence[sharding_impls.XLACompatibleSharding]
  unordered_effects: list[core.Effect]
  ordered_effects: list[core.Effect]
  keepalive: Sequence[Any]
  host_callbacks: Sequence[Any]
  jaxpr_debug_info: core.JaxprDebugInfo

  def build_execute_fun(self):
    input_indices = []
    for aval, spec in safe_zip(self.local_input_avals, self.input_shardings):
      assert isinstance(spec, sharding_impls.PmapSharding), spec
      assert isinstance(aval, core.ShapedArray), aval
      input_indices.append(
          sharding_specs.spec_to_indices(aval.shape, spec.sharding_spec)
          if spec.sharding_spec is not None else None)
    handle_outs = local_avals_to_results_handler(self.local_output_avals,
                                                 self.output_shardings)
    handle_args = InputsHandler(self.input_shardings,
                                self.compiled.local_devices(), input_indices)
    execute_fun = ExecuteReplicated(self.compiled, "parallel computation",
                                    self.backend, handle_args, handle_outs,
                                    self.unordered_effects,
                                    self.ordered_effects, self.keepalive,
                                    bool(self.host_callbacks),
                                    set(range(len(input_indices))), None)
    return execute_fun

  def load(self) -> PmapExecutable:
    fingerprint = getattr(self.compiled, "fingerprint", None)

    return PmapExecutable(
        self.compiled, self.build_execute_fun, fingerprint,
        self.local_input_avals, self.jaxpr_debug_info, self)

  @staticmethod
  def from_hlo(hlo: ir.Module,
               pci: ParallelCallableInfo,
               replicas: ReplicaInfo,
               shards: ShardInfo,
               tuple_args: bool,
               unordered_effects: list[core.Effect],
               ordered_effects: list[core.Effect],
               host_callbacks: list[Any],
               keepalive: Any,
               jaxpr_debug_info: core.JaxprDebugInfo,
               compiler_options=None):
    devices = pci.devices
    if devices is None:
      if shards.num_global_shards > xb.device_count(pci.backend):
        msg = ("compiling computation that requires {} logical devices, but only {} XLA "
               "devices are available (num_replicas={})")
        raise ValueError(msg.format(shards.num_global_shards,
                                    xb.device_count(pci.backend),
                                    replicas.num_global_replicas))
      # On a single host, we simply grab the first N devices from jax.devices().
      # In the single host case, we want the default device order of pmap to
      # match jax.devices().
      # On multiple hosts, we create a default device assignment that ensures
      # each host is responsible for a contiguous set of replicas.
      if shards.num_global_shards > shards.num_local_shards:
        # TODO(skye): use a locality-aware assignment that satisfies the above
        # constraint.
        devices = [d for process_index in range(xb.process_count(pci.backend))
                  for d in xb.local_devices(process_index, pci.backend)]
      else:
        devices = xb.local_devices(backend=pci.backend)[:shards.num_local_shards]
    else:
      if shards.num_local_shards != len(pci.local_devices):
        local_devices_str = ", ".join(map(str, pci.local_devices))
        if shards.num_local_shards == pci.axis_size:
          raise ValueError(
              f"Leading axis size of input to pmapped function must equal the "
              f"number of local devices passed to pmap. Got axis_size="
              f"{pci.axis_size}, num_local_devices={len(pci.local_devices)}.\n"
              f"(Local devices available to pmap: {local_devices_str})")
        else:
          raise ValueError(
              f"pmapped function requires {shards.num_local_shards} local "
              f"devices to run due to nested pmapped or other parallel "
              f"functions, but only {len(pci.local_devices)} are available.\n"
              f"(outer axis size: {pci.axis_size}, local devices available to "
              f"pmap: {local_devices_str})")
      if shards.num_global_shards != len(devices):
        raise ValueError("compiling computation that creates %s shards, "
                        "but %s devices were specified" %
                        (shards.num_global_shards, len(devices)))

    # 'devices' may be 1D or 2D at this point (e.g.
    # get_default_device_assignment() returns 2D assignment, caller may have
    # provided 1D list of devices).
    # Convert to 2D in case it's 1D and we have > 1 partitions.
    num_partitions = 1
    device_assignment: np.ndarray = np.array(devices).reshape(
        (replicas.num_global_replicas, num_partitions))
    compile_options = compiler.get_compile_options(
        num_replicas=replicas.num_global_replicas,
        num_partitions=num_partitions,
        device_assignment=device_assignment,
        use_spmd_partitioning=False,
        env_options_overrides=compiler_options,
        detailed_logging=compiler.use_detailed_logging(hlo),
        backend=pci.backend,
    )
    compile_options.parameter_is_tupled_arguments = tuple_args

    process_index = xb.process_index(pci.backend)
    local_device_assignment = np.array([
        d for d in device_assignment.flat if d.process_index == process_index
    ])

    input_sharding_specs = [
        sharding_specs.pmap_sharding_spec(
            replicas.num_local_replicas, pci.axis_size,
            cast(ShapedArray, aval).shape, in_axis)
        for aval, in_axis in safe_zip(shards.sharded_avals, pci.in_axes)]
    in_shardings = _get_pmap_sharding(local_device_assignment,
                                      input_sharding_specs)

    local_unmapped_avals = [
        _cast_to_shaped_array(
            _pmap_unmapped_aval(pci.axis_size, pci.axis_name, out_axis, aval))
        if out_axis is not None else aval
        for aval, out_axis in safe_zip(shards.out_sharded_avals, pci.out_axes)]
    out_specs = [
        sharding_specs.pmap_sharding_spec(
            replicas.num_local_replicas, pci.axis_size, aval.shape, out_axis)
        for aval, out_axis in safe_zip(
            shards.out_sharded_avals, pci.out_axes)]
    out_shardings = _get_pmap_sharding(local_device_assignment, out_specs)

    if hasattr(pci.backend, "compile_replicated"):
      input_indices = [
          sharding_specs.spec_to_indices(aval.shape, spec)
          if spec is not None else None
          for aval, spec in safe_zip(pci.avals, input_sharding_specs)
      ]
      handle_outs = local_avals_to_results_handler(local_unmapped_avals,
                                                   out_shardings)
      return _compile_replicated_pmap_executable_from_hlo(
          hlo, pci, input_indices, in_shardings, handle_outs,
          compile_options, host_callbacks, bool(unordered_effects),
          ordered_effects, jaxpr_debug_info)

    with dispatch.log_elapsed_time(
        "Finished XLA compilation of {fun_name} in {elapsed_time} sec",
        fun_name=pci.name, event=dispatch.BACKEND_COMPILE_EVENT):
      compiled = compiler.compile_or_get_cached(
          pci.backend, hlo, device_assignment, compile_options,
          host_callbacks)

    return UnloadedPmapExecutable(
        compiled=compiled,
        backend=pci.backend,
        local_input_avals=pci.avals,
        input_shardings=in_shardings,
        local_output_avals=local_unmapped_avals,
        output_shardings=out_shardings,
        unordered_effects=unordered_effects,
        ordered_effects=ordered_effects,
        keepalive=keepalive,
        host_callbacks=host_callbacks,
        jaxpr_debug_info=jaxpr_debug_info).load()


def _compile_replicated_pmap_executable_from_hlo(
    hlo: ir.Module, pci, input_indices, in_shardings, handle_outs,
    compile_options, host_callbacks, has_unordered_effects, ordered_effects,
    jaxpr_debug_info):
  # Use the standard out_handler.
  execute_fun = pci.backend.compile_replicated(
      is_trivial=False, name=pci.name, computation=hlo,
      compile_options=compile_options, host_callbacks=host_callbacks,
      has_unordered_effects=has_unordered_effects,
      ordered_effects=ordered_effects, in_avals=pci.avals,
      in_indices=input_indices, in_shardings=in_shardings,
      kept_var_idx=set(range(len(pci.avals))), out_handler=handle_outs)
  # TODO(frostig): need `compile_replicated` to give us the XLA executable
  return PmapExecutable(None, lambda: execute_fun, None, pci.avals,
                        jaxpr_debug_info, None)


class PmapExecutable(stages.XlaExecutable):
  __slots__ = ["xla_executable", "_unsafe_call", "build_unsafe_call",
               "fingerprint", "in_avals", "_jaxpr_debug_info",
               "_unloaded_executable"]

  def __init__(self, xla_executable, build_unsafe_call, fingerprint,
               in_avals, jaxpr_debug_info, unloaded_executable):
    self.xla_executable = xla_executable
    self._unsafe_call = None
    self.build_unsafe_call = build_unsafe_call
    self.fingerprint = fingerprint
    self.in_avals = in_avals
    self._jaxpr_debug_info = jaxpr_debug_info
    self._unloaded_executable = unloaded_executable

  @property
  def unsafe_call(self) -> Callable[..., Any]:
    if self._unsafe_call is None:
      self._unsafe_call = self.build_unsafe_call()
    return self._unsafe_call

  # -- stages.XlaExecutable overrides

  def xla_extension_executable(self):
    return self.xla_executable

  @profiler.annotate_function
  def call(self, *args):
    # TODO(frostig): do we need to check sharding and sharded avals?
    arg_avals = map(xla.abstractify, args)
    check_arg_avals_for_call(self.in_avals, arg_avals, self._jaxpr_debug_info)
    return self.unsafe_call(*args)  # pylint: disable=not-callable


def _get_pmap_sharding(devices, specs):
  return [sharding_impls.PmapSharding(devices, spec) for spec in specs]


class InputsHandler:
  __slots__ = ("handler", "local_devices", "in_shardings", "input_indices")

  def __init__(self, in_shardings, local_devices=None, input_indices=None):
    self.handler = partial(shard_args, in_shardings)
    self.local_devices = local_devices
    self.in_shardings = in_shardings
    self.input_indices = input_indices

  def __call__(self, input_buffers):
    return self.handler(input_buffers)

  def __str__(self):
    return ("InputsHandler(\n"
            f"local_devices={self.local_devices},\n"
            f"in_shardings={self.in_shardings},\n"
            f"input_indices={self.input_indices})")


class ResultsHandler:
  # `out_avals` is the `Array` global avals when using pjit or xmap. It is the
  # local one when using `pmap`.
  __slots__ = ("handlers", "out_shardings", "out_avals")

  def __init__(self, handlers, out_shardings, out_avals):
    self.handlers = handlers
    self.out_shardings = out_shardings
    self.out_avals = out_avals

  def __call__(self, out_bufs):
    return [h(bufs) for h, bufs in safe_zip(self.handlers, out_bufs)]


def local_avals_to_results_handler(
    unmapped_local_out_avals: Sequence[ShapedArray],
    local_shardings: Sequence[sharding_impls.XLACompatibleSharding]) -> ResultsHandler:
  out_indices = [tuple(s.devices_indices_map(aval.shape).values())
                 for s, aval in safe_zip(local_shardings, unmapped_local_out_avals)]
  handlers = [
      local_aval_to_result_handler(aval, s, idcs)
      for aval, s, idcs in safe_zip(unmapped_local_out_avals, local_shardings, out_indices)
  ]
  return ResultsHandler(handlers, local_shardings, unmapped_local_out_avals)


def global_avals_to_results_handler(
    global_out_avals: Sequence[ShapedArray],
    shardings: Sequence[sharding_impls.XLACompatibleSharding],
    committed: bool) -> ResultsHandler:
  handlers = [
      global_aval_to_result_handler(global_aval, s, committed)
      for global_aval, s in safe_zip(global_out_avals, shardings)
  ]
  return ResultsHandler(handlers, shardings, global_out_avals)


class ExecuteReplicated:
  """The logic to shard inputs, execute a replicated model, returning outputs."""
  __slots__ = ['xla_executable', 'name', 'backend', 'in_handler', 'out_handler',
               'has_unordered_effects', 'ordered_effects', 'keepalive',
               'has_host_callbacks', '_local_devices', 'kept_var_idx',
               'mut', '__weakref__']

  def __init__(self, xla_executable, name, backend, in_handler: InputsHandler,
               out_handler: ResultsHandler,
               unordered_effects: list[core.Effect],
               ordered_effects: list[core.Effect], keepalive: Any,
               has_host_callbacks: bool, kept_var_idx: set[int],
               mut: MutationData | None):
    self.xla_executable = xla_executable
    self.name = name
    self.backend = backend
    self.in_handler = in_handler
    self.out_handler = out_handler
    self.has_unordered_effects = bool(unordered_effects)
    self.ordered_effects = ordered_effects
    self._local_devices = self.xla_executable.local_devices()
    self.keepalive = keepalive
    self.has_host_callbacks = has_host_callbacks
    self.kept_var_idx = kept_var_idx
    self.mut = mut

  def _add_tokens_to_inputs(self, input_bufs):
    if self.ordered_effects:
      tokens = [
        dispatch.runtime_tokens.get_token_input(eff, self._local_devices)
        for eff in self.ordered_effects]
      input_bufs = [*tokens, *input_bufs]
    return input_bufs

  def _handle_token_bufs(self, token_bufs, sharded_token):
    # token_bufs: Sequence[Sequence[tokenArray]], for each effect the returned
    # token buffer (as a singleton list).
    # sharded_token: ShardedToken, containing the RuntimeTokens for each device
    for i, device in enumerate(self._local_devices):
      dispatch.runtime_tokens.set_output_runtime_token(
          device, sharded_token.get_token(i))
    for eff, token_buf in zip(self.ordered_effects, token_bufs):
      dispatch.runtime_tokens.set_token_result(eff, token_buf[0])

  @profiler.annotate_function
  def __call__(self, *args):
    args = [x for i, x in enumerate(args) if i in self.kept_var_idx]
    if self.mut:
      args = [*args, *self.mut.in_mut]
    input_bufs = self.in_handler(args)
    if (self.ordered_effects or self.has_unordered_effects
        or self.has_host_callbacks):
      input_bufs = self._add_tokens_to_inputs(input_bufs)
      results = self.xla_executable.execute_sharded(
          input_bufs, with_tokens=True
      )
      result_token_bufs = results.disassemble_prefix_into_single_device_arrays(
          len(self.ordered_effects))
      sharded_runtime_token = results.consume_token()
      self._handle_token_bufs(result_token_bufs, sharded_runtime_token)
    else:
      results = self.xla_executable.execute_sharded(input_bufs)
    if dispatch.needs_check_special():
      out_arrays = results.disassemble_into_single_device_arrays()
      for arrays in out_arrays:
        dispatch.check_special(self.name, arrays)
      out = self.out_handler(out_arrays)
    else:
      out = results.consume_with_handlers(self.out_handler.handlers)
    if self.mut is None:
      return out
    else:
      out_ = []
      for i, o in zip(self.mut.out_mut, out):
        if i is not None:
          args[i]._buf = o
        else:
          out_.append(o)
      return out_


xla_pmap_p = core.MapPrimitive('xla_pmap')
xla_pmap = xla_pmap_p.bind
xla_pmap_p.def_impl(xla_pmap_impl)

def _pmap_partial_eval_custom_params_updater(
    unks_in, inst_in, kept_outs_known, kept_outs_staged, num_res, params_known,
    params_staged):
  # prune inputs to jaxpr_known according to unks_in
  donated_invars_known, _ = partition_list(unks_in, params_known['donated_invars'])
  in_axes_known, _ = partition_list(unks_in, params_known['in_axes'])
  _, out_axes_known = partition_list(kept_outs_known, params_known['out_axes'])
  out_axes_known = out_axes_known + [0] * num_res
  new_params_known = dict(params_known, in_axes=tuple(in_axes_known),
                          out_axes=tuple(out_axes_known),
                          donated_invars=tuple(donated_invars_known))

  # added num_res new inputs to jaxpr_staged, pruning according to inst_in
  _, donated_invars_staged = partition_list(inst_in, params_staged['donated_invars'])
  donated_invars_staged = [False] * num_res + donated_invars_staged
  _, in_axes_staged = partition_list(inst_in, params_staged['in_axes'])
  in_axes_staged = [0] * num_res + in_axes_staged
  _, out_axes_staged = partition_list(kept_outs_staged, params_staged['out_axes'])
  new_params_staged = dict(params_staged, in_axes=tuple(in_axes_staged),
                           out_axes=tuple(out_axes_staged),
                           donated_invars=tuple(donated_invars_staged))
  return new_params_known, new_params_staged

def _pmap_partial_eval_custom_res_maker(params_known, aval):
  return core.unmapped_aval(params_known['axis_size'], core.no_axis_name, 0, aval)

def _pmap_dce_rule(used_outputs, eqn):
  # just like pe.dce_jaxpr_call_rule, except handles in_axes / out_axes
  axis_name = eqn.params["axis_name"]
  with maybe_extend_axis_env(axis_name, eqn.params["global_axis_size"], None):
    new_jaxpr, used_inputs = pe.dce_jaxpr(eqn.params['call_jaxpr'], used_outputs)
  _, donated_invars = partition_list(used_inputs, eqn.params['donated_invars'])
  _, in_axes = partition_list(used_inputs, eqn.params['in_axes'])
  _, out_axes = partition_list(used_outputs, eqn.params['out_axes'])
  new_params = dict(eqn.params, call_jaxpr=new_jaxpr,
                    donated_invars=tuple(donated_invars),
                    in_axes=tuple(in_axes), out_axes=tuple(out_axes))
  if not any(used_inputs) and not any(used_outputs) and not new_jaxpr.effects:
    return used_inputs, None
  else:
    effs = core.filter_named_axis_effects(new_jaxpr.effects, {axis_name})
    new_eqn = pe.new_jaxpr_eqn(
        [v for v, used in zip(eqn.invars, used_inputs) if used],
        [v for v, used in zip(eqn.outvars, used_outputs) if used],
        eqn.primitive, new_params, effs, eqn.source_info)
    return used_inputs, new_eqn


def _xla_call_partial_eval_update_params(
    params: core.ParamDict, kept_inputs: Sequence[bool], num_new_inputs: int
  ) -> core.ParamDict:
  donated_invars = params['donated_invars']
  if not kept_inputs and donated_invars:
    # JaxprTrace.post_process_call creates a call with no input tracers
    donated_invars = (False,) * num_new_inputs
  else:
    assert len(kept_inputs) == len(donated_invars)
    # JaxprTrace.process_call drops known input tracers
    donated_invars = [d for d, kept in zip(donated_invars, kept_inputs) if kept]
    # Any new inputs are prepended to the left, so mark those as not donated.
    donated_invars = [False] * num_new_inputs + donated_invars
  return dict(params, donated_invars=tuple(donated_invars))

def xla_call_jvp_update_params(params, nz_tangents):
  donated_invars = params['donated_invars']
  donated_tangents = [d for d, nz in zip(donated_invars, nz_tangents) if nz]
  new_donated_invars = (*donated_invars, *donated_tangents)
  return dict(params, donated_invars=new_donated_invars)

def _xla_call_transpose_update_params(params, undef_primals, nonzero_cts):
  donated_invars = params['donated_invars']
  donated_primals = [d for d, u in zip(donated_invars, undef_primals) if not u]
  donated_cotangents = [False for nz in nonzero_cts if nz]
  return dict(params, donated_invars=(*donated_primals, *donated_cotangents))


# Set param update handlers to update `donated_invars` just like xla_call_p
pe.call_param_updaters[xla_pmap_p] = _xla_call_partial_eval_update_params
pe.partial_eval_jaxpr_custom_rules[xla_pmap_p] = \
    partial(pe.call_partial_eval_custom_rule,
            'call_jaxpr', _pmap_partial_eval_custom_params_updater,
            res_aval=_pmap_partial_eval_custom_res_maker)
pe.dce_rules[xla_pmap_p] = _pmap_dce_rule
ad.call_param_updaters[xla_pmap_p] = xla_call_jvp_update_params
ad.call_transpose_param_updaters[xla_pmap_p] = _xla_call_transpose_update_params

ad.primitive_transposes[xla_pmap_p] = partial(ad.map_transpose, xla_pmap_p)

def _pmap_axis_subst(params, subst, traverse):
  if 'call_jaxpr' not in params:
    return params
  if not traverse:
    return params
  def shadowed_subst(name):
    return (name,) if name in params['axis_name'] else subst(name)
  with maybe_extend_axis_env(params['axis_name'],
                             params['global_axis_size'], None):
    new_jaxpr = core.subst_axis_names_jaxpr(params['call_jaxpr'],
                                            shadowed_subst)
  return dict(params, call_jaxpr=new_jaxpr)
core.axis_substitution_rules[xla_pmap_p] = _pmap_axis_subst


def _unravel_index_hlo(axis_env):
  div = mlir.ir_constant(
      np.array(axis_env.nreps // math.prod(axis_env.sizes), np.uint32))
  mod = mlir.ir_constant(np.array(axis_env.sizes[-1], np.uint32))
  return hlo.remainder(hlo.divide(hlo.replica_id(), div), mod)

def _hlo_shard(aval, axis_env, xs, in_axis):
  if aval is core.abstract_token:
    return xs
  elif isinstance(aval, core.ShapedArray):
    x, = xs
    dims = list(aval.shape)
    zero = mlir.ir_constant(np.zeros((), dtype=np.uint32))
    idxs = [zero] * len(dims)
    idxs.insert(in_axis, _unravel_index_hlo(axis_env))
    dims_unsqueezed = dims.copy()
    dims_unsqueezed.insert(in_axis, 1)
    dynamic_slice_result = hlo.dynamic_slice(
        x, idxs, mlir.dense_int_array(dims_unsqueezed))
    return [
      hlo.reshape(mlir.aval_to_ir_type(aval), dynamic_slice_result)
    ]
  else:
    raise TypeError(aval)


def _axis_read(axis_env, axis_name):
  try:
    return max(i for i, name in enumerate(axis_env.names) if name == axis_name)
  except ValueError:
    raise NameError(f"unbound axis name: {axis_name}") from None

def axis_groups(axis_env: sharding_impls.AxisEnv, name) -> tuple[tuple[int, ...]]:
  if not isinstance(name, (list, tuple)):
    name = (name,)
  mesh_axes = tuple(unsafe_map(partial(_axis_read, axis_env), name))
  trailing_size, ragged = divmod(axis_env.nreps, math.prod(axis_env.sizes))
  assert not ragged
  mesh_spec = axis_env.sizes + (trailing_size,)
  return _axis_groups(mesh_spec, mesh_axes)

def _axis_groups(mesh_spec, mesh_axes):
  """Computes replica group ids for a collective performed over a subset of the mesh.

  Args:
    mesh_spec: A sequence of integers representing the mesh shape.
    mesh_axes: A sequence of integers between 0 and `len(mesh_spec)` (exclusive)
      indicating over which axes the collective is performed.
  Returns:
    A tuple of replica groups (i.e. tuples containing replica ids).
  """
  iota = np.arange(math.prod(mesh_spec)).reshape(mesh_spec)
  groups = np.reshape(
      np.moveaxis(iota, mesh_axes, np.arange(len(mesh_axes))),
      (math.prod(np.take(mesh_spec, mesh_axes)), -1))
  return tuple(unsafe_map(tuple, groups.T))


# TODO(b/110096942): more efficient gather
def _hlo_unshard(ctx: mlir.LoweringRuleContext, aval, axis_env, out_axis, xs):
  if aval is core.abstract_token:
    return xs
  elif isinstance(aval, core.ShapedArray):
    x, = xs
    dims = list(aval.shape)
    padded_aval = aval.update(shape=[axis_env.sizes[-1]] + dims)
    padded = mlir.full_like_aval(ctx, 0, padded_aval)
    zero = mlir.ir_constant(np.zeros((), dtype=np.uint32))
    idxs = [_unravel_index_hlo(axis_env)] + [zero] * len(dims)
    broadcast_result = hlo.broadcast(x, mlir.dense_int_array([1]))
    padded = hlo.dynamic_update_slice(padded, broadcast_result, idxs)
    replica_groups = mlir.dense_int_elements(
      axis_groups(axis_env, axis_env.names[-1]))
    out = hlo.cross_replica_sum(padded, replica_groups)
    if out_axis != 0:
      # TODO(apaszke,mattjj): Change the indices to DynamicUpdateSlice instead
      perm = list(range(1, len(dims)))
      perm.insert(out_axis, 0)
      transposed_dims = list(dims)
      transposed_dims.insert(out_axis, axis_env.sizes[-1])
      out = hlo.transpose(out, mlir.dense_int_array(perm))

    return out
  else:
    raise TypeError(aval)

def _extend_axis_env(env: sharding_impls.AxisEnv, name, size: int):
  return sharding_impls.AxisEnv(env.nreps, env.names + (name,),
                                env.sizes + (size,))


def _pmap_lowering(ctx, *in_nodes, axis_name,
                   axis_size, global_axis_size, devices, name,
                   call_jaxpr, backend=None, in_axes, out_axes,
                   donated_invars, is_explicit_global_axis_size):
  del donated_invars  # Unused.
  mlir.check_backend_matches(backend, ctx.module_context.platforms)
  # We in-line here rather than generating a Call HLO as in the xla_call
  # translation rule just because the extra tuple stuff is a pain.
  if ctx.module_context.axis_env.names and devices is not None:
    raise ValueError("Nested pmap with explicit devices argument.")
  new_env = _extend_axis_env(ctx.module_context.axis_env, axis_name,
                             global_axis_size)
  # Shard the in_nodes that are mapped
  in_avals = [v.aval for v in call_jaxpr.invars]
  in_nodes_sharded = (
    _hlo_shard(aval, new_env, mlir.wrap_singleton_ir_values(in_node), in_axis)
    if in_axis is not None else mlir.wrap_singleton_ir_values(in_node)
    for aval, in_node, in_axis in zip(in_avals, in_nodes, in_axes))

  with maybe_extend_axis_env(axis_name, global_axis_size, None):  # type: ignore
    sub_ctx = ctx.module_context.replace(
        axis_context=sharding_impls.ReplicaAxisContext(new_env))
    sharded_outs, _ = mlir.jaxpr_subcomp(
        sub_ctx, call_jaxpr,
        ctx.name_stack.extend(util.wrap_name(name, 'pmap')),
        mlir.TokenSet(), (), *in_nodes_sharded,
        dim_var_values=ctx.dim_var_values)
  out_avals = [v.aval for v in call_jaxpr.outvars]
  outs = [_hlo_unshard(ctx, aval, new_env, out_axis, shard)
          for aval, out_axis, shard in zip(out_avals, out_axes, sharded_outs)]
  return outs

mlir.register_lowering(xla_pmap_p, _pmap_lowering)


# ------------------- xmap -------------------

def tile_aval_nd(axis_sizes, in_axes: ArrayMapping, aval):
  assert isinstance(aval, ShapedArray)
  shape = list(aval.shape)
  named_shape = dict(aval.named_shape)
  for name, axis in in_axes.items():
    assert shape[axis] % axis_sizes[name] == 0
    assert name not in named_shape
    named_shape[name] = axis_sizes[name]
    shape[axis] //= axis_sizes[name]
  return aval.update(shape=tuple(shape), named_shape=named_shape)

def untile_aval_nd(axis_sizes, out_axes: ArrayMapping, aval):
  assert isinstance(aval, ShapedArray)
  shape = list(aval.shape)
  named_shape = dict(aval.named_shape)
  for name, axis in out_axes.items():
    shape[axis] *= axis_sizes[name]
    named_shape.pop(name, None)  # The name might be missing --- it's a broadcast.
  return aval.update(shape=tuple(shape), named_shape=named_shape)


def mesh_local_to_global(mesh, axes: ArrayMapping, aval):
  return untile_aval_nd(mesh.shape, axes,
                        tile_aval_nd(mesh.local_mesh.shape, axes, aval))

def mesh_global_to_local(mesh, axes: ArrayMapping, aval):
  return untile_aval_nd(mesh.local_mesh.shape, axes,
                        tile_aval_nd(mesh.shape, axes, aval))


class SPMDBatchTrace(batching.BatchTrace):
  def get_axis_primitive_batcher(self, primitive, frame):
    if primitive in spmd_primitive_batchers:
      return partial(spmd_primitive_batchers[primitive],
          frame.size, frame.name, frame.main_trace.trace_type)
    return super().get_axis_primitive_batcher(primitive, frame)


spmd_primitive_batchers: dict[core.Primitive, Callable] = {}


def vtile_by_mesh(fun: lu.WrappedFun,
                  mesh: Mesh,
                  in_axes: Sequence[ArrayMapping],
                  out_axes: Sequence[ArrayMapping]):
  # We vectorize in reversed order, because vmap is often biased towards
  # moving the batch axis to the front, and this way of stacking transforms
  # will order the batch axes according to the mesh axis order.
  # Not strictly necessary, but seems nicer than reversing it?
  for name, size in reversed(mesh.shape.items()):
    fun = batching.vtile(fun,
                         tuple(a.get(name, None) for a in in_axes),
                         tuple(a.get(name, None) for a in out_axes),
                         tile_size=size,
                         axis_name=name,
                         main_type=SPMDBatchTrace)
  return fun

full_to_shard_p = core.Primitive('full_to_shard')

@full_to_shard_p.def_abstract_eval
def _full_to_shard_abstract_eval(x, axes, mesh, **_):
  # TODO: Assert x is a global aval! Or ideally check that it's global in dims from axes!
  return tile_aval_nd(mesh.shape, axes, x)

def manual_proto(
    aval: core.ShapedArray,
    manual_axes_set: frozenset[sharding_impls.MeshAxisName], mesh: Mesh):
  """Create an OpSharding proto that declares all mesh axes from `axes` as manual
  and all others as replicated.
  """
  named_mesh_shape = mesh.shape
  mesh_shape = list(named_mesh_shape.values())
  axis_order = {axis: i for i, axis in enumerate(mesh.axis_names)}

  manual_axes = sorted(manual_axes_set, key=str)
  replicated_axes = [axis for axis in mesh.axis_names
                     if axis not in manual_axes_set]

  tad_perm = ([axis_order[a] for a in replicated_axes] +
              [axis_order[a] for a in manual_axes])
  tad_shape = [1] * aval.ndim
  tad_shape.append(math.prod([named_mesh_shape[a] for a in replicated_axes]))
  tad_shape.append(math.prod([named_mesh_shape[a] for a in manual_axes]))

  raw_mesh = np.arange(math.prod(mesh_shape)).reshape(mesh_shape)
  proto = xc.OpSharding()
  proto.type = xc.OpSharding.Type.OTHER
  proto.tile_assignment_dimensions = tad_shape
  proto.tile_assignment_devices = list(raw_mesh.transpose(tad_perm).reshape(tad_shape).flat)
  proto.last_tile_dims = [xc.OpSharding.Type.REPLICATED, xc.OpSharding.Type.MANUAL]
  return proto

@partial(mlir.register_lowering, full_to_shard_p)
def _full_to_shard_lowering(ctx, x, *, axes: ArrayMapping, mesh: Mesh,
                            manual_axes: frozenset[sharding_impls.MeshAxisName]):
  # TODO: Can we short-circuit for replicated values? Probably not.
  aval_in, = ctx.avals_in
  aval_out, = ctx.avals_out
  sharding_proto = (
      sharding_impls.NamedSharding(mesh, array_mapping_to_axis_resources(axes))
      ._to_xla_hlo_sharding(aval_in.ndim).to_proto())
  unspecified_dims = set(range(aval_in.ndim)) - set(axes.values())
  sx = mlir.wrap_with_sharding_op(ctx, x, aval_in, sharding_proto,
                                  unspecified_dims=unspecified_dims)
  proto = manual_proto(aval_in, manual_axes, mesh)
  return (mlir.wrap_with_full_to_shard_op(ctx, sx, aval_out, proto,
                                          unspecified_dims=unspecified_dims),)

shard_to_full_p = core.Primitive('shard_to_full')

@shard_to_full_p.def_abstract_eval
def _shard_to_full_abstract_eval(x, axes, mesh, **_):
  # TODO: Assert x is a global aval! Or ideally check that it's global in dims from axes!
  return untile_aval_nd(mesh.shape, axes, x)

@partial(mlir.register_lowering, shard_to_full_p)
def _shard_to_full_lowering(ctx: mlir.LoweringRuleContext, x, *, axes: ArrayMapping, mesh: Mesh,
                            manual_axes: frozenset[sharding_impls.MeshAxisName]):
  aval_in, = ctx.avals_in
  aval_out, = ctx.avals_out
  proto = manual_proto(aval_in, manual_axes, mesh)  # type: ignore
  unspecified_dims = set(range(aval_in.ndim)) - set(axes.values())  # type: ignore
  sx = mlir.wrap_with_sharding_op(ctx, x, aval_in, proto,
                                  unspecified_dims=unspecified_dims)
  sharding_proto = (
      sharding_impls.NamedSharding(mesh, array_mapping_to_axis_resources(axes))
      ._to_xla_hlo_sharding(aval_out.ndim).to_proto())
  return (mlir.wrap_with_shard_to_full_op(ctx, sx, aval_out, sharding_proto,
                                          unspecified_dims),)

@lu.transformation
def vtile_manual(manual_axes: frozenset[sharding_impls.MeshAxisName],
                 mesh: Mesh,
                 in_axes: Sequence[ArrayMapping],
                 out_axes: Sequence[ArrayMapping],
                 *args):
  tiled_args = [full_to_shard_p.bind(arg, axes=axes, mesh=mesh, manual_axes=manual_axes)
                for arg, axes in zip(args, in_axes)]
  tiled_outs = yield tiled_args, {}
  outs = [shard_to_full_p.bind(out, axes=axes, mesh=mesh, manual_axes=manual_axes)
          for out, axes in zip(tiled_outs, out_axes)]
  yield outs


@dataclasses.dataclass(frozen=True)
class TileVectorize:
  pass

@dataclasses.dataclass(frozen=True)
class TileManual:
  manual_axes: frozenset[sharding_impls.MeshAxisName]

TilingMethod = Union[TileVectorize, TileManual]


def check_if_any_auto(
    shardings: Iterable[(sharding_impls.XLACompatibleSharding |
                              AUTO | UnspecifiedValue)]) -> bool:
  for s in shardings:
    if is_auto(s):
      return True
  return False

class MismatchType(enum.Enum):
  ARG_SHARDING = 0
  OUT_SHARDING = 1
  SHARDING_INSIDE_COMPUTATION = 2
  CONTEXT_DEVICES = 3
  IN_SHARDING = 4

  def __str__(self):
    if self.name == 'IN_SHARDING':
      return 'explicit input sharding'
    elif self.name == 'OUT_SHARDING':
      return 'explicit output sharding'
    elif self.name == 'CONTEXT_DEVICES':
      return 'devices'
    return f'{self.name}'


@dataclasses.dataclass
class DeviceAssignmentMismatch:
  da: Sequence[xc.Device]
  m_type: MismatchType
  source_info: dispatch.SourceInfo | None

  @property
  def device_ids(self) -> Sequence[int]:
    return [d.id for d in self.da]

  @property
  def platform(self) -> str:
    return self.da[0].platform.upper()

  def _maybe_api_name(self, api_name) -> str:
    return f" {api_name}'s" if self.m_type == MismatchType.CONTEXT_DEVICES else ""

  @property
  def source_info_str(self):
    return (
        "" if self.source_info is None
        else f" at {source_info_util.summarize(self.source_info.source_info)}"
    )

  @property
  def _dev_ids_plat_str(self):
    return f"device ids {self.device_ids} on platform {self.platform}"

  def m_type_str(self, api_name):
    return (f'{self.source_info and self.source_info.eqn_name} inside {api_name}'
            if self.m_type == MismatchType.SHARDING_INSIDE_COMPUTATION else self.m_type)

  def _str(self, api_name):
    return (f"{self._maybe_api_name(api_name)} {self.m_type_str(api_name)} with "
            f"{self._dev_ids_plat_str}{self.source_info_str}")


class DeviceAssignmentMismatchError(Exception):
  pass


ShardingInfo = tuple[
    Union[sharding_impls.XLACompatibleSharding, UnspecifiedValue, AUTO],
    MismatchType,
    Union[Any, None],  # Any is dispatch.SourceInfo to avoid circular imports
]


def _get_default_device() -> xc.Device:
  return config.default_device.value or xb.local_devices()[0]


class _thread_local_decorator(threading.local):

  def __init__(self, fn):
    self.fn = fn

  def __call__(self, *args, **kwargs):
    return self.fn(*args, **kwargs)


@_thread_local_decorator
def _get_and_check_device_assignment(
    shardings: Iterable[ShardingInfo],
    devices: Sequence[xc.Device] | None,
) -> tuple[xc.Client, tuple[xc.Device, ...]]:
  first_sharding_info = None
  if devices is None:
    devices = ()
  else:
    devices = tuple(devices)

  for i, s_type, source_info in shardings:
    if is_unspecified(i):
      continue

    if first_sharding_info is None:
      first_sharding_info = (
          (i.mesh._flat_devices_tuple, s_type, source_info) if is_auto(i)  # type: ignore
          else (i._device_assignment, s_type, source_info))  # type: ignore
    arr_device_assignment = i.mesh._flat_devices_tuple if is_auto(i) else i._device_assignment  # type: ignore
    if not devices:
      if first_sharding_info[0] != arr_device_assignment:
        raise DeviceAssignmentMismatchError([
            DeviceAssignmentMismatch(*first_sharding_info),
            DeviceAssignmentMismatch(arr_device_assignment, s_type, source_info)])
    else:
      if devices != arr_device_assignment:
        raise DeviceAssignmentMismatchError([
            DeviceAssignmentMismatch(devices, MismatchType.CONTEXT_DEVICES, None),
            DeviceAssignmentMismatch(arr_device_assignment, s_type, source_info)])
  if first_sharding_info is None and devices:
    final_device_assignment = devices
  elif first_sharding_info is None:
    final_device_assignment = (_get_default_device(),)
  else:
    final_device_assignment = first_sharding_info[0]
  return xb.get_device_backend(final_device_assignment[0]), final_device_assignment

MaybeSharding = Union[sharding_impls.XLACompatibleSharding, UnspecifiedValue]


def prune_unused_inputs(
    jaxpr: core.Jaxpr,
) -> tuple[core.Jaxpr, set[int], set[int]]:
  used_outputs = [True] * len(jaxpr.outvars)
  new_jaxpr, used_consts, used_inputs = pe.dce_jaxpr_consts(jaxpr, used_outputs)
  kept_const_idx = {i for i, b in enumerate(used_consts) if b}
  kept_var_idx = {i for i, b in enumerate(used_inputs) if b}
  return new_jaxpr, kept_const_idx, kept_var_idx


@weakref_lru_cache
def _dce_jaxpr(closed_jaxpr, global_in_avals, api_name, fun_name,
               keep_unused, donated_invars, auto_spmd_lowering):
  name_stack = source_info_util.new_name_stack(wrap_name(fun_name, api_name))

  assert isinstance(closed_jaxpr, core.ClosedJaxpr)
  jaxpr = closed_jaxpr.jaxpr
  global_out_avals = closed_jaxpr.out_avals
  consts = closed_jaxpr.consts

  if (keep_unused or auto_spmd_lowering or
      any(hasattr(a, "shape") and not core.is_constant_shape(a.shape)
          for a in global_in_avals)):
    kept_var_idx = set(range(len(global_in_avals)))
  else:
    jaxpr, kept_const_idx, kept_var_idx = prune_unused_inputs(jaxpr)
    consts = [c for i, c in enumerate(consts) if i in kept_const_idx]
    global_in_avals = tuple(a for i, a in enumerate(global_in_avals) if i in kept_var_idx)
    donated_invars = tuple(x for i, x in enumerate(donated_invars) if i in kept_var_idx)
    del kept_const_idx

  jaxpr = dispatch.apply_outfeed_rewriter(jaxpr)
  closed_jaxpr = core.ClosedJaxpr(jaxpr, consts)
  return (closed_jaxpr, global_in_avals, tuple(global_out_avals), donated_invars,
          kept_var_idx, name_stack)

class MutationData(NamedTuple):
  in_mut: list[core.MutableArray]
  out_mut: list[int | None]

@weakref_lru_cache
def _discharge_refs(
    jaxpr: core.ClosedJaxpr
) -> tuple[core.ClosedJaxpr, Sequence[int | None], MutationData]:
  from jax._src.state.discharge import discharge_state
  jaxpr, in_mut = _move_mutable_consts(jaxpr)
  new_jaxpr = core.ClosedJaxpr(*discharge_state(jaxpr.jaxpr, jaxpr.consts))
  count = it.count(len(jaxpr.out_avals))  # new outputs are appended to the end
  inout_map = {i: next(count) for i, a in enumerate(jaxpr.in_avals)
               if isinstance(a, AbstractRef)}
  outin_map = {j: i for i, j in inout_map.items()}
  inout_aliases = tuple(map(inout_map.get, range(len(new_jaxpr.in_avals))))
  out_mut = list(map(outin_map.get, range(len(new_jaxpr.out_avals))))
  return new_jaxpr, inout_aliases, MutationData(in_mut, out_mut)

@weakref_lru_cache
def _move_mutable_consts(
    closed_jaxpr: core.ClosedJaxpr,
) -> tuple[core.ClosedJaxpr, list[core.MutableArray]]:
  jaxpr = closed_jaxpr.jaxpr
  hoist = [isinstance(c, core.MutableArray) for c in closed_jaxpr.consts]
  consts, in_mut = partition_list(hoist, closed_jaxpr.consts)
  constvars, mutvars = partition_list(hoist, jaxpr.constvars)
  invars = (*jaxpr.invars, *mutvars)
  effects = pe.make_jaxpr_effects(constvars, invars, jaxpr.outvars, jaxpr.eqns)
  jaxpr = core.Jaxpr(constvars, invars, jaxpr.outvars, jaxpr.eqns,
                     effects, None)
  return core.ClosedJaxpr(jaxpr, consts), in_mut


@dataclasses.dataclass(frozen=True)
class SemanticallyEqualShardings:
  shardings: tuple[sharding_impls.GSPMDSharding | UnspecifiedValue, ...]

  def __hash__(self):
    return hash(tuple(
        (s._hlo_sharding_hash, s.memory_kind)  # type: ignore
        if isinstance(s, sharding_impls.GSPMDSharding) else s
        for s in self.shardings))

  def __eq__(self, other):
    if not isinstance(other, SemanticallyEqualShardings):
      return False
    return all(
        (op_shardings.are_op_shardings_equal(s._hlo_sharding, o._hlo_sharding)
         and s.memory_kind == o.memory_kind)
        if (isinstance(s, sharding_impls.GSPMDSharding) and
            isinstance(o, sharding_impls.GSPMDSharding))
        else s == o
        for s, o in zip(self.shardings, other.shardings)
    )


def _raise_warnings_or_errors_for_jit_of_pmap(
    nreps: int, backend: xc.Client, name: str, jaxpr: core.Jaxpr) -> None:
  if nreps > 1:
    warnings.warn(
        f"The jitted function {name} includes a pmap. Using "
         "jit-of-pmap can lead to inefficient data movement, as the outer jit "
         "does not preserve sharded data representations and instead collects "
         "input and output arrays onto a single device. "
         "Consider removing the outer jit unless you know what you're doing. "
         "See https://github.com/google/jax/issues/2926.")

  if nreps > xb.device_count(backend):
    raise ValueError(
        f"compiling computation `{name}` that requires {nreps} replicas, but "
        f"only {xb.device_count(backend)} XLA devices are available.")

  if xb.process_count() > 1 and (
      nreps > 1 or dispatch.jaxpr_has_primitive(jaxpr, "xla_pmap")
  ):
    raise NotImplementedError(
        "jit of multi-host pmap not implemented (and jit-of-pmap can cause "
        "extra data movement anyway, so maybe you don't want it after all).")


@weakref_lru_cache
def _cached_lowering_to_hlo(closed_jaxpr, api_name, fun_name, backend,
                            semantic_in_shardings, semantic_out_shardings,
                            in_layouts, out_layouts, num_devices, device_assignment,
                            donated_invars, name_stack, all_default_mem_kind,
                            inout_aliases: None | tuple[None | int, ...],
                            lowering_parameters: mlir.LoweringParameters):
  jaxpr = closed_jaxpr.jaxpr
  in_shardings = semantic_in_shardings.shardings
  out_shardings = semantic_out_shardings.shardings
  global_in_avals = closed_jaxpr.in_avals
  global_out_avals = closed_jaxpr.out_avals

  log_priority = logging.WARNING if config.log_compiles.value else logging.DEBUG
  if logger.isEnabledFor(log_priority):
    logger.log(log_priority,
               "Compiling %s for with global shapes and types %s. "
               "Argument mapping: %s.",
               fun_name, global_in_avals, in_shardings)

  # Look at the number of replcas present in the jaxpr. In
  # lower_sharding_computation, nreps > 1 during `jit(pmap)` cases. This is
  # handled here so as to deprecate the lower_xla_callable codepath when
  # `jax.Array` is turned on by default.
  # TODO(yashkatariya): Remove this when `jit(pmap)` is removed.
  nreps = dispatch.jaxpr_replicas(jaxpr)
  _raise_warnings_or_errors_for_jit_of_pmap(nreps, backend, fun_name, jaxpr)

  in_mlir_shardings: list[sharding_impls.XLACompatibleSharding | None] | None
  out_mlir_shardings: list[sharding_impls.XLACompatibleSharding | None] | None
  axis_ctx: mlir.AxisContext

  if nreps == 1:
    in_mlir_shardings = map(_to_logical_sharding, global_in_avals, in_shardings)
    out_mlir_shardings = map(_to_logical_sharding, global_out_avals, out_shardings)
    replicated_args = [False] * len(global_in_avals)
    axis_ctx = sharding_impls.ShardingContext(num_devices, device_assignment)
    num_partitions = num_devices
  else:
    # This path is triggered for `jit(pmap)` cases.
    replicated_args = None
    in_mlir_shardings = None
    out_mlir_shardings = None
    axis_env = sharding_impls.AxisEnv(nreps, (), ())
    axis_ctx = sharding_impls.ReplicaAxisContext(axis_env)
    num_partitions = 1

  module_name = f"{api_name}_{fun_name}"

  if num_devices > 1:
    unsupported_effects = effects.ordered_effects.filter_in(closed_jaxpr.effects)
    unsupported_effects = effects.shardable_ordered_effects.filter_not_in(
        unsupported_effects)
    if len(unsupported_effects) > 0:
      raise ValueError(
        "The following ordered effects are not supported for "
        f"more than 1 device: {unsupported_effects}")
  ordered_effects = list(effects.ordered_effects.filter_in(closed_jaxpr.effects))

  with dispatch.log_elapsed_time(
        "Finished jaxpr to MLIR module conversion {fun_name} in {elapsed_time} sec",
        fun_name=str(name_stack), event=dispatch.JAXPR_TO_MLIR_MODULE_EVENT):
    lowering_result = mlir.lower_jaxpr_to_module(
        module_name,
        closed_jaxpr,
        ordered_effects=ordered_effects,
        backend_or_name=backend,
        # Optionally, override the lowering platform
        platforms=lowering_parameters.platforms or (backend.platform,),
        axis_context=axis_ctx,
        name_stack=name_stack,
        donated_args=donated_invars,
        replicated_args=replicated_args,
        arg_shardings=in_mlir_shardings,
        result_shardings=out_mlir_shardings,
        in_layouts=in_layouts,
        out_layouts=out_layouts,
        arg_names=jaxpr.debug_info and jaxpr.debug_info.arg_names,
        result_names=jaxpr.debug_info and jaxpr.debug_info.result_paths,
        num_replicas=nreps,
        num_partitions=num_partitions,
        all_default_mem_kind=all_default_mem_kind,
        input_output_aliases=inout_aliases,
        lowering_parameters=lowering_parameters)
  tuple_args = dispatch.should_tuple_args(len(global_in_avals), backend.platform)
  unordered_effects = list(
      effects.ordered_effects.filter_not_in(closed_jaxpr.effects))
  return (lowering_result.module, lowering_result.keepalive,
          lowering_result.host_callbacks, unordered_effects, ordered_effects,
          nreps, tuple_args, lowering_result.shape_poly_state)


@lru_cache(maxsize=2048)
def _create_da_object(  # pytype: disable=invalid-annotation
    device_assignment: tuple[xc.Device, ...]) -> xc.DeviceList:  # type: ignore
  return xc.DeviceList(device_assignment)


def jaxpr_transfer_mem_kinds(
    jaxpr: core.Jaxpr) -> Iterator[sharding_impls.TransferToMemoryKind]:
  for eqn in jaxpr.eqns:
    if (eqn.primitive is dispatch.device_put_p and
        isinstance(eqn.params['device'], sharding_impls.TransferToMemoryKind)):
      yield eqn.params['device']
  for subjaxpr in core.subjaxprs(jaxpr):
    yield from jaxpr_transfer_mem_kinds(subjaxpr)


def are_all_shardings_default_mem_kind(da_object, shardings):
  try:
    default_mem_kind = da_object.default_memory_kind
  except:
    return True
  for i in shardings:
    if is_unspecified_or_auto(i):
      continue
    if i.memory_kind != default_mem_kind:
      return False
  return True

MaybeLayout = Sequence[Union[XLACompatibleLayout, LayoutRequest, None]]


class AllArgsInfo(NamedTuple):
  """Avals, shardings, layouts and debug_info for all arguments prior to DCE."""
  in_avals: Sequence[core.ShapedArray]
  in_shardings: Any
  debug_info: core.JaxprDebugInfo | None


@profiler.annotate_function
def lower_sharding_computation(
    closed_jaxpr: core.ClosedJaxpr,
    api_name: str,
    fun_name: str,
    in_shardings: Sequence[MaybeSharding],
    out_shardings: Sequence[MaybeSharding],
    donated_invars: Sequence[bool],
    global_in_avals: Sequence[core.ShapedArray],
    *,
    keep_unused: bool,
    inline: bool,
    devices_from_context: Sequence[xc.Device] | None = None,
    lowering_parameters: mlir.LoweringParameters,
    in_layouts: MaybeLayout,
    out_layouts: MaybeLayout,
) -> MeshComputation:
  """Lowers a computation to XLA. It can take arbitrary shardings as input.

  The caller of this code can pass in a singleton UNSPECIFIED because the
  number of out_avals might not be known at that time and
  lower_sharding_computation calculates the number of out_avals so it can apply
  the singleton UNSPECIFIED to all out_avals.
  """
  # 1. Trace to jaxpr and preprocess/verify it
  auto_spmd_lowering = check_if_any_auto(
      it.chain.from_iterable([in_shardings, out_shardings]))  # type: ignore

  all_args_info = AllArgsInfo(global_in_avals, in_shardings,
                              closed_jaxpr.jaxpr.debug_info)

  (closed_jaxpr, global_in_avals, global_out_avals, donated_invars,
   kept_var_idx, name_stack) = _dce_jaxpr(
      closed_jaxpr, global_in_avals, api_name, fun_name, keep_unused,
      donated_invars, auto_spmd_lowering)
  in_shardings = tuple(s for i, s in enumerate(in_shardings) if i in kept_var_idx)
  in_layouts = tuple(l for i, l in enumerate(in_layouts) if i in kept_var_idx)

  if any(isinstance(e, RefEffect) for e in closed_jaxpr.effects):
    closed_jaxpr, inout_aliases, mut = _discharge_refs(closed_jaxpr)
    in_shardings = (*in_shardings,) + (UNSPECIFIED,) * len(mut.in_mut)
    in_layouts = (*in_layouts,) + (None,) * len(mut.in_mut)
    donated_invars = (*donated_invars,) + (False,) * len(mut.in_mut)
    out_layouts_ = iter(zip(out_shardings, out_layouts))
    out_shardings, out_layouts = unzip2(
        next(out_layouts_) if i is None else (in_shardings[i], in_layouts[i])
        for i in mut.out_mut)
    assert next(out_layouts_, None) is None
    # TODO(yashkatariya): remove global_in_avals / global_out_avals
    global_in_avals = closed_jaxpr.in_avals
    global_out_avals = closed_jaxpr.out_avals
  else:
    inout_aliases = mut = None

  jaxpr = closed_jaxpr.jaxpr
  assert len(out_shardings) == len(out_layouts) == len(global_out_avals), (
      len(out_shardings), len(out_layouts), len(global_out_avals))

  # Device assignment across all inputs, outputs and shardings inside jaxpr
  # should be the same.
  jaxpr_sharding = list(dispatch.jaxpr_shardings(jaxpr))
  backend, device_assignment = _get_and_check_device_assignment(
      it.chain(
          ((i, MismatchType.ARG_SHARDING, None) for i in util.stable_unique(in_shardings)),
          ((o, MismatchType.OUT_SHARDING, None) for o in util.stable_unique(out_shardings)),
          ((js, MismatchType.SHARDING_INSIDE_COMPUTATION, source_info)
           for js, source_info in util.stable_unique(jaxpr_sharding))),
      devices_from_context)

  # TODO(yashkatariya): Enable this when offload APIs are stable.
  # transfer_mem_kind_in_jaxpr = list(jaxpr_transfer_mem_kinds(jaxpr))

  committed = bool(
      devices_from_context or
      len(device_assignment) > 1 or
      any(not is_unspecified(i) for i in in_shardings) or
      any(not is_unspecified(js) for js, _ in jaxpr_sharding) or
      any(not is_unspecified(o) for o in out_shardings))

  gs = GSPMDSharding.get_replicated(device_assignment)
  if xla_extension_version < 241 or hasattr(backend, "compile_replicated"):
    in_shardings = tuple(gs if is_unspecified(i) else i for i in in_shardings)

  da_object = _create_da_object(tuple(device_assignment))

  all_default_mem_kind = are_all_shardings_default_mem_kind(
      da_object,
      it.chain(in_shardings, out_shardings, [js for js, _ in jaxpr_sharding]))  # type: ignore

  if not da_object.is_fully_addressable:  # type: ignore
    if inline and config.spmd_mode.value != 'allow_all':
      raise RuntimeError(
          "Running operations on `Array`s that are not fully addressable by this "
          "process (i.e. `Array`s with data sharded across multiple devices and "
          "processes.) is dangerous. It’s very important that all processes run "
          "the same cross-process computations in the same order otherwise it "
          "can lead to hangs. "
          "If you’re not already familiar with JAX’s multi-process "
          "programming model, please read "
          "https://jax.readthedocs.io/en/latest/multi_process.html. "
          "To fix this error, run your `jitted` computation inside "
          "`with jax.spmd_mode('allow_all'):` context manager.")

  # 2. Build up the HLO
  semantic_in_shardings = SemanticallyEqualShardings(in_shardings)  # type: ignore
  semantic_out_shardings = SemanticallyEqualShardings(out_shardings)  # type: ignore
  prim_requires_devices = dispatch.jaxpr_has_prim_requiring_devices(jaxpr)

  (module, keepalive, host_callbacks, unordered_effects, ordered_effects,
   nreps, tuple_args, shape_poly_state) = _cached_lowering_to_hlo(
       closed_jaxpr, api_name, fun_name, backend, semantic_in_shardings,
       semantic_out_shardings, in_layouts, out_layouts, len(da_object),
       tuple(da_object) if prim_requires_devices else None, donated_invars,
       name_stack, all_default_mem_kind, inout_aliases,
       lowering_parameters=lowering_parameters)

  # backend and device_assignment is passed through to MeshExecutable because
  # if keep_unused=False and all in_shardings are pruned, then there is no way
  # to get the device_assignment and backend. So pass it to MeshExecutable
  # because we calculate the device_assignment and backend before in_shardings,
  # etc are pruned.
  return MeshComputation(
      str(name_stack),
      module,
      donated_invars,
      global_in_avals=global_in_avals,
      global_out_avals=global_out_avals,
      in_shardings=in_shardings,
      out_shardings=out_shardings,
      spmd_lowering=True,
      tuple_args=tuple_args,
      auto_spmd_lowering=auto_spmd_lowering,
      unordered_effects=unordered_effects,
      ordered_effects=ordered_effects,
      host_callbacks=host_callbacks,
      keepalive=keepalive,
      kept_var_idx=kept_var_idx,
      mut=mut,
      backend=backend,
      device_assignment=da_object,
      committed=committed,
      in_layouts=in_layouts,
      out_layouts=out_layouts,
      pmap_nreps=nreps,
      shape_poly_state=shape_poly_state,
      all_default_mem_kind=all_default_mem_kind,
      all_args_info=all_args_info)


def _to_logical_sharding(
    aval: core.AbstractValue, sharding: MaybeSharding | AUTO
) -> sharding_impls.XLACompatibleSharding | None:
  if is_unspecified(sharding) or is_auto(sharding):
    return None
  elif isinstance(aval, (ShapedArray, DShapedArray, AbstractRef)):
    assert isinstance(sharding, sharding_impls.XLACompatibleSharding)
    return sharding
  elif isinstance(aval, core.AbstractToken):
    return None
  else:
    raise TypeError(aval)


@profiler.annotate_function
def lower_mesh_computation(
    fun_or_jaxpr: lu.WrappedFun | core.ClosedJaxpr,
    api_name: str,
    fun_name: str,
    mesh: Mesh,
    in_shardings: Sequence[sharding_impls.NamedSharding | AUTO],
    out_shardings: Sequence[(sharding_impls.NamedSharding | AUTO |
                                  UnspecifiedValue)],
    donated_invars: Sequence[bool],
    spmd_lowering: bool,
    global_in_avals: Sequence[core.ShapedArray],
    tiling_method: TilingMethod | None,
    lowering_parameters: mlir.LoweringParameters) -> MeshComputation:
  assert not mesh.empty
  backend = xb.get_device_backend(mesh.devices.flat[0])
  name_stack = source_info_util.new_name_stack(wrap_name(fun_name, api_name))

  global_axis_sizes = mesh.shape

  log_priority = logging.WARNING if config.log_compiles.value else logging.DEBUG
  if logger.isEnabledFor(log_priority):
    logger.log(log_priority,
               "Compiling %s for %s mesh with global shapes and types %s. "
               "Argument mapping: %s.",
               fun_name, tuple(global_axis_sizes.items()), global_in_avals,
               in_shardings)

  # 1. Trace to jaxpr and preprocess/verify it
  if spmd_lowering:
    manual_axes: frozenset[MeshAxisName] = frozenset()
    # TODO: Consider handling xmap's 'vectorize' in here. We can vmap once instead of vtile twice!
    if tiling_method is not None:
      if isinstance(tiling_method, TileVectorize):
        tiling_transform = vtile_by_mesh
      elif isinstance(tiling_method, TileManual):
        tiling_transform = lambda f, *args: vtile_manual(f, tiling_method.manual_axes, *args)  # type: ignore
        manual_axes = tiling_method.manual_axes
      else:
        raise NotImplementedError(f"Unrecognized tiling method: {tiling_method}")
      assert not callable(out_shardings)
      assert isinstance(fun_or_jaxpr, lu.WrappedFun)
      # This is the xmap path where there is no `AUTO` or `UNSPECIFIED`, which
      # is why `.spec` can be accessed.
      fun_or_jaxpr = tiling_transform(
          fun_or_jaxpr, mesh, [get_array_mapping(i.spec) for i in in_shardings],  # type: ignore
          [get_array_mapping(o.spec) for o in out_shardings])  # type: ignore
    in_jaxpr_avals = global_in_avals
  else:
    assert isinstance(tiling_method, TileVectorize)
    # In non-spmd lowering path, there is no `AUTO` or `UNSPECIFIED`, which is
    # why `.spec` can be accessed.
    in_tiled_avals = [tile_aval_nd(global_axis_sizes, get_array_mapping(i.spec), aval)  # type: ignore
                      for aval, i in safe_zip(global_in_avals, in_shardings)]
    in_jaxpr_avals = in_tiled_avals

  with core.extend_axis_env_nd(mesh.shape.items()):
    if isinstance(fun_or_jaxpr, lu.WrappedFun):
      with dispatch.log_elapsed_time(
          "Finished tracing + transforming {fun_name} in {elapsed_time} sec",
          fun_name=str(name_stack), event=dispatch.JAXPR_TRACE_EVENT):
        jaxpr, out_jaxpr_avals, consts = pe.trace_to_jaxpr_final(
            fun_or_jaxpr, in_jaxpr_avals)
    else:
      assert isinstance(fun_or_jaxpr, core.ClosedJaxpr)
      jaxpr = fun_or_jaxpr.jaxpr
      out_jaxpr_avals = fun_or_jaxpr.out_avals
      consts = fun_or_jaxpr.consts

  all_args_info = AllArgsInfo(global_in_avals, in_shardings, jaxpr.debug_info)

  assert len(out_shardings) == len(out_jaxpr_avals)
  if spmd_lowering:
    global_out_avals = out_jaxpr_avals
  else:
    # In non-spmd lowering path, there is no `AUTO` or `UNSPECIFIED`, which is
    # why `.spec` can be accessed.
    global_out_avals = [untile_aval_nd(global_axis_sizes, get_array_mapping(o.spec), aval)  # type: ignore
                        for aval, o in safe_zip(out_jaxpr_avals, out_shardings)]

  _sanitize_mesh_jaxpr(jaxpr)
  jaxpr = dispatch.apply_outfeed_rewriter(jaxpr)

  # 2. Build up the HLO
  tuple_args = dispatch.should_tuple_args(len(in_jaxpr_avals), backend.platform)

  in_partitions: list[sharding_impls.XLACompatibleSharding | None] | None
  out_partitions: list[sharding_impls.XLACompatibleSharding | None] | None
  axis_ctx: mlir.AxisContext
  if spmd_lowering:
    in_partitions = map(_to_logical_sharding, global_in_avals, in_shardings)
    out_partitions = map(_to_logical_sharding, global_out_avals, out_shardings)
    replicated_args = [False] * len(in_jaxpr_avals)
    axis_ctx = sharding_impls.SPMDAxisContext(mesh, manual_axes)
    num_replicas = 1
    num_partitions = mesh.devices.size
  else:
    replicated_args = [not get_array_mapping(i.spec) for i in in_shardings]  # type: ignore
    in_partitions = None
    out_partitions = None
    axis_env = sharding_impls.AxisEnv(
        nreps=mesh.size,
        names=tuple(global_axis_sizes.keys()),
        sizes=tuple(global_axis_sizes.values()))
    axis_ctx = sharding_impls.ReplicaAxisContext(axis_env)
    num_replicas = mesh.devices.size
    num_partitions = 1
  jaxpr = core.remove_named_axis_effects(jaxpr, mesh.axis_names)
  closed_jaxpr = core.ClosedJaxpr(jaxpr, consts)
  module_name = f"{api_name}_{fun_name}"
  with core.extend_axis_env_nd(mesh.shape.items()):
    if any(effects.ordered_effects.contains(eff) for eff
           in closed_jaxpr.effects):
      raise ValueError("Ordered effects not supported in mesh computations.")
    unordered_effects = list(effects.ordered_effects.filter_not_in(
      closed_jaxpr.effects))
    ordered_effects = list(effects.ordered_effects.filter_in(
      closed_jaxpr.effects))
    with dispatch.log_elapsed_time(
        "Finished jaxpr to MLIR module conversion {fun_name} in {elapsed_time} sec",
        fun_name=str(name_stack), event=dispatch.JAXPR_TO_MLIR_MODULE_EVENT):
      lowering_result = mlir.lower_jaxpr_to_module(
          module_name,
          closed_jaxpr,
          ordered_effects=ordered_effects,
          backend_or_name=backend,
          platforms=lowering_parameters.platforms or (backend.platform,),
          axis_context=axis_ctx,
          name_stack=name_stack,
          donated_args=donated_invars,
          replicated_args=replicated_args,
          arg_shardings=in_partitions,
          result_shardings=out_partitions,
          arg_names=jaxpr.debug_info and jaxpr.debug_info.arg_names,
          result_names=jaxpr.debug_info and jaxpr.debug_info.result_paths,
          num_replicas=num_replicas,
          num_partitions=num_partitions,
          lowering_parameters=lowering_parameters)

  return MeshComputation(
      str(name_stack),
      lowering_result.module,
      donated_invars,
      global_in_avals=global_in_avals,
      global_out_avals=global_out_avals,
      in_shardings=in_shardings,
      out_shardings=out_shardings,
      spmd_lowering=spmd_lowering,
      tuple_args=tuple_args,
      auto_spmd_lowering=False,
      unordered_effects=unordered_effects,
      ordered_effects=ordered_effects,
      host_callbacks=lowering_result.host_callbacks,
      keepalive=lowering_result.keepalive,
      kept_var_idx=set(range(len(global_in_avals))),
      backend=backend,
      device_assignment=_create_da_object(tuple(mesh.devices.flat)),
      committed=True,
      in_layouts=(None,) * len(global_in_avals),
      out_layouts=(None,) * len(global_out_avals),
      shape_poly_state=lowering_result.shape_poly_state,
      all_args_info=all_args_info)

class MeshComputation(stages.XlaLowering):
  _hlo: ir.Module | None
  _executable: MeshExecutable | None

  def __init__(self, name: str, hlo: ir.Module | None,
               donated_invars: Sequence[bool], **compile_args):
    self._name = name
    self._hlo = hlo
    self._donated_invars = donated_invars
    self.compile_args = compile_args
    self._executable = None

  # -- stages.XlaLowering overrides

  def stablehlo(self) -> ir.Module:
    return self._hlo

  def compile(self, compiler_options=None) -> MeshExecutable:
    if self._executable is None or compiler_options is not None:
      executable = UnloadedMeshExecutable.from_hlo(
          self._name, self._hlo, **self.compile_args,
          compiler_options=compiler_options)
      if compiler_options is None:
        self._executable = executable
      return executable
    return self._executable

  def cost_analysis(self) -> dict[str, float]:
    backend = self.compile_args["backend"]
    if xb.using_pjrt_c_api(backend):
      raise NotImplementedError(
          "Lowered.cost_analysis not implemented on platform "
          f"'{backend.platform}'. Use compile().cost_analysis() for "  # type: ignore
          "post-compilation cost estimates.")
    return xe.hlo_module_cost_analysis(backend, self.hlo().as_hlo_module())


if xla_extension_version < 229:
  def _get_input_indices(
      avals: Sequence[ShapedArray],
      shardings: Sequence[sharding_impls.XLACompatibleSharding],
      da_object: xc.DeviceList | Sequence[xc.Device],  # type: ignore
  ) -> Sequence[tuple[Index | None, ...]]:

    input_indices = []
    if not isinstance(da_object, xc.DeviceList):
      da_object = _create_da_object(tuple(da_object))
    num_addressable_devices = len(da_object.addressable_device_list)

    def _get_replicated_slices(num_addressable_devices: int, ndim: int | None):
      if ndim is None:
        return ((slice(None),),) * num_addressable_devices
      else:
        return ((slice(None),) * ndim,) * num_addressable_devices

    for aval, sharding in zip(avals, shardings):
      if aval is core.abstract_token:
        index = _get_replicated_slices(num_addressable_devices, None)
      else:
        if sharding.is_fully_replicated:
          index = _get_replicated_slices(num_addressable_devices, aval.ndim)
        else:
          index = tuple(
              sharding.addressable_devices_indices_map(aval.shape).values())  # type: ignore
      input_indices.append(index)

    return input_indices


def get_out_shardings_from_executable(
    xla_executable,
    device_assignment: Sequence[xc.Device],
    num_out_avals: int,
    num_ordered_effects: int,
    all_default_mem_kind: bool,
) -> Sequence[sharding_impls.GSPMDSharding] | None:
  from jax._src import pjit

  if config.enable_memories.value:
    if all_default_mem_kind:
      omk = [None] * num_out_avals
    else:
      try:
        omk = xla_executable.get_output_memory_kinds()[0]
        if num_ordered_effects > 0:
          omk = omk[num_ordered_effects:]
      except:
        omk = [None] * num_out_avals
  else:
    omk = [None] * num_out_avals

  assert len(omk) == num_out_avals, (len(omk), num_out_avals)

  # When the device assignment only has 1 device, SPMD partitioner will not run.
  # Hence the op shardings will not be set on the `hlo_module`.
  if len(device_assignment) == 1:
    return [sharding_impls.GSPMDSharding.get_replicated(device_assignment, memory_kind=mk)
            for mk in omk]

  _, out_op_shardings = pjit.get_op_sharding_from_executable(xla_executable)
  if not out_op_shardings:
    return None

  if num_ordered_effects > 0:
    out_op_shardings = out_op_shardings[num_ordered_effects:]

  # This means that there are no outputs for JAX but for XLA there is an empty
  # tuple output which gets a replicated sharding.
  if num_out_avals == 0 and len(out_op_shardings) == 1:
    return None

  # This condition happens when all the elements in the output tuple have the
  # same sharding, so XLA decides to run the `FusionTupleDeduplicator` to
  # put the sharding on ROOT instead of the tuple.
  # TODO(b/245667823): Remove this when XLA fixes this.
  if len(out_op_shardings) == 1 and len(out_op_shardings) < num_out_avals:
    out_op_shardings = out_op_shardings * num_out_avals  # type: ignore

  assert len(out_op_shardings) == num_out_avals == len(omk), (
      len(out_op_shardings), num_out_avals, len(omk))

  return [sharding_impls.GSPMDSharding(device_assignment, os, memory_kind=mk)
          for os, mk in safe_zip(out_op_shardings, omk)]


def _get_in_shardings_from_xla(
    xla_executable, device_assignment: Sequence[xc.Device], num_in_avals: int,
    num_ordered_effects: int
  ) -> Sequence[GSPMDSharding] | None:
  """Returns input shardings from XLA."""
  from jax._src import pjit

  # When the device assignment only has 1 device, SPMD partitioner will not run.
  # Hence the op shardings will not be set on the `hlo_module`.
  if len(device_assignment) == 1:
    return [GSPMDSharding.get_replicated(device_assignment)] * num_in_avals

  in_op_shardings, _ = pjit.get_op_sharding_from_executable(xla_executable)
  if not in_op_shardings:
    return None

  if num_ordered_effects > 0:
    in_op_shardings = in_op_shardings[num_ordered_effects:]

  assert len(in_op_shardings) == num_in_avals, (
      len(in_op_shardings), num_in_avals)

  return [GSPMDSharding(device_assignment, os)
          for os in in_op_shardings]


# TODO(yashkatariya): Remove this function after `AUTO` can return shardings
# without mesh.
def _get_mesh_pspec_shardings_from_executable(
    xla_executable, mesh: Mesh
) -> tuple[Sequence[sharding_impls.NamedSharding],
           Sequence[sharding_impls.NamedSharding]]:
  from jax._src import pjit

  in_pspec, out_pspec = pjit.get_pspec_from_executable(xla_executable, mesh)
  return ([sharding_impls.NamedSharding(mesh, i) for i in in_pspec],
          [sharding_impls.NamedSharding(mesh, o) for o in out_pspec])


_orig_out_sharding_handlers = {}

_ShardingT = TypeVar("_ShardingT", bound=sharding_impls.XLACompatibleSharding)


def _register_out_sharding_handler(
    sharding_cls: type[_ShardingT],
    handler: Callable[[sharding_impls.GSPMDSharding, _ShardingT], _ShardingT],
) -> None:
  _orig_out_sharding_handlers[sharding_cls] = handler


def _gspmd_to_named_sharding_via_mesh(
    out_s: sharding_impls.GSPMDSharding,
    mesh: Mesh) -> sharding_impls.NamedSharding:
  parsed_pspec = sharding_impls.parse_flatten_op_sharding(
      out_s._hlo_sharding, mesh)[0]
  return create_mesh_pspec_sharding(
      mesh, parsed_pspec.get_partition_spec(), parsed_pspec,
      out_s.memory_kind)

def _gspmd_to_named_sharding(
    out_s: sharding_impls.GSPMDSharding,
    orig_in_s: sharding_impls.NamedSharding) -> sharding_impls.NamedSharding:
  return _gspmd_to_named_sharding_via_mesh(out_s, orig_in_s.mesh)

_register_out_sharding_handler(
    sharding_impls.NamedSharding, _gspmd_to_named_sharding)


def _gspmd_to_positional_sharding(
    out_s: sharding_impls.GSPMDSharding,
    orig_in_s: sharding_impls.PositionalSharding
    ) -> sharding_impls.PositionalSharding:
  return sharding_impls._op_sharding_to_pos_sharding(
      out_s._hlo_sharding, orig_in_s._device_assignment, out_s.memory_kind)

_register_out_sharding_handler(
    sharding_impls.PositionalSharding, _gspmd_to_positional_sharding)

def _gspmd_to_single_device_sharding(
    out_s: GSPMDSharding, orig_in_s: SingleDeviceSharding) -> SingleDeviceSharding:
  assert isinstance(orig_in_s, SingleDeviceSharding)
  return SingleDeviceSharding(
      out_s._device_assignment[0], memory_kind=out_s.memory_kind)

_register_out_sharding_handler(
    SingleDeviceSharding, _gspmd_to_single_device_sharding)


def _get_out_sharding_from_orig_sharding(
    out_shardings, out_avals, orig_in_s, orig_aval):
  out = []
  orig_handler = _orig_out_sharding_handlers[type(orig_in_s)]
  for o, out_aval in safe_zip(out_shardings, out_avals):
    if isinstance(o, sharding_impls.GSPMDSharding):
      try:
        # Only return the same input sharding object if the OpShardings and
        # in_aval.ndim and out_aval.ndim match. This is because if OpSharding is
        # replicated then, it doesn't encode the ndim in it. The devices
        # will be the same at this point because those checks happen before.
        if (orig_aval is not None and out_aval is not None and
            out_aval.ndim == orig_aval.ndim
            and sharding_impls.are_op_shardings_equal(
                o._hlo_sharding, orig_in_s._to_xla_hlo_sharding(orig_aval.ndim))
            and o.memory_kind == orig_in_s.memory_kind):
          out.append(orig_in_s)
        else:
          out.append(orig_handler(o, orig_in_s))
      except:
        out.append(o)
    else:
      out.append(o)
  return out

def maybe_get_orig_out_sharding(
    in_shardings, out_shardings, in_avals, out_avals):
  if all(hasattr(o, '_original_sharding') for o in out_shardings):
    return [o._original_sharding for o in out_shardings]

  orig_in_s = None
  orig_aval = None
  for i, aval in safe_zip(in_shardings, in_avals):
    oi = getattr(i, '_original_sharding', None)
    if type(oi) in _orig_out_sharding_handlers:
      orig_in_s = oi
      orig_aval = aval
      break
  if orig_in_s is not None:
    return _get_out_sharding_from_orig_sharding(
        out_shardings, out_avals, orig_in_s, orig_aval)

  return out_shardings


def _get_layouts_from_executable(
    xla_executable, in_layouts, out_layouts, num_ordered_effects
) -> tuple[Sequence[SpecifiedLayout | None], Sequence[SpecifiedLayout | None]]:
  try:
    in_layouts_xla = xla_executable.get_parameter_layouts()
    out_layouts_xla = xla_executable.get_output_layouts()
  except:
    return (None,) * len(in_layouts), (None,) * len(out_layouts)

  if num_ordered_effects > 0:
    in_layouts_xla = in_layouts_xla[num_ordered_effects:]
    out_layouts_xla = out_layouts_xla[num_ordered_effects:]

  new_in_layouts = []
  for x, i in safe_zip(in_layouts_xla, in_layouts):
    x = SpecifiedLayout(x)
    if isinstance(i, SpecifiedLayout):
      if i != x:
        raise AssertionError(
            f"Unexpected XLA layout override: (XLA) {x} != {i} (User sharding)")
      new_in_layouts.append(i)
    else:
      new_in_layouts.append(x)

  new_out_layouts = []
  for x, o in safe_zip(out_layouts_xla, out_layouts):
    x = SpecifiedLayout(x)
    if isinstance(o, SpecifiedLayout):
      if o != x:
        raise AssertionError(
            f"Unexpected XLA layout override: (XLA) {x} != {o} (User sharding)")
      new_out_layouts.append(o)
    else:
      new_out_layouts.append(x)

  assert all(isinstance(i, SpecifiedLayout) for i in new_in_layouts)
  assert all(isinstance(o, SpecifiedLayout) for o in new_out_layouts)
  return new_in_layouts, new_out_layouts  # type: ignore


def get_logical_mesh_ids(mesh_shape):
  return np.arange(math.prod(mesh_shape)).reshape(mesh_shape)


@weakref_lru_cache
def _cached_compilation(computation, name, mesh, spmd_lowering,
                        tuple_args, auto_spmd_lowering, allow_prop_to_inputs,
                        allow_prop_to_outputs, host_callbacks, backend,
                        da, pmap_nreps, compiler_options_keys,
                        compiler_options_values):
  # TODO(phawkins): One would normally just write:
  # dev = np.array(device_assignment)
  # The formulation below is substantially faster if there are many devices.
  # If we were to optimize __getattr__ on xc.Device we might not need this
  # workaround.
  dev = np.vectorize(lambda i: da[i], otypes=[object])(
    np.arange(len(da))
  )
  if pmap_nreps > 1:
    num_replicas, num_partitions = pmap_nreps, 1
  elif spmd_lowering:
    num_replicas, num_partitions = 1, dev.size
  else:
    num_replicas, num_partitions = dev.size, 1

  if pmap_nreps > 1:
    # In `jit` device_assignment is set to None when num_replicas > 1. Do
    # the same thing here too.
    xla_device_assignment = None
  else:
    xla_device_assignment = dev.reshape((num_replicas, num_partitions))

  if compiler_options_keys is None:
    compiler_options = None
  else:
    compiler_options = dict(safe_zip(compiler_options_keys, compiler_options_values))

  fdo_profile = (None if compiler_options is None else
                 compiler_options.pop("fdo_profile", None))

  compile_options = compiler.get_compile_options(
      num_replicas=num_replicas,
      num_partitions=num_partitions,
      device_assignment=xla_device_assignment,
      use_spmd_partitioning=spmd_lowering,
      use_auto_spmd_partitioning=auto_spmd_lowering,
      env_options_overrides=compiler_options,
      fdo_profile=fdo_profile,
      detailed_logging=compiler.use_detailed_logging(computation),
      backend=backend,
  )

  opts = compile_options.executable_build_options
  if auto_spmd_lowering:
    assert mesh is not None
    opts.auto_spmd_partitioning_mesh_shape = list(mesh.shape.values())
    opts.auto_spmd_partitioning_mesh_ids = (
        get_logical_mesh_ids(list(mesh.shape.values()))
        .reshape(-1))
  compile_options.parameter_is_tupled_arguments = tuple_args
  if xla_extension_version >= 241:
    opts.allow_spmd_sharding_propagation_to_parameters = list(allow_prop_to_inputs)
  opts.allow_spmd_sharding_propagation_to_output = list(allow_prop_to_outputs)

  if hasattr(backend, "compile_replicated"):
    return None, compile_options

  with dispatch.log_elapsed_time(
      "Finished XLA compilation of {fun_name} in {elapsed_time} sec",
      fun_name=name, event=dispatch.BACKEND_COMPILE_EVENT):
    xla_executable = compiler.compile_or_get_cached(
        backend, computation, dev, compile_options, host_callbacks)
  return xla_executable, compile_options


def _maybe_get_and_check_in_shardings(
    xla_executable, in_shardings, device_assignment,
    global_in_avals, num_ordered_effects):
  """Returns in_shardings extracted from XLA or checks and returns original
  shardings.

  If in_shardings exist on `jit` or on `jax.Array`, then this function will
  check that sharding against what XLA returns as in_shardings. If they don't
  match, an error is raised.

  If in_sharding is unspecified, then the sharding returned by XLA is returned.
  """
  in_shardings_xla = _get_in_shardings_from_xla(  # type: ignore
      xla_executable, device_assignment, len(global_in_avals),
      num_ordered_effects)  # type: ignore
  if in_shardings_xla is None:
    return in_shardings

  new_in_shardings = []
  for xla_s, orig, aval in safe_zip(in_shardings_xla, in_shardings,
                                    global_in_avals):
    if is_unspecified(orig):
      if (aval is not core.abstract_token and
          dtypes.issubdtype(aval.dtype, dtypes.extended)):
        xla_s = aval.dtype._rules.logical_sharding(aval, xla_s)
      new_in_shardings.append(xla_s)
    else:
      # TODO(yashkatariya): Remove the if branch for abstract_token once
      # choosing input shardings by XLA is enabled again.
      if aval is core.abstract_token:
        new_in_shardings.append(orig)
      else:
        xla_hlo_s = xla_s._to_xla_hlo_sharding(aval.ndim)  # type: ignore
        orig_hlo_s = orig._to_xla_hlo_sharding(aval.ndim)  # type: ignore
        # MANUAL HloSharding comes from other partitioning frameworks.
        if (not dtypes.issubdtype(aval.dtype, dtypes.extended) and
            not xla_hlo_s.is_manual() and
            (not op_shardings.are_op_shardings_equal(xla_hlo_s, orig_hlo_s))):
          raise AssertionError(
              f"Unexpected XLA sharding override: (XLA) {xla_s} != {orig} "
              "(User sharding)")
        new_in_shardings.append(orig)
  return new_in_shardings


def _maybe_get_and_check_out_shardings(
    xla_executable, out_shardings, device_assignment, global_out_avals,
    num_ordered_effects, all_default_mem_kind
  ):
  out_shardings_xla = get_out_shardings_from_executable(  # type: ignore
      xla_executable, device_assignment, len(global_out_avals),
      num_ordered_effects, all_default_mem_kind)  # type: ignore
  if out_shardings_xla is None:
    return out_shardings

  new_out_shardings = []
  for xla_s, orig, aval in safe_zip(out_shardings_xla, out_shardings,
                                    global_out_avals):
    if is_unspecified(orig):
      if (aval is not core.abstract_token and
          dtypes.issubdtype(aval.dtype, dtypes.extended)):
        xla_s = aval.dtype._rules.logical_sharding(aval, xla_s)
      new_out_shardings.append(xla_s)
    else:
      xla_hlo_s = xla_s._to_xla_hlo_sharding(aval.ndim)  # type: ignore
      orig_hlo_s = orig._to_xla_hlo_sharding(aval.ndim)  # type: ignore
      # MANUAL HloSharding comes from other partitioning frameworks.
      if (not dtypes.issubdtype(aval.dtype, dtypes.extended) and
          not xla_hlo_s.is_manual() and
          (not op_shardings.are_op_shardings_equal(xla_hlo_s, orig_hlo_s) or
           xla_s.memory_kind != orig.memory_kind)):  # type: ignore
        raise AssertionError(
            f"Unexpected XLA sharding override: (XLA) {xla_s} != {orig} "
            "(User sharding)")
      new_out_shardings.append(orig)
  return new_out_shardings


def finalize_out_shardings(out_shardings, device_assignment):
  if len(device_assignment) == 1:
    return [SingleDeviceSharding(device_assignment[0], memory_kind=o.memory_kind)
            if isinstance(o, GSPMDSharding) else o for o in out_shardings]
  return out_shardings


@dataclasses.dataclass
class UnloadedMeshExecutable:
  xla_executable: Any
  device_assignment: xc.DeviceList | Sequence[xc.Device]  # type: ignore
  backend: xb.XlaBackend
  input_avals: Sequence[ShapedArray]
  input_shardings: Sequence[sharding_impls.XLACompatibleSharding]
  output_avals: Sequence[ShapedArray]
  output_shardings: Sequence[sharding_impls.XLACompatibleSharding]
  committed: bool
  name: str
  unordered_effects: list[core.Effect]
  ordered_effects: list[core.Effect]
  keepalive: Sequence[Any]
  host_callbacks: Sequence[Any]
  kept_var_idx: set[int]
  mut: MutationData | None
  auto_spmd_lowering: bool
  in_layouts: Sequence[SpecifiedLayout | None]
  out_layouts: Sequence[SpecifiedLayout | None]
  all_args_info: AllArgsInfo | None

  def build_unsafe_call(self):
    if xla_extension_version >= 229:
      handle_args = InputsHandler(self.input_shardings)
    else:
      input_indices = _get_input_indices(self.input_avals, self.input_shardings,
                                         self.device_assignment)
      handle_args = InputsHandler(
          self.input_shardings, self.xla_executable.local_devices(), input_indices)
    handle_outs = global_avals_to_results_handler(
        self.output_avals, self.output_shardings, self.committed)  # type: ignore  # arg-type

    unsafe_call = ExecuteReplicated(  # type: ignore  # assignment
        self.xla_executable, self.name, self.backend, handle_args,
        handle_outs, self.unordered_effects, self.ordered_effects, self.keepalive,
        bool(self.host_callbacks), self.kept_var_idx, self.mut)
    return unsafe_call

  def load(self) -> MeshExecutable:
    return MeshExecutable(self.xla_executable, self.build_unsafe_call,
                          self.input_avals, self.output_avals,
                          self.input_shardings, self.output_shardings,
                          self.auto_spmd_lowering, self.kept_var_idx,
                          self.in_layouts, self.out_layouts,
                          self.all_args_info, self)

  # May return a MeshExecutable in the compile_replicated case.
  @staticmethod
  def from_hlo(name: str,
               hlo: ir.Module,
               global_in_avals: Sequence[ShapedArray],
               global_out_avals: Sequence[ShapedArray],
               in_shardings: Sequence[sharding_impls.XLACompatibleSharding | AUTO],
               out_shardings: Sequence[(sharding_impls.XLACompatibleSharding | AUTO |
                                        UnspecifiedValue)],
               spmd_lowering: bool,
               tuple_args: bool,
               auto_spmd_lowering: bool,
               unordered_effects: list[core.Effect],
               ordered_effects: list[core.Effect],
               host_callbacks: list[Any],
               keepalive: Any,
               kept_var_idx: set[int],
               backend: xb.XlaBackend,
               device_assignment: xc.DeviceList | Sequence[xc.Device],  # type: ignore
               committed: bool,
               in_layouts: MaybeLayout,
               out_layouts: MaybeLayout,
               pmap_nreps: int = 1,
               mut: MutationData | None = None,
               shape_poly_state: mlir.ShapePolyLoweringState | None = None,
               all_default_mem_kind: bool = True,
               all_args_info: AllArgsInfo | None = None,
               compiler_options=None,
  ) -> MeshExecutable:
    if shape_poly_state is not None and shape_poly_state.uses_dim_vars:
      hlo = mlir.refine_polymorphic_shapes(hlo)
    compiler_options_keys = tuple(
        compiler_options.keys()) if compiler_options is not None else None
    compiler_options_values = tuple(
        compiler_options.values()) if compiler_options is not None else None
    if isinstance(device_assignment, xc.DeviceList):
      da = device_assignment
    else:
      da = _create_da_object(tuple(device_assignment))
    del device_assignment

    allow_prop_to_inputs = tuple(is_unspecified(i) for i in in_shardings)
    allow_prop_to_outputs = tuple(is_unspecified(o) for o in out_shardings)

    mesh = None
    if auto_spmd_lowering:
      for i in it.chain.from_iterable([in_shardings, out_shardings]):
        if is_auto(i):
          mesh = i.mesh  # type: ignore
          break

    xla_executable, compile_options = _cached_compilation(
        hlo, name, mesh, spmd_lowering,
        tuple_args, auto_spmd_lowering, allow_prop_to_inputs,
        allow_prop_to_outputs, tuple(host_callbacks), backend, da, pmap_nreps,
        compiler_options_keys, compiler_options_values)

    if hasattr(backend, "compile_replicated"):
      semantics_in_shardings = SemanticallyEqualShardings(in_shardings)  # type: ignore
      semantics_out_shardings = SemanticallyEqualShardings(out_shardings)  # type: ignore
      return _compile_replicated_mesh_executable_from_hlo(
          hlo, name, tuple(global_in_avals), tuple(global_out_avals),
          semantics_in_shardings, semantics_out_shardings, auto_spmd_lowering,
          compile_options, tuple(host_callbacks), bool(unordered_effects),
          tuple(ordered_effects), tuple(kept_var_idx), backend, da, committed,
          pmap_nreps)

    if auto_spmd_lowering:
      assert mesh is not None
      in_shardings_xla, out_shardings_xla = _get_mesh_pspec_shardings_from_executable(
          xla_executable, mesh)
      in_shardings = [x if is_auto(i) else getattr(i, '_original_sharding', i)  # type: ignore
                      for x, i in safe_zip(in_shardings_xla, in_shardings)]
      out_shardings = [x if is_auto(o) else o
                       for x, o in safe_zip(out_shardings_xla, out_shardings)]
    else:
      if pmap_nreps == 1:
        assert mesh is None
        if xla_extension_version >= 241:
          in_shardings = _maybe_get_and_check_in_shardings(
              xla_executable, in_shardings, tuple(da), global_in_avals,
              len(ordered_effects))
        out_shardings = _maybe_get_and_check_out_shardings(
            xla_executable, out_shardings, tuple(da), global_out_avals,
            len(ordered_effects), all_default_mem_kind)
      else:
        in_shardings, out_shardings, committed, da = _get_metadata_jit_pmap(
            xla_executable.local_devices(), len(in_shardings), len(out_shardings))

    if xla_extension_version >= 217:
      in_layouts, out_layouts = _get_layouts_from_executable(
          xla_executable, in_layouts, out_layouts, len(ordered_effects))
    else:
      assert all(i is None for i in in_layouts)
      assert all(o is None for o in out_layouts)

    out_shardings = maybe_get_orig_out_sharding(
        in_shardings, out_shardings, global_in_avals, global_out_avals)

    out_shardings = finalize_out_shardings(out_shardings, da)

    return UnloadedMeshExecutable(
        xla_executable=xla_executable,
        device_assignment=da,  # type: ignore
        backend=backend,
        input_avals=global_in_avals,
        input_shardings=in_shardings,  # type: ignore
        output_avals=global_out_avals,
        output_shardings=out_shardings,  # type: ignore # arg-type
        committed=committed,
        name=name,
        unordered_effects=unordered_effects,
        ordered_effects=ordered_effects,
        keepalive=keepalive,
        host_callbacks=host_callbacks,
        kept_var_idx=kept_var_idx,
        mut=mut,
        auto_spmd_lowering=auto_spmd_lowering,
        in_layouts=in_layouts,  # type: ignore
        out_layouts=out_layouts,  # type: ignore
        all_args_info=all_args_info).load()  # type: ignore


class MeshExecutableFastpathData(NamedTuple):
  xla_executable: xc.LoadedExecutable
  out_pytree_def: Any
  in_shardings: Sequence[sharding_impls.XLACompatibleSharding]
  out_shardings: Sequence[sharding_impls.XLACompatibleSharding]
  out_avals: Sequence[ShapedArray]
  out_committed: Sequence[bool]
  kept_var_bitvec: Iterable[bool]
  # TODO(yashkatariya): Remove once minimum jaxlib version is 0.4.24
  arg_handler_devices: Sequence[xc.Device]
  arg_handler_indices: Sequence[tuple[Index | None, ...]]


def reflatten_outputs_for_dispatch(out_tree, out_flat):
  # We arrive at dispatch having flattened according to the default
  # pytree registry, but we want to re-flatten according to our
  # dispatch-specific registry.
  out_unflat = tree_util.tree_unflatten(out_tree, out_flat)
  return tree_util.dispatch_registry.flatten(out_unflat, None)


class MeshExecutable(stages.XlaExecutable):
  __slots__ = [
      "xla_executable", "_unsafe_call", "build_unsafe_call", "in_avals",
      "out_avals", "_in_shardings", "_out_shardings", "_auto_spmd_lowering",
      "_kept_var_idx", "_in_layouts", "_out_layouts", "_all_args_info",
      "_unloaded_executable",
  ]

  def __init__(self, xla_executable, build_unsafe_call, in_avals, out_avals,
               in_shardings, out_shardings, auto_spmd_lowering, kept_var_idx,
               in_layouts, out_layouts,
               all_args_info: AllArgsInfo | None = None,
               unloaded_executable=None):
    self.xla_executable = xla_executable
    self.build_unsafe_call = build_unsafe_call
    # in_avals is a list of global and local avals. Aval is global if input
    # is a GDA or jax.Array else local.
    self.in_avals = in_avals
    self.out_avals = out_avals
    self._unsafe_call = None
    self._in_shardings = in_shardings
    self._out_shardings = out_shardings
    self._auto_spmd_lowering = auto_spmd_lowering
    self._kept_var_idx = kept_var_idx
    self._in_layouts = in_layouts
    self._out_layouts = out_layouts
    self._all_args_info = all_args_info
    self._unloaded_executable = unloaded_executable

  @property
  def unsafe_call(self) -> Callable[..., Any]:
    if self._unsafe_call is None:
      self._unsafe_call = self.build_unsafe_call()
    return self._unsafe_call  # type: ignore

  # -- stages.XlaExecutable overrides

  def xla_extension_executable(self):
    return self.xla_executable

  def call(self, *args):
    if self._all_args_info is None:
      kept_args = [a for i, a in enumerate(args) if i in self._kept_var_idx]
      ref_avals = self.in_avals
      in_shardings = self._in_shardings
      debug_info = None
    else:
      kept_args = args
      ref_avals = self._all_args_info.in_avals
      iter_in_shardings = iter(self._in_shardings)
      in_shardings = [next(iter_in_shardings) if i in self._kept_var_idx else s
                      for i, s in enumerate(self._all_args_info.in_shardings)]
      debug_info = self._all_args_info.debug_info

    arg_avals = map(xla.abstractify, kept_args)
    check_arg_avals_for_call(ref_avals, arg_avals, debug_info)
    # Check the GDA sharding and the input sharding.
    check_gda_or_array_xla_sharding_match(kept_args, in_shardings, debug_info)
    return self.unsafe_call(*args)  # pylint: disable=not-callable

  def input_shardings(self) -> Sequence[sharding_impls.XLACompatibleSharding]:
    return self._in_shardings

  def output_shardings(self) -> Sequence[sharding_impls.XLACompatibleSharding]:
    return self._out_shardings

  def input_layouts(self):
    return self._in_layouts

  def output_layouts(self):
    return self._out_layouts

  def create_cpp_call(self, no_kwargs, in_tree, out_tree):
    if not (isinstance(self.unsafe_call, ExecuteReplicated) and
            not self.unsafe_call.has_unordered_effects and
            not self.unsafe_call.has_host_callbacks):
      return None

    def aot_cache_miss(*args, **kwargs):
      params = stages.CompiledCallParams(self, no_kwargs, in_tree, out_tree)
      outs, out_flat, args_flat = stages.Compiled.call(params, *args, **kwargs)
      out_flat, out_tree_dispatch = reflatten_outputs_for_dispatch(
          out_tree, out_flat)
      use_fastpath = (all(isinstance(x, xc.ArrayImpl) for x in out_flat))

      if use_fastpath:
        out_avals = [o.aval for o in out_flat]
        out_committed = [o._committed for o in out_flat]
        kept_var_bitvec = [i in self._kept_var_idx
                           for i in range(len(args_flat))]
        in_shardings = [
            a.dtype._rules.physical_sharding(a, s)
            if a is not core.abstract_token and dtypes.issubdtype(a.dtype, dtypes.extended)
            else s
            for s, a in zip(self._in_shardings, self.in_avals)
        ]
        fastpath_data = MeshExecutableFastpathData(
            self.xla_executable, out_tree_dispatch, in_shardings,
            self._out_shardings, out_avals, out_committed, kept_var_bitvec,
            self.unsafe_call.in_handler.local_devices,
            self.unsafe_call.in_handler.input_indices)
      else:
        fastpath_data = None
      return outs, fastpath_data

    if xla_extension_version >= 226:
      return xc._xla.pjit(
          self.unsafe_call.name, None, aot_cache_miss, [], [], [],
          tree_util.dispatch_registry,
          shard_arg if xla_extension_version >= 229 else temp_shard_arg)  # type: ignore
    else:
      return xc._xla.pjit(self.unsafe_call.name, None, aot_cache_miss, [], [], [],  # type: ignore
                          tree_util.dispatch_registry)


# TODO(yashkatariya): Remove once minimum jaxlib version is 0.4.24
def temp_shard_arg(arg, devices, arg_indices, sharding, canonicalize=True):
  return shard_arg(arg, sharding)


def check_arg_avals_for_call(ref_avals, arg_avals,
                             jaxpr_debug_info: core.JaxprDebugInfo | None = None):
  if len(ref_avals) != len(arg_avals):
    raise TypeError(
        f"Computation compiled for {len(ref_avals)} inputs "
        f"but called with {len(arg_avals)}")

  if jaxpr_debug_info is not None:
    arg_names = [f"'{name}'" for name in jaxpr_debug_info.arg_names]
  else:
    num_args = len(ref_avals)
    arg_names = [f"{i + 1}/{num_args}" for i in range(num_args)]

  errors = []
  for ref_aval, arg_aval, name in safe_zip(ref_avals, arg_avals, arg_names):
    if not core.typematch(ref_aval, arg_aval):
      errors.append(
          f"Argument {name} compiled with {ref_aval.str_short()} and called "
          f"with {arg_aval.str_short()}")
  if errors:
    max_num_errors = 5
    str_errors = "\n".join(errors[:max_num_errors])
    if len(errors) >= max_num_errors:
      num_mismatch_str = f"The first {max_num_errors} of {len(errors)}"
    else:
      num_mismatch_str = "The"
    raise TypeError(
        "Argument types differ from the types for which this computation was "
        f"compiled. {num_mismatch_str} mismatches are:\n{str_errors}")


def _get_metadata_jit_pmap(local_devices, num_in_shardings, num_out_shardings):
  # Create replicated shardings for jit(pmap) path with local devices
  # because multihost jit(pmap) is not allowed.
  gs = sharding_impls.GSPMDSharding.get_replicated(local_devices)
  in_shardings = [gs] * num_in_shardings
  out_shardings = [gs] * num_out_shardings
  # jit(pmap) will generate Arrays with multi-device sharding.
  # It is unsupported for these shardings to be uncommitted, so force
  # the outputs to be committed.
  committed = True
  return in_shardings, out_shardings, committed, tuple(local_devices)


@weakref_lru_cache
def _compile_replicated_mesh_executable_from_hlo(
    computation, name, global_in_avals, global_out_avals, semantics_in_shardings,
    semantics_out_shardings, auto_spmd_lowering, compile_options,
    host_callbacks, has_unordered_effects, ordered_effects, kept_var_idx,
    backend, da, committed, pmap_nreps):
  assert not auto_spmd_lowering
  in_shardings = semantics_in_shardings.shardings
  out_shardings = semantics_out_shardings.shardings

  kept_var_idx = set(kept_var_idx)
  # Will compute out_handler with executable information.
  unsafe_call = backend.compile_replicated(
      is_trivial=False, name=name, computation=computation,
      compile_options=compile_options, host_callbacks=host_callbacks,
      has_unordered_effects=has_unordered_effects,
      device_assignment=da, ordered_effects=ordered_effects,
      in_avals=global_in_avals,
      in_shardings=in_shardings, kept_var_idx=kept_var_idx,
      out_avals=global_out_avals, out_shardings=out_shardings,
      committed=committed, pmap_nreps=pmap_nreps)
  xla_executable = None
  return MeshExecutable(xla_executable, lambda: unsafe_call, global_in_avals,
                        global_out_avals, in_shardings, out_shardings,
                        auto_spmd_lowering, kept_var_idx,
                        (None,) * len(global_in_avals),
                        (None,) * len(global_out_avals))


@lru_cache
def create_mesh_pspec_sharding(
    mesh: Mesh, pspec: PartitionSpec | None, parsed_pspec=None,
    memory_kind: str | None = None) -> sharding_impls.NamedSharding:
  if pspec is None:
    pspec, parsed_pspec = PartitionSpec(), None
  return sharding_impls.NamedSharding(mesh, pspec, _parsed_pspec=parsed_pspec,
                                      memory_kind=memory_kind)


def check_device_backend_on_shardings(shardings) -> bool:
  for i in shardings:
    if is_unspecified(i) or is_auto(i):
      continue
    if hasattr(i, '_original_sharding') and getattr(
        i._original_sharding, '_device_backend', False):
      return True
  return False


def check_gda_or_array_xla_sharding_match(
    args, in_xla_shardings: Sequence[sharding_impls.XLACompatibleSharding],
    jaxpr_debug_info: core.JaxprDebugInfo | None) -> None:
  from jax._src.array import ArrayImpl
  arg_names = ([''] * len(args) if jaxpr_debug_info is None else
               jaxpr_debug_info.arg_names)
  errors = []
  num_errors = 5
  for arg, xs, name in safe_zip(args, in_xla_shardings, arg_names):
    if not isinstance(arg, ArrayImpl):
      continue
    if is_unspecified_or_auto(xs):
      continue

    db_xs = check_device_backend_on_shardings([xs])
    if not db_xs:
      xs = getattr(xs, '_original_sharding', xs)

    # Raise memory kind mismatch error even if the arg is uncommitted.
    if arg.sharding.memory_kind != xs.memory_kind:
      errors.append(
          "Got input sharding(s) that compiled object was called with: "
          f"{arg.sharding} and sharding(s) the computation was compiled "
          f"with: {xs} for arg {name} with shape: {arg.aval.str_short()}")

    if (not db_xs and arg._committed and
        not op_shardings.are_op_shardings_equal(
            arg.sharding._to_xla_hlo_sharding(arg.ndim),
            xs._to_xla_hlo_sharding(arg.ndim))):
      errors.append(
          "Got input sharding(s) that compiled object was called with: "
          f"{arg.sharding} and sharding(s) the computation was compiled "
          f"with: {xs} for arg {name} with shape: {arg.aval.str_short()}")

  if errors:
    str_errors = '\n'.join(errors[:num_errors])
    num_mismatch_str = (
        f'the {len(errors)} mismatches' if len(errors) < num_errors else
        f"{num_errors} mismatches out of {len(errors)}")
    raise ValueError(
          "Compiled object called with input sharding(s) does not match the "
          "sharding(s) the computation was compiled with. "
          f"Here are {num_mismatch_str}:\n{str_errors}")


def get_array_mapping(pspec: PartitionSpec) -> ArrayMappingOrAutoOrUnspecified:
  parsed_pspec, _, _ = sharding_impls.prepare_axis_resources(
      pspec, "pspec to array_mapping")
  return _get_array_mapping(parsed_pspec)


_forbidden_primitives = {
  'xla_pmap': 'pmap',
}
def _sanitize_mesh_jaxpr(jaxpr):
  if isinstance(jaxpr, core.ClosedJaxpr):
    jaxpr = jaxpr.jaxpr
  for eqn in jaxpr.eqns:
    if eqn.primitive.name in _forbidden_primitives:
      raise RuntimeError(f"Nesting {_forbidden_primitives[eqn.primitive.name]} "
                         f"inside xmaps not supported!")
    core.traverse_jaxpr_params(_sanitize_mesh_jaxpr, eqn.params)


custom_resource_typing_rules: dict[core.Primitive, Callable] = {}

def resource_typecheck(jaxpr, resource_env, axis_resources, what_jaxpr_thunk):
  if isinstance(jaxpr, core.ClosedJaxpr):
    jaxpr = jaxpr.jaxpr
  def _check_aval(aval, what_thunk):
    if not hasattr(aval, 'named_shape'):
      return
    resource_to_axis = {}
    for axis in aval.named_shape:
      if axis_resources:
        for resource in axis_resources[axis]:
          if resource in resource_to_axis:
            other_axis = resource_to_axis[resource]
            axis, other_axis = sorted([str(axis), str(other_axis)])
            raise JAXTypeError(
                f"Axes `{axis}` and `{other_axis}` are both mapped to the "
                f"resource `{resource}`, but they coincide in the named_shape "
                f"of {what_thunk()}")
          resource_to_axis[resource] = axis

  what_thunk = lambda: (f"an input to {what_jaxpr_thunk()}")
  for v in jaxpr.constvars:
    _check_aval(v.aval, what_thunk)
  for v in jaxpr.invars:
    _check_aval(v.aval, what_thunk)
  what_thunk = lambda: (f"a value returned from a primitive {eqn.primitive} created "
                        f"at {source_info_util.summarize(eqn.source_info)}")
  rec_what_jaxpr_thunk = lambda: (f"a primitive {eqn.primitive} created at"
                                  f"{source_info_util.summarize(eqn.source_info)}")
  for eqn in jaxpr.eqns:
    typing_rule = custom_resource_typing_rules.get(eqn.primitive, None)
    if typing_rule:
      typing_rule([v.aval for v in eqn.invars], eqn.params, eqn.source_info,
                  resource_env, axis_resources)
    else:
      core.traverse_jaxpr_params(partial(resource_typecheck,
                                         resource_env=resource_env,
                                         axis_resources=axis_resources,
                                         what_jaxpr_thunk=rec_what_jaxpr_thunk),
                                 eqn.params)
    for v in eqn.outvars:
      _check_aval(v.aval, what_thunk)


@contextmanager
def maybe_extend_axis_env(*args, **kwargs):
  with core.extend_axis_env(*args, **kwargs):
    yield


def device_put(x, devices: Sequence[xc.ArrayImpl],
               replicate: bool=False) -> list[xc.ArrayImpl]:
  """Call device_put on a sequence of devices and return a flat sequence of buffers."""
  if replicate:
    return [jax.device_put(x, device) for device in devices]
  else:
    return [jax.device_put(val, device) for val, device in safe_zip(x, devices)]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								# Copyright 2018 The JAX Authors.
 								#
 								# Licensed under the Apache License, Version 2.0 (the "License");
 								# you may not use this file except in compliance with the License.
 								# You may obtain a copy of the License at
 								#
 								#     https://www.apache.org/licenses/LICENSE-2.0
 								#
 								# Unless required by applicable law or agreed to in writing, software
 								# distributed under the License is distributed on an "AS IS" BASIS,
 								# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								# See the License for the specific language governing permissions and
 								# limitations under the License.
 								"""Implementation of pmap and related functionality."""
 								from __future__ import annotations
 								import enum
-												Split Mesh and ResourceEnv into a new module jax._src.mesh.

This work is an effort to reduce cyclic dependencies in JAX internals.

Move the _global_to_local and _local_to_global methods out of Mesh and into pxla as free functions. This removes the need for jax._src.mesh to depend on things like avals.

PiperOrigin-RevId: 515667671

											
										
										
											2023-03-10 10:07:37 -08:00
+								from contextlib import contextmanager
-												Move _array_shard_arg helpers from pxla into array.

Refactoring only which fixes a TODO.

Add a canonicalize argument to pxla.shard_arg so we can call that API from array yet  avoid double-canonicalization.

PiperOrigin-RevId: 549658117

											
										
										
											2023-07-20 09:43:40 -07:00
+								from collections import namedtuple
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								from collections.abc import Sequence, Iterable
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								import dataclasses
 								from functools import partial, lru_cache, cached_property
 								import itertools as it
 								import logging
-												Replace jax._src.util.prod with math.prod.

math.prod() was added in Python 3.8, so we can assume it is always present.

PiperOrigin-RevId: 513011144

											
										
										
											2023-02-28 12:40:30 -08:00
+								import math
-												Make device_put of non_addressable_jax_array and non_addressable_sharding (with different device order) thread safe by making _get_and_check_device_assignment thread local.

PiperOrigin-RevId: 568358309

											
										
										
											2023-09-25 16:41:43 -07:00
+								import threading
-												Upgrade remaining sources to Python 3.9

This PR is a follow up to #18881.

The changes were generated by adding

    from __future__ import annotations

to the files which did not already have them and running

    pyupgrade --py39-plus --keep-percent-format {jax,tests,jaxlib,examples,benchmarks}/**/*.py

											
										
										
											2023-12-11 13:59:29 +00:00
+								from typing import Any, Callable, NamedTuple, TypeVar, Union, cast
-												Upgrade most .py sources to 3.9

This commit was generated by running

    pyupgrade --py39-plus --keep-percent-format {jax,tests,jaxlib,examples,benchmarks}/**/*.py

											
										
										
											2023-12-08 12:09:04 +00:00
+								from collections.abc import Iterator
-												Move some utilities out of dispatch.py next to their users, add more types.

Internal cleanups only, no user-visible changes intended.

PiperOrigin-RevId: 554876522

											
										
										
											2023-08-08 10:51:38 -07:00
+								import warnings
-												migrate internal dependencies from `jax.interpreters.ad` to `jax._src.interpreters.ad`

... in preparation for paring down `jax.interpreters.ad`'s exported symbols.

Includes some import fixups along the way.

PiperOrigin-RevId: 507684262

											
										
										
											2023-02-06 22:51:50 -08:00
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								import numpy as np
 								import jax
 								from jax.errors import JAXTypeError
 								from jax._src import api_util
-												Move compiler APIs out of dispatch.py and xla_bridge.py into a new jax._src.compiler module.

Refactoring only, no user-visible changes intended.

PiperOrigin-RevId: 557116160

											
										
										
											2023-08-15 06:38:56 -07:00
+								from jax._src import compiler
-												Migrate a subset of internal modules to use state objects

The motivation here is to gradually replace all dynamic lookups on `jax.config`
with statically-typed state objects, which are more type checker/IDE friendly.

PiperOrigin-RevId: 571932143

											
										
										
											2023-10-09 07:28:18 -07:00
+								from jax._src import config
 								from jax._src import core
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								from jax._src import dispatch
-												migrate internal dependencies from `jax.interpreters.ad` to `jax._src.interpreters.ad`

... in preparation for paring down `jax.interpreters.ad`'s exported symbols.

Includes some import fixups along the way.

PiperOrigin-RevId: 507684262

											
										
										
											2023-02-06 22:51:50 -08:00
+								from jax._src import dtypes
-												Refactor effects system to use effect types, not objects

											
										
										
											2023-02-01 17:50:00 -08:00
+								from jax._src import effects
-												migrate internal dependencies from `jax.interpreters.ad` to `jax._src.interpreters.ad`

... in preparation for paring down `jax.interpreters.ad`'s exported symbols.

Includes some import fixups along the way.

PiperOrigin-RevId: 507684262

											
										
										
											2023-02-06 22:51:50 -08:00
+								from jax._src import linear_util as lu
-												Split sharding_impls into its own Bazel target.

* Move dependencies of sharding_impls into sharding_impls to avoid creating cyclic dependencies.
* Fix a handful of new pytype errors.

PiperOrigin-RevId: 523146076

											
										
										
											2023-04-10 10:15:08 -07:00
+								from jax._src import mesh as mesh_lib
-												Rename jax._src.sharding_utils to jax._src.op_shardings.

Move some more op_sharding related helpers to that module.

PiperOrigin-RevId: 522343010

											
										
										
											2023-04-06 08:31:47 -07:00
+								from jax._src import op_shardings
-												Split ShardingSpecs and most of the helpers for constructing them into a separate file (jax/_src/sharding_specs.py).

PiperOrigin-RevId: 522360232

											
										
										
											2023-04-06 09:48:14 -07:00
+								from jax._src import sharding_specs
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								from jax._src import profiler
-												Split Sharding subclasses out of _src/sharding.py into _src/sharding_impls.py

By defining the Sharding base class in its own module, we can pull it out into a separate Bazel submodule, which will help pytype inference when defining Array.

PiperOrigin-RevId: 516223009

											
										
										
											2023-03-13 08:49:39 -07:00
+								from jax._src import sharding_impls
-												migrate internal dependencies from `jax.interpreters.ad` to `jax._src.interpreters.ad`

... in preparation for paring down `jax.interpreters.ad`'s exported symbols.

Includes some import fixups along the way.

PiperOrigin-RevId: 507684262

											
										
										
											2023-02-06 22:51:50 -08:00
+								from jax._src import source_info_util
 								from jax._src import stages
-												[JAX] Add support for multiple pytree registries.

We have a number of potential use cases where we want different functions that interpret pytrees differently. By allowing multiple pytree registries the same tree node can be registered in registry but not another.

One motivating use case is the new opaque PRNG array type. We want `jit` to treat these objects as if they were pytrees, but we want other transformations to leave them alone or handle them specially.

PiperOrigin-RevId: 549301796

											
										
										
											2023-07-19 06:47:46 -07:00
+								from jax._src import tree_util
-												migrate internal dependencies from `jax.interpreters.ad` to `jax._src.interpreters.ad`

... in preparation for paring down `jax.interpreters.ad`'s exported symbols.

Includes some import fixups along the way.

PiperOrigin-RevId: 507684262

											
										
										
											2023-02-06 22:51:50 -08:00
+								from jax._src import util
-												[JAX] Move jax._src.lib.xla_bridge to jax._src.xla_bridge.

Limit jax._src.lib to shims around jaxlib and nothing else.

The goal of this change is to avoid a dependency cycle between the rest of jax and jax._src.lib in a Bazel build. This allows the types for jax._src.lib to be inferred by pytype in isolation without referring to the rest of JAX.

PiperOrigin-RevId: 512922397

											
										
										
											2023-02-28 07:01:14 -08:00
+								from jax._src import xla_bridge as xb
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								from jax._src.abstract_arrays import array_types
-												Explicitly check for DShapedArray in _to_logical_sharding instead of returning a sharding by default.

											
										
										
											2023-07-12 13:08:57 -04:00
+								from jax._src.core import DShapedArray
-												Remove pxla.OutputType enum class now that the only output can be jax.Array

PiperOrigin-RevId: 517985356

											
										
										
											2023-03-20 09:09:15 -07:00
+								from jax._src.core import ShapedArray
-												migrate internal dependencies from `jax.interpreters.ad` to `jax._src.interpreters.ad`

... in preparation for paring down `jax.interpreters.ad`'s exported symbols.

Includes some import fixups along the way.

PiperOrigin-RevId: 507684262

											
										
										
											2023-02-06 22:51:50 -08:00
+								from jax._src.interpreters import ad
-												migrate internal dependencies from `jax.interpreters.batching` to `jax._src.interpreters.batching`

... in preparation for paring down `jax.interpreters.batching`'s exported symbols.

PiperOrigin-RevId: 508487887

											
										
										
											2023-02-09 15:11:20 -08:00
+								from jax._src.interpreters import batching
-												Move jax.interpreters.partial_eval to jax._src.interpreters.partial_eval.

Also fix up some other internal imports of jax.interpreters.* to use jax._src.interpreters.

PiperOrigin-RevId: 519813664

											
										
										
											2023-03-27 13:29:59 -07:00
+								from jax._src.interpreters import partial_eval as pe
-												migrate internal dependencies from `jax.interpreters.batching` to `jax._src.interpreters.batching`

... in preparation for paring down `jax.interpreters.batching`'s exported symbols.

PiperOrigin-RevId: 508487887

											
										
										
											2023-02-09 15:11:20 -08:00
+								from jax._src.interpreters import mlir
-												Move jax.interpreters.xla to jax._src.interpreters.xla.

Replace jax.interpreters.xla with a shim that re-exports names that are likely to be used externally.

PiperOrigin-RevId: 507895040

											
										
										
											2023-02-07 15:00:56 -08:00
+								from jax._src.interpreters import xla
-												Add a private API to allow setting layouts on jitted computations.

We expose 3 modes:

* `SpecifiedLayout`: User specifies the `minor_to_major` field of the layout. Tiling not exposed yet.

* `DefaultLayout`: PJRT chooses the layout. It defaults to the current behavior.

* `AUTO`: Compiler chooses the layout. This field is not a layout per se. It's a request to get the layout from the compiler. This field cannot be on an Array or other data types. It can only be on jit.

Public API coming soon.

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 582692036

											
										
										
											2023-11-15 08:48:17 -08:00
+								from jax._src.layout import XLACompatibleLayout, SpecifiedLayout, LayoutRequest
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								from jax._src.lib import xla_client as xc
-												[JAX] Introduce `DeviceList` backed by C++ `xla::ifrt::DeviceList`

This change adds `xla_client.DeviceList` that is implemented in C++
`jax::PyDeviceList`. `jax::PyDeviceList` implements the features of
`pxla._DeviceAssignment` as a functional drop-in replacement.
`jax::PyDeviceList` internally has `xla::ifrt::DeviceList`, which will be used
when using IFRT APIs without having to construct a new copy of a potentially
large device list.

`pxla._DeviceAssignment`'s interface is changed slightly to encourage avoiding
conversion to tuple.

Note that for the backward compatibility (and fast `xla_client.Device`
conversion), `jax::PyDeviceList` still uses a Python tuple whose element can be
any Python object matches `xla_client.Device` interface with duck typing. This
duck typing support will be removed when such use case is deprecated.
Eventually, we can try to avoid any type conversion to remove a shadow copy of
device list in JAX.

PiperOrigin-RevId: 555317152

											
										
										
											2023-08-09 16:57:28 -07:00
+								from jax._src.lib import xla_extension_version
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								from jax._src.lib.mlir import ir
 								from jax._src.lib.mlir.dialects import hlo
-												Move PartitionSpec into its own file (jax/_src/partition_spec.py).

No functional changes intended.

A subsequent change will move ParsedPartitionSpec and array mapping utilities here also.

PiperOrigin-RevId: 522393166

											
										
										
											2023-04-06 11:42:45 -07:00
+								from jax._src.partition_spec import PartitionSpec
-												Split sharding_impls into its own Bazel target.

* Move dependencies of sharding_impls into sharding_impls to avoid creating cyclic dependencies.
* Fix a handful of new pytype errors.

PiperOrigin-RevId: 523146076

											
										
										
											2023-04-10 10:15:08 -07:00
+								from jax._src.sharding_impls import (
-												[mutable-arrays] support closed-over mutable arrays in jit

											
										
										
											2024-03-05 16:20:24 -08:00
+								    ArrayMapping, ArrayMappingOrAutoOrUnspecified, AUTO, UNSPECIFIED,
 								    UnspecifiedValue, get_array_mapping as _get_array_mapping, is_auto,
-												Preserve single device NamedSharding/PositionalSharding on the output instead of always return SingleDeviceShardings.

Fixes https://github.com/google/jax/issues/19459

PiperOrigin-RevId: 600999853

											
										
										
											2024-01-23 21:28:33 -08:00
+								    is_unspecified, is_unspecified_or_auto, array_mapping_to_axis_resources,
-												[mutable-arrays] support closed-over mutable arrays in jit

											
										
										
											2024-03-05 16:20:24 -08:00
+								    SingleDeviceSharding, GSPMDSharding)
 								from jax._src.util import (safe_map, safe_zip, partition_list, wrap_name,
 								                           tuple_update, tuple_delete, distributed_debug_log,
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								                           unzip2, HashableFunction, weakref_lru_cache)
-												[mutable-arrays] allow state effects in jit by building in run_state

with help from @sharadmv, @yashkatariya, @dougalm, and others

The basic strategy is to apply discharge_state when lowering a jaxpr with state
effects to HLO, and update the dispatch path accordingly. Specifically:
1. in tests only for now, introduce a MutableArray data type;
2. teach jit to abstract it to a Ref(ShapedArray) type, register an input
   handler, etc;
3. call discharge_state in `lower_sharding_computation` to lower a jaxpr with
   refs to a jaxpr (and then to an HLO) with extra outputs, and set up aliasing;
4. teach the output side of the dispatch path to drop those outputs.

As an alternative to (3), we could potentially lower away the effects at a
higher level, like in _pjit_lower_cached. They are similar because
_pjit_lower_cached is the only (non-xmap) caller of lower_sharding_computation.
I decided to do it in lower_sharding_computation mainly because that's closer
to where we set up aliases, and I wanted to make mutable arrays correspond to
aliased inputs/outputs on the XLA computation.

											
										
										
											2024-02-26 14:46:05 -08:00
+								from jax._src.state.types import AbstractRef, RefEffect
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								# Built in Python lists don't support weak refs but subclasses of lists do.
 								class WeakRefList(list):
 								  pass
 								xe = xc._xla
 								unsafe_map, map = map, safe_map  # type: ignore
 								logger = logging.getLogger(__name__)
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								Index = Union[int, slice, tuple[Union[int, slice], ...]]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Split ShardingSpecs and most of the helpers for constructing them into a separate file (jax/_src/sharding_specs.py).

PiperOrigin-RevId: 522360232

											
										
										
											2023-04-06 09:48:14 -07:00
+								NoSharding = sharding_specs.NoSharding
 								Chunked = sharding_specs.Chunked
 								Unstacked = sharding_specs.Unstacked
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Split ShardingSpecs and most of the helpers for constructing them into a separate file (jax/_src/sharding_specs.py).

PiperOrigin-RevId: 522360232

											
										
										
											2023-04-06 09:48:14 -07:00
+								ShardedAxis = sharding_specs.ShardedAxis
 								Replicated = sharding_specs.Replicated
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								AvalDimSharding = Union[Unstacked, Chunked, NoSharding]
-												Split sharding_impls into its own Bazel target.

* Move dependencies of sharding_impls into sharding_impls to avoid creating cyclic dependencies.
* Fix a handful of new pytype errors.

PiperOrigin-RevId: 523146076

											
										
										
											2023-04-10 10:15:08 -07:00
+								Mesh = mesh_lib.Mesh
 								MeshAxisName = sharding_impls.MeshAxisName
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								MeshDimAssignment = Union[ShardedAxis, Replicated]
-												Split ShardingSpecs and most of the helpers for constructing them into a separate file (jax/_src/sharding_specs.py).

PiperOrigin-RevId: 522360232

											
										
										
											2023-04-06 09:48:14 -07:00
+								ShardingSpec = sharding_specs.ShardingSpec
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								### util
 								def identity(x): return x
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								def shard_arg(arg, sharding, canonicalize=True):
-												Move _array_shard_arg helpers from pxla into array.

Refactoring only which fixes a TODO.

Add a canonicalize argument to pxla.shard_arg so we can call that API from array yet  avoid double-canonicalization.

PiperOrigin-RevId: 549658117

											
										
										
											2023-07-20 09:43:40 -07:00
+								  if canonicalize:
 								    arg = xla.canonicalize_dtype(arg)
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								  return shard_arg_handlers[type(arg)](arg, sharding)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								@profiler.annotate_function
-												Add sharding to the signature of shard_args and update
the jax.Array handler unpack to single device arrays after
resharding.

PiperOrigin-RevId: 513624513

											
										
										
											2023-03-02 13:28:25 -08:00
+								def shard_args(
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								    shardings: Sequence[sharding_impls.XLACompatibleSharding], args,
-												Buffer -> Array in some pxla type annotations.

PiperOrigin-RevId: 520975371

											
										
										
											2023-03-31 11:41:49 -07:00
+								) -> Sequence[jax.Array]:
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								  return [shard_arg(arg, shardings[i]) for i, arg in enumerate(args)]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								shard_arg_handlers: dict[Any, Callable[[Any, Any], Any]] = {}
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								@lru_cache(maxsize=1024)
 								def get_addressable_devices_for_shard_arg(
 								    s: sharding_impls.XLACompatibleSharding) -> tuple[xc.Device, ...]:
 								  return s._addressable_device_assignment
 								@lru_cache(maxsize=1024)
 								def _get_replicated_slices(num_addressable_devices: int):
 								  return ((slice(None),),) * num_addressable_devices
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								def _shard_token(x, sharding):
 								  devices = get_addressable_devices_for_shard_arg(sharding)
 								  indices = _get_replicated_slices(len(devices))
-												Remove references to jax.config.jax_array, which is always True at head.

PiperOrigin-RevId: 516970232

											
										
										
											2023-03-15 17:08:21 -07:00
+								  zeros = np.zeros((), dtype=np.dtype(np.bool_))
 								  aval = api_util.shaped_abstractify(zeros)
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								  return batched_device_put(aval, sharding, [zeros for _ in indices], devices)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								shard_arg_handlers[core.Token] = _shard_token
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								def _masked_array_error(x, sharding):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  raise ValueError("numpy masked arrays are not supported as direct inputs to JAX functions. "
 								                   "Use arr.filled() to convert the value to a standard numpy array.")
 								shard_arg_handlers[np.ma.MaskedArray] = _masked_array_error
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								def _shard_array(x, sharding):
 								  devices = get_addressable_devices_for_shard_arg(sharding)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  if x.dtype == dtypes.float0:
 								    x = np.zeros(x.shape, dtype=np.dtype(bool))
-												Remove references to jax.config.jax_array, which is always True at head.

PiperOrigin-RevId: 516970232

											
										
										
											2023-03-15 17:08:21 -07:00
+								  aval = api_util.shaped_abstractify(x)
-												Don't index into numpy array if the sharding is fully replicated.

PiperOrigin-RevId: 615202739

											
										
										
											2024-03-12 16:04:05 -07:00
+								  if sharding.is_fully_replicated:
 								    shards = [x] * len(devices)
 								  else:
 								    indices = tuple(sharding.addressable_devices_indices_map(x.shape).values())
 								    shards = [x[i] for i in indices]
 								  return batched_device_put(aval, sharding, shards, devices)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								for _t in array_types:
 								  shard_arg_handlers[_t] = _shard_array
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								def _shard_darray(x, sharding):
 								  return shard_arg(x._data, sharding)
-												Finish supporing bints and DArrays where the pile tests touch them.

											
										
										
											2023-07-10 15:21:41 -04:00
+								shard_arg_handlers[core.DArray] = _shard_darray
-												[mutable-arrays] move MutableArray, add eager, improve tests, fix bug

1. move MutableArray to core.py, and some handlers to their respective files
2. fix a bug in aliasing setup (it was just broken before, now better test coverage)
3. add eager support by enabling get_p, swap_p, and addupdate_p impls
4. improve tests slightly

											
										
										
											2024-03-01 11:07:45 -08:00
+								def _shard_mutable_array(x, sharding):
 								  return shard_arg(x._buf, sharding)
 								shard_arg_handlers[core.MutableArray] = _shard_mutable_array
-												Split core.py and several files in an SCC with it into a separate Bazel build target.

PiperOrigin-RevId: 520192610

											
										
										
											2023-03-28 18:30:36 -07:00
+								def batched_device_put(aval: core.ShapedArray,
-												If the bufs are on the same devices passed to batched_device_put then create an Array directly rather than going via xc.batched_device_put. Fixing the transfer guard problem should help in removing this workaround too.

PiperOrigin-RevId: 516561791

											
										
										
											2023-03-14 10:19:03 -07:00
+								                       sharding: jax.sharding.Sharding, xs: Sequence[Any],
 								                       devices: Sequence[jax.Device], committed: bool = True):
 								  from jax._src import array
 								  bufs = [x for x, d in safe_zip(xs, devices)
 								          if (isinstance(x, array.ArrayImpl) and
 								              dispatch.is_single_device_sharding(x.sharding) and
-												Deprecate the device() method of JAX arrays

											
										
										
											2023-11-29 16:52:09 -08:00
+								              x.devices() == {d})]
-												If the bufs are on the same devices passed to batched_device_put then create an Array directly rather than going via xc.batched_device_put. Fixing the transfer guard problem should help in removing this workaround too.

PiperOrigin-RevId: 516561791

											
										
										
											2023-03-14 10:19:03 -07:00
+								  if len(bufs) == len(xs):
 								    return array.ArrayImpl(
 								        aval, sharding, bufs, committed=committed, _skip_checks=True)
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								  return xc.batched_device_put(aval, sharding, xs, list(devices), committed)  # type: ignore
-												If the bufs are on the same devices passed to batched_device_put then create an Array directly rather than going via xc.batched_device_put. Fixing the transfer guard problem should help in removing this workaround too.

PiperOrigin-RevId: 516561791

											
										
										
											2023-03-14 10:19:03 -07:00
-												Add a new experimental option jax_pmap_no_rank_reduction.

This option changes the implementation of pmap so that the individual shards have the same rank as the entire array, i.e. in the terminology of pmap using a "chunked" axis instead of an "unstacked" axis.

i.e., previously a typical array used by pmap might have a shape of, say, [8, 100], if sharded across 8 accelerators on its first axis, and each individual shard would have a shape of, say, [100]. With this change, each individual shard has a shape of [1, 100] instead.

Why do this?

The main reason to do this is that XLA's sharding (HloSharding), which is exposed in JAX as GSPMDSharding/NamedSharding/PositionalSharding, cannot represent a change of rank. This means that the kind of sharding used by pmap cannot be represented to XLA as a sharding. If we change the definition of PmapSharding to preserve the array rank instead, then this means that PmapSharding can in the future be represented directly as a kind of sharding known to XLA.

The new definition of PmapSharding will allow a number of internal simplifications to JAX, for example in a subsequent change we can probably delete PmapSharding entirely. This in turn also would allow us to delete the APIs `jax.device_put_replicated` and `jax.device_put_sharded`, which predate the current sharding design.

This change also prepares for an upcoming change where we would like to redefine `pmap` in terms of `jit(shard_map(...))`, allowing us to delete most `pmap` code paths.

Once enabled, this change has the potential to break pmap users who:
a) look at the shards of an array, e.g., via `.addressable_shards`, or `jax.make_array_from_single_device_arrays`, since the shapes of the shards will change.
b) rely on zero-copy behavior in APIs like `jax.device_put_replicated`.

The change is disabled by default, so we do not expect any user visible impacts from this change.

PiperOrigin-RevId: 599787818

											
										
										
											2024-01-19 03:53:01 -08:00
+								def _shard_aval(size, axis: int, aval):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  try:
-												Add a new experimental option jax_pmap_no_rank_reduction.

This option changes the implementation of pmap so that the individual shards have the same rank as the entire array, i.e. in the terminology of pmap using a "chunked" axis instead of an "unstacked" axis.

i.e., previously a typical array used by pmap might have a shape of, say, [8, 100], if sharded across 8 accelerators on its first axis, and each individual shard would have a shape of, say, [100]. With this change, each individual shard has a shape of [1, 100] instead.

Why do this?

The main reason to do this is that XLA's sharding (HloSharding), which is exposed in JAX as GSPMDSharding/NamedSharding/PositionalSharding, cannot represent a change of rank. This means that the kind of sharding used by pmap cannot be represented to XLA as a sharding. If we change the definition of PmapSharding to preserve the array rank instead, then this means that PmapSharding can in the future be represented directly as a kind of sharding known to XLA.

The new definition of PmapSharding will allow a number of internal simplifications to JAX, for example in a subsequent change we can probably delete PmapSharding entirely. This in turn also would allow us to delete the APIs `jax.device_put_replicated` and `jax.device_put_sharded`, which predate the current sharding design.

This change also prepares for an upcoming change where we would like to redefine `pmap` in terms of `jit(shard_map(...))`, allowing us to delete most `pmap` code paths.

Once enabled, this change has the potential to break pmap users who:
a) look at the shards of an array, e.g., via `.addressable_shards`, or `jax.make_array_from_single_device_arrays`, since the shapes of the shards will change.
b) rely on zero-copy behavior in APIs like `jax.device_put_replicated`.

The change is disabled by default, so we do not expect any user visible impacts from this change.

PiperOrigin-RevId: 599787818

											
										
										
											2024-01-19 03:53:01 -08:00
+								    return _shard_aval_handlers[type(aval)](size, axis, aval)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  except KeyError as err:
-												Add a new experimental option jax_pmap_no_rank_reduction.

This option changes the implementation of pmap so that the individual shards have the same rank as the entire array, i.e. in the terminology of pmap using a "chunked" axis instead of an "unstacked" axis.

i.e., previously a typical array used by pmap might have a shape of, say, [8, 100], if sharded across 8 accelerators on its first axis, and each individual shard would have a shape of, say, [100]. With this change, each individual shard has a shape of [1, 100] instead.

Why do this?

The main reason to do this is that XLA's sharding (HloSharding), which is exposed in JAX as GSPMDSharding/NamedSharding/PositionalSharding, cannot represent a change of rank. This means that the kind of sharding used by pmap cannot be represented to XLA as a sharding. If we change the definition of PmapSharding to preserve the array rank instead, then this means that PmapSharding can in the future be represented directly as a kind of sharding known to XLA.

The new definition of PmapSharding will allow a number of internal simplifications to JAX, for example in a subsequent change we can probably delete PmapSharding entirely. This in turn also would allow us to delete the APIs `jax.device_put_replicated` and `jax.device_put_sharded`, which predate the current sharding design.

This change also prepares for an upcoming change where we would like to redefine `pmap` in terms of `jit(shard_map(...))`, allowing us to delete most `pmap` code paths.

Once enabled, this change has the potential to break pmap users who:
a) look at the shards of an array, e.g., via `.addressable_shards`, or `jax.make_array_from_single_device_arrays`, since the shapes of the shards will change.
b) rely on zero-copy behavior in APIs like `jax.device_put_replicated`.

The change is disabled by default, so we do not expect any user visible impacts from this change.

PiperOrigin-RevId: 599787818

											
										
										
											2024-01-19 03:53:01 -08:00
+								    raise TypeError(f"No _shard_aval handler for type: {type(aval)}") from err
 								_shard_aval_handlers: dict[type[core.AbstractValue], Callable[[int, int, Any], Any]] = {}
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								def _shard_abstract_array(size, axis: int, x):
 								  try:
 								    if x.shape[axis] != size:
 								      raise ValueError(f"Axis size {size} does not match dimension {axis} of "
 								                       f"shape {x.shape}")
 								  except IndexError:
 								    raise ValueError("Cannot split a {x.dim}D value along axis {axis}") from None
-												Add a new experimental option jax_pmap_no_rank_reduction.

This option changes the implementation of pmap so that the individual shards have the same rank as the entire array, i.e. in the terminology of pmap using a "chunked" axis instead of an "unstacked" axis.

i.e., previously a typical array used by pmap might have a shape of, say, [8, 100], if sharded across 8 accelerators on its first axis, and each individual shard would have a shape of, say, [100]. With this change, each individual shard has a shape of [1, 100] instead.

Why do this?

The main reason to do this is that XLA's sharding (HloSharding), which is exposed in JAX as GSPMDSharding/NamedSharding/PositionalSharding, cannot represent a change of rank. This means that the kind of sharding used by pmap cannot be represented to XLA as a sharding. If we change the definition of PmapSharding to preserve the array rank instead, then this means that PmapSharding can in the future be represented directly as a kind of sharding known to XLA.

The new definition of PmapSharding will allow a number of internal simplifications to JAX, for example in a subsequent change we can probably delete PmapSharding entirely. This in turn also would allow us to delete the APIs `jax.device_put_replicated` and `jax.device_put_sharded`, which predate the current sharding design.

This change also prepares for an upcoming change where we would like to redefine `pmap` in terms of `jit(shard_map(...))`, allowing us to delete most `pmap` code paths.

Once enabled, this change has the potential to break pmap users who:
a) look at the shards of an array, e.g., via `.addressable_shards`, or `jax.make_array_from_single_device_arrays`, since the shapes of the shards will change.
b) rely on zero-copy behavior in APIs like `jax.device_put_replicated`.

The change is disabled by default, so we do not expect any user visible impacts from this change.

PiperOrigin-RevId: 599787818

											
										
										
											2024-01-19 03:53:01 -08:00
+								  if config.pmap_no_rank_reduction.value:
 								    return x.update(shape=tuple_update(x.shape, axis, 1))
 								  else:
 								    return x.update(shape=tuple_delete(x.shape, axis))
 								_shard_aval_handlers[ShapedArray] = _shard_abstract_array
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								def local_aval_to_result_handler(
 								    aval: core.AbstractValue,
-												Split Sharding subclasses out of _src/sharding.py into _src/sharding_impls.py

By defining the Sharding base class in its own module, we can pull it out into a separate Bazel submodule, which will help pytype inference when defining Array.

PiperOrigin-RevId: 516223009

											
										
										
											2023-03-13 08:49:39 -07:00
+								    sharding: sharding_impls.XLACompatibleSharding,
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								    indices: tuple[Index, ...] | None,
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								) -> Callable[[list[xc.ArrayImpl]], Any]:
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  """Returns a function for handling the raw buffers of a single output aval.
 								  Args:
 								    aval: The local output AbstractValue.
 								    sharding_spec: Indicates how the output is sharded across devices, or None
 								      for non-array avals.
 								    indices: The pre-computed result of spec_to_indices, or None for non-array
 								      avals.
 								  Returns:
 								    A function for handling the Buffers that will eventually be produced
 								    for this output. The function will return an object suitable for returning
-												Replace references to DeviceArray with Array.

A number of stale references are lurking in our documentation.

											
										
										
											2023-08-18 16:50:36 -04:00
+								    to the user, e.g. an Array.
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  """
 								  try:
-												Remove pxla.OutputType enum class now that the only output can be jax.Array

PiperOrigin-RevId: 517985356

											
										
										
											2023-03-20 09:09:15 -07:00
+								    return local_result_handlers[(type(aval))](aval, sharding, indices)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  except KeyError as err:
 								    raise TypeError(
 								        f"No pxla_result_handler for type: {type(aval)}") from err
-												Merge pull request #15303 from jakevdp:lax-asarray

PiperOrigin-RevId: 520717999

											
										
										
											2023-03-30 20:11:11 +00:00
+								PxlaResultHandler = Callable[..., Callable[[Any], Any]]
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								local_result_handlers: dict[type[core.AbstractValue], PxlaResultHandler] = {}
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								def global_aval_to_result_handler(
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								    aval: core.AbstractValue, out_sharding, committed: bool
-												Buffer -> Array in some pxla type annotations.

PiperOrigin-RevId: 520975371

											
										
										
											2023-03-31 11:41:49 -07:00
+								) -> Callable[[Sequence[xc.ArrayImpl]], Any]:
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  """Returns a function for handling the raw buffers of a single output aval.
 								  Args:
 								    aval: The global output AbstractValue.
 								    out_axis_resources: A PartitionSpec specifying the sharding of outputs.
 								      Used for creating GSDAs.
 								    global_mesh: The global device mesh that generated this output. Used
 								      for creating GSDAs.
 								  Returns:
 								    A function for handling the Buffers that will eventually be produced
 								    for this output. The function will return an object suitable for returning
-												Replace references to DeviceArray with Array.

A number of stale references are lurking in our documentation.

											
										
										
											2023-08-18 16:50:36 -04:00
+								    to the user, e.g. an Array.
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  """
 								  try:
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								    return global_result_handlers[type(aval)](aval, out_sharding, committed)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  except KeyError as err:
 								    raise TypeError(
 								        f"No pxla_result_handler for type: {type(aval)}") from err
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								global_result_handlers: dict[type[core.AbstractValue], PxlaResultHandler] = {}
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								### lazy device-memory persistence and result handling
 								### the xla_pmap primitive and its rules are comparable to xla_call in xla.py
 								def xla_pmap_impl_lazy(
 								    fun: lu.WrappedFun,
 								    *args,
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								    backend: str | None,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    axis_name: core.AxisName,
 								    axis_size: int,
 								    global_axis_size: int,
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								    devices: Sequence[Any] | None,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    name: str,
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								    in_axes: Sequence[int | None],
 								    out_axes_thunk: Callable[[], Sequence[int | None]],
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    donated_invars: Sequence[bool],
 								    is_explicit_global_axis_size: bool,
-												Fix pytype errors.

PiperOrigin-RevId: 509984207

											
										
										
											2023-02-15 18:11:55 -08:00
+								) -> Callable:
-												Migrate a subset of internal modules to use state objects

The motivation here is to gradually replace all dynamic lookups on `jax.config`
with statically-typed state objects, which are more type checker/IDE friendly.

PiperOrigin-RevId: 571932143

											
										
										
											2023-10-09 07:28:18 -07:00
+								  if (config.disable_jit.value and config.eager_pmap.value and
-												Remove global_arg_shapes from pmap since it was only used for sharded_jit and sharded_jit was removed from JAX a long time ago

PiperOrigin-RevId: 520356179

											
										
										
											2023-03-29 09:22:34 -07:00
+								      not is_explicit_global_axis_size and not any(d for d in donated_invars)):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    def _emap_apply_fn(*args):
 								      return _emap_impl(fun, *args, backend=backend, axis_name=axis_name,
 								                        axis_size=axis_size, global_axis_size=global_axis_size,
 								                        devices=devices, name=name, in_axes=in_axes,
 								                        out_axes_thunk=out_axes_thunk,
 								                        donated_invars=donated_invars,
 								                        is_explicit_global_axis_size=is_explicit_global_axis_size)
 								    return _emap_apply_fn
 								  abstract_args = unsafe_map(xla.abstractify, args)
 								  compiled_fun, fingerprint = parallel_callable(
 								      fun, backend, axis_name, axis_size, global_axis_size, devices, name,
-												Remove global_arg_shapes from pmap since it was only used for sharded_jit and sharded_jit was removed from JAX a long time ago

PiperOrigin-RevId: 520356179

											
										
										
											2023-03-29 09:22:34 -07:00
+								      in_axes, out_axes_thunk, donated_invars,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      is_explicit_global_axis_size, *abstract_args)
 								  # Don't re-abstractify args unless logging is enabled for performance.
-												Migrate a subset of internal modules to use state objects

The motivation here is to gradually replace all dynamic lookups on `jax.config`
with statically-typed state objects, which are more type checker/IDE friendly.

PiperOrigin-RevId: 571932143

											
										
										
											2023-10-09 07:28:18 -07:00
+								  if config.distributed_debug.value:
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    distributed_debug_log(("Running pmapped function", name),
 								                          ("python function", fun.f),
 								                          ("devices", devices),
 								                          ("abstract args", map(xla.abstractify, args)),
 								                          ("fingerprint", fingerprint))
 								  return compiled_fun
 								def xla_pmap_impl(fun: lu.WrappedFun, *args, **params):
 								  compiled_fun = xla_pmap_impl_lazy(fun, *args, **params)
 								  return compiled_fun(*args)
 								class EmapInfo(NamedTuple):
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								  backend: str | None
 								  devices: Sequence[Any] | None
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								def _emap_impl(fun: lu.WrappedFun, *args,
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								               backend: str | None,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								               axis_name: core.AxisName,
 								               axis_size: int,
 								               global_axis_size: int,
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								               devices: Sequence[Any] | None,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								               name: str,
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								               in_axes: Sequence[int | None],
 								               out_axes_thunk: Callable[[], Sequence[int | None]],
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								               donated_invars: Sequence[bool],
 								               is_explicit_global_axis_size: bool,
 								               ):
 								  from jax._src import array
 								  # TODO(sharadmv,mattjj): implement these cases
 								  if any(d for d in donated_invars):
 								    raise NotImplementedError("Buffer donation not supported in eager pmap.")
 								  if is_explicit_global_axis_size:
 								    raise NotImplementedError("Non-default global_axis_size not supported in "
 								                              "eager pmap.")
 								  emap_info = EmapInfo(backend, devices)
 								  shard_axes = [{} if in_axis is None else {axis_name: in_axis} for in_axis in in_axes]
 								  with core.new_base_main(MapTrace, emap_info=emap_info) as main:
 								    with core.new_sublevel(), core.extend_axis_env(axis_name, axis_size, main):
 								      t = main.with_cur_sublevel()
-												[mutable-arrays] allow state effects in jit by building in run_state

with help from @sharadmv, @yashkatariya, @dougalm, and others

The basic strategy is to apply discharge_state when lowering a jaxpr with state
effects to HLO, and update the dispatch path accordingly. Specifically:
1. in tests only for now, introduce a MutableArray data type;
2. teach jit to abstract it to a Ref(ShapedArray) type, register an input
   handler, etc;
3. call discharge_state in `lower_sharding_computation` to lower a jaxpr with
   refs to a jaxpr (and then to an HLO) with extra outputs, and set up aliasing;
4. teach the output side of the dispatch path to drop those outputs.

As an alternative to (3), we could potentially lower away the effects at a
higher level, like in _pjit_lower_cached. They are similar because
_pjit_lower_cached is the only (non-xmap) caller of lower_sharding_computation.
I decided to do it in lower_sharding_computation mainly because that's closer
to where we set up aliases, and I wanted to make mutable arrays correspond to
aliased inputs/outputs on the XLA computation.

											
										
										
											2024-02-26 14:46:05 -08:00
+								      tracers = [MapTracer(t, arg, s) for arg, s in zip(args, shard_axes)]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      ans = fun.call_wrapped(*tracers)
 								      out_tracers = map(t.full_raise, ans)
 								      outvals, out_axes_src = unzip2((t.val, t.shard_axes) for t in out_tracers)
 								    del main
 								  out_axes = out_axes_thunk()
 								  platform = xb.get_backend(backend).platform
 								  donate_argnums = (1,) if platform in {"cuda", "rocm", "tpu"} else ()
 								  new_outvals = []
 								  for out_axis_src, out_axis, outval in zip(out_axes_src, out_axes, outvals):
 								    with jax.disable_jit(False):
 								      donate_argnums_ = donate_argnums
-												[JAX] Delete ShardedDeviceArray.

Replace it with a temporary shim that is Any to type checkers and an uninstantiatable class at runtime.

PiperOrigin-RevId: 518074394

											
										
										
											2023-03-20 14:17:25 -07:00
+								      if isinstance(outval, array.ArrayImpl):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								        # We don't want to donate if it's already sharded.
 								        donate_argnums_ = ()
 								      out = jax.pmap(
 								          lambda _, x: x,
 								          in_axes=(0, out_axis_src.get(axis_name)),
 								          out_axes=out_axis,
 								          devices=(None if devices is None else list(devices)),
 								          backend=backend,
 								          donate_argnums=donate_argnums_)(np.arange(axis_size), outval)
 								      new_outvals.append(out)
 								  return new_outvals
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								def _map_schedule(idx: tuple[int | None, ...]) -> tuple[int | None, ...]:
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  # In order to do a multi-map (a simultaneous map over several axes), we will
 								  # nest several maps. Each time we do a map, we "remove" an input axis so we
 								  # need to update the remaining map axes. For example, if we are to map over
 								  # the axes 0, 3, and 4, we make three calls to pmap with in_axes as 0, 2, 2.
 								  return tuple(None if i is None else
 								               i - sum(j is not None and j < i for j in idx[:l])
 								               for l, i in enumerate(idx))
 								# We're often creating `f`s on the fly and we try to carefully make them have
 								# the right __hash__ and __eq__. However, despite our attempts pmap's caching
 								# still ends up not working, because it has a separate cache per
 								# _function object_. Adding this annotation here lets us reuse the same pmap
 								# callable for all equivalent primitive pmaps.
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								@lru_cache
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								def _multi_pmap(f: Callable, info: EmapInfo, names: list[core.AxisName],
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								                all_axes: list[tuple[int | None, ...]]
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								                ) -> tuple[Callable, dict[core.AxisName, int]]:
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  used_names = []
 								  for i, name in reversed(list(enumerate(names))):
 								    in_axes = tuple(arg_axis[i] for arg_axis in all_axes)
 								    if any(in_axis is not None for in_axis in in_axes):
 								      f = jax.pmap(
 								          f,
 								          in_axes=in_axes,
 								          axis_name=name,
 								          out_axes=0,
 								          backend=info.backend,
 								          devices=(None if info.devices is None else list(info.devices)))
 								      used_names.append(name)
 								  out_shard_axes = {name: i for i, name in enumerate(reversed(used_names))}
 								  return f, out_shard_axes
-												Prune accidental exports from jax.interpreters.pxla.

These imports do not appear to have users outside JAX itself.

PiperOrigin-RevId: 507835295

											
										
										
											2023-02-07 11:16:01 -08:00
+								FakePrimitive = namedtuple("FakePrimitive", ["multiple_results", "bind"])
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								class MapTrace(core.Trace):
 								  def __init__(self, *args, emap_info):
 								    super().__init__(*args)
 								    self.emap_info = emap_info
 								  def pure(self, val):
 								    return MapTracer(self, val, {})
 								  def sublift(self, tracer):
 								    return MapTracer(self, tracer.val, tracer.shard_axes)
 								  def process_primitive(self, primitive, tracers, params):
 								    info = self.main.payload["emap_info"]
 								    vals, shard_axes = unzip2([(t.val, t.shard_axes) for t in tracers])
 								    names = tuple(f.name for f in core.thread_local_state.trace_state.axis_env
 								                  if f.main_trace is self.main)
 								    all_axes = tuple(_map_schedule(map(s.get, names)) for s in shard_axes)  # pytype: disable=wrong-arg-types  # always-use-return-annotations
 								    f = HashableFunction(lambda *args: primitive.bind(*args, **params),
 								                         (primitive, tuple(params.items())))
 								    f_mapped, out_shard_axes = _multi_pmap(f, info, names, all_axes)
 								    with core.eval_context(), jax.disable_jit(False):
 								      outvals = f_mapped(*vals)
 								    if primitive.multiple_results:
 								      return [MapTracer(self, val, out_shard_axes) for val in outvals]
 								    return MapTracer(self, outvals, out_shard_axes)
 								  def process_call(self, call_primitive, fun, tracers, params):
-												Deprecated xla_call_p since it has been replaced with pjit.pjit_p

PiperOrigin-RevId: 518921538

											
										
										
											2023-03-23 11:43:49 -07:00
+								    raise NotImplementedError
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												[shard-map] better errors for not-implemented-in-eager features

											
										
										
											2023-04-08 21:12:40 -07:00
+								  def process_map(self, map_primitive, fun, tracers, params):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    if params['devices'] is not None:
 								      raise ValueError("Nested pmap with explicit devices argument.")
-												Migrate a subset of internal modules to use state objects

The motivation here is to gradually replace all dynamic lookups on `jax.config`
with statically-typed state objects, which are more type checker/IDE friendly.

PiperOrigin-RevId: 571932143

											
										
										
											2023-10-09 07:28:18 -07:00
+								    if not config.disable_jit.value:
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      bind = HashableFunction(
-												[shard-map] better errors for not-implemented-in-eager features

											
										
										
											2023-04-08 21:12:40 -07:00
+								          lambda *args, **kwargs: map_primitive.bind(fun, *args, **kwargs),
 								          (map_primitive, fun))
-												Prune accidental exports from jax.interpreters.pxla.

These imports do not appear to have users outside JAX itself.

PiperOrigin-RevId: 507835295

											
										
										
											2023-02-07 11:16:01 -08:00
+								      fake_primitive = FakePrimitive(multiple_results=True, bind=bind)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      return self.process_primitive(fake_primitive, tracers, params)
 								    axis_name, in_axes, out_axes_thunk, axis_size = (params["axis_name"],
 								        params["in_axes"], params["out_axes_thunk"], params["axis_size"])
-												fix a bug with eager pmap + vmap + custom_jvp interaction

I used the same implementation technique in shard_map.py, e.g. in ShardMapTrace.process_custom_jvp_call, and it's sound, whereas I can't remember why we implementd the eager pmap stuff the way we did.

This fixes an internal test, but unfortunately I wasn't able to figure out a simple repro :/

											
										
										
											2023-12-14 12:41:56 -08:00
+								    vals, shard_axes = unzip2((t.val, t.shard_axes) for t in tracers)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    shard_axes = [{axis_name: _annot_to_flat(np.ndim(v), s.values(), ax), **s}
 								                  if ax is not None else s
 								                  for v, ax, s in zip(vals, in_axes, shard_axes)]
-												fix a bug with eager pmap + vmap + custom_jvp interaction

I used the same implementation technique in shard_map.py, e.g. in ShardMapTrace.process_custom_jvp_call, and it's sound, whereas I can't remember why we implementd the eager pmap stuff the way we did.

This fixes an internal test, but unfortunately I wasn't able to figure out a simple repro :/

											
										
										
											2023-12-14 12:41:56 -08:00
+								    # TODO(mattjj): use _emap_subtrace here?
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    with core.new_sublevel(), core.extend_axis_env(axis_name, axis_size, self.main):
 								      t = self.main.with_cur_sublevel()
 								      in_tracers = map(partial(MapTracer, t), vals, shard_axes)
 								      ans = fun.call_wrapped(*in_tracers)
 								      out_tracers = map(t.full_raise, ans)
 								      out, outaxes = unzip2((t.val, t.shard_axes) for t in out_tracers)
 								      del t, in_tracers, ans, out_tracers
 								    out, outaxes = unzip2(_match_annot(axis_name, axis_size, v, s, dst)
 								                           for v, s, dst in zip(out, outaxes, out_axes_thunk()))
 								    return map(partial(MapTracer, self), out, outaxes)
-												[shard-map] better errors for not-implemented-in-eager features

											
										
										
											2023-04-08 21:12:40 -07:00
+								  def process_custom_jvp_call(self, prim, fun, jvp, tracers, *, symbolic_zeros):
-												fix a bug with eager pmap + vmap + custom_jvp interaction

I used the same implementation technique in shard_map.py, e.g. in ShardMapTrace.process_custom_jvp_call, and it's sound, whereas I can't remember why we implementd the eager pmap stuff the way we did.

This fixes an internal test, but unfortunately I wasn't able to figure out a simple repro :/

											
										
										
											2023-12-14 12:41:56 -08:00
+								    if symbolic_zeros:
 								      msg = ("custom_jvp with symbolic_zeros=True not supported with eager pmap. "
 								             "Please open an issue at https://github.com/google/jax/issues !")
 								      raise NotImplementedError(msg)
 								    del prim, jvp, symbolic_zeros  # always base main, can drop jvp
 								    in_vals, in_axes = unzip2((t.val, t.shard_axes) for t in tracers)
 								    fun, out_axes = _emap_subtrace(fun, self.main, in_axes)
 								    with core.new_sublevel():
 								      out_vals = fun.call_wrapped(*in_vals)
 								    return map(partial(MapTracer, self), out_vals, out_axes())
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								  def process_custom_vjp_call(self, primitive, fun, fwd, bwd, tracers,
-												`custom_vjp` symbolic zeros support, take two

This change re-introduces symbolic zero support for `custom_vjp`.

This time:

* The forward rule API is slightly different, accepting two-field
  records at pytree leaves rather than pairs.

* In the default setting where symbolic_zeros is not set, there are no
  new requirements from pytree node definitions that are involved in
  the primal arguments. This avoids any change in behavior on the
  default path. In particular, custom pytree node definitions that
  aren't completely polymorphic in unflattening can remain as is.

* There is an additional test involving a custom pytree node.

											
										
										
											2023-03-24 14:42:19 -07:00
+								                              out_trees, symbolic_zeros):
-												fix a bug with eager pmap + vmap + custom_jvp interaction

I used the same implementation technique in shard_map.py, e.g. in ShardMapTrace.process_custom_jvp_call, and it's sound, whereas I can't remember why we implementd the eager pmap stuff the way we did.

This fixes an internal test, but unfortunately I wasn't able to figure out a simple repro :/

											
										
										
											2023-12-14 12:41:56 -08:00
+								    if symbolic_zeros:
 								      msg = ("custom_vjp with symbolic_zeros=True not supported with eager pmap. "
 								             "Please open an issue at https://github.com/google/jax/issues !")
 								      raise NotImplementedError(msg)
 								    del primitive, fwd, bwd, out_trees, symbolic_zeros  # always base main, drop vjp
 								    in_vals, in_axes = unzip2((t.val, t.shard_axes) for t in tracers)
 								    fun, out_axes = _emap_subtrace(fun, self.main, in_axes)
 								    with core.new_sublevel():
 								      out_vals = fun.call_wrapped(*in_vals)
 								    return map(partial(MapTracer, self), out_vals, out_axes())
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								  def process_axis_index(self, frame):
 								    bind = HashableFunction(
 								        lambda _: jax.lax.axis_index(frame.name),
 								        (jax.lax.axis_index, frame.name))
-												Prune accidental exports from jax.interpreters.pxla.

These imports do not appear to have users outside JAX itself.

PiperOrigin-RevId: 507835295

											
										
										
											2023-02-07 11:16:01 -08:00
+								    fake_primitive = FakePrimitive(multiple_results=False, bind=bind)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    with core.eval_context():
 								      range = jax.lax.iota(np.int32, frame.size)
 								    dummy_tracer = MapTracer(self, range, {frame.name: 0})
 								    return self.process_primitive(fake_primitive, (dummy_tracer,), {})
-												fix a bug with eager pmap + vmap + custom_jvp interaction

I used the same implementation technique in shard_map.py, e.g. in ShardMapTrace.process_custom_jvp_call, and it's sound, whereas I can't remember why we implementd the eager pmap stuff the way we did.

This fixes an internal test, but unfortunately I wasn't able to figure out a simple repro :/

											
										
										
											2023-12-14 12:41:56 -08:00
+								@lu.transformation_with_aux
 								def _emap_subtrace(main, in_axes, *in_vals):
 								  t = main.with_cur_sublevel()
 								  in_tracers = map(partial(MapTracer, t), in_vals, in_axes)
 								  ans = yield in_tracers, {}
 								  out_tracers = map(t.full_raise, ans)
 								  out_vals, out_axes = unzip2((t.val, t.shard_axes) for t in out_tracers)
 								  del t, in_tracers, ans, out_tracers
 								  yield out_vals, out_axes
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								def _annot_to_flat(ndim: int, mapped_axes: Iterable[int],
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								                 annotation: int | None) -> int | None:
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  if annotation is None: return None
 								  mapped_axes_ = set(mapped_axes)
 								  return [i for i in range(ndim) if i not in mapped_axes_][annotation]
 								def _match_annot(axis_name: core.AxisName, axis_size: int, val: Any,
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								                 shard_axis_src: dict[core.AxisName, int],
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								                 dst_annotation: int | None
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								                 ) -> tuple[Any, dict[core.AxisName, int]]:
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  shard_axis_out = dict(shard_axis_src)
 								  src = shard_axis_out.pop(axis_name, None)
 								  dst = _annot_to_flat(np.ndim(val) + (src is None), shard_axis_out.values(),
 								                       dst_annotation)
 								  with core.eval_context():
 								    if src == dst:
 								      outval = val
 								    elif type(src) == type(dst) == int:
 								      outval = batching.moveaxis(val, src, dst)
 								      shard_axis_out = _moveaxis(np.ndim(val), shard_axis_src, src, dst)
 								    elif src is None and dst is not None:
 								      outval = batching.broadcast(val, axis_size, dst)
 								      shard_axis_out = {n: d + (dst <= d) for n, d in shard_axis_out.items()}
 								    else:
 								      raise NotImplementedError
 								  return outval, shard_axis_out
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								def _moveaxis(ndim: int, shard_axes: dict[core.AxisName, int],
 								              src: int, dst: int) -> dict[core.AxisName, int]:
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								  lst: list[core.AxisName | None] = [None] * ndim
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  for k, v in shard_axes.items():
 								    lst[v] = k
 								  name = lst.pop(src)
 								  lst.insert(dst - (src < dst), name)
 								  return {name: i for i, name in enumerate(lst) if name is not None}
 								class MapTracer(core.Tracer):
 								  __slots__ = ["val", "shard_axes"]
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								  def __init__(self, trace: MapTrace, val, shard_axes: dict[core.AxisName, int]):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    self._trace = trace
 								    self.val = val
 								    self.shard_axes = shard_axes
 								    assert all(val < self.val.ndim for val in self.shard_axes.values())
 								  @property
 								  def aval(self):
 								    aval = xla.abstractify(self.val)
 								    shard_axes = dict(self.shard_axes)
 								    for axis_idx in sorted(shard_axes.values())[::-1]:
 								      aval = core.mapped_aval(aval.shape[axis_idx], axis_idx, aval)
 								    return aval
 								  def full_lower(self):
 								    return self
 								  def __str__(self):
 								    named_axes = [f"{k}={v}" for k, v in self.shard_axes.items()]
 								    return f"{self.val}{{{','.join(named_axes)}}}"
 								@lu.cache
 								def parallel_callable(fun: lu.WrappedFun,
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								                      backend_name: str | None,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								                      axis_name: core.AxisName,
 								                      axis_size: int,
 								                      global_axis_size: int,
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								                      devices: Sequence[Any] | None,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								                      name: str,
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								                      in_axes: Sequence[int | None],
 								                      out_axes_thunk: Callable[[], Sequence[int | None]],
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								                      donated_invars: Sequence[bool],
 								                      is_explicit_global_axis_size: bool,
 								                      *avals):
 								  pmap_computation = lower_parallel_callable(
 								      fun, backend_name, axis_name, axis_size, global_axis_size, devices, name,
-												Remove global_arg_shapes from pmap since it was only used for sharded_jit and sharded_jit was removed from JAX a long time ago

PiperOrigin-RevId: 520356179

											
										
										
											2023-03-29 09:22:34 -07:00
+								      in_axes, out_axes_thunk, donated_invars,
-												Introduce a LoweringParameters dataclass for easier plumbing

There are currently two parameters that are used to configure
lowering: lowering_platform (for cross-platform lowering), and
override_lowering_rules. Each of them are passed as separate arguments
through several layers of lowering internal functions. This is tedious,
and error prone. In fact, override_lowering_rules was not plumbed
in all places, and due to using default arguments in all places,
this leads to silent errors.

We foresee introducing other parameters for lowering: for multi-platform
lowering, for controlling the lowering of effects.

Here is pack all such parameters into a `mlir.LoweringParameters`
dataclass and we plumb that through.

											
										
										
											2023-09-28 12:44:14 +02:00
+								      is_explicit_global_axis_size, avals,
 								      lowering_parameters=mlir.LoweringParameters())
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  pmap_executable = pmap_computation.compile()
 								  return WeakRefList([pmap_executable.unsafe_call, pmap_executable.fingerprint])
 								@dataclasses.dataclass(frozen=True)
 								class ParallelCallableInfo:
 								  name: str
-												Remove more exported names from jax.interpreters.xla.

None of these appear to have public users, and this module is not included in the deprecation policy.

Also:
* shorten a number of alias chains.
* move make_op_metadata() into its only caller in jax2tf
* delete the unused function dtype_to_primitive_type.
PiperOrigin-RevId: 510205315

											
										
										
											2023-02-16 11:54:25 -08:00
+								  backend: xc.Client
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  axis_name: core.AxisName
 								  axis_size: int
 								  global_axis_size: int
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								  devices: Sequence[xc.Device] | None
 								  in_axes: Iterable[int | None]
 								  out_axes_thunk: Callable[[], Sequence[int | None]]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  avals: Sequence[core.AbstractValue]
 								  @cached_property
 								  def local_devices(self):
 								    if self.devices:
 								      out = [d for d in self.devices
 								             if d.process_index == xb.process_index(self.backend)]
 								      assert len(out) > 0
 								    else:
 								      out = None  # type: ignore
 								    return out
 								  @cached_property
 								  def out_axes(self):
 								    return self.out_axes_thunk()
 								class ShardInfo(NamedTuple):
 								  sharded_avals: Sequence[core.AbstractValue]
-												Delete some dead code that pertained to sharded_jit.

sharded_jit is long gone.

PiperOrigin-RevId: 523711890

											
										
										
											2023-04-12 08:49:07 -07:00
+								  out_sharded_avals: Sequence[core.ShapedArray]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  global_sharded_avals: Sequence[core.AbstractValue]
 								  num_local_shards: int
 								  num_global_shards: int
 								class ReplicaInfo(NamedTuple):
 								  jaxpr_replicas: int
 								  num_local_replicas: int
 								  num_global_replicas: int
-												Add more return type annotations.

Fix a new pytype error by adding a checked cast.

PiperOrigin-RevId: 523780354

											
										
										
											2023-04-12 12:53:32 -07:00
+								def find_replicas(
 								    jaxpr: core.Jaxpr, axis_size: int, global_axis_size: int
 								) -> ReplicaInfo:
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  # TODO(skyewm): replace this with a chain of pmaps and/or sharded_jits
 								  jaxpr_replicas = dispatch.jaxpr_replicas(jaxpr)
 								  num_local_replicas = axis_size * jaxpr_replicas
 								  num_global_replicas = global_axis_size * jaxpr_replicas
 								  return ReplicaInfo(jaxpr_replicas, num_local_replicas, num_global_replicas)
-												Add a new experimental option jax_pmap_no_rank_reduction.

This option changes the implementation of pmap so that the individual shards have the same rank as the entire array, i.e. in the terminology of pmap using a "chunked" axis instead of an "unstacked" axis.

i.e., previously a typical array used by pmap might have a shape of, say, [8, 100], if sharded across 8 accelerators on its first axis, and each individual shard would have a shape of, say, [100]. With this change, each individual shard has a shape of [1, 100] instead.

Why do this?

The main reason to do this is that XLA's sharding (HloSharding), which is exposed in JAX as GSPMDSharding/NamedSharding/PositionalSharding, cannot represent a change of rank. This means that the kind of sharding used by pmap cannot be represented to XLA as a sharding. If we change the definition of PmapSharding to preserve the array rank instead, then this means that PmapSharding can in the future be represented directly as a kind of sharding known to XLA.

The new definition of PmapSharding will allow a number of internal simplifications to JAX, for example in a subsequent change we can probably delete PmapSharding entirely. This in turn also would allow us to delete the APIs `jax.device_put_replicated` and `jax.device_put_sharded`, which predate the current sharding design.

This change also prepares for an upcoming change where we would like to redefine `pmap` in terms of `jit(shard_map(...))`, allowing us to delete most `pmap` code paths.

Once enabled, this change has the potential to break pmap users who:
a) look at the shards of an array, e.g., via `.addressable_shards`, or `jax.make_array_from_single_device_arrays`, since the shapes of the shards will change.
b) rely on zero-copy behavior in APIs like `jax.device_put_replicated`.

The change is disabled by default, so we do not expect any user visible impacts from this change.

PiperOrigin-RevId: 599787818

											
										
										
											2024-01-19 03:53:01 -08:00
+								@lu.transformation
 								def _change_argument_ranks(in_axes, out_axes_thunk, *args):
 								  args = tuple(
 								      arg if in_axis is None else jax.lax.squeeze(arg, dimensions=(in_axis,))
 								      for in_axis, arg in zip(in_axes, args)
 								  )
 								  results = yield (args, {})
 								  out_axes = out_axes_thunk()
 								  yield tuple(
 								      x if axis is None else jax.lax.expand_dims(x, dimensions=(axis,))
 								      for x, axis in zip(results, out_axes)
 								  )
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								def stage_parallel_callable(
-												Add more return type annotations.

Fix a new pytype error by adding a checked cast.

PiperOrigin-RevId: 523780354

											
										
										
											2023-04-12 12:53:32 -07:00
+								    pci: ParallelCallableInfo, fun: lu.WrappedFun
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								) -> tuple[core.Jaxpr, list[Any], ReplicaInfo, ShardInfo]:
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  sharded_avals = tuple(
-												Add a new experimental option jax_pmap_no_rank_reduction.

This option changes the implementation of pmap so that the individual shards have the same rank as the entire array, i.e. in the terminology of pmap using a "chunked" axis instead of an "unstacked" axis.

i.e., previously a typical array used by pmap might have a shape of, say, [8, 100], if sharded across 8 accelerators on its first axis, and each individual shard would have a shape of, say, [100]. With this change, each individual shard has a shape of [1, 100] instead.

Why do this?

The main reason to do this is that XLA's sharding (HloSharding), which is exposed in JAX as GSPMDSharding/NamedSharding/PositionalSharding, cannot represent a change of rank. This means that the kind of sharding used by pmap cannot be represented to XLA as a sharding. If we change the definition of PmapSharding to preserve the array rank instead, then this means that PmapSharding can in the future be represented directly as a kind of sharding known to XLA.

The new definition of PmapSharding will allow a number of internal simplifications to JAX, for example in a subsequent change we can probably delete PmapSharding entirely. This in turn also would allow us to delete the APIs `jax.device_put_replicated` and `jax.device_put_sharded`, which predate the current sharding design.

This change also prepares for an upcoming change where we would like to redefine `pmap` in terms of `jit(shard_map(...))`, allowing us to delete most `pmap` code paths.

Once enabled, this change has the potential to break pmap users who:
a) look at the shards of an array, e.g., via `.addressable_shards`, or `jax.make_array_from_single_device_arrays`, since the shapes of the shards will change.
b) rely on zero-copy behavior in APIs like `jax.device_put_replicated`.

The change is disabled by default, so we do not expect any user visible impacts from this change.

PiperOrigin-RevId: 599787818

											
										
										
											2024-01-19 03:53:01 -08:00
+								      _shard_aval(pci.axis_size, axis, aval) if axis is not None else aval
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      for axis, aval in safe_zip(pci.in_axes, pci.avals))
-												Add a new experimental option jax_pmap_no_rank_reduction.

This option changes the implementation of pmap so that the individual shards have the same rank as the entire array, i.e. in the terminology of pmap using a "chunked" axis instead of an "unstacked" axis.

i.e., previously a typical array used by pmap might have a shape of, say, [8, 100], if sharded across 8 accelerators on its first axis, and each individual shard would have a shape of, say, [100]. With this change, each individual shard has a shape of [1, 100] instead.

Why do this?

The main reason to do this is that XLA's sharding (HloSharding), which is exposed in JAX as GSPMDSharding/NamedSharding/PositionalSharding, cannot represent a change of rank. This means that the kind of sharding used by pmap cannot be represented to XLA as a sharding. If we change the definition of PmapSharding to preserve the array rank instead, then this means that PmapSharding can in the future be represented directly as a kind of sharding known to XLA.

The new definition of PmapSharding will allow a number of internal simplifications to JAX, for example in a subsequent change we can probably delete PmapSharding entirely. This in turn also would allow us to delete the APIs `jax.device_put_replicated` and `jax.device_put_sharded`, which predate the current sharding design.

This change also prepares for an upcoming change where we would like to redefine `pmap` in terms of `jit(shard_map(...))`, allowing us to delete most `pmap` code paths.

Once enabled, this change has the potential to break pmap users who:
a) look at the shards of an array, e.g., via `.addressable_shards`, or `jax.make_array_from_single_device_arrays`, since the shapes of the shards will change.
b) rely on zero-copy behavior in APIs like `jax.device_put_replicated`.

The change is disabled by default, so we do not expect any user visible impacts from this change.

PiperOrigin-RevId: 599787818

											
										
										
											2024-01-19 03:53:01 -08:00
+								  orig_fun = fun
 								  if config.pmap_no_rank_reduction.value:
 								    fun = _change_argument_ranks(fun, pci.in_axes, pci.out_axes_thunk)
 								  else:
 								    fun = orig_fun
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  with core.extend_axis_env(pci.axis_name, pci.global_axis_size, None):  # type: ignore
-												Remove the f-string evaluation during logging the elapsed time by passing in fun_name to log_elapsed_time

PiperOrigin-RevId: 532132574

											
										
										
											2023-05-15 09:15:22 -07:00
+								    with dispatch.log_elapsed_time(
 								        "Finished tracing + transforming {fun_name} for pmap in {elapsed_time} sec",
 								        fun_name=fun.__name__, event=dispatch.JAXPR_TRACE_EVENT):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      jaxpr, out_sharded_avals, consts = pe.trace_to_jaxpr_final(
-												Remove global_arg_shapes from pmap since it was only used for sharded_jit and sharded_jit was removed from JAX a long time ago

PiperOrigin-RevId: 520356179

											
										
										
											2023-03-29 09:22:34 -07:00
+								          fun, sharded_avals, pe.debug_info_final(fun, "pmap"))
-												Add a new experimental option jax_pmap_no_rank_reduction.

This option changes the implementation of pmap so that the individual shards have the same rank as the entire array, i.e. in the terminology of pmap using a "chunked" axis instead of an "unstacked" axis.

i.e., previously a typical array used by pmap might have a shape of, say, [8, 100], if sharded across 8 accelerators on its first axis, and each individual shard would have a shape of, say, [100]. With this change, each individual shard has a shape of [1, 100] instead.

Why do this?

The main reason to do this is that XLA's sharding (HloSharding), which is exposed in JAX as GSPMDSharding/NamedSharding/PositionalSharding, cannot represent a change of rank. This means that the kind of sharding used by pmap cannot be represented to XLA as a sharding. If we change the definition of PmapSharding to preserve the array rank instead, then this means that PmapSharding can in the future be represented directly as a kind of sharding known to XLA.

The new definition of PmapSharding will allow a number of internal simplifications to JAX, for example in a subsequent change we can probably delete PmapSharding entirely. This in turn also would allow us to delete the APIs `jax.device_put_replicated` and `jax.device_put_sharded`, which predate the current sharding design.

This change also prepares for an upcoming change where we would like to redefine `pmap` in terms of `jit(shard_map(...))`, allowing us to delete most `pmap` code paths.

Once enabled, this change has the potential to break pmap users who:
a) look at the shards of an array, e.g., via `.addressable_shards`, or `jax.make_array_from_single_device_arrays`, since the shapes of the shards will change.
b) rely on zero-copy behavior in APIs like `jax.device_put_replicated`.

The change is disabled by default, so we do not expect any user visible impacts from this change.

PiperOrigin-RevId: 599787818

											
										
										
											2024-01-19 03:53:01 -08:00
+								  jaxpr = api_util.jaxpr_debug_info(jaxpr, orig_fun.debug_info)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  jaxpr = dispatch.apply_outfeed_rewriter(jaxpr)
 								  assert len(out_sharded_avals) == len(pci.out_axes), (
 								      len(out_sharded_avals), len(pci.out_axes))
 								  replicas = find_replicas(jaxpr, pci.axis_size, pci.global_axis_size)
-												Delete some dead code that pertained to sharded_jit.

sharded_jit is long gone.

PiperOrigin-RevId: 523711890

											
										
										
											2023-04-12 08:49:07 -07:00
+								  num_local_shards = replicas.num_local_replicas
 								  num_global_shards = replicas.num_global_replicas
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								  shards = ShardInfo(
-												Remove global_arg_shapes from pmap since it was only used for sharded_jit and sharded_jit was removed from JAX a long time ago

PiperOrigin-RevId: 520356179

											
										
										
											2023-03-29 09:22:34 -07:00
+								      sharded_avals, out_sharded_avals, sharded_avals,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      num_local_shards, num_global_shards)
-												Delete some dead code that pertained to sharded_jit.

sharded_jit is long gone.

PiperOrigin-RevId: 523711890

											
										
										
											2023-04-12 08:49:07 -07:00
+								  return jaxpr, consts, replicas, shards
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								@profiler.annotate_function
 								def lower_parallel_callable(
 								    fun: lu.WrappedFun,
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								    backend_name: str | None,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    axis_name: core.AxisName,
 								    axis_size: int,
 								    global_axis_size: int,
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								    devices: Sequence[xc.Device] | None,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    name: str,
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								    in_axes: Iterable[int | None],
 								    out_axes_thunk: Callable[[], Sequence[int | None]],
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    donated_invars: Sequence[bool],
 								    is_explicit_global_axis_size: bool,
-												[jax2tf] Clean up the support for cross-lowering.

In a previous CL we introduced cross-lowering support without any
changes in JAX core, but at the expense of some overly complex code
in jax2tf, along with overriding a JAX core function. Plus, those
changes were not enough to handle some xmap and pmap cases.

Here we introduce a `_experimental_lowering_platform: Optional[str]` parameter
to the `.lower()` methods and then we thread the `lowering_platform`
all the way to the calls to `mlir.lower_jaxpr_to_module2`. That's it.

Note that this parameter to `.lower()` is experimental and not supposed
to be used outside jax2tf. It may also gobble user kwargs.

											
										
										
											2023-02-28 11:30:23 +01:00
+								    avals: Sequence[core.AbstractValue],
 								    *,
-												Add a new experimental option jax_pmap_no_rank_reduction.

This option changes the implementation of pmap so that the individual shards have the same rank as the entire array, i.e. in the terminology of pmap using a "chunked" axis instead of an "unstacked" axis.

i.e., previously a typical array used by pmap might have a shape of, say, [8, 100], if sharded across 8 accelerators on its first axis, and each individual shard would have a shape of, say, [100]. With this change, each individual shard has a shape of [1, 100] instead.

Why do this?

The main reason to do this is that XLA's sharding (HloSharding), which is exposed in JAX as GSPMDSharding/NamedSharding/PositionalSharding, cannot represent a change of rank. This means that the kind of sharding used by pmap cannot be represented to XLA as a sharding. If we change the definition of PmapSharding to preserve the array rank instead, then this means that PmapSharding can in the future be represented directly as a kind of sharding known to XLA.

The new definition of PmapSharding will allow a number of internal simplifications to JAX, for example in a subsequent change we can probably delete PmapSharding entirely. This in turn also would allow us to delete the APIs `jax.device_put_replicated` and `jax.device_put_sharded`, which predate the current sharding design.

This change also prepares for an upcoming change where we would like to redefine `pmap` in terms of `jit(shard_map(...))`, allowing us to delete most `pmap` code paths.

Once enabled, this change has the potential to break pmap users who:
a) look at the shards of an array, e.g., via `.addressable_shards`, or `jax.make_array_from_single_device_arrays`, since the shapes of the shards will change.
b) rely on zero-copy behavior in APIs like `jax.device_put_replicated`.

The change is disabled by default, so we do not expect any user visible impacts from this change.

PiperOrigin-RevId: 599787818

											
										
										
											2024-01-19 03:53:01 -08:00
+								    lowering_parameters: mlir.LoweringParameters) -> PmapComputation:
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  # Determine global_axis_size for use in AxisEnv.
 								  # TODO(mattjj,skyewm): revive this check (inner_pmap always False now)
 								  # if xb.process_count() > 1 and global_axis_size is None and inner_pmap:
 								  #   raise ValueError("'axis_size' must be specified for nested multi-host pmaps")
 								  if (xb.process_count() == 1 and is_explicit_global_axis_size
 								      and global_axis_size != axis_size):
 								    raise ValueError(
 								        f"Specified axis_size {global_axis_size} doesn't match received "
 								        f"axis_size {axis_size}.")
 								  if devices is not None and backend_name is None:
 								    backend = xb.get_device_backend(devices[0])
 								  else:
 								    backend = xb.get_backend(backend_name)
 								  no_nested_sharding = False
 								  must_run_on_all_devices = False
 								  if not is_explicit_global_axis_size:
 								    if xb.process_count(backend) > 1:
 								      if devices:
 								        # This allows each host in a multi-host pmap to run on a different number
-												Delete some dead code that pertained to sharded_jit.

sharded_jit is long gone.

PiperOrigin-RevId: 523711890

											
										
										
											2023-04-12 08:49:07 -07:00
+								        # of devices, but precludes nested sharding (i.e. inner pmaps).
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								        no_nested_sharding = True
 								      else:
 								        # This assumes all hosts run on the same number of devices. We make sure
 								        # this assumption is true by requiring that the pmap is run on all devices
 								        # (and making the further assumption that each host has the same number of
 								        # devices). Nested sharding is ok in this case.
 								        must_run_on_all_devices = True
 								  pci = ParallelCallableInfo(
 								      name, backend, axis_name, axis_size, global_axis_size, devices,
 								      in_axes, out_axes_thunk, avals)
-												Delete some dead code that pertained to sharded_jit.

sharded_jit is long gone.

PiperOrigin-RevId: 523711890

											
										
										
											2023-04-12 08:49:07 -07:00
+								  jaxpr, consts, replicas, shards = stage_parallel_callable(pci, fun)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  if logger.isEnabledFor(logging.DEBUG):
 								    logger.debug("sharded_avals: %s", shards.sharded_avals)
 								    logger.debug("global_sharded_avals: %s", shards.global_sharded_avals)
 								    logger.debug("num_replicas: %d  num_local_replicas: %d",
 								                 replicas.num_global_replicas, replicas.num_local_replicas)
 								    logger.debug("devices: %s", devices)
 								    logger.debug("local_devices: %s", pci.local_devices)
 								  if (xb.process_count(backend) > 1 and must_run_on_all_devices and
 								      shards.num_local_shards != xb.local_device_count(backend)):
 								    if shards.num_local_shards == axis_size:
 								      raise ValueError(
 								         f"On multi-host platforms, the input to pmapped functions must have "
 								         f"leading axis size equal to the number of local devices if no "
 								         f"`devices` argument is specified. Got {axis_size=}, "
 								         f"num_local_devices={xb.local_device_count(backend)}")
 								    else:
 								      raise ValueError(
 								        f"On multi-host platforms, pmapped functions must run across all "
 								        f"devices, i.e. num_replicas * num_partitions should equal the "
 								        f"number of local devices. Got "
-												Delete some dead code that pertained to sharded_jit.

sharded_jit is long gone.

PiperOrigin-RevId: 523711890

											
										
										
											2023-04-12 08:49:07 -07:00
+								        f"num_replicas={replicas.num_local_replicas}, and "
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								        f"num_local_devices={xb.local_device_count(backend)}")
-												Delete some dead code that pertained to sharded_jit.

sharded_jit is long gone.

PiperOrigin-RevId: 523711890

											
										
										
											2023-04-12 08:49:07 -07:00
+								  if no_nested_sharding and replicas.jaxpr_replicas > 1:
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    raise ValueError(
 								      f"On multi-host platforms, pmapped functions that both have `devices` "
-												Delete some dead code that pertained to sharded_jit.

sharded_jit is long gone.

PiperOrigin-RevId: 523711890

											
										
										
											2023-04-12 08:49:07 -07:00
+								      f"specified and contain an inner_pmap must specify an "
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      f"`axis_size` (or remove the `devices` argument). Got nested_replicas="
-												Delete some dead code that pertained to sharded_jit.

sharded_jit is long gone.

PiperOrigin-RevId: 523711890

											
										
										
											2023-04-12 08:49:07 -07:00
+								      f"{replicas.jaxpr_replicas}")
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Migrate a subset of internal modules to use state objects

The motivation here is to gradually replace all dynamic lookups on `jax.config`
with statically-typed state objects, which are more type checker/IDE friendly.

PiperOrigin-RevId: 571932143

											
										
										
											2023-10-09 07:28:18 -07:00
+								  log_priority = logging.WARNING if config.log_compiles.value else logging.DEBUG
-												[Micro-optimization] Only log the avals and shardings if logging is enabled for that level.

PiperOrigin-RevId: 524845969

											
										
										
											2023-04-17 07:52:56 -07:00
+								  if logger.isEnabledFor(log_priority):
 								    logger.log(log_priority,
 								               "Compiling %s (%d) for %d devices with args %s. (num_replicas=%d)",
 								               fun.__name__, id(fun),
 								               shards.num_global_shards, avals, replicas.num_global_replicas)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Split sharding_impls into its own Bazel target.

* Move dependencies of sharding_impls into sharding_impls to avoid creating cyclic dependencies.
* Fix a handful of new pytype errors.

PiperOrigin-RevId: 523146076

											
										
										
											2023-04-10 10:15:08 -07:00
+								  axis_env = sharding_impls.AxisEnv(
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      replicas.num_global_replicas, (axis_name,), (global_axis_size,))
-												Remove circular dependency between source_info_util and util.

Move util.new_name_stack into source_info_util. Replace uses of util.extend_name_stack with stack.extend().

PiperOrigin-RevId: 512685810

											
										
										
											2023-02-27 11:37:10 -08:00
+								  name_stack = source_info_util.new_name_stack(wrap_name(name, 'pmap'))
-												Axis names are now tracked via an effect

This allows propagating the names bottom up -- from equations to the jaxpr,
instead of "discovering" them top-down by traversing (and rebuilding) the
jaxpr via core.subst_axis_names.

PiperOrigin-RevId: 612416803

											
										
										
											2024-03-04 05:41:29 -08:00
+								  jaxpr = core.remove_named_axis_effects(jaxpr, {axis_name})
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  closed_jaxpr = core.ClosedJaxpr(jaxpr, consts)
 								  replicated_args = [axis is None for axis in in_axes]
 								  tuple_args = dispatch.should_tuple_args(len(shards.global_sharded_avals),
 								                                          backend.platform)
 								  module_name = f"pmap_{fun.__name__}"
 								  with maybe_extend_axis_env(axis_name, global_axis_size, None):  # type: ignore
-												Refactor effects system to use effect types, not objects

											
										
										
											2023-02-01 17:50:00 -08:00
+								    ordered_effects = list(
 								        effects.ordered_effects.filter_in(closed_jaxpr.effects))
 								    if ordered_effects:
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      raise ValueError("Ordered effects not supported in `pmap`.")
-												Refactor effects system to use effect types, not objects

											
										
										
											2023-02-01 17:50:00 -08:00
+								    unordered_effects = list(
 								        effects.ordered_effects.filter_not_in(closed_jaxpr.effects))
-												Log the time it takes to lower from jaxpr to stableHLO

PiperOrigin-RevId: 532115098

											
										
										
											2023-05-15 08:07:31 -07:00
+								    with dispatch.log_elapsed_time(
-												Remove the f-string evaluation during logging the elapsed time by passing in fun_name to log_elapsed_time

PiperOrigin-RevId: 532132574

											
										
										
											2023-05-15 09:15:22 -07:00
+								        "Finished jaxpr to MLIR module conversion {fun_name} in {elapsed_time} sec",
 								        fun_name=str(name_stack), event=dispatch.JAXPR_TO_MLIR_MODULE_EVENT):
-												Log the time it takes to lower from jaxpr to stableHLO

PiperOrigin-RevId: 532115098

											
										
										
											2023-05-15 08:07:31 -07:00
+								      lowering_result = mlir.lower_jaxpr_to_module(
 								          module_name,
 								          closed_jaxpr,
-												Introduce a LoweringParameters dataclass for easier plumbing

There are currently two parameters that are used to configure
lowering: lowering_platform (for cross-platform lowering), and
override_lowering_rules. Each of them are passed as separate arguments
through several layers of lowering internal functions. This is tedious,
and error prone. In fact, override_lowering_rules was not plumbed
in all places, and due to using default arguments in all places,
this leads to silent errors.

We foresee introducing other parameters for lowering: for multi-platform
lowering, for controlling the lowering of effects.

Here is pack all such parameters into a `mlir.LoweringParameters`
dataclass and we plumb that through.

											
										
										
											2023-09-28 12:44:14 +02:00
+								          ordered_effects=ordered_effects,
 								          backend_or_name=backend,
-												Cleanup the handling of single- and multi-platform lowering in ModuleContext

Previously, we introduced support for multi-platform lowering, by
adding a new LoweringParameters object that can be used to specify
a cross-lowering platform or even multiple platforms. But we had
kept the ModuleContext.platform in place because some lowering rules
were still referencing it. Now we replace ModuleContext.platform with
ModuleContext.platforms, which removes the redundancy, simplifies
the code, and makes it clearer that the lowering rules should not
simply assume single-platform lowering.

PiperOrigin-RevId: 576575376

											
										
										
											2023-10-25 10:39:47 -07:00
+								          platforms=lowering_parameters.platforms or (backend.platform,),
-												Introduce a LoweringParameters dataclass for easier plumbing

There are currently two parameters that are used to configure
lowering: lowering_platform (for cross-platform lowering), and
override_lowering_rules. Each of them are passed as separate arguments
through several layers of lowering internal functions. This is tedious,
and error prone. In fact, override_lowering_rules was not plumbed
in all places, and due to using default arguments in all places,
this leads to silent errors.

We foresee introducing other parameters for lowering: for multi-platform
lowering, for controlling the lowering of effects.

Here is pack all such parameters into a `mlir.LoweringParameters`
dataclass and we plumb that through.

											
										
										
											2023-09-28 12:44:14 +02:00
+								          axis_context=sharding_impls.ReplicaAxisContext(axis_env),
 								          name_stack=name_stack,
 								          donated_args=donated_invars,
-												Log the time it takes to lower from jaxpr to stableHLO

PiperOrigin-RevId: 532115098

											
										
										
											2023-05-15 08:07:31 -07:00
+								          replicated_args=replicated_args,
 								          arg_shardings=None,
 								          result_shardings=None,
 								          arg_names=jaxpr.debug_info and jaxpr.debug_info.arg_names,
 								          result_names=jaxpr.debug_info and jaxpr.debug_info.result_paths,
-												Introduce a LoweringParameters dataclass for easier plumbing

There are currently two parameters that are used to configure
lowering: lowering_platform (for cross-platform lowering), and
override_lowering_rules. Each of them are passed as separate arguments
through several layers of lowering internal functions. This is tedious,
and error prone. In fact, override_lowering_rules was not plumbed
in all places, and due to using default arguments in all places,
this leads to silent errors.

We foresee introducing other parameters for lowering: for multi-platform
lowering, for controlling the lowering of effects.

Here is pack all such parameters into a `mlir.LoweringParameters`
dataclass and we plumb that through.

											
										
										
											2023-09-28 12:44:14 +02:00
+								          num_replicas=replicas.num_global_replicas,
 								          lowering_parameters=lowering_parameters)
-												Small cleanups to pxla.py.

Remove stale references to XlaComputation and code left over from handling both XlaComputations and ir.Modules.

No functional changes intended.

PiperOrigin-RevId: 526139679

											
										
										
											2023-04-21 14:37:52 -07:00
+								  return PmapComputation(lowering_result.module, pci=pci, replicas=replicas,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								                         shards=shards, tuple_args=tuple_args,
 								                         unordered_effects=unordered_effects,
 								                         ordered_effects=ordered_effects,
-												Small cleanups to pxla.py.

Remove stale references to XlaComputation and code left over from handling both XlaComputations and ir.Modules.

No functional changes intended.

PiperOrigin-RevId: 526139679

											
										
										
											2023-04-21 14:37:52 -07:00
+								                         keepalive=lowering_result.keepalive,
 								                         host_callbacks=lowering_result.host_callbacks,
-												Add arg_names to aval mismatch error raised during AOT compilation to raise better error messages

PiperOrigin-RevId: 525561905

											
										
										
											2023-04-19 15:08:21 -07:00
+								                         jaxpr_debug_info=closed_jaxpr.jaxpr.debug_info)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Add a new experimental option jax_pmap_no_rank_reduction.

This option changes the implementation of pmap so that the individual shards have the same rank as the entire array, i.e. in the terminology of pmap using a "chunked" axis instead of an "unstacked" axis.

i.e., previously a typical array used by pmap might have a shape of, say, [8, 100], if sharded across 8 accelerators on its first axis, and each individual shard would have a shape of, say, [100]. With this change, each individual shard has a shape of [1, 100] instead.

Why do this?

The main reason to do this is that XLA's sharding (HloSharding), which is exposed in JAX as GSPMDSharding/NamedSharding/PositionalSharding, cannot represent a change of rank. This means that the kind of sharding used by pmap cannot be represented to XLA as a sharding. If we change the definition of PmapSharding to preserve the array rank instead, then this means that PmapSharding can in the future be represented directly as a kind of sharding known to XLA.

The new definition of PmapSharding will allow a number of internal simplifications to JAX, for example in a subsequent change we can probably delete PmapSharding entirely. This in turn also would allow us to delete the APIs `jax.device_put_replicated` and `jax.device_put_sharded`, which predate the current sharding design.

This change also prepares for an upcoming change where we would like to redefine `pmap` in terms of `jit(shard_map(...))`, allowing us to delete most `pmap` code paths.

Once enabled, this change has the potential to break pmap users who:
a) look at the shards of an array, e.g., via `.addressable_shards`, or `jax.make_array_from_single_device_arrays`, since the shapes of the shards will change.
b) rely on zero-copy behavior in APIs like `jax.device_put_replicated`.

The change is disabled by default, so we do not expect any user visible impacts from this change.

PiperOrigin-RevId: 599787818

											
										
										
											2024-01-19 03:53:01 -08:00
+								def _pmap_unmap_shaped_array(
 								    size: int, axis_name: core.AxisName, axis: int | None, aval: ShapedArray
 								  ) -> ShapedArray:
 								  named_shape = dict(aval.named_shape)
 								  named_shape.pop(axis_name, None)  # TODO: make this mandatory
 								  if axis is None: return aval.update(named_shape=named_shape)
 								  elif type(axis) is int:
 								    return ShapedArray(tuple_update(aval.shape, axis, size), aval.dtype,
 								                       named_shape=named_shape, weak_type=aval.weak_type)
 								  else: raise TypeError(axis)
 								AvalMapHandlerPair = tuple[Any, Callable]
 								_pmap_aval_mapping_handlers: dict[type, AvalMapHandlerPair] = {
 								    ShapedArray:   (Any, _pmap_unmap_shaped_array),
 								}
 								def _pmap_unmapped_aval(size: core.AxisSize, axis_name, axis: int | None,
 								                       aval: core.AbstractValue) -> core.AbstractValue:
 								  if not config.pmap_no_rank_reduction.value:
 								    return core.unmapped_aval(size, axis_name, axis, aval)
 								  _, handler = _pmap_aval_mapping_handlers.get(type(aval), (None, None))
 								  if handler is not None:
 								    return handler(size, axis_name, axis, aval)
 								  else:
 								    raise TypeError(f"no unmapping handler for {aval} of type {type(aval)}")
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								class PmapComputation(stages.XlaLowering):
 								  _hlo: ir.Module
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								  _executable: PmapExecutable | None
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								  def __init__(self, hlo: ir.Module, **compile_args):
 								    self._executable = None
 								    self._hlo = hlo
 								    self.compile_args = compile_args
 								  # -- stages.XlaLowering overrides
 								  def stablehlo(self) -> ir.Module:
 								    return self._hlo
 								  @profiler.annotate_function
-												expose `compiler_options` on `compile()`

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 520782460

											
										
										
											2023-03-30 17:13:46 -07:00
+								  def compile(self, compiler_options=None) -> PmapExecutable:
 								    if self._executable is None or compiler_options is not None:
 								      executable = UnloadedPmapExecutable.from_hlo(
 								          self._hlo, **self.compile_args,
 								          compiler_options=compiler_options)
 								      if compiler_options is None:
 								        self._executable = executable
 								      return executable
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    return self._executable
-												Delete some dead code that pertained to sharded_jit.

sharded_jit is long gone.

PiperOrigin-RevId: 523711890

											
										
										
											2023-04-12 08:49:07 -07:00
+								def _cast_to_shaped_array(aval: core.AbstractValue) -> ShapedArray:
 								  assert isinstance(aval, ShapedArray), aval
 								  return cast(ShapedArray, aval)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								@dataclasses.dataclass
 								class UnloadedPmapExecutable:
 								  compiled: Any
 								  backend: xb.XlaBackend
-												migrate more internal dependencies from `jax.core` to `jax._src.core`

PiperOrigin-RevId: 509736368

											
										
										
											2023-02-14 23:00:40 -08:00
+								  local_input_avals: Sequence[core.AbstractValue]
-												Split Sharding subclasses out of _src/sharding.py into _src/sharding_impls.py

By defining the Sharding base class in its own module, we can pull it out into a separate Bazel submodule, which will help pytype inference when defining Array.

PiperOrigin-RevId: 516223009

											
										
										
											2023-03-13 08:49:39 -07:00
+								  input_shardings: Sequence[sharding_impls.XLACompatibleSharding]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  local_output_avals: Sequence[ShapedArray]
-												Split Sharding subclasses out of _src/sharding.py into _src/sharding_impls.py

By defining the Sharding base class in its own module, we can pull it out into a separate Bazel submodule, which will help pytype inference when defining Array.

PiperOrigin-RevId: 516223009

											
										
										
											2023-03-13 08:49:39 -07:00
+								  output_shardings: Sequence[sharding_impls.XLACompatibleSharding]
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								  unordered_effects: list[core.Effect]
 								  ordered_effects: list[core.Effect]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  keepalive: Sequence[Any]
 								  host_callbacks: Sequence[Any]
-												Add arg_names to aval mismatch error raised during AOT compilation to raise better error messages

PiperOrigin-RevId: 525561905

											
										
										
											2023-04-19 15:08:21 -07:00
+								  jaxpr_debug_info: core.JaxprDebugInfo
 								  def build_execute_fun(self):
 								    input_indices = []
 								    for aval, spec in safe_zip(self.local_input_avals, self.input_shardings):
 								      assert isinstance(spec, sharding_impls.PmapSharding), spec
 								      assert isinstance(aval, core.ShapedArray), aval
 								      input_indices.append(
 								          sharding_specs.spec_to_indices(aval.shape, spec.sharding_spec)
 								          if spec.sharding_spec is not None else None)
 								    handle_outs = local_avals_to_results_handler(self.local_output_avals,
 								                                                 self.output_shardings)
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								    handle_args = InputsHandler(self.input_shardings,
 								                                self.compiled.local_devices(), input_indices)
-												Add arg_names to aval mismatch error raised during AOT compilation to raise better error messages

PiperOrigin-RevId: 525561905

											
										
										
											2023-04-19 15:08:21 -07:00
+								    execute_fun = ExecuteReplicated(self.compiled, "parallel computation",
 								                                    self.backend, handle_args, handle_outs,
 								                                    self.unordered_effects,
 								                                    self.ordered_effects, self.keepalive,
 								                                    bool(self.host_callbacks),
-												Set out_mut to `None` as default on `from_hlo` instead of in `__init__` of `MeshComputation` and correct the types too.

PiperOrigin-RevId: 611814102

											
										
										
											2024-03-01 09:27:57 -08:00
+								                                    set(range(len(input_indices))), None)
-												Add arg_names to aval mismatch error raised during AOT compilation to raise better error messages

PiperOrigin-RevId: 525561905

											
										
										
											2023-04-19 15:08:21 -07:00
+								    return execute_fun
 								  def load(self) -> PmapExecutable:
 								    fingerprint = getattr(self.compiled, "fingerprint", None)
 								    return PmapExecutable(
 								        self.compiled, self.build_execute_fun, fingerprint,
 								        self.local_input_avals, self.jaxpr_debug_info, self)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								  @staticmethod
-												Small cleanups to pxla.py.

Remove stale references to XlaComputation and code left over from handling both XlaComputations and ir.Modules.

No functional changes intended.

PiperOrigin-RevId: 526139679

											
										
										
											2023-04-21 14:37:52 -07:00
+								  def from_hlo(hlo: ir.Module,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								               pci: ParallelCallableInfo,
 								               replicas: ReplicaInfo,
 								               shards: ShardInfo,
 								               tuple_args: bool,
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								               unordered_effects: list[core.Effect],
 								               ordered_effects: list[core.Effect],
 								               host_callbacks: list[Any],
-												expose `compiler_options` on `compile()`

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 520782460

											
										
										
											2023-03-30 17:13:46 -07:00
+								               keepalive: Any,
-												Add arg_names to aval mismatch error raised during AOT compilation to raise better error messages

PiperOrigin-RevId: 525561905

											
										
										
											2023-04-19 15:08:21 -07:00
+								               jaxpr_debug_info: core.JaxprDebugInfo,
-												expose `compiler_options` on `compile()`

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 520782460

											
										
										
											2023-03-30 17:13:46 -07:00
+								               compiler_options=None):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    devices = pci.devices
 								    if devices is None:
 								      if shards.num_global_shards > xb.device_count(pci.backend):
 								        msg = ("compiling computation that requires {} logical devices, but only {} XLA "
-												Delete some dead code that pertained to sharded_jit.

sharded_jit is long gone.

PiperOrigin-RevId: 523711890

											
										
										
											2023-04-12 08:49:07 -07:00
+								               "devices are available (num_replicas={})")
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								        raise ValueError(msg.format(shards.num_global_shards,
 								                                    xb.device_count(pci.backend),
-												Delete some dead code that pertained to sharded_jit.

sharded_jit is long gone.

PiperOrigin-RevId: 523711890

											
										
										
											2023-04-12 08:49:07 -07:00
+								                                    replicas.num_global_replicas))
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      # On a single host, we simply grab the first N devices from jax.devices().
 								      # In the single host case, we want the default device order of pmap to
 								      # match jax.devices().
 								      # On multiple hosts, we create a default device assignment that ensures
 								      # each host is responsible for a contiguous set of replicas.
 								      if shards.num_global_shards > shards.num_local_shards:
 								        # TODO(skye): use a locality-aware assignment that satisfies the above
 								        # constraint.
 								        devices = [d for process_index in range(xb.process_count(pci.backend))
 								                  for d in xb.local_devices(process_index, pci.backend)]
 								      else:
 								        devices = xb.local_devices(backend=pci.backend)[:shards.num_local_shards]
 								    else:
 								      if shards.num_local_shards != len(pci.local_devices):
 								        local_devices_str = ", ".join(map(str, pci.local_devices))
 								        if shards.num_local_shards == pci.axis_size:
 								          raise ValueError(
 								              f"Leading axis size of input to pmapped function must equal the "
 								              f"number of local devices passed to pmap. Got axis_size="
 								              f"{pci.axis_size}, num_local_devices={len(pci.local_devices)}.\n"
 								              f"(Local devices available to pmap: {local_devices_str})")
 								        else:
 								          raise ValueError(
 								              f"pmapped function requires {shards.num_local_shards} local "
 								              f"devices to run due to nested pmapped or other parallel "
 								              f"functions, but only {len(pci.local_devices)} are available.\n"
 								              f"(outer axis size: {pci.axis_size}, local devices available to "
 								              f"pmap: {local_devices_str})")
 								      if shards.num_global_shards != len(devices):
 								        raise ValueError("compiling computation that creates %s shards, "
 								                        "but %s devices were specified" %
 								                        (shards.num_global_shards, len(devices)))
 								    # 'devices' may be 1D or 2D at this point (e.g.
 								    # get_default_device_assignment() returns 2D assignment, caller may have
 								    # provided 1D list of devices).
 								    # Convert to 2D in case it's 1D and we have > 1 partitions.
-												Delete some dead code that pertained to sharded_jit.

sharded_jit is long gone.

PiperOrigin-RevId: 523711890

											
										
										
											2023-04-12 08:49:07 -07:00
+								    num_partitions = 1
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    device_assignment: np.ndarray = np.array(devices).reshape(
-												Delete some dead code that pertained to sharded_jit.

sharded_jit is long gone.

PiperOrigin-RevId: 523711890

											
										
										
											2023-04-12 08:49:07 -07:00
+								        (replicas.num_global_replicas, num_partitions))
-												Move compiler APIs out of dispatch.py and xla_bridge.py into a new jax._src.compiler module.

Refactoring only, no user-visible changes intended.

PiperOrigin-RevId: 557116160

											
										
										
											2023-08-15 06:38:56 -07:00
+								    compile_options = compiler.get_compile_options(
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								        num_replicas=replicas.num_global_replicas,
-												Delete some dead code that pertained to sharded_jit.

sharded_jit is long gone.

PiperOrigin-RevId: 523711890

											
										
										
											2023-04-12 08:49:07 -07:00
+								        num_partitions=num_partitions,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								        device_assignment=device_assignment,
-												Delete some dead code that pertained to sharded_jit.

sharded_jit is long gone.

PiperOrigin-RevId: 523711890

											
										
										
											2023-04-12 08:49:07 -07:00
+								        use_spmd_partitioning=False,
-												expose `compiler_options` on `compile()`

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 520782460

											
										
										
											2023-03-30 17:13:46 -07:00
+								        env_options_overrides=compiler_options,
-												Restrict retrieving XLA-AutoFDO profile version to TPU workloads.

XLA-AutoFDO is supported only for TPUs, so requesting the latest
profile version for non-TPU workloads is unnecessary and can delay
the completion of initialization.

Testing: test workload.
PiperOrigin-RevId: 584148686

											
										
										
											2023-11-20 15:51:27 -08:00
+								        detailed_logging=compiler.use_detailed_logging(hlo),
 								        backend=pci.backend,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    )
 								    compile_options.parameter_is_tupled_arguments = tuple_args
 								    process_index = xb.process_index(pci.backend)
 								    local_device_assignment = np.array([
 								        d for d in device_assignment.flat if d.process_index == process_index
 								    ])
 								    input_sharding_specs = [
-												Split ShardingSpecs and most of the helpers for constructing them into a separate file (jax/_src/sharding_specs.py).

PiperOrigin-RevId: 522360232

											
										
										
											2023-04-06 09:48:14 -07:00
+								        sharding_specs.pmap_sharding_spec(
 								            replicas.num_local_replicas, pci.axis_size,
 								            cast(ShapedArray, aval).shape, in_axis)
-												Delete some dead code that pertained to sharded_jit.

sharded_jit is long gone.

PiperOrigin-RevId: 523711890

											
										
										
											2023-04-12 08:49:07 -07:00
+								        for aval, in_axis in safe_zip(shards.sharded_avals, pci.in_axes)]
 								    in_shardings = _get_pmap_sharding(local_device_assignment,
 								                                      input_sharding_specs)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    local_unmapped_avals = [
-												Delete some dead code that pertained to sharded_jit.

sharded_jit is long gone.

PiperOrigin-RevId: 523711890

											
										
										
											2023-04-12 08:49:07 -07:00
+								        _cast_to_shaped_array(
-												Add a new experimental option jax_pmap_no_rank_reduction.

This option changes the implementation of pmap so that the individual shards have the same rank as the entire array, i.e. in the terminology of pmap using a "chunked" axis instead of an "unstacked" axis.

i.e., previously a typical array used by pmap might have a shape of, say, [8, 100], if sharded across 8 accelerators on its first axis, and each individual shard would have a shape of, say, [100]. With this change, each individual shard has a shape of [1, 100] instead.

Why do this?

The main reason to do this is that XLA's sharding (HloSharding), which is exposed in JAX as GSPMDSharding/NamedSharding/PositionalSharding, cannot represent a change of rank. This means that the kind of sharding used by pmap cannot be represented to XLA as a sharding. If we change the definition of PmapSharding to preserve the array rank instead, then this means that PmapSharding can in the future be represented directly as a kind of sharding known to XLA.

The new definition of PmapSharding will allow a number of internal simplifications to JAX, for example in a subsequent change we can probably delete PmapSharding entirely. This in turn also would allow us to delete the APIs `jax.device_put_replicated` and `jax.device_put_sharded`, which predate the current sharding design.

This change also prepares for an upcoming change where we would like to redefine `pmap` in terms of `jit(shard_map(...))`, allowing us to delete most `pmap` code paths.

Once enabled, this change has the potential to break pmap users who:
a) look at the shards of an array, e.g., via `.addressable_shards`, or `jax.make_array_from_single_device_arrays`, since the shapes of the shards will change.
b) rely on zero-copy behavior in APIs like `jax.device_put_replicated`.

The change is disabled by default, so we do not expect any user visible impacts from this change.

PiperOrigin-RevId: 599787818

											
										
										
											2024-01-19 03:53:01 -08:00
+								            _pmap_unmapped_aval(pci.axis_size, pci.axis_name, out_axis, aval))
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								        if out_axis is not None else aval
-												Delete some dead code that pertained to sharded_jit.

sharded_jit is long gone.

PiperOrigin-RevId: 523711890

											
										
										
											2023-04-12 08:49:07 -07:00
+								        for aval, out_axis in safe_zip(shards.out_sharded_avals, pci.out_axes)]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    out_specs = [
-												Split ShardingSpecs and most of the helpers for constructing them into a separate file (jax/_src/sharding_specs.py).

PiperOrigin-RevId: 522360232

											
										
										
											2023-04-06 09:48:14 -07:00
+								        sharding_specs.pmap_sharding_spec(
-												Delete some dead code that pertained to sharded_jit.

sharded_jit is long gone.

PiperOrigin-RevId: 523711890

											
										
										
											2023-04-12 08:49:07 -07:00
+								            replicas.num_local_replicas, pci.axis_size, aval.shape, out_axis)
 								        for aval, out_axis in safe_zip(
 								            shards.out_sharded_avals, pci.out_axes)]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    out_shardings = _get_pmap_sharding(local_device_assignment, out_specs)
 								    if hasattr(pci.backend, "compile_replicated"):
 								      input_indices = [
-												Split ShardingSpecs and most of the helpers for constructing them into a separate file (jax/_src/sharding_specs.py).

PiperOrigin-RevId: 522360232

											
										
										
											2023-04-06 09:48:14 -07:00
+								          sharding_specs.spec_to_indices(aval.shape, spec)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								          if spec is not None else None
 								          for aval, spec in safe_zip(pci.avals, input_sharding_specs)
 								      ]
 								      handle_outs = local_avals_to_results_handler(local_unmapped_avals,
 								                                                   out_shardings)
 								      return _compile_replicated_pmap_executable_from_hlo(
-												Small cleanups to pxla.py.

Remove stale references to XlaComputation and code left over from handling both XlaComputations and ir.Modules.

No functional changes intended.

PiperOrigin-RevId: 526139679

											
										
										
											2023-04-21 14:37:52 -07:00
+								          hlo, pci, input_indices, in_shardings, handle_outs,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								          compile_options, host_callbacks, bool(unordered_effects),
-												Add arg_names to aval mismatch error raised during AOT compilation to raise better error messages

PiperOrigin-RevId: 525561905

											
										
										
											2023-04-19 15:08:21 -07:00
+								          ordered_effects, jaxpr_debug_info)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								    with dispatch.log_elapsed_time(
-												Remove the f-string evaluation during logging the elapsed time by passing in fun_name to log_elapsed_time

PiperOrigin-RevId: 532132574

											
										
										
											2023-05-15 09:15:22 -07:00
+								        "Finished XLA compilation of {fun_name} in {elapsed_time} sec",
 								        fun_name=pci.name, event=dispatch.BACKEND_COMPILE_EVENT):
-												Move compiler APIs out of dispatch.py and xla_bridge.py into a new jax._src.compiler module.

Refactoring only, no user-visible changes intended.

PiperOrigin-RevId: 557116160

											
										
										
											2023-08-15 06:38:56 -07:00
+								      compiled = compiler.compile_or_get_cached(
-												Small cleanups to pxla.py.

Remove stale references to XlaComputation and code left over from handling both XlaComputations and ir.Modules.

No functional changes intended.

PiperOrigin-RevId: 526139679

											
										
										
											2023-04-21 14:37:52 -07:00
+								          pci.backend, hlo, device_assignment, compile_options,
-												Include the device_kind in the compilation cache key.

PiperOrigin-RevId: 525726898

											
										
										
											2023-04-20 06:16:12 -07:00
+								          host_callbacks)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								    return UnloadedPmapExecutable(
 								        compiled=compiled,
 								        backend=pci.backend,
 								        local_input_avals=pci.avals,
 								        input_shardings=in_shardings,
 								        local_output_avals=local_unmapped_avals,
 								        output_shardings=out_shardings,
 								        unordered_effects=unordered_effects,
 								        ordered_effects=ordered_effects,
 								        keepalive=keepalive,
 								        host_callbacks=host_callbacks,
-												Add arg_names to aval mismatch error raised during AOT compilation to raise better error messages

PiperOrigin-RevId: 525561905

											
										
										
											2023-04-19 15:08:21 -07:00
+								        jaxpr_debug_info=jaxpr_debug_info).load()
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Rollforward with fixes: Remove _execute_replicated from UnloadedMeshExecutable.load since it is not required anymore for jit(pmap) cases

PiperOrigin-RevId: 516317920

											
										
										
											2023-03-13 14:08:48 -07:00
+								def _compile_replicated_pmap_executable_from_hlo(
-												Small cleanups to pxla.py.

Remove stale references to XlaComputation and code left over from handling both XlaComputations and ir.Modules.

No functional changes intended.

PiperOrigin-RevId: 526139679

											
										
										
											2023-04-21 14:37:52 -07:00
+								    hlo: ir.Module, pci, input_indices, in_shardings, handle_outs,
-												Add arg_names to aval mismatch error raised during AOT compilation to raise better error messages

PiperOrigin-RevId: 525561905

											
										
										
											2023-04-19 15:08:21 -07:00
+								    compile_options, host_callbacks, has_unordered_effects, ordered_effects,
 								    jaxpr_debug_info):
-												Rollforward with fixes: Remove _execute_replicated from UnloadedMeshExecutable.load since it is not required anymore for jit(pmap) cases

PiperOrigin-RevId: 516317920

											
										
										
											2023-03-13 14:08:48 -07:00
+								  # Use the standard out_handler.
 								  execute_fun = pci.backend.compile_replicated(
-												Small cleanups to pxla.py.

Remove stale references to XlaComputation and code left over from handling both XlaComputations and ir.Modules.

No functional changes intended.

PiperOrigin-RevId: 526139679

											
										
										
											2023-04-21 14:37:52 -07:00
+								      is_trivial=False, name=pci.name, computation=hlo,
-												Rollforward with fixes: Remove _execute_replicated from UnloadedMeshExecutable.load since it is not required anymore for jit(pmap) cases

PiperOrigin-RevId: 516317920

											
										
										
											2023-03-13 14:08:48 -07:00
+								      compile_options=compile_options, host_callbacks=host_callbacks,
 								      has_unordered_effects=has_unordered_effects,
 								      ordered_effects=ordered_effects, in_avals=pci.avals,
 								      in_indices=input_indices, in_shardings=in_shardings,
 								      kept_var_idx=set(range(len(pci.avals))), out_handler=handle_outs)
 								  # TODO(frostig): need `compile_replicated` to give us the XLA executable
-												Add arg_names to aval mismatch error raised during AOT compilation to raise better error messages

PiperOrigin-RevId: 525561905

											
										
										
											2023-04-19 15:08:21 -07:00
+								  return PmapExecutable(None, lambda: execute_fun, None, pci.avals,
 								                        jaxpr_debug_info, None)
-												Rollforward with fixes: Remove _execute_replicated from UnloadedMeshExecutable.load since it is not required anymore for jit(pmap) cases

PiperOrigin-RevId: 516317920

											
										
										
											2023-03-13 14:08:48 -07:00
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								class PmapExecutable(stages.XlaExecutable):
-												Redefine `compile_and_serialize` as `serialize(lowered.compile())`.

This has the downside of keeping around the UnloadedMeshComputation,
but it makes the serialize() API easier to understand.

PiperOrigin-RevId: 518715469

											
										
										
											2023-03-22 17:22:39 -07:00
+								  __slots__ = ["xla_executable", "_unsafe_call", "build_unsafe_call",
-												Add arg_names to aval mismatch error raised during AOT compilation to raise better error messages

PiperOrigin-RevId: 525561905

											
										
										
											2023-04-19 15:08:21 -07:00
+								               "fingerprint", "in_avals", "_jaxpr_debug_info",
 								               "_unloaded_executable"]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Redefine `compile_and_serialize` as `serialize(lowered.compile())`.

This has the downside of keeping around the UnloadedMeshComputation,
but it makes the serialize() API easier to understand.

PiperOrigin-RevId: 518715469

											
										
										
											2023-03-22 17:22:39 -07:00
+								  def __init__(self, xla_executable, build_unsafe_call, fingerprint,
-												Add arg_names to aval mismatch error raised during AOT compilation to raise better error messages

PiperOrigin-RevId: 525561905

											
										
										
											2023-04-19 15:08:21 -07:00
+								               in_avals, jaxpr_debug_info, unloaded_executable):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    self.xla_executable = xla_executable
-												Redefine `compile_and_serialize` as `serialize(lowered.compile())`.

This has the downside of keeping around the UnloadedMeshComputation,
but it makes the serialize() API easier to understand.

PiperOrigin-RevId: 518715469

											
										
										
											2023-03-22 17:22:39 -07:00
+								    self._unsafe_call = None
 								    self.build_unsafe_call = build_unsafe_call
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    self.fingerprint = fingerprint
 								    self.in_avals = in_avals
-												Add arg_names to aval mismatch error raised during AOT compilation to raise better error messages

PiperOrigin-RevId: 525561905

											
										
										
											2023-04-19 15:08:21 -07:00
+								    self._jaxpr_debug_info = jaxpr_debug_info
-												Redefine `compile_and_serialize` as `serialize(lowered.compile())`.

This has the downside of keeping around the UnloadedMeshComputation,
but it makes the serialize() API easier to understand.

PiperOrigin-RevId: 518715469

											
										
										
											2023-03-22 17:22:39 -07:00
+								    self._unloaded_executable = unloaded_executable
 								  @property
 								  def unsafe_call(self) -> Callable[..., Any]:
 								    if self._unsafe_call is None:
 								      self._unsafe_call = self.build_unsafe_call()
 								    return self._unsafe_call
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								  # -- stages.XlaExecutable overrides
 								  def xla_extension_executable(self):
 								    return self.xla_executable
 								  @profiler.annotate_function
 								  def call(self, *args):
 								    # TODO(frostig): do we need to check sharding and sharded avals?
 								    arg_avals = map(xla.abstractify, args)
-												Add arg_names to aval mismatch error raised during AOT compilation to raise better error messages

PiperOrigin-RevId: 525561905

											
										
										
											2023-04-19 15:08:21 -07:00
+								    check_arg_avals_for_call(self.in_avals, arg_avals, self._jaxpr_debug_info)
-												Redefine `compile_and_serialize` as `serialize(lowered.compile())`.

This has the downside of keeping around the UnloadedMeshComputation,
but it makes the serialize() API easier to understand.

PiperOrigin-RevId: 518715469

											
										
										
											2023-03-22 17:22:39 -07:00
+								    return self.unsafe_call(*args)  # pylint: disable=not-callable
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								def _get_pmap_sharding(devices, specs):
-												Split Sharding subclasses out of _src/sharding.py into _src/sharding_impls.py

By defining the Sharding base class in its own module, we can pull it out into a separate Bazel submodule, which will help pytype inference when defining Array.

PiperOrigin-RevId: 516223009

											
										
										
											2023-03-13 08:49:39 -07:00
+								  return [sharding_impls.PmapSharding(devices, spec) for spec in specs]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								class InputsHandler:
 								  __slots__ = ("handler", "local_devices", "in_shardings", "input_indices")
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								  def __init__(self, in_shardings, local_devices=None, input_indices=None):
 								    self.handler = partial(shard_args, in_shardings)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    self.local_devices = local_devices
 								    self.in_shardings = in_shardings
 								    self.input_indices = input_indices
 								  def __call__(self, input_buffers):
 								    return self.handler(input_buffers)
 								  def __str__(self):
 								    return ("InputsHandler(\n"
 								            f"local_devices={self.local_devices},\n"
 								            f"in_shardings={self.in_shardings},\n"
 								            f"input_indices={self.input_indices})")
 								class ResultsHandler:
-												Remove --jax_parallel_functions_output_gda.

PiperOrigin-RevId: 616898032

											
										
										
											2024-03-18 11:41:17 -07:00
+								  # `out_avals` is the `Array` global avals when using pjit or xmap. It is the
 								  # local one when using `pmap`.
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  __slots__ = ("handlers", "out_shardings", "out_avals")
 								  def __init__(self, handlers, out_shardings, out_avals):
 								    self.handlers = handlers
 								    self.out_shardings = out_shardings
 								    self.out_avals = out_avals
 								  def __call__(self, out_bufs):
 								    return [h(bufs) for h, bufs in safe_zip(self.handlers, out_bufs)]
 								def local_avals_to_results_handler(
 								    unmapped_local_out_avals: Sequence[ShapedArray],
-												Split Sharding subclasses out of _src/sharding.py into _src/sharding_impls.py

By defining the Sharding base class in its own module, we can pull it out into a separate Bazel submodule, which will help pytype inference when defining Array.

PiperOrigin-RevId: 516223009

											
										
										
											2023-03-13 08:49:39 -07:00
+								    local_shardings: Sequence[sharding_impls.XLACompatibleSharding]) -> ResultsHandler:
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  out_indices = [tuple(s.devices_indices_map(aval.shape).values())
 								                 for s, aval in safe_zip(local_shardings, unmapped_local_out_avals)]
 								  handlers = [
 								      local_aval_to_result_handler(aval, s, idcs)
 								      for aval, s, idcs in safe_zip(unmapped_local_out_avals, local_shardings, out_indices)
 								  ]
 								  return ResultsHandler(handlers, local_shardings, unmapped_local_out_avals)
 								def global_avals_to_results_handler(
 								    global_out_avals: Sequence[ShapedArray],
-												Split Sharding subclasses out of _src/sharding.py into _src/sharding_impls.py

By defining the Sharding base class in its own module, we can pull it out into a separate Bazel submodule, which will help pytype inference when defining Array.

PiperOrigin-RevId: 516223009

											
										
										
											2023-03-13 08:49:39 -07:00
+								    shardings: Sequence[sharding_impls.XLACompatibleSharding],
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								    committed: bool) -> ResultsHandler:
-												Remove references to jax.config.jax_array, which is always True at head.

PiperOrigin-RevId: 516970232

											
										
										
											2023-03-15 17:08:21 -07:00
+								  handlers = [
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								      global_aval_to_result_handler(global_aval, s, committed)
 								      for global_aval, s in safe_zip(global_out_avals, shardings)
-												Remove references to jax.config.jax_array, which is always True at head.

PiperOrigin-RevId: 516970232

											
										
										
											2023-03-15 17:08:21 -07:00
+								  ]
 								  return ResultsHandler(handlers, shardings, global_out_avals)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								class ExecuteReplicated:
 								  """The logic to shard inputs, execute a replicated model, returning outputs."""
 								  __slots__ = ['xla_executable', 'name', 'backend', 'in_handler', 'out_handler',
 								               'has_unordered_effects', 'ordered_effects', 'keepalive',
 								               'has_host_callbacks', '_local_devices', 'kept_var_idx',
-												[mutable-arrays] support closed-over mutable arrays in jit

											
										
										
											2024-03-05 16:20:24 -08:00
+								               'mut', '__weakref__']
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								  def __init__(self, xla_executable, name, backend, in_handler: InputsHandler,
 								               out_handler: ResultsHandler,
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								               unordered_effects: list[core.Effect],
 								               ordered_effects: list[core.Effect], keepalive: Any,
-												[mutable-arrays] allow state effects in jit by building in run_state

with help from @sharadmv, @yashkatariya, @dougalm, and others

The basic strategy is to apply discharge_state when lowering a jaxpr with state
effects to HLO, and update the dispatch path accordingly. Specifically:
1. in tests only for now, introduce a MutableArray data type;
2. teach jit to abstract it to a Ref(ShapedArray) type, register an input
   handler, etc;
3. call discharge_state in `lower_sharding_computation` to lower a jaxpr with
   refs to a jaxpr (and then to an HLO) with extra outputs, and set up aliasing;
4. teach the output side of the dispatch path to drop those outputs.

As an alternative to (3), we could potentially lower away the effects at a
higher level, like in _pjit_lower_cached. They are similar because
_pjit_lower_cached is the only (non-xmap) caller of lower_sharding_computation.
I decided to do it in lower_sharding_computation mainly because that's closer
to where we set up aliases, and I wanted to make mutable arrays correspond to
aliased inputs/outputs on the XLA computation.

											
										
										
											2024-02-26 14:46:05 -08:00
+								               has_host_callbacks: bool, kept_var_idx: set[int],
-												[mutable-arrays] support closed-over mutable arrays in jit

											
										
										
											2024-03-05 16:20:24 -08:00
+								               mut: MutationData | None):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    self.xla_executable = xla_executable
 								    self.name = name
 								    self.backend = backend
 								    self.in_handler = in_handler
 								    self.out_handler = out_handler
 								    self.has_unordered_effects = bool(unordered_effects)
 								    self.ordered_effects = ordered_effects
 								    self._local_devices = self.xla_executable.local_devices()
 								    self.keepalive = keepalive
 								    self.has_host_callbacks = has_host_callbacks
 								    self.kept_var_idx = kept_var_idx
-												[mutable-arrays] support closed-over mutable arrays in jit

											
										
										
											2024-03-05 16:20:24 -08:00
+								    self.mut = mut
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Add PyArrayResultHandler which behaves like
functools.partial(jax.arrays.ArrayImpl) with the added benefit
that the new PyExecuteResults type can explode directly into
ArrayImpls if passed to explode_with_handlers().

Note that this also helps with deprecating PyBuffer as the fastpath
does not need to call the PyBuffer constructor.

PiperOrigin-RevId: 512788757

											
										
										
											2023-02-27 18:26:12 -08:00
+								  def _add_tokens_to_inputs(self, input_bufs):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    if self.ordered_effects:
-												[callbacks] Add support for shardable ordered effects.

Ordered effects currently are not allowed in multi-device computations.
This is too restrictive sometimes, e.g., `io_callback(ordered=True)` uses
maximal sharding on one device and the callback would be issued only
once even in multi-device computations.

Here we add support for ordered shardable effects, which behave like
ordered effects except they are allowed in SPMD computations.
Currently, only `callback.IOOrderedEffect` is declared shardable.

In general, if the sharding of the side-effecting operation is not
maximal, then such effects would appear in a partial order, with
effects appearing ordered by program point and unordered among
the different devices at a given program point.

We also generalize the mechanism for tracking runtime tokens and
token buffers to work with multiple devices.

PiperOrigin-RevId: 566242557

											
										
										
											2023-09-18 02:49:53 -07:00
+								      tokens = [
 								        dispatch.runtime_tokens.get_token_input(eff, self._local_devices)
 								        for eff in self.ordered_effects]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      input_bufs = [*tokens, *input_bufs]
-												Add PyArrayResultHandler which behaves like
functools.partial(jax.arrays.ArrayImpl) with the added benefit
that the new PyExecuteResults type can explode directly into
ArrayImpls if passed to explode_with_handlers().

Note that this also helps with deprecating PyBuffer as the fastpath
does not need to call the PyBuffer constructor.

PiperOrigin-RevId: 512788757

											
										
										
											2023-02-27 18:26:12 -08:00
+								    return input_bufs
 								  def _handle_token_bufs(self, token_bufs, sharded_token):
-												[callbacks] Add support for shardable ordered effects.

Ordered effects currently are not allowed in multi-device computations.
This is too restrictive sometimes, e.g., `io_callback(ordered=True)` uses
maximal sharding on one device and the callback would be issued only
once even in multi-device computations.

Here we add support for ordered shardable effects, which behave like
ordered effects except they are allowed in SPMD computations.
Currently, only `callback.IOOrderedEffect` is declared shardable.

In general, if the sharding of the side-effecting operation is not
maximal, then such effects would appear in a partial order, with
effects appearing ordered by program point and unordered among
the different devices at a given program point.

We also generalize the mechanism for tracking runtime tokens and
token buffers to work with multiple devices.

PiperOrigin-RevId: 566242557

											
										
										
											2023-09-18 02:49:53 -07:00
+								    # token_bufs: Sequence[Sequence[tokenArray]], for each effect the returned
 								    # token buffer (as a singleton list).
 								    # sharded_token: ShardedToken, containing the RuntimeTokens for each device
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    for i, device in enumerate(self._local_devices):
 								      dispatch.runtime_tokens.set_output_runtime_token(
 								          device, sharded_token.get_token(i))
 								    for eff, token_buf in zip(self.ordered_effects, token_bufs):
-												[callbacks] Add support for shardable ordered effects.

Ordered effects currently are not allowed in multi-device computations.
This is too restrictive sometimes, e.g., `io_callback(ordered=True)` uses
maximal sharding on one device and the callback would be issued only
once even in multi-device computations.

Here we add support for ordered shardable effects, which behave like
ordered effects except they are allowed in SPMD computations.
Currently, only `callback.IOOrderedEffect` is declared shardable.

In general, if the sharding of the side-effecting operation is not
maximal, then such effects would appear in a partial order, with
effects appearing ordered by program point and unordered among
the different devices at a given program point.

We also generalize the mechanism for tracking runtime tokens and
token buffers to work with multiple devices.

PiperOrigin-RevId: 566242557

											
										
										
											2023-09-18 02:49:53 -07:00
+								      dispatch.runtime_tokens.set_token_result(eff, token_buf[0])
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								  @profiler.annotate_function
 								  def __call__(self, *args):
 								    args = [x for i, x in enumerate(args) if i in self.kept_var_idx]
-												[mutable-arrays] support closed-over mutable arrays in jit

											
										
										
											2024-03-05 16:20:24 -08:00
+								    if self.mut:
 								      args = [*args, *self.mut.in_mut]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    input_bufs = self.in_handler(args)
-												Bump minimum jaxlib version to 0.4.6 which means xla_extension_version == 137 and mlir_api_version == 45

PiperOrigin-RevId: 516364523

											
										
										
											2023-03-13 17:09:06 -07:00
+								    if (self.ordered_effects or self.has_unordered_effects
 								        or self.has_host_callbacks):
 								      input_bufs = self._add_tokens_to_inputs(input_bufs)
 								      results = self.xla_executable.execute_sharded(
 								          input_bufs, with_tokens=True
 								      )
-												[callbacks] Add support for shardable ordered effects.

Ordered effects currently are not allowed in multi-device computations.
This is too restrictive sometimes, e.g., `io_callback(ordered=True)` uses
maximal sharding on one device and the callback would be issued only
once even in multi-device computations.

Here we add support for ordered shardable effects, which behave like
ordered effects except they are allowed in SPMD computations.
Currently, only `callback.IOOrderedEffect` is declared shardable.

In general, if the sharding of the side-effecting operation is not
maximal, then such effects would appear in a partial order, with
effects appearing ordered by program point and unordered among
the different devices at a given program point.

We also generalize the mechanism for tracking runtime tokens and
token buffers to work with multiple devices.

PiperOrigin-RevId: 566242557

											
										
										
											2023-09-18 02:49:53 -07:00
+								      result_token_bufs = results.disassemble_prefix_into_single_device_arrays(
 								          len(self.ordered_effects))
 								      sharded_runtime_token = results.consume_token()
 								      self._handle_token_bufs(result_token_bufs, sharded_runtime_token)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    else:
-												Bump minimum jaxlib version to 0.4.6 which means xla_extension_version == 137 and mlir_api_version == 45

PiperOrigin-RevId: 516364523

											
										
										
											2023-03-13 17:09:06 -07:00
+								      results = self.xla_executable.execute_sharded(input_bufs)
 								    if dispatch.needs_check_special():
 								      out_arrays = results.disassemble_into_single_device_arrays()
 								      for arrays in out_arrays:
 								        dispatch.check_special(self.name, arrays)
-												[mutable-arrays] allow state effects in jit by building in run_state

with help from @sharadmv, @yashkatariya, @dougalm, and others

The basic strategy is to apply discharge_state when lowering a jaxpr with state
effects to HLO, and update the dispatch path accordingly. Specifically:
1. in tests only for now, introduce a MutableArray data type;
2. teach jit to abstract it to a Ref(ShapedArray) type, register an input
   handler, etc;
3. call discharge_state in `lower_sharding_computation` to lower a jaxpr with
   refs to a jaxpr (and then to an HLO) with extra outputs, and set up aliasing;
4. teach the output side of the dispatch path to drop those outputs.

As an alternative to (3), we could potentially lower away the effects at a
higher level, like in _pjit_lower_cached. They are similar because
_pjit_lower_cached is the only (non-xmap) caller of lower_sharding_computation.
I decided to do it in lower_sharding_computation mainly because that's closer
to where we set up aliases, and I wanted to make mutable arrays correspond to
aliased inputs/outputs on the XLA computation.

											
										
										
											2024-02-26 14:46:05 -08:00
+								      out = self.out_handler(out_arrays)
 								    else:
 								      out = results.consume_with_handlers(self.out_handler.handlers)
-												[mutable-arrays] support closed-over mutable arrays in jit

											
										
										
											2024-03-05 16:20:24 -08:00
+								    if self.mut is None:
-												[mutable-arrays] allow state effects in jit by building in run_state

with help from @sharadmv, @yashkatariya, @dougalm, and others

The basic strategy is to apply discharge_state when lowering a jaxpr with state
effects to HLO, and update the dispatch path accordingly. Specifically:
1. in tests only for now, introduce a MutableArray data type;
2. teach jit to abstract it to a Ref(ShapedArray) type, register an input
   handler, etc;
3. call discharge_state in `lower_sharding_computation` to lower a jaxpr with
   refs to a jaxpr (and then to an HLO) with extra outputs, and set up aliasing;
4. teach the output side of the dispatch path to drop those outputs.

As an alternative to (3), we could potentially lower away the effects at a
higher level, like in _pjit_lower_cached. They are similar because
_pjit_lower_cached is the only (non-xmap) caller of lower_sharding_computation.
I decided to do it in lower_sharding_computation mainly because that's closer
to where we set up aliases, and I wanted to make mutable arrays correspond to
aliased inputs/outputs on the XLA computation.

											
										
										
											2024-02-26 14:46:05 -08:00
+								      return out
 								    else:
 								      out_ = []
-												[mutable-arrays] support closed-over mutable arrays in jit

											
										
										
											2024-03-05 16:20:24 -08:00
+								      for i, o in zip(self.mut.out_mut, out):
-												[mutable-arrays] allow state effects in jit by building in run_state

with help from @sharadmv, @yashkatariya, @dougalm, and others

The basic strategy is to apply discharge_state when lowering a jaxpr with state
effects to HLO, and update the dispatch path accordingly. Specifically:
1. in tests only for now, introduce a MutableArray data type;
2. teach jit to abstract it to a Ref(ShapedArray) type, register an input
   handler, etc;
3. call discharge_state in `lower_sharding_computation` to lower a jaxpr with
   refs to a jaxpr (and then to an HLO) with extra outputs, and set up aliasing;
4. teach the output side of the dispatch path to drop those outputs.

As an alternative to (3), we could potentially lower away the effects at a
higher level, like in _pjit_lower_cached. They are similar because
_pjit_lower_cached is the only (non-xmap) caller of lower_sharding_computation.
I decided to do it in lower_sharding_computation mainly because that's closer
to where we set up aliases, and I wanted to make mutable arrays correspond to
aliased inputs/outputs on the XLA computation.

											
										
										
											2024-02-26 14:46:05 -08:00
+								        if i is not None:
 								          args[i]._buf = o
 								        else:
 								          out_.append(o)
 								      return out_
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								xla_pmap_p = core.MapPrimitive('xla_pmap')
 								xla_pmap = xla_pmap_p.bind
 								xla_pmap_p.def_impl(xla_pmap_impl)
 								def _pmap_partial_eval_custom_params_updater(
 								    unks_in, inst_in, kept_outs_known, kept_outs_staged, num_res, params_known,
 								    params_staged):
 								  # prune inputs to jaxpr_known according to unks_in
 								  donated_invars_known, _ = partition_list(unks_in, params_known['donated_invars'])
 								  in_axes_known, _ = partition_list(unks_in, params_known['in_axes'])
 								  _, out_axes_known = partition_list(kept_outs_known, params_known['out_axes'])
 								  out_axes_known = out_axes_known + [0] * num_res
 								  new_params_known = dict(params_known, in_axes=tuple(in_axes_known),
 								                          out_axes=tuple(out_axes_known),
 								                          donated_invars=tuple(donated_invars_known))
 								  # added num_res new inputs to jaxpr_staged, pruning according to inst_in
 								  _, donated_invars_staged = partition_list(inst_in, params_staged['donated_invars'])
 								  donated_invars_staged = [False] * num_res + donated_invars_staged
 								  _, in_axes_staged = partition_list(inst_in, params_staged['in_axes'])
 								  in_axes_staged = [0] * num_res + in_axes_staged
 								  _, out_axes_staged = partition_list(kept_outs_staged, params_staged['out_axes'])
 								  new_params_staged = dict(params_staged, in_axes=tuple(in_axes_staged),
 								                           out_axes=tuple(out_axes_staged),
 								                           donated_invars=tuple(donated_invars_staged))
 								  return new_params_known, new_params_staged
 								def _pmap_partial_eval_custom_res_maker(params_known, aval):
 								  return core.unmapped_aval(params_known['axis_size'], core.no_axis_name, 0, aval)
 								def _pmap_dce_rule(used_outputs, eqn):
 								  # just like pe.dce_jaxpr_call_rule, except handles in_axes / out_axes
-												Axis names are now tracked via an effect

This allows propagating the names bottom up -- from equations to the jaxpr,
instead of "discovering" them top-down by traversing (and rebuilding) the
jaxpr via core.subst_axis_names.

PiperOrigin-RevId: 612416803

											
										
										
											2024-03-04 05:41:29 -08:00
+								  axis_name = eqn.params["axis_name"]
 								  with maybe_extend_axis_env(axis_name, eqn.params["global_axis_size"], None):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    new_jaxpr, used_inputs = pe.dce_jaxpr(eqn.params['call_jaxpr'], used_outputs)
 								  _, donated_invars = partition_list(used_inputs, eqn.params['donated_invars'])
 								  _, in_axes = partition_list(used_inputs, eqn.params['in_axes'])
 								  _, out_axes = partition_list(used_outputs, eqn.params['out_axes'])
 								  new_params = dict(eqn.params, call_jaxpr=new_jaxpr,
 								                    donated_invars=tuple(donated_invars),
 								                    in_axes=tuple(in_axes), out_axes=tuple(out_axes))
 								  if not any(used_inputs) and not any(used_outputs) and not new_jaxpr.effects:
 								    return used_inputs, None
 								  else:
-												Axis names are now tracked via an effect

This allows propagating the names bottom up -- from equations to the jaxpr,
instead of "discovering" them top-down by traversing (and rebuilding) the
jaxpr via core.subst_axis_names.

PiperOrigin-RevId: 612416803

											
										
										
											2024-03-04 05:41:29 -08:00
+								    effs = core.filter_named_axis_effects(new_jaxpr.effects, {axis_name})
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    new_eqn = pe.new_jaxpr_eqn(
 								        [v for v, used in zip(eqn.invars, used_inputs) if used],
 								        [v for v, used in zip(eqn.outvars, used_outputs) if used],
-												Axis names are now tracked via an effect

This allows propagating the names bottom up -- from equations to the jaxpr,
instead of "discovering" them top-down by traversing (and rebuilding) the
jaxpr via core.subst_axis_names.

PiperOrigin-RevId: 612416803

											
										
										
											2024-03-04 05:41:29 -08:00
+								        eqn.primitive, new_params, effs, eqn.source_info)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    return used_inputs, new_eqn
-												Move functions out of xla.py closer to their users.

Refactoring only, no changes intended. The goal is to shrink xla.py down to only its HLO-compatibility role, and remove things that aren't related to HLO compatibility.

Remove an unused top_k translation rule as well.

PiperOrigin-RevId: 554946059

											
										
										
											2023-08-08 14:39:57 -07:00
+								def _xla_call_partial_eval_update_params(
 								    params: core.ParamDict, kept_inputs: Sequence[bool], num_new_inputs: int
 								  ) -> core.ParamDict:
 								  donated_invars = params['donated_invars']
 								  if not kept_inputs and donated_invars:
 								    # JaxprTrace.post_process_call creates a call with no input tracers
 								    donated_invars = (False,) * num_new_inputs
 								  else:
 								    assert len(kept_inputs) == len(donated_invars)
 								    # JaxprTrace.process_call drops known input tracers
 								    donated_invars = [d for d, kept in zip(donated_invars, kept_inputs) if kept]
 								    # Any new inputs are prepended to the left, so mark those as not donated.
 								    donated_invars = [False] * num_new_inputs + donated_invars
 								  return dict(params, donated_invars=tuple(donated_invars))
 								def xla_call_jvp_update_params(params, nz_tangents):
 								  donated_invars = params['donated_invars']
 								  donated_tangents = [d for d, nz in zip(donated_invars, nz_tangents) if nz]
 								  new_donated_invars = (*donated_invars, *donated_tangents)
 								  return dict(params, donated_invars=new_donated_invars)
 								def _xla_call_transpose_update_params(params, undef_primals, nonzero_cts):
 								  donated_invars = params['donated_invars']
 								  donated_primals = [d for d, u in zip(donated_invars, undef_primals) if not u]
 								  donated_cotangents = [False for nz in nonzero_cts if nz]
 								  return dict(params, donated_invars=(*donated_primals, *donated_cotangents))
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								# Set param update handlers to update `donated_invars` just like xla_call_p
-												Move functions out of xla.py closer to their users.

Refactoring only, no changes intended. The goal is to shrink xla.py down to only its HLO-compatibility role, and remove things that aren't related to HLO compatibility.

Remove an unused top_k translation rule as well.

PiperOrigin-RevId: 554946059

											
										
										
											2023-08-08 14:39:57 -07:00
+								pe.call_param_updaters[xla_pmap_p] = _xla_call_partial_eval_update_params
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								pe.partial_eval_jaxpr_custom_rules[xla_pmap_p] = \
 								    partial(pe.call_partial_eval_custom_rule,
 								            'call_jaxpr', _pmap_partial_eval_custom_params_updater,
 								            res_aval=_pmap_partial_eval_custom_res_maker)
 								pe.dce_rules[xla_pmap_p] = _pmap_dce_rule
-												Move functions out of xla.py closer to their users.

Refactoring only, no changes intended. The goal is to shrink xla.py down to only its HLO-compatibility role, and remove things that aren't related to HLO compatibility.

Remove an unused top_k translation rule as well.

PiperOrigin-RevId: 554946059

											
										
										
											2023-08-08 14:39:57 -07:00
+								ad.call_param_updaters[xla_pmap_p] = xla_call_jvp_update_params
 								ad.call_transpose_param_updaters[xla_pmap_p] = _xla_call_transpose_update_params
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								ad.primitive_transposes[xla_pmap_p] = partial(ad.map_transpose, xla_pmap_p)
 								def _pmap_axis_subst(params, subst, traverse):
 								  if 'call_jaxpr' not in params:
 								    return params
 								  if not traverse:
 								    return params
 								  def shadowed_subst(name):
 								    return (name,) if name in params['axis_name'] else subst(name)
 								  with maybe_extend_axis_env(params['axis_name'],
 								                             params['global_axis_size'], None):
 								    new_jaxpr = core.subst_axis_names_jaxpr(params['call_jaxpr'],
 								                                            shadowed_subst)
 								  return dict(params, call_jaxpr=new_jaxpr)
 								core.axis_substitution_rules[xla_pmap_p] = _pmap_axis_subst
 								def _unravel_index_hlo(axis_env):
 								  div = mlir.ir_constant(
-												Replace jax._src.util.prod with math.prod.

math.prod() was added in Python 3.8, so we can assume it is always present.

PiperOrigin-RevId: 513011144

											
										
										
											2023-02-28 12:40:30 -08:00
+								      np.array(axis_env.nreps // math.prod(axis_env.sizes), np.uint32))
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  mod = mlir.ir_constant(np.array(axis_env.sizes[-1], np.uint32))
-												Use MLIR generated convenience functions athing(...) instead of writing AThingOp(...).result.

In most cases these are more succinct.

This change does not update Pallas/Mosaic.

PiperOrigin-RevId: 583448254

											
										
										
											2023-11-17 11:46:24 -08:00
+								  return hlo.remainder(hlo.divide(hlo.replica_id(), div), mod)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								def _hlo_shard(aval, axis_env, xs, in_axis):
 								  if aval is core.abstract_token:
 								    return xs
 								  elif isinstance(aval, core.ShapedArray):
 								    x, = xs
 								    dims = list(aval.shape)
 								    zero = mlir.ir_constant(np.zeros((), dtype=np.uint32))
 								    idxs = [zero] * len(dims)
 								    idxs.insert(in_axis, _unravel_index_hlo(axis_env))
 								    dims_unsqueezed = dims.copy()
 								    dims_unsqueezed.insert(in_axis, 1)
-												Use MLIR generated convenience functions athing(...) instead of writing AThingOp(...).result.

In most cases these are more succinct.

This change does not update Pallas/Mosaic.

PiperOrigin-RevId: 583448254

											
										
										
											2023-11-17 11:46:24 -08:00
+								    dynamic_slice_result = hlo.dynamic_slice(
-												Integrate StableHLO at openxla/stablehlo@ab709fe4

PiperOrigin-RevId: 589908773

											
										
										
											2023-12-11 12:29:57 -08:00
+								        x, idxs, mlir.dense_int_array(dims_unsqueezed))
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    return [
-												Use MLIR generated convenience functions athing(...) instead of writing AThingOp(...).result.

In most cases these are more succinct.

This change does not update Pallas/Mosaic.

PiperOrigin-RevId: 583448254

											
										
										
											2023-11-17 11:46:24 -08:00
+								      hlo.reshape(mlir.aval_to_ir_type(aval), dynamic_slice_result)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    ]
 								  else:
 								    raise TypeError(aval)
-												Move functions out of xla.py closer to their users.

Refactoring only, no changes intended. The goal is to shrink xla.py down to only its HLO-compatibility role, and remove things that aren't related to HLO compatibility.

Remove an unused top_k translation rule as well.

PiperOrigin-RevId: 554946059

											
										
										
											2023-08-08 14:39:57 -07:00
+								def _axis_read(axis_env, axis_name):
 								  try:
 								    return max(i for i, name in enumerate(axis_env.names) if name == axis_name)
 								  except ValueError:
 								    raise NameError(f"unbound axis name: {axis_name}") from None
 								def axis_groups(axis_env: sharding_impls.AxisEnv, name) -> tuple[tuple[int, ...]]:
 								  if not isinstance(name, (list, tuple)):
 								    name = (name,)
 								  mesh_axes = tuple(unsafe_map(partial(_axis_read, axis_env), name))
 								  trailing_size, ragged = divmod(axis_env.nreps, math.prod(axis_env.sizes))
 								  assert not ragged
 								  mesh_spec = axis_env.sizes + (trailing_size,)
 								  return _axis_groups(mesh_spec, mesh_axes)
 								def _axis_groups(mesh_spec, mesh_axes):
 								  """Computes replica group ids for a collective performed over a subset of the mesh.
 								  Args:
 								    mesh_spec: A sequence of integers representing the mesh shape.
 								    mesh_axes: A sequence of integers between 0 and `len(mesh_spec)` (exclusive)
 								      indicating over which axes the collective is performed.
 								  Returns:
 								    A tuple of replica groups (i.e. tuples containing replica ids).
 								  """
 								  iota = np.arange(math.prod(mesh_spec)).reshape(mesh_spec)
 								  groups = np.reshape(
 								      np.moveaxis(iota, mesh_axes, np.arange(len(mesh_axes))),
 								      (math.prod(np.take(mesh_spec, mesh_axes)), -1))
 								  return tuple(unsafe_map(tuple, groups.T))
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								# TODO(b/110096942): more efficient gather
-												Cleanup lowering rule for hlo_unshard, to remove platform dependence.

PiperOrigin-RevId: 572997889

											
										
										
											2023-10-12 13:32:47 -07:00
+								def _hlo_unshard(ctx: mlir.LoweringRuleContext, aval, axis_env, out_axis, xs):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  if aval is core.abstract_token:
 								    return xs
 								  elif isinstance(aval, core.ShapedArray):
 								    x, = xs
 								    dims = list(aval.shape)
 								    padded_aval = aval.update(shape=[axis_env.sizes[-1]] + dims)
 								    padded = mlir.full_like_aval(ctx, 0, padded_aval)
 								    zero = mlir.ir_constant(np.zeros((), dtype=np.uint32))
 								    idxs = [_unravel_index_hlo(axis_env)] + [zero] * len(dims)
-												Integrate StableHLO at openxla/stablehlo@ab709fe4

PiperOrigin-RevId: 589908773

											
										
										
											2023-12-11 12:29:57 -08:00
+								    broadcast_result = hlo.broadcast(x, mlir.dense_int_array([1]))
-												Use MLIR generated convenience functions athing(...) instead of writing AThingOp(...).result.

In most cases these are more succinct.

This change does not update Pallas/Mosaic.

PiperOrigin-RevId: 583448254

											
										
										
											2023-11-17 11:46:24 -08:00
+								    padded = hlo.dynamic_update_slice(padded, broadcast_result, idxs)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    replica_groups = mlir.dense_int_elements(
-												Move functions out of xla.py closer to their users.

Refactoring only, no changes intended. The goal is to shrink xla.py down to only its HLO-compatibility role, and remove things that aren't related to HLO compatibility.

Remove an unused top_k translation rule as well.

PiperOrigin-RevId: 554946059

											
										
										
											2023-08-08 14:39:57 -07:00
+								      axis_groups(axis_env, axis_env.names[-1]))
-												Use MLIR generated convenience functions athing(...) instead of writing AThingOp(...).result.

In most cases these are more succinct.

This change does not update Pallas/Mosaic.

PiperOrigin-RevId: 583448254

											
										
										
											2023-11-17 11:46:24 -08:00
+								    out = hlo.cross_replica_sum(padded, replica_groups)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    if out_axis != 0:
 								      # TODO(apaszke,mattjj): Change the indices to DynamicUpdateSlice instead
 								      perm = list(range(1, len(dims)))
 								      perm.insert(out_axis, 0)
 								      transposed_dims = list(dims)
 								      transposed_dims.insert(out_axis, axis_env.sizes[-1])
-												Integrate StableHLO at openxla/stablehlo@ab709fe4

PiperOrigin-RevId: 589908773

											
										
										
											2023-12-11 12:29:57 -08:00
+								      out = hlo.transpose(out, mlir.dense_int_array(perm))
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								    return out
 								  else:
 								    raise TypeError(aval)
-												Move functions out of xla.py closer to their users.

Refactoring only, no changes intended. The goal is to shrink xla.py down to only its HLO-compatibility role, and remove things that aren't related to HLO compatibility.

Remove an unused top_k translation rule as well.

PiperOrigin-RevId: 554946059

											
										
										
											2023-08-08 14:39:57 -07:00
+								def _extend_axis_env(env: sharding_impls.AxisEnv, name, size: int):
 								  return sharding_impls.AxisEnv(env.nreps, env.names + (name,),
 								                                env.sizes + (size,))
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								def _pmap_lowering(ctx, *in_nodes, axis_name,
 								                   axis_size, global_axis_size, devices, name,
 								                   call_jaxpr, backend=None, in_axes, out_axes,
-												Remove global_arg_shapes from pmap since it was only used for sharded_jit and sharded_jit was removed from JAX a long time ago

PiperOrigin-RevId: 520356179

											
										
										
											2023-03-29 09:22:34 -07:00
+								                   donated_invars, is_explicit_global_axis_size):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  del donated_invars  # Unused.
-												Cleanup the handling of single- and multi-platform lowering in ModuleContext

Previously, we introduced support for multi-platform lowering, by
adding a new LoweringParameters object that can be used to specify
a cross-lowering platform or even multiple platforms. But we had
kept the ModuleContext.platform in place because some lowering rules
were still referencing it. Now we replace ModuleContext.platform with
ModuleContext.platforms, which removes the redundancy, simplifies
the code, and makes it clearer that the lowering rules should not
simply assume single-platform lowering.

PiperOrigin-RevId: 576575376

											
										
										
											2023-10-25 10:39:47 -07:00
+								  mlir.check_backend_matches(backend, ctx.module_context.platforms)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  # We in-line here rather than generating a Call HLO as in the xla_call
 								  # translation rule just because the extra tuple stuff is a pain.
 								  if ctx.module_context.axis_env.names and devices is not None:
 								    raise ValueError("Nested pmap with explicit devices argument.")
-												Move functions out of xla.py closer to their users.

Refactoring only, no changes intended. The goal is to shrink xla.py down to only its HLO-compatibility role, and remove things that aren't related to HLO compatibility.

Remove an unused top_k translation rule as well.

PiperOrigin-RevId: 554946059

											
										
										
											2023-08-08 14:39:57 -07:00
+								  new_env = _extend_axis_env(ctx.module_context.axis_env, axis_name,
 								                             global_axis_size)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  # Shard the in_nodes that are mapped
 								  in_avals = [v.aval for v in call_jaxpr.invars]
 								  in_nodes_sharded = (
 								    _hlo_shard(aval, new_env, mlir.wrap_singleton_ir_values(in_node), in_axis)
 								    if in_axis is not None else mlir.wrap_singleton_ir_values(in_node)
 								    for aval, in_node, in_axis in zip(in_avals, in_nodes, in_axes))
 								  with maybe_extend_axis_env(axis_name, global_axis_size, None):  # type: ignore
 								    sub_ctx = ctx.module_context.replace(
-												Split name_stack out of mlir.ModuleContext.

A unique name_stack is built for every equation, which means that we're constantly rebuilding ModuleContext objects, even though the lifetime of almost everything else (naturally) is the Module scope. Split name_stack into an object that is threaded separately, including as part of mlir.LoweringRuleContext.

PiperOrigin-RevId: 608594374

											
										
										
											2024-02-20 07:16:38 -08:00
+								        axis_context=sharding_impls.ReplicaAxisContext(new_env))
 								    sharded_outs, _ = mlir.jaxpr_subcomp(
 								        sub_ctx, call_jaxpr,
 								        ctx.name_stack.extend(util.wrap_name(name, 'pmap')),
 								        mlir.TokenSet(), (), *in_nodes_sharded,
 								        dim_var_values=ctx.dim_var_values)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  out_avals = [v.aval for v in call_jaxpr.outvars]
-												Cleanup lowering rule for hlo_unshard, to remove platform dependence.

PiperOrigin-RevId: 572997889

											
										
										
											2023-10-12 13:32:47 -07:00
+								  outs = [_hlo_unshard(ctx, aval, new_env, out_axis, shard)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								          for aval, out_axis, shard in zip(out_avals, out_axes, sharded_outs)]
 								  return outs
 								mlir.register_lowering(xla_pmap_p, _pmap_lowering)
 								# ------------------- xmap -------------------
 								def tile_aval_nd(axis_sizes, in_axes: ArrayMapping, aval):
 								  assert isinstance(aval, ShapedArray)
 								  shape = list(aval.shape)
 								  named_shape = dict(aval.named_shape)
 								  for name, axis in in_axes.items():
 								    assert shape[axis] % axis_sizes[name] == 0
 								    assert name not in named_shape
 								    named_shape[name] = axis_sizes[name]
 								    shape[axis] //= axis_sizes[name]
 								  return aval.update(shape=tuple(shape), named_shape=named_shape)
 								def untile_aval_nd(axis_sizes, out_axes: ArrayMapping, aval):
 								  assert isinstance(aval, ShapedArray)
 								  shape = list(aval.shape)
 								  named_shape = dict(aval.named_shape)
 								  for name, axis in out_axes.items():
 								    shape[axis] *= axis_sizes[name]
 								    named_shape.pop(name, None)  # The name might be missing --- it's a broadcast.
 								  return aval.update(shape=tuple(shape), named_shape=named_shape)
-												Split Mesh and ResourceEnv into a new module jax._src.mesh.

This work is an effort to reduce cyclic dependencies in JAX internals.

Move the _global_to_local and _local_to_global methods out of Mesh and into pxla as free functions. This removes the need for jax._src.mesh to depend on things like avals.

PiperOrigin-RevId: 515667671

											
										
										
											2023-03-10 10:07:37 -08:00
+								def mesh_local_to_global(mesh, axes: ArrayMapping, aval):
 								  return untile_aval_nd(mesh.shape, axes,
 								                        tile_aval_nd(mesh.local_mesh.shape, axes, aval))
 								def mesh_global_to_local(mesh, axes: ArrayMapping, aval):
 								  return untile_aval_nd(mesh.local_mesh.shape, axes,
 								                        tile_aval_nd(mesh.shape, axes, aval))
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								class SPMDBatchTrace(batching.BatchTrace):
 								  def get_axis_primitive_batcher(self, primitive, frame):
 								    if primitive in spmd_primitive_batchers:
 								      return partial(spmd_primitive_batchers[primitive],
 								          frame.size, frame.name, frame.main_trace.trace_type)
 								    return super().get_axis_primitive_batcher(primitive, frame)
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								spmd_primitive_batchers: dict[core.Primitive, Callable] = {}
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								def vtile_by_mesh(fun: lu.WrappedFun,
 								                  mesh: Mesh,
 								                  in_axes: Sequence[ArrayMapping],
 								                  out_axes: Sequence[ArrayMapping]):
 								  # We vectorize in reversed order, because vmap is often biased towards
 								  # moving the batch axis to the front, and this way of stacking transforms
 								  # will order the batch axes according to the mesh axis order.
 								  # Not strictly necessary, but seems nicer than reversing it?
 								  for name, size in reversed(mesh.shape.items()):
 								    fun = batching.vtile(fun,
 								                         tuple(a.get(name, None) for a in in_axes),
 								                         tuple(a.get(name, None) for a in out_axes),
 								                         tile_size=size,
 								                         axis_name=name,
 								                         main_type=SPMDBatchTrace)
 								  return fun
 								full_to_shard_p = core.Primitive('full_to_shard')
 								@full_to_shard_p.def_abstract_eval
 								def _full_to_shard_abstract_eval(x, axes, mesh, **_):
 								  # TODO: Assert x is a global aval! Or ideally check that it's global in dims from axes!
 								  return tile_aval_nd(mesh.shape, axes, x)
-												Split sharding_impls into its own Bazel target.

* Move dependencies of sharding_impls into sharding_impls to avoid creating cyclic dependencies.
* Fix a handful of new pytype errors.

PiperOrigin-RevId: 523146076

											
										
										
											2023-04-10 10:15:08 -07:00
+								def manual_proto(
 								    aval: core.ShapedArray,
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								    manual_axes_set: frozenset[sharding_impls.MeshAxisName], mesh: Mesh):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  """Create an OpSharding proto that declares all mesh axes from `axes` as manual
 								  and all others as replicated.
 								  """
 								  named_mesh_shape = mesh.shape
 								  mesh_shape = list(named_mesh_shape.values())
 								  axis_order = {axis: i for i, axis in enumerate(mesh.axis_names)}
-												Switch from flake8 to Ruff

											
										
										
											2023-11-14 23:34:30 -05:00
+								  manual_axes = sorted(manual_axes_set, key=str)
 								  replicated_axes = [axis for axis in mesh.axis_names
 								                     if axis not in manual_axes_set]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								  tad_perm = ([axis_order[a] for a in replicated_axes] +
 								              [axis_order[a] for a in manual_axes])
 								  tad_shape = [1] * aval.ndim
-												Change np.prod->math.prod

Why? This is generally used for static operations on shapes, but np.prod
has an unfortunate corner-case behavior that np.prod([]) returns a float.
math.prod is available as of Python 3.8, and is a better solution here.

											
										
										
											2023-04-13 11:48:11 -07:00
+								  tad_shape.append(math.prod([named_mesh_shape[a] for a in replicated_axes]))
 								  tad_shape.append(math.prod([named_mesh_shape[a] for a in manual_axes]))
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Change np.prod->math.prod

Why? This is generally used for static operations on shapes, but np.prod
has an unfortunate corner-case behavior that np.prod([]) returns a float.
math.prod is available as of Python 3.8, and is a better solution here.

											
										
										
											2023-04-13 11:48:11 -07:00
+								  raw_mesh = np.arange(math.prod(mesh_shape)).reshape(mesh_shape)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  proto = xc.OpSharding()
 								  proto.type = xc.OpSharding.Type.OTHER
 								  proto.tile_assignment_dimensions = tad_shape
 								  proto.tile_assignment_devices = list(raw_mesh.transpose(tad_perm).reshape(tad_shape).flat)
 								  proto.last_tile_dims = [xc.OpSharding.Type.REPLICATED, xc.OpSharding.Type.MANUAL]
 								  return proto
 								@partial(mlir.register_lowering, full_to_shard_p)
-												Split Mesh and ResourceEnv into a new module jax._src.mesh.

This work is an effort to reduce cyclic dependencies in JAX internals.

Move the _global_to_local and _local_to_global methods out of Mesh and into pxla as free functions. This removes the need for jax._src.mesh to depend on things like avals.

PiperOrigin-RevId: 515667671

											
										
										
											2023-03-10 10:07:37 -08:00
+								def _full_to_shard_lowering(ctx, x, *, axes: ArrayMapping, mesh: Mesh,
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								                            manual_axes: frozenset[sharding_impls.MeshAxisName]):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  # TODO: Can we short-circuit for replicated values? Probably not.
 								  aval_in, = ctx.avals_in
 								  aval_out, = ctx.avals_out
-												Delete `mesh_sharding_specs` from JAX

PiperOrigin-RevId: 595164505

											
										
										
											2024-01-02 11:13:57 -08:00
+								  sharding_proto = (
 								      sharding_impls.NamedSharding(mesh, array_mapping_to_axis_resources(axes))
 								      ._to_xla_hlo_sharding(aval_in.ndim).to_proto())
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  unspecified_dims = set(range(aval_in.ndim)) - set(axes.values())
-												Delete `mesh_sharding_specs` from JAX

PiperOrigin-RevId: 595164505

											
										
										
											2024-01-02 11:13:57 -08:00
+								  sx = mlir.wrap_with_sharding_op(ctx, x, aval_in, sharding_proto,
 								                                  unspecified_dims=unspecified_dims)
-												Prune accidental exports from jax.interpreters.pxla.

These imports do not appear to have users outside JAX itself.

PiperOrigin-RevId: 507835295

											
										
										
											2023-02-07 11:16:01 -08:00
+								  proto = manual_proto(aval_in, manual_axes, mesh)
-												Delete `mesh_sharding_specs` from JAX

PiperOrigin-RevId: 595164505

											
										
										
											2024-01-02 11:13:57 -08:00
+								  return (mlir.wrap_with_full_to_shard_op(ctx, sx, aval_out, proto,
 								                                          unspecified_dims=unspecified_dims),)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								shard_to_full_p = core.Primitive('shard_to_full')
 								@shard_to_full_p.def_abstract_eval
 								def _shard_to_full_abstract_eval(x, axes, mesh, **_):
 								  # TODO: Assert x is a global aval! Or ideally check that it's global in dims from axes!
 								  return untile_aval_nd(mesh.shape, axes, x)
 								@partial(mlir.register_lowering, shard_to_full_p)
-												[shape_poly] Lowering sharding annotations in presence of dynamic shapes

Sharding annotations are lowered to custom calls, and in presence of dynamic shapes
we must use the `indices_of_shape_operands` attribute to hlo.CustomCall.
In order to be able to generate the code to compute the result shapes
we must pass the `LoweringRuleContext` and the result abstract value
to the lowering helpers that generate the custom calls.

The above is easy everywhere, except for the sharding annotations for
the inputs and outputs for a function, because we do not yet have
a LoweringRuleContext available.

This code is tested by tests that are still disabled in sharding_test.
They can be enabled once StableHLO improves the support for
dynamic shapes for custom calls: https://github.com/openxla/stablehlo/issues/1367

											
										
										
											2023-04-05 09:38:37 +02:00
+								def _shard_to_full_lowering(ctx: mlir.LoweringRuleContext, x, *, axes: ArrayMapping, mesh: Mesh,
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								                            manual_axes: frozenset[sharding_impls.MeshAxisName]):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  aval_in, = ctx.avals_in
 								  aval_out, = ctx.avals_out
-												[shape_poly] Lowering sharding annotations in presence of dynamic shapes

Sharding annotations are lowered to custom calls, and in presence of dynamic shapes
we must use the `indices_of_shape_operands` attribute to hlo.CustomCall.
In order to be able to generate the code to compute the result shapes
we must pass the `LoweringRuleContext` and the result abstract value
to the lowering helpers that generate the custom calls.

The above is easy everywhere, except for the sharding annotations for
the inputs and outputs for a function, because we do not yet have
a LoweringRuleContext available.

This code is tested by tests that are still disabled in sharding_test.
They can be enabled once StableHLO improves the support for
dynamic shapes for custom calls: https://github.com/openxla/stablehlo/issues/1367

											
										
										
											2023-04-05 09:38:37 +02:00
+								  proto = manual_proto(aval_in, manual_axes, mesh)  # type: ignore
 								  unspecified_dims = set(range(aval_in.ndim)) - set(axes.values())  # type: ignore
-												Delete `mesh_sharding_specs` from JAX

PiperOrigin-RevId: 595164505

											
										
										
											2024-01-02 11:13:57 -08:00
+								  sx = mlir.wrap_with_sharding_op(ctx, x, aval_in, proto,
 								                                  unspecified_dims=unspecified_dims)
 								  sharding_proto = (
 								      sharding_impls.NamedSharding(mesh, array_mapping_to_axis_resources(axes))
 								      ._to_xla_hlo_sharding(aval_out.ndim).to_proto())
 								  return (mlir.wrap_with_shard_to_full_op(ctx, sx, aval_out, sharding_proto,
 								                                          unspecified_dims),)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								@lu.transformation
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								def vtile_manual(manual_axes: frozenset[sharding_impls.MeshAxisName],
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								                 mesh: Mesh,
 								                 in_axes: Sequence[ArrayMapping],
 								                 out_axes: Sequence[ArrayMapping],
 								                 *args):
 								  tiled_args = [full_to_shard_p.bind(arg, axes=axes, mesh=mesh, manual_axes=manual_axes)
 								                for arg, axes in zip(args, in_axes)]
 								  tiled_outs = yield tiled_args, {}
 								  outs = [shard_to_full_p.bind(out, axes=axes, mesh=mesh, manual_axes=manual_axes)
 								          for out, axes in zip(tiled_outs, out_axes)]
 								  yield outs
 								@dataclasses.dataclass(frozen=True)
 								class TileVectorize:
 								  pass
 								@dataclasses.dataclass(frozen=True)
 								class TileManual:
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								  manual_axes: frozenset[sharding_impls.MeshAxisName]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								TilingMethod = Union[TileVectorize, TileManual]
-												Prune accidental exports from jax.interpreters.pxla.

These imports do not appear to have users outside JAX itself.

PiperOrigin-RevId: 507835295

											
										
										
											2023-02-07 11:16:01 -08:00
+								def check_if_any_auto(
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								    shardings: Iterable[(sharding_impls.XLACompatibleSharding |
 								                              AUTO | UnspecifiedValue)]) -> bool:
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  for s in shardings:
-												Prune accidental exports from jax.interpreters.pxla.

These imports do not appear to have users outside JAX itself.

PiperOrigin-RevId: 507835295

											
										
										
											2023-02-07 11:16:01 -08:00
+								    if is_auto(s):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      return True
 								  return False
-												Improve the error message which is raised from `_get_and_check_device_assignment`.

Before:

```
ValueError: Devices of all `Array` inputs and outputs should be the same. Got array device ids [0] on platform CPU and another array's device ids [0, 1, 2, 3] on platform CPU
```

After:

```
ValueError: Received incompatible devices for jitted computation. Got argument inp of ArrayPjitTest.test_jit_with_sharding_constraint_committed_inp_error.<locals>.sharded_inp with bfloat16[8,2] and device ids [0] on platform CPU and with_sharding_constraint or nested pjit or shard_map with device ids [0, 1, 2, 3] on platform CPU at jax/tests/pjit_test.py:2509 (sharded_inp)
```
PiperOrigin-RevId: 508746961

											
										
										
											2023-02-10 13:53:43 -08:00
+								class MismatchType(enum.Enum):
 								  ARG_SHARDING = 0
 								  OUT_SHARDING = 1
 								  SHARDING_INSIDE_COMPUTATION = 2
 								  CONTEXT_DEVICES = 3
 								  IN_SHARDING = 4
 								  def __str__(self):
 								    if self.name == 'IN_SHARDING':
 								      return 'explicit input sharding'
 								    elif self.name == 'OUT_SHARDING':
 								      return 'explicit output sharding'
 								    elif self.name == 'CONTEXT_DEVICES':
 								      return 'devices'
 								    return f'{self.name}'
 								@dataclasses.dataclass
 								class DeviceAssignmentMismatch:
 								  da: Sequence[xc.Device]
 								  m_type: MismatchType
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								  source_info: dispatch.SourceInfo | None
-												Improve the error message which is raised from `_get_and_check_device_assignment`.

Before:

```
ValueError: Devices of all `Array` inputs and outputs should be the same. Got array device ids [0] on platform CPU and another array's device ids [0, 1, 2, 3] on platform CPU
```

After:

```
ValueError: Received incompatible devices for jitted computation. Got argument inp of ArrayPjitTest.test_jit_with_sharding_constraint_committed_inp_error.<locals>.sharded_inp with bfloat16[8,2] and device ids [0] on platform CPU and with_sharding_constraint or nested pjit or shard_map with device ids [0, 1, 2, 3] on platform CPU at jax/tests/pjit_test.py:2509 (sharded_inp)
```
PiperOrigin-RevId: 508746961

											
										
										
											2023-02-10 13:53:43 -08:00
 								  @property
 								  def device_ids(self) -> Sequence[int]:
 								    return [d.id for d in self.da]
 								  @property
 								  def platform(self) -> str:
 								    return self.da[0].platform.upper()
 								  def _maybe_api_name(self, api_name) -> str:
 								    return f" {api_name}'s" if self.m_type == MismatchType.CONTEXT_DEVICES else ""
 								  @property
 								  def source_info_str(self):
-												Summarize equations in sharding mismatch errors lazily.

Don't spend time building error output unless there is an error.

PiperOrigin-RevId: 606812191

											
										
										
											2024-02-13 18:26:41 -08:00
+								    return (
 								        "" if self.source_info is None
 								        else f" at {source_info_util.summarize(self.source_info.source_info)}"
 								    )
-												Improve the error message which is raised from `_get_and_check_device_assignment`.

Before:

```
ValueError: Devices of all `Array` inputs and outputs should be the same. Got array device ids [0] on platform CPU and another array's device ids [0, 1, 2, 3] on platform CPU
```

After:

```
ValueError: Received incompatible devices for jitted computation. Got argument inp of ArrayPjitTest.test_jit_with_sharding_constraint_committed_inp_error.<locals>.sharded_inp with bfloat16[8,2] and device ids [0] on platform CPU and with_sharding_constraint or nested pjit or shard_map with device ids [0, 1, 2, 3] on platform CPU at jax/tests/pjit_test.py:2509 (sharded_inp)
```
PiperOrigin-RevId: 508746961

											
										
										
											2023-02-10 13:53:43 -08:00
 								  @property
 								  def _dev_ids_plat_str(self):
 								    return f"device ids {self.device_ids} on platform {self.platform}"
-												Point to the exact primitive name nested under jit/pjit instead of mentioning all possible ones.

PiperOrigin-RevId: 508770290

											
										
										
											2023-02-10 15:36:04 -08:00
+								  def m_type_str(self, api_name):
-												Fix type failures under an upcoming pytype change.

PiperOrigin-RevId: 522591195

											
										
										
											2023-04-07 07:09:44 -07:00
+								    return (f'{self.source_info and self.source_info.eqn_name} inside {api_name}'
-												Point to the exact primitive name nested under jit/pjit instead of mentioning all possible ones.

PiperOrigin-RevId: 508770290

											
										
										
											2023-02-10 15:36:04 -08:00
+								            if self.m_type == MismatchType.SHARDING_INSIDE_COMPUTATION else self.m_type)
-												Improve the error message which is raised from `_get_and_check_device_assignment`.

Before:

```
ValueError: Devices of all `Array` inputs and outputs should be the same. Got array device ids [0] on platform CPU and another array's device ids [0, 1, 2, 3] on platform CPU
```

After:

```
ValueError: Received incompatible devices for jitted computation. Got argument inp of ArrayPjitTest.test_jit_with_sharding_constraint_committed_inp_error.<locals>.sharded_inp with bfloat16[8,2] and device ids [0] on platform CPU and with_sharding_constraint or nested pjit or shard_map with device ids [0, 1, 2, 3] on platform CPU at jax/tests/pjit_test.py:2509 (sharded_inp)
```
PiperOrigin-RevId: 508746961

											
										
										
											2023-02-10 13:53:43 -08:00
+								  def _str(self, api_name):
-												Point to the exact primitive name nested under jit/pjit instead of mentioning all possible ones.

PiperOrigin-RevId: 508770290

											
										
										
											2023-02-10 15:36:04 -08:00
+								    return (f"{self._maybe_api_name(api_name)} {self.m_type_str(api_name)} with "
-												Improve the error message which is raised from `_get_and_check_device_assignment`.

Before:

```
ValueError: Devices of all `Array` inputs and outputs should be the same. Got array device ids [0] on platform CPU and another array's device ids [0, 1, 2, 3] on platform CPU
```

After:

```
ValueError: Received incompatible devices for jitted computation. Got argument inp of ArrayPjitTest.test_jit_with_sharding_constraint_committed_inp_error.<locals>.sharded_inp with bfloat16[8,2] and device ids [0] on platform CPU and with_sharding_constraint or nested pjit or shard_map with device ids [0, 1, 2, 3] on platform CPU at jax/tests/pjit_test.py:2509 (sharded_inp)
```
PiperOrigin-RevId: 508746961

											
										
										
											2023-02-10 13:53:43 -08:00
+								            f"{self._dev_ids_plat_str}{self.source_info_str}")
 								class DeviceAssignmentMismatchError(Exception):
 								  pass
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								ShardingInfo = tuple[
-												Allow pjit.AUTO to be used with jax.jit. This introduces an API change which requires a mesh to be provided to pjit.AUTO(mesh).

`with mesh:` is no longer required with pjit to use the auto spmd pass of GSPMD.

PiperOrigin-RevId: 533801596

											
										
										
											2023-05-20 22:59:52 -07:00
+								    Union[sharding_impls.XLACompatibleSharding, UnspecifiedValue, AUTO],
-												Upgrade remaining sources to Python 3.9

This PR is a follow up to #18881.

The changes were generated by adding

    from __future__ import annotations

to the files which did not already have them and running

    pyupgrade --py39-plus --keep-percent-format {jax,tests,jaxlib,examples,benchmarks}/**/*.py

											
										
										
											2023-12-11 13:59:29 +00:00
+								    MismatchType,
 								    Union[Any, None],  # Any is dispatch.SourceInfo to avoid circular imports
 								]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Use shard_args and global_result_handlers since the `aval_to_result_handler` and `dispatch.device_put` will be removed soon.

PiperOrigin-RevId: 515471662

											
										
										
											2023-03-09 16:18:31 -08:00
 								def _get_default_device() -> xc.Device:
-												Migrate a subset of internal modules to use state objects

The motivation here is to gradually replace all dynamic lookups on `jax.config`
with statically-typed state objects, which are more type checker/IDE friendly.

PiperOrigin-RevId: 571932143

											
										
										
											2023-10-09 07:28:18 -07:00
+								  return config.default_device.value or xb.local_devices()[0]
-												Use shard_args and global_result_handlers since the `aval_to_result_handler` and `dispatch.device_put` will be removed soon.

PiperOrigin-RevId: 515471662

											
										
										
											2023-03-09 16:18:31 -08:00
-												Make device_put of non_addressable_jax_array and non_addressable_sharding (with different device order) thread safe by making _get_and_check_device_assignment thread local.

PiperOrigin-RevId: 568358309

											
										
										
											2023-09-25 16:41:43 -07:00
+								class _thread_local_decorator(threading.local):
 								  def __init__(self, fn):
 								    self.fn = fn
 								  def __call__(self, *args, **kwargs):
 								    return self.fn(*args, **kwargs)
 								@_thread_local_decorator
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								def _get_and_check_device_assignment(
-												Improve the error message which is raised from `_get_and_check_device_assignment`.

Before:

```
ValueError: Devices of all `Array` inputs and outputs should be the same. Got array device ids [0] on platform CPU and another array's device ids [0, 1, 2, 3] on platform CPU
```

After:

```
ValueError: Received incompatible devices for jitted computation. Got argument inp of ArrayPjitTest.test_jit_with_sharding_constraint_committed_inp_error.<locals>.sharded_inp with bfloat16[8,2] and device ids [0] on platform CPU and with_sharding_constraint or nested pjit or shard_map with device ids [0, 1, 2, 3] on platform CPU at jax/tests/pjit_test.py:2509 (sharded_inp)
```
PiperOrigin-RevId: 508746961

											
										
										
											2023-02-10 13:53:43 -08:00
+								    shardings: Iterable[ShardingInfo],
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								    devices: Sequence[xc.Device] | None,
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								) -> tuple[xc.Client, tuple[xc.Device, ...]]:
-												Improve the error message which is raised from `_get_and_check_device_assignment`.

Before:

```
ValueError: Devices of all `Array` inputs and outputs should be the same. Got array device ids [0] on platform CPU and another array's device ids [0, 1, 2, 3] on platform CPU
```

After:

```
ValueError: Received incompatible devices for jitted computation. Got argument inp of ArrayPjitTest.test_jit_with_sharding_constraint_committed_inp_error.<locals>.sharded_inp with bfloat16[8,2] and device ids [0] on platform CPU and with_sharding_constraint or nested pjit or shard_map with device ids [0, 1, 2, 3] on platform CPU at jax/tests/pjit_test.py:2509 (sharded_inp)
```
PiperOrigin-RevId: 508746961

											
										
										
											2023-02-10 13:53:43 -08:00
+								  first_sharding_info = None
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  if devices is None:
-												Make _device_assignment a `Tuple[Device]` so that we don't convert a list to a tuple and vice-versa everywhere

PiperOrigin-RevId: 524002310

											
										
										
											2023-04-13 08:02:53 -07:00
+								    devices = ()
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  else:
-												Make _device_assignment a `Tuple[Device]` so that we don't convert a list to a tuple and vice-versa everywhere

PiperOrigin-RevId: 524002310

											
										
										
											2023-04-13 08:02:53 -07:00
+								    devices = tuple(devices)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Improve the error message which is raised from `_get_and_check_device_assignment`.

Before:

```
ValueError: Devices of all `Array` inputs and outputs should be the same. Got array device ids [0] on platform CPU and another array's device ids [0, 1, 2, 3] on platform CPU
```

After:

```
ValueError: Received incompatible devices for jitted computation. Got argument inp of ArrayPjitTest.test_jit_with_sharding_constraint_committed_inp_error.<locals>.sharded_inp with bfloat16[8,2] and device ids [0] on platform CPU and with_sharding_constraint or nested pjit or shard_map with device ids [0, 1, 2, 3] on platform CPU at jax/tests/pjit_test.py:2509 (sharded_inp)
```
PiperOrigin-RevId: 508746961

											
										
										
											2023-02-10 13:53:43 -08:00
+								  for i, s_type, source_info in shardings:
-												Allow pjit.AUTO to be used with jax.jit. This introduces an API change which requires a mesh to be provided to pjit.AUTO(mesh).

`with mesh:` is no longer required with pjit to use the auto spmd pass of GSPMD.

PiperOrigin-RevId: 533801596

											
										
										
											2023-05-20 22:59:52 -07:00
+								    if is_unspecified(i):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      continue
-												Allow pjit.AUTO to be used with jax.jit. This introduces an API change which requires a mesh to be provided to pjit.AUTO(mesh).

`with mesh:` is no longer required with pjit to use the auto spmd pass of GSPMD.

PiperOrigin-RevId: 533801596

											
										
										
											2023-05-20 22:59:52 -07:00
-												Improve the error message which is raised from `_get_and_check_device_assignment`.

Before:

```
ValueError: Devices of all `Array` inputs and outputs should be the same. Got array device ids [0] on platform CPU and another array's device ids [0, 1, 2, 3] on platform CPU
```

After:

```
ValueError: Received incompatible devices for jitted computation. Got argument inp of ArrayPjitTest.test_jit_with_sharding_constraint_committed_inp_error.<locals>.sharded_inp with bfloat16[8,2] and device ids [0] on platform CPU and with_sharding_constraint or nested pjit or shard_map with device ids [0, 1, 2, 3] on platform CPU at jax/tests/pjit_test.py:2509 (sharded_inp)
```
PiperOrigin-RevId: 508746961

											
										
										
											2023-02-10 13:53:43 -08:00
+								    if first_sharding_info is None:
-												Allow pjit.AUTO to be used with jax.jit. This introduces an API change which requires a mesh to be provided to pjit.AUTO(mesh).

`with mesh:` is no longer required with pjit to use the auto spmd pass of GSPMD.

PiperOrigin-RevId: 533801596

											
										
										
											2023-05-20 22:59:52 -07:00
+								      first_sharding_info = (
 								          (i.mesh._flat_devices_tuple, s_type, source_info) if is_auto(i)  # type: ignore
 								          else (i._device_assignment, s_type, source_info))  # type: ignore
 								    arr_device_assignment = i.mesh._flat_devices_tuple if is_auto(i) else i._device_assignment  # type: ignore
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    if not devices:
-												Improve the error message which is raised from `_get_and_check_device_assignment`.

Before:

```
ValueError: Devices of all `Array` inputs and outputs should be the same. Got array device ids [0] on platform CPU and another array's device ids [0, 1, 2, 3] on platform CPU
```

After:

```
ValueError: Received incompatible devices for jitted computation. Got argument inp of ArrayPjitTest.test_jit_with_sharding_constraint_committed_inp_error.<locals>.sharded_inp with bfloat16[8,2] and device ids [0] on platform CPU and with_sharding_constraint or nested pjit or shard_map with device ids [0, 1, 2, 3] on platform CPU at jax/tests/pjit_test.py:2509 (sharded_inp)
```
PiperOrigin-RevId: 508746961

											
										
										
											2023-02-10 13:53:43 -08:00
+								      if first_sharding_info[0] != arr_device_assignment:
 								        raise DeviceAssignmentMismatchError([
 								            DeviceAssignmentMismatch(*first_sharding_info),
 								            DeviceAssignmentMismatch(arr_device_assignment, s_type, source_info)])
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    else:
 								      if devices != arr_device_assignment:
-												Improve the error message which is raised from `_get_and_check_device_assignment`.

Before:

```
ValueError: Devices of all `Array` inputs and outputs should be the same. Got array device ids [0] on platform CPU and another array's device ids [0, 1, 2, 3] on platform CPU
```

After:

```
ValueError: Received incompatible devices for jitted computation. Got argument inp of ArrayPjitTest.test_jit_with_sharding_constraint_committed_inp_error.<locals>.sharded_inp with bfloat16[8,2] and device ids [0] on platform CPU and with_sharding_constraint or nested pjit or shard_map with device ids [0, 1, 2, 3] on platform CPU at jax/tests/pjit_test.py:2509 (sharded_inp)
```
PiperOrigin-RevId: 508746961

											
										
										
											2023-02-10 13:53:43 -08:00
+								        raise DeviceAssignmentMismatchError([
 								            DeviceAssignmentMismatch(devices, MismatchType.CONTEXT_DEVICES, None),
 								            DeviceAssignmentMismatch(arr_device_assignment, s_type, source_info)])
 								  if first_sharding_info is None and devices:
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    final_device_assignment = devices
-												Improve the error message which is raised from `_get_and_check_device_assignment`.

Before:

```
ValueError: Devices of all `Array` inputs and outputs should be the same. Got array device ids [0] on platform CPU and another array's device ids [0, 1, 2, 3] on platform CPU
```

After:

```
ValueError: Received incompatible devices for jitted computation. Got argument inp of ArrayPjitTest.test_jit_with_sharding_constraint_committed_inp_error.<locals>.sharded_inp with bfloat16[8,2] and device ids [0] on platform CPU and with_sharding_constraint or nested pjit or shard_map with device ids [0, 1, 2, 3] on platform CPU at jax/tests/pjit_test.py:2509 (sharded_inp)
```
PiperOrigin-RevId: 508746961

											
										
										
											2023-02-10 13:53:43 -08:00
+								  elif first_sharding_info is None:
-												Make _device_assignment a `Tuple[Device]` so that we don't convert a list to a tuple and vice-versa everywhere

PiperOrigin-RevId: 524002310

											
										
										
											2023-04-13 08:02:53 -07:00
+								    final_device_assignment = (_get_default_device(),)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  else:
-												Improve the error message which is raised from `_get_and_check_device_assignment`.

Before:

```
ValueError: Devices of all `Array` inputs and outputs should be the same. Got array device ids [0] on platform CPU and another array's device ids [0, 1, 2, 3] on platform CPU
```

After:

```
ValueError: Received incompatible devices for jitted computation. Got argument inp of ArrayPjitTest.test_jit_with_sharding_constraint_committed_inp_error.<locals>.sharded_inp with bfloat16[8,2] and device ids [0] on platform CPU and with_sharding_constraint or nested pjit or shard_map with device ids [0, 1, 2, 3] on platform CPU at jax/tests/pjit_test.py:2509 (sharded_inp)
```
PiperOrigin-RevId: 508746961

											
										
										
											2023-02-10 13:53:43 -08:00
+								    final_device_assignment = first_sharding_info[0]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  return xb.get_device_backend(final_device_assignment[0]), final_device_assignment
-												[shard-map] fix eager shmap+prngs, revise phys aval/sharding logic

Co-authored-by: Yash Katariya <yashkatariya@google.com>

											
										
										
											2023-04-05 14:09:46 -07:00
+								MaybeSharding = Union[sharding_impls.XLACompatibleSharding, UnspecifiedValue]
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
-												Move some utilities out of dispatch.py next to their users, add more types.

Internal cleanups only, no user-visible changes intended.

PiperOrigin-RevId: 554876522

											
										
										
											2023-08-08 10:51:38 -07:00
+								def prune_unused_inputs(
 								    jaxpr: core.Jaxpr,
 								) -> tuple[core.Jaxpr, set[int], set[int]]:
 								  used_outputs = [True] * len(jaxpr.outvars)
 								  new_jaxpr, used_consts, used_inputs = pe.dce_jaxpr_consts(jaxpr, used_outputs)
 								  kept_const_idx = {i for i, b in enumerate(used_consts) if b}
 								  kept_var_idx = {i for i, b in enumerate(used_inputs) if b}
 								  return new_jaxpr, kept_const_idx, kept_var_idx
-												Simply lower_sharding_computation signature by always taking a closed jaxpr as input. For apply_primitive do the tracing to jaxpr in dispatch.py

PiperOrigin-RevId: 585810475

											
										
										
											2023-11-27 18:00:22 -08:00
+								@weakref_lru_cache
 								def _dce_jaxpr(closed_jaxpr, global_in_avals, api_name, fun_name,
 								               keep_unused, donated_invars, auto_spmd_lowering):
-												Remove circular dependency between source_info_util and util.

Move util.new_name_stack into source_info_util. Replace uses of util.extend_name_stack with stack.extend().

PiperOrigin-RevId: 512685810

											
										
										
											2023-02-27 11:37:10 -08:00
+								  name_stack = source_info_util.new_name_stack(wrap_name(fun_name, api_name))
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Simply lower_sharding_computation signature by always taking a closed jaxpr as input. For apply_primitive do the tracing to jaxpr in dispatch.py

PiperOrigin-RevId: 585810475

											
										
										
											2023-11-27 18:00:22 -08:00
+								  assert isinstance(closed_jaxpr, core.ClosedJaxpr)
 								  jaxpr = closed_jaxpr.jaxpr
 								  global_out_avals = closed_jaxpr.out_avals
 								  consts = closed_jaxpr.consts
-												Pass the `jaxpr` from `pjit` since there is no need to trace it again in lower_sharding_computation. It also helps in preserving debug_info that already exists on the jaxpr to surface it in MHLO eventually.

PiperOrigin-RevId: 513268085

											
										
										
											2023-03-01 10:04:59 -08:00
-												Allow pjit.AUTO to be used with jax.jit. This introduces an API change which requires a mesh to be provided to pjit.AUTO(mesh).

`with mesh:` is no longer required with pjit to use the auto spmd pass of GSPMD.

PiperOrigin-RevId: 533801596

											
										
										
											2023-05-20 22:59:52 -07:00
+								  if (keep_unused or auto_spmd_lowering or
-												DCE as early as possible so that `committed` is not dependent on DCE's vars

PiperOrigin-RevId: 521879918

											
										
										
											2023-04-04 15:20:32 -07:00
+								      any(hasattr(a, "shape") and not core.is_constant_shape(a.shape)
 								          for a in global_in_avals)):
 								    kept_var_idx = set(range(len(global_in_avals)))
 								  else:
-												Move some utilities out of dispatch.py next to their users, add more types.

Internal cleanups only, no user-visible changes intended.

PiperOrigin-RevId: 554876522

											
										
										
											2023-08-08 10:51:38 -07:00
+								    jaxpr, kept_const_idx, kept_var_idx = prune_unused_inputs(jaxpr)
-												DCE as early as possible so that `committed` is not dependent on DCE's vars

PiperOrigin-RevId: 521879918

											
										
										
											2023-04-04 15:20:32 -07:00
+								    consts = [c for i, c in enumerate(consts) if i in kept_const_idx]
 								    global_in_avals = tuple(a for i, a in enumerate(global_in_avals) if i in kept_var_idx)
 								    donated_invars = tuple(x for i, x in enumerate(donated_invars) if i in kept_var_idx)
 								    del kept_const_idx
 								  jaxpr = dispatch.apply_outfeed_rewriter(jaxpr)
 								  closed_jaxpr = core.ClosedJaxpr(jaxpr, consts)
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								  return (closed_jaxpr, global_in_avals, tuple(global_out_avals), donated_invars,
 								          kept_var_idx, name_stack)
-												[mutable-arrays] support closed-over mutable arrays in jit

											
										
										
											2024-03-05 16:20:24 -08:00
+								class MutationData(NamedTuple):
 								  in_mut: list[core.MutableArray]
 								  out_mut: list[int | None]
-												[mutable-arrays] allow state effects in jit by building in run_state

with help from @sharadmv, @yashkatariya, @dougalm, and others

The basic strategy is to apply discharge_state when lowering a jaxpr with state
effects to HLO, and update the dispatch path accordingly. Specifically:
1. in tests only for now, introduce a MutableArray data type;
2. teach jit to abstract it to a Ref(ShapedArray) type, register an input
   handler, etc;
3. call discharge_state in `lower_sharding_computation` to lower a jaxpr with
   refs to a jaxpr (and then to an HLO) with extra outputs, and set up aliasing;
4. teach the output side of the dispatch path to drop those outputs.

As an alternative to (3), we could potentially lower away the effects at a
higher level, like in _pjit_lower_cached. They are similar because
_pjit_lower_cached is the only (non-xmap) caller of lower_sharding_computation.
I decided to do it in lower_sharding_computation mainly because that's closer
to where we set up aliases, and I wanted to make mutable arrays correspond to
aliased inputs/outputs on the XLA computation.

											
										
										
											2024-02-26 14:46:05 -08:00
+								@weakref_lru_cache
 								def _discharge_refs(
 								    jaxpr: core.ClosedJaxpr
-												[mutable-arrays] support closed-over mutable arrays in jit

											
										
										
											2024-03-05 16:20:24 -08:00
+								) -> tuple[core.ClosedJaxpr, Sequence[int | None], MutationData]:
-												[mutable-arrays] allow state effects in jit by building in run_state

with help from @sharadmv, @yashkatariya, @dougalm, and others

The basic strategy is to apply discharge_state when lowering a jaxpr with state
effects to HLO, and update the dispatch path accordingly. Specifically:
1. in tests only for now, introduce a MutableArray data type;
2. teach jit to abstract it to a Ref(ShapedArray) type, register an input
   handler, etc;
3. call discharge_state in `lower_sharding_computation` to lower a jaxpr with
   refs to a jaxpr (and then to an HLO) with extra outputs, and set up aliasing;
4. teach the output side of the dispatch path to drop those outputs.

As an alternative to (3), we could potentially lower away the effects at a
higher level, like in _pjit_lower_cached. They are similar because
_pjit_lower_cached is the only (non-xmap) caller of lower_sharding_computation.
I decided to do it in lower_sharding_computation mainly because that's closer
to where we set up aliases, and I wanted to make mutable arrays correspond to
aliased inputs/outputs on the XLA computation.

											
										
										
											2024-02-26 14:46:05 -08:00
+								  from jax._src.state.discharge import discharge_state
-												[mutable-arrays] support closed-over mutable arrays in jit

											
										
										
											2024-03-05 16:20:24 -08:00
+								  jaxpr, in_mut = _move_mutable_consts(jaxpr)
-												[mutable-arrays] move MutableArray, add eager, improve tests, fix bug

1. move MutableArray to core.py, and some handlers to their respective files
2. fix a bug in aliasing setup (it was just broken before, now better test coverage)
3. add eager support by enabling get_p, swap_p, and addupdate_p impls
4. improve tests slightly

											
										
										
											2024-03-01 11:07:45 -08:00
+								  new_jaxpr = core.ClosedJaxpr(*discharge_state(jaxpr.jaxpr, jaxpr.consts))
 								  count = it.count(len(jaxpr.out_avals))  # new outputs are appended to the end
 								  inout_map = {i: next(count) for i, a in enumerate(jaxpr.in_avals)
 								               if isinstance(a, AbstractRef)}
 								  outin_map = {j: i for i, j in inout_map.items()}
 								  inout_aliases = tuple(map(inout_map.get, range(len(new_jaxpr.in_avals))))
-												[mutable-arrays] support closed-over mutable arrays in jit

											
										
										
											2024-03-05 16:20:24 -08:00
+								  out_mut = list(map(outin_map.get, range(len(new_jaxpr.out_avals))))
 								  return new_jaxpr, inout_aliases, MutationData(in_mut, out_mut)
 								@weakref_lru_cache
 								def _move_mutable_consts(
 								    closed_jaxpr: core.ClosedJaxpr,
 								) -> tuple[core.ClosedJaxpr, list[core.MutableArray]]:
 								  jaxpr = closed_jaxpr.jaxpr
 								  hoist = [isinstance(c, core.MutableArray) for c in closed_jaxpr.consts]
 								  consts, in_mut = partition_list(hoist, closed_jaxpr.consts)
 								  constvars, mutvars = partition_list(hoist, jaxpr.constvars)
 								  invars = (*jaxpr.invars, *mutvars)
 								  effects = pe.make_jaxpr_effects(constvars, invars, jaxpr.outvars, jaxpr.eqns)
 								  jaxpr = core.Jaxpr(constvars, invars, jaxpr.outvars, jaxpr.eqns,
 								                     effects, None)
 								  return core.ClosedJaxpr(jaxpr, consts), in_mut
-												[mutable-arrays] allow state effects in jit by building in run_state

with help from @sharadmv, @yashkatariya, @dougalm, and others

The basic strategy is to apply discharge_state when lowering a jaxpr with state
effects to HLO, and update the dispatch path accordingly. Specifically:
1. in tests only for now, introduce a MutableArray data type;
2. teach jit to abstract it to a Ref(ShapedArray) type, register an input
   handler, etc;
3. call discharge_state in `lower_sharding_computation` to lower a jaxpr with
   refs to a jaxpr (and then to an HLO) with extra outputs, and set up aliasing;
4. teach the output side of the dispatch path to drop those outputs.

As an alternative to (3), we could potentially lower away the effects at a
higher level, like in _pjit_lower_cached. They are similar because
_pjit_lower_cached is the only (non-xmap) caller of lower_sharding_computation.
I decided to do it in lower_sharding_computation mainly because that's closer
to where we set up aliases, and I wanted to make mutable arrays correspond to
aliased inputs/outputs on the XLA computation.

											
										
										
											2024-02-26 14:46:05 -08:00
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
 								@dataclasses.dataclass(frozen=True)
 								class SemanticallyEqualShardings:
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								  shardings: tuple[sharding_impls.GSPMDSharding | UnspecifiedValue, ...]
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
 								  def __hash__(self):
 								    return hash(tuple(
-												Make sure that if gspmd_sharding1 == gspmd_sharding2, then their hash also is equal.

PiperOrigin-RevId: 613009976

											
										
										
											2024-03-05 16:35:57 -08:00
+								        (s._hlo_sharding_hash, s.memory_kind)  # type: ignore
 								        if isinstance(s, sharding_impls.GSPMDSharding) else s
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								        for s in self.shardings))
 								  def __eq__(self, other):
 								    if not isinstance(other, SemanticallyEqualShardings):
 								      return False
-												[Memories] Add Memories support to jax.jit and jax.device_put!

These are the following changes:

* Add a temporary flag (`JAX_FETCH_MEMORY_KIND_ON_EXECUTABLE`) (should not be used by user but needed in C++ in pjrt-ifrt code) on whether to fetch memory kinds from executable. If it is set to True, the host runtime dep needs to be linked in and should also work in OSS (more work needs to happen for that). So only the test sets it to True for now until jax memories is under development.

* Add with_memory_kind method on Sharding to allow for easier creation of shardings with different memory kind.

* Add lowering rules for device_put and jax.jit.
  * For device_put, we always add the annotation that describes a transfer to a memory and a sharding annotation.
  * For jax.jit, if the argument is on host memory, it will have an extra attribute _xla_buffer_placement.

* Handle the correct output sharding in pxla.py by extracting the memory kind from the executable.

* Handle the caching of pjit caches by canonicalizing the memory_kinds so that `NS(mesh, pspec) == NS(mesh, pspec, memory_kind='tpu_hbm')`. Also canonicalize memory_kind in `__hash__` and `__eq__` of shardings.
  * This is to not change the StableHLO to include device placement annotations right now since the host aware passes are not enabled by default and the work is under progress to make it work everywhere.

PiperOrigin-RevId: 553833344

											
										
										
											2023-08-04 09:43:39 -07:00
+								    return all(
 								        (op_shardings.are_op_shardings_equal(s._hlo_sharding, o._hlo_sharding)
-												Canonicalize to default memory in init of Shardings only on the backends that support memories right now.

PiperOrigin-RevId: 553942534

											
										
										
											2023-08-04 16:26:31 -07:00
+								         and s.memory_kind == o.memory_kind)
-												[Memories] Add Memories support to jax.jit and jax.device_put!

These are the following changes:

* Add a temporary flag (`JAX_FETCH_MEMORY_KIND_ON_EXECUTABLE`) (should not be used by user but needed in C++ in pjrt-ifrt code) on whether to fetch memory kinds from executable. If it is set to True, the host runtime dep needs to be linked in and should also work in OSS (more work needs to happen for that). So only the test sets it to True for now until jax memories is under development.

* Add with_memory_kind method on Sharding to allow for easier creation of shardings with different memory kind.

* Add lowering rules for device_put and jax.jit.
  * For device_put, we always add the annotation that describes a transfer to a memory and a sharding annotation.
  * For jax.jit, if the argument is on host memory, it will have an extra attribute _xla_buffer_placement.

* Handle the correct output sharding in pxla.py by extracting the memory kind from the executable.

* Handle the caching of pjit caches by canonicalizing the memory_kinds so that `NS(mesh, pspec) == NS(mesh, pspec, memory_kind='tpu_hbm')`. Also canonicalize memory_kind in `__hash__` and `__eq__` of shardings.
  * This is to not change the StableHLO to include device placement annotations right now since the host aware passes are not enabled by default and the work is under progress to make it work everywhere.

PiperOrigin-RevId: 553833344

											
										
										
											2023-08-04 09:43:39 -07:00
+								        if (isinstance(s, sharding_impls.GSPMDSharding) and
 								            isinstance(o, sharding_impls.GSPMDSharding))
 								        else s == o
 								        for s, o in zip(self.shardings, other.shardings)
 								    )
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
-												Move some utilities out of dispatch.py next to their users, add more types.

Internal cleanups only, no user-visible changes intended.

PiperOrigin-RevId: 554876522

											
										
										
											2023-08-08 10:51:38 -07:00
+								def _raise_warnings_or_errors_for_jit_of_pmap(
 								    nreps: int, backend: xc.Client, name: str, jaxpr: core.Jaxpr) -> None:
 								  if nreps > 1:
 								    warnings.warn(
 								        f"The jitted function {name} includes a pmap. Using "
 								         "jit-of-pmap can lead to inefficient data movement, as the outer jit "
 								         "does not preserve sharded data representations and instead collects "
 								         "input and output arrays onto a single device. "
 								         "Consider removing the outer jit unless you know what you're doing. "
 								         "See https://github.com/google/jax/issues/2926.")
 								  if nreps > xb.device_count(backend):
 								    raise ValueError(
 								        f"compiling computation `{name}` that requires {nreps} replicas, but "
 								        f"only {xb.device_count(backend)} XLA devices are available.")
 								  if xb.process_count() > 1 and (
 								      nreps > 1 or dispatch.jaxpr_has_primitive(jaxpr, "xla_pmap")
 								  ):
 								    raise NotImplementedError(
 								        "jit of multi-host pmap not implemented (and jit-of-pmap can cause "
 								        "extra data movement anyway, so maybe you don't want it after all).")
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								@weakref_lru_cache
 								def _cached_lowering_to_hlo(closed_jaxpr, api_name, fun_name, backend,
 								                            semantic_in_shardings, semantic_out_shardings,
-												Make lowering oblivious to real physical devices. Instead cache lowering on HloSharding only (which is based on logical device numbers)

Make an exception for callbacks and custom_partitioning because they need access to device_assignment during lowering.

PiperOrigin-RevId: 589244695

											
										
										
											2023-12-08 14:35:27 -08:00
+								                            in_layouts, out_layouts, num_devices, device_assignment,
-												Set the jax_enable_memories flag to True.

If all memory_kinds in the jaxpr are the default memory kind, then annotate_device_placement custom calls are not inserted. This allows for existing code to work without any changes.

If non-default memory kind is present in the jaxpr, then we allow custom calls to be inserted.

PiperOrigin-RevId: 564457393

											
										
										
											2023-09-11 11:54:29 -07:00
+								                            donated_invars, name_stack, all_default_mem_kind,
-												[mutable-arrays] allow state effects in jit by building in run_state

with help from @sharadmv, @yashkatariya, @dougalm, and others

The basic strategy is to apply discharge_state when lowering a jaxpr with state
effects to HLO, and update the dispatch path accordingly. Specifically:
1. in tests only for now, introduce a MutableArray data type;
2. teach jit to abstract it to a Ref(ShapedArray) type, register an input
   handler, etc;
3. call discharge_state in `lower_sharding_computation` to lower a jaxpr with
   refs to a jaxpr (and then to an HLO) with extra outputs, and set up aliasing;
4. teach the output side of the dispatch path to drop those outputs.

As an alternative to (3), we could potentially lower away the effects at a
higher level, like in _pjit_lower_cached. They are similar because
_pjit_lower_cached is the only (non-xmap) caller of lower_sharding_computation.
I decided to do it in lower_sharding_computation mainly because that's closer
to where we set up aliases, and I wanted to make mutable arrays correspond to
aliased inputs/outputs on the XLA computation.

											
										
										
											2024-02-26 14:46:05 -08:00
+								                            inout_aliases: None | tuple[None | int, ...],
-												Introduce a LoweringParameters dataclass for easier plumbing

There are currently two parameters that are used to configure
lowering: lowering_platform (for cross-platform lowering), and
override_lowering_rules. Each of them are passed as separate arguments
through several layers of lowering internal functions. This is tedious,
and error prone. In fact, override_lowering_rules was not plumbed
in all places, and due to using default arguments in all places,
this leads to silent errors.

We foresee introducing other parameters for lowering: for multi-platform
lowering, for controlling the lowering of effects.

Here is pack all such parameters into a `mlir.LoweringParameters`
dataclass and we plumb that through.

											
										
										
											2023-09-28 12:44:14 +02:00
+								                            lowering_parameters: mlir.LoweringParameters):
-												DCE as early as possible so that `committed` is not dependent on DCE's vars

PiperOrigin-RevId: 521879918

											
										
										
											2023-04-04 15:20:32 -07:00
+								  jaxpr = closed_jaxpr.jaxpr
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								  in_shardings = semantic_in_shardings.shardings
 								  out_shardings = semantic_out_shardings.shardings
 								  global_in_avals = closed_jaxpr.in_avals
 								  global_out_avals = closed_jaxpr.out_avals
-												DCE as early as possible so that `committed` is not dependent on DCE's vars

PiperOrigin-RevId: 521879918

											
										
										
											2023-04-04 15:20:32 -07:00
-												Migrate a subset of internal modules to use state objects

The motivation here is to gradually replace all dynamic lookups on `jax.config`
with statically-typed state objects, which are more type checker/IDE friendly.

PiperOrigin-RevId: 571932143

											
										
										
											2023-10-09 07:28:18 -07:00
+								  log_priority = logging.WARNING if config.log_compiles.value else logging.DEBUG
-												[Micro-optimization] Only log the avals and shardings if logging is enabled for that level.

PiperOrigin-RevId: 524845969

											
										
										
											2023-04-17 07:52:56 -07:00
+								  if logger.isEnabledFor(log_priority):
 								    logger.log(log_priority,
 								               "Compiling %s for with global shapes and types %s. "
 								               "Argument mapping: %s.",
 								               fun_name, global_in_avals, in_shardings)
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
 								  # Look at the number of replcas present in the jaxpr. In
 								  # lower_sharding_computation, nreps > 1 during `jit(pmap)` cases. This is
 								  # handled here so as to deprecate the lower_xla_callable codepath when
 								  # `jax.Array` is turned on by default.
 								  # TODO(yashkatariya): Remove this when `jit(pmap)` is removed.
 								  nreps = dispatch.jaxpr_replicas(jaxpr)
-												Move some utilities out of dispatch.py next to their users, add more types.

Internal cleanups only, no user-visible changes intended.

PiperOrigin-RevId: 554876522

											
										
										
											2023-08-08 10:51:38 -07:00
+								  _raise_warnings_or_errors_for_jit_of_pmap(nreps, backend, fun_name, jaxpr)
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								  in_mlir_shardings: list[sharding_impls.XLACompatibleSharding | None] | None
 								  out_mlir_shardings: list[sharding_impls.XLACompatibleSharding | None] | None
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								  axis_ctx: mlir.AxisContext
 								  if nreps == 1:
-												Add is_fully_replicated method to Shardings. This allows to scrub the usage of is_op_sharding_replicated from JAX because we can just query it on Shardings and save an expensive round trip to OpSharding creation.

PiperOrigin-RevId: 524379122

											
										
										
											2023-04-14 13:55:52 -07:00
+								    in_mlir_shardings = map(_to_logical_sharding, global_in_avals, in_shardings)
 								    out_mlir_shardings = map(_to_logical_sharding, global_out_avals, out_shardings)
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								    replicated_args = [False] * len(global_in_avals)
-												Make lowering oblivious to real physical devices. Instead cache lowering on HloSharding only (which is based on logical device numbers)

Make an exception for callbacks and custom_partitioning because they need access to device_assignment during lowering.

PiperOrigin-RevId: 589244695

											
										
										
											2023-12-08 14:35:27 -08:00
+								    axis_ctx = sharding_impls.ShardingContext(num_devices, device_assignment)
 								    num_partitions = num_devices
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								  else:
 								    # This path is triggered for `jit(pmap)` cases.
 								    replicated_args = None
-												Add is_fully_replicated method to Shardings. This allows to scrub the usage of is_op_sharding_replicated from JAX because we can just query it on Shardings and save an expensive round trip to OpSharding creation.

PiperOrigin-RevId: 524379122

											
										
										
											2023-04-14 13:55:52 -07:00
+								    in_mlir_shardings = None
 								    out_mlir_shardings = None
-												Split sharding_impls into its own Bazel target.

* Move dependencies of sharding_impls into sharding_impls to avoid creating cyclic dependencies.
* Fix a handful of new pytype errors.

PiperOrigin-RevId: 523146076

											
										
										
											2023-04-10 10:15:08 -07:00
+								    axis_env = sharding_impls.AxisEnv(nreps, (), ())
 								    axis_ctx = sharding_impls.ReplicaAxisContext(axis_env)
-												Add unregistered mhlo.num_replicas and mhlo.num_partitions attributes to HLO output.

These are to allow PJRT plugin developers an inline way to determine the number of replicas/partitions to which the module is targeted. There are no stability guarantees on these attributes at the moment.

PiperOrigin-RevId: 524013922

											
										
										
											2023-04-13 08:55:01 -07:00
+								    num_partitions = 1
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
 								  module_name = f"{api_name}_{fun_name}"
-												Make lowering oblivious to real physical devices. Instead cache lowering on HloSharding only (which is based on logical device numbers)

Make an exception for callbacks and custom_partitioning because they need access to device_assignment during lowering.

PiperOrigin-RevId: 589244695

											
										
										
											2023-12-08 14:35:27 -08:00
+								  if num_devices > 1:
-												[callbacks] Add support for shardable ordered effects.

Ordered effects currently are not allowed in multi-device computations.
This is too restrictive sometimes, e.g., `io_callback(ordered=True)` uses
maximal sharding on one device and the callback would be issued only
once even in multi-device computations.

Here we add support for ordered shardable effects, which behave like
ordered effects except they are allowed in SPMD computations.
Currently, only `callback.IOOrderedEffect` is declared shardable.

In general, if the sharding of the side-effecting operation is not
maximal, then such effects would appear in a partial order, with
effects appearing ordered by program point and unordered among
the different devices at a given program point.

We also generalize the mechanism for tracking runtime tokens and
token buffers to work with multiple devices.

PiperOrigin-RevId: 566242557

											
										
										
											2023-09-18 02:49:53 -07:00
+								    unsupported_effects = effects.ordered_effects.filter_in(closed_jaxpr.effects)
 								    unsupported_effects = effects.shardable_ordered_effects.filter_not_in(
 								        unsupported_effects)
 								    if len(unsupported_effects) > 0:
 								      raise ValueError(
 								        "The following ordered effects are not supported for "
 								        f"more than 1 device: {unsupported_effects}")
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								  ordered_effects = list(effects.ordered_effects.filter_in(closed_jaxpr.effects))
-												Log the time it takes to lower from jaxpr to stableHLO

PiperOrigin-RevId: 532115098

											
										
										
											2023-05-15 08:07:31 -07:00
 								  with dispatch.log_elapsed_time(
-												Remove the f-string evaluation during logging the elapsed time by passing in fun_name to log_elapsed_time

PiperOrigin-RevId: 532132574

											
										
										
											2023-05-15 09:15:22 -07:00
+								        "Finished jaxpr to MLIR module conversion {fun_name} in {elapsed_time} sec",
 								        fun_name=str(name_stack), event=dispatch.JAXPR_TO_MLIR_MODULE_EVENT):
-												Log the time it takes to lower from jaxpr to stableHLO

PiperOrigin-RevId: 532115098

											
										
										
											2023-05-15 08:07:31 -07:00
+								    lowering_result = mlir.lower_jaxpr_to_module(
 								        module_name,
 								        closed_jaxpr,
-												Introduce a LoweringParameters dataclass for easier plumbing

There are currently two parameters that are used to configure
lowering: lowering_platform (for cross-platform lowering), and
override_lowering_rules. Each of them are passed as separate arguments
through several layers of lowering internal functions. This is tedious,
and error prone. In fact, override_lowering_rules was not plumbed
in all places, and due to using default arguments in all places,
this leads to silent errors.

We foresee introducing other parameters for lowering: for multi-platform
lowering, for controlling the lowering of effects.

Here is pack all such parameters into a `mlir.LoweringParameters`
dataclass and we plumb that through.

											
										
										
											2023-09-28 12:44:14 +02:00
+								        ordered_effects=ordered_effects,
 								        backend_or_name=backend,
-												Log the time it takes to lower from jaxpr to stableHLO

PiperOrigin-RevId: 532115098

											
										
										
											2023-05-15 08:07:31 -07:00
+								        # Optionally, override the lowering platform
-												Cleanup the handling of single- and multi-platform lowering in ModuleContext

Previously, we introduced support for multi-platform lowering, by
adding a new LoweringParameters object that can be used to specify
a cross-lowering platform or even multiple platforms. But we had
kept the ModuleContext.platform in place because some lowering rules
were still referencing it. Now we replace ModuleContext.platform with
ModuleContext.platforms, which removes the redundancy, simplifies
the code, and makes it clearer that the lowering rules should not
simply assume single-platform lowering.

PiperOrigin-RevId: 576575376

											
										
										
											2023-10-25 10:39:47 -07:00
+								        platforms=lowering_parameters.platforms or (backend.platform,),
-												Introduce a LoweringParameters dataclass for easier plumbing

There are currently two parameters that are used to configure
lowering: lowering_platform (for cross-platform lowering), and
override_lowering_rules. Each of them are passed as separate arguments
through several layers of lowering internal functions. This is tedious,
and error prone. In fact, override_lowering_rules was not plumbed
in all places, and due to using default arguments in all places,
this leads to silent errors.

We foresee introducing other parameters for lowering: for multi-platform
lowering, for controlling the lowering of effects.

Here is pack all such parameters into a `mlir.LoweringParameters`
dataclass and we plumb that through.

											
										
										
											2023-09-28 12:44:14 +02:00
+								        axis_context=axis_ctx,
 								        name_stack=name_stack,
 								        donated_args=donated_invars,
-												Log the time it takes to lower from jaxpr to stableHLO

PiperOrigin-RevId: 532115098

											
										
										
											2023-05-15 08:07:31 -07:00
+								        replicated_args=replicated_args,
 								        arg_shardings=in_mlir_shardings,
 								        result_shardings=out_mlir_shardings,
-												Add a private API to allow setting layouts on jitted computations.

We expose 3 modes:

* `SpecifiedLayout`: User specifies the `minor_to_major` field of the layout. Tiling not exposed yet.

* `DefaultLayout`: PJRT chooses the layout. It defaults to the current behavior.

* `AUTO`: Compiler chooses the layout. This field is not a layout per se. It's a request to get the layout from the compiler. This field cannot be on an Array or other data types. It can only be on jit.

Public API coming soon.

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 582692036

											
										
										
											2023-11-15 08:48:17 -08:00
+								        in_layouts=in_layouts,
 								        out_layouts=out_layouts,
-												Log the time it takes to lower from jaxpr to stableHLO

PiperOrigin-RevId: 532115098

											
										
										
											2023-05-15 08:07:31 -07:00
+								        arg_names=jaxpr.debug_info and jaxpr.debug_info.arg_names,
 								        result_names=jaxpr.debug_info and jaxpr.debug_info.result_paths,
 								        num_replicas=nreps,
-												Makes it possible to lower primitives with user-defined lowering rules.

PiperOrigin-RevId: 547228102

											
										
										
											2023-07-11 10:23:48 -07:00
+								        num_partitions=num_partitions,
-												Set the jax_enable_memories flag to True.

If all memory_kinds in the jaxpr are the default memory kind, then annotate_device_placement custom calls are not inserted. This allows for existing code to work without any changes.

If non-default memory kind is present in the jaxpr, then we allow custom calls to be inserted.

PiperOrigin-RevId: 564457393

											
										
										
											2023-09-11 11:54:29 -07:00
+								        all_default_mem_kind=all_default_mem_kind,
-												[mutable-arrays] allow state effects in jit by building in run_state

with help from @sharadmv, @yashkatariya, @dougalm, and others

The basic strategy is to apply discharge_state when lowering a jaxpr with state
effects to HLO, and update the dispatch path accordingly. Specifically:
1. in tests only for now, introduce a MutableArray data type;
2. teach jit to abstract it to a Ref(ShapedArray) type, register an input
   handler, etc;
3. call discharge_state in `lower_sharding_computation` to lower a jaxpr with
   refs to a jaxpr (and then to an HLO) with extra outputs, and set up aliasing;
4. teach the output side of the dispatch path to drop those outputs.

As an alternative to (3), we could potentially lower away the effects at a
higher level, like in _pjit_lower_cached. They are similar because
_pjit_lower_cached is the only (non-xmap) caller of lower_sharding_computation.
I decided to do it in lower_sharding_computation mainly because that's closer
to where we set up aliases, and I wanted to make mutable arrays correspond to
aliased inputs/outputs on the XLA computation.

											
										
										
											2024-02-26 14:46:05 -08:00
+								        input_output_aliases=inout_aliases,
-												Introduce a LoweringParameters dataclass for easier plumbing

There are currently two parameters that are used to configure
lowering: lowering_platform (for cross-platform lowering), and
override_lowering_rules. Each of them are passed as separate arguments
through several layers of lowering internal functions. This is tedious,
and error prone. In fact, override_lowering_rules was not plumbed
in all places, and due to using default arguments in all places,
this leads to silent errors.

We foresee introducing other parameters for lowering: for multi-platform
lowering, for controlling the lowering of effects.

Here is pack all such parameters into a `mlir.LoweringParameters`
dataclass and we plumb that through.

											
										
										
											2023-09-28 12:44:14 +02:00
+								        lowering_parameters=lowering_parameters)
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								  tuple_args = dispatch.should_tuple_args(len(global_in_avals), backend.platform)
-												Remove `unordered_effects` from `lower_jaxpr_to_module` since it is unused

PiperOrigin-RevId: 524139972

											
										
										
											2023-04-13 16:57:03 -07:00
+								  unordered_effects = list(
 								      effects.ordered_effects.filter_not_in(closed_jaxpr.effects))
-												Small cleanups to pxla.py.

Remove stale references to XlaComputation and code left over from handling both XlaComputations and ir.Modules.

No functional changes intended.

PiperOrigin-RevId: 526139679

											
										
										
											2023-04-21 14:37:52 -07:00
+								  return (lowering_result.module, lowering_result.keepalive,
 								          lowering_result.host_callbacks, unordered_effects, ordered_effects,
-												[shape_poly] Keep track of whether a lowering contains shape polymorphism

Previously, we kept the `dim_vars` in the `mlir.ModuleContext`. Now we
replace that with a mutable `ShapePolyLoweringState` that also tracks
whether we encounter shape polymorphism anywhere in the lowering.
For this purpose, we also add `shape_poly_state` to the lowering.compile_args.

We need to keep track of whether a module contains dimension variables
because such modules need shape refinement before they can be converted
to MHLO and compiled. For now, we just test that we set the
`Exported.module_uses_dim_vars` correctly.

											
										
										
											2023-05-31 11:00:08 +03:00
+								          nreps, tuple_args, lowering_result.shape_poly_state)
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
 								@lru_cache(maxsize=2048)
-												[JAX] Introduce `DeviceList` backed by C++ `xla::ifrt::DeviceList`

This change adds `xla_client.DeviceList` that is implemented in C++
`jax::PyDeviceList`. `jax::PyDeviceList` implements the features of
`pxla._DeviceAssignment` as a functional drop-in replacement.
`jax::PyDeviceList` internally has `xla::ifrt::DeviceList`, which will be used
when using IFRT APIs without having to construct a new copy of a potentially
large device list.

`pxla._DeviceAssignment`'s interface is changed slightly to encourage avoiding
conversion to tuple.

Note that for the backward compatibility (and fast `xla_client.Device`
conversion), `jax::PyDeviceList` still uses a Python tuple whose element can be
any Python object matches `xla_client.Device` interface with duck typing. This
duck typing support will be removed when such use case is deprecated.
Eventually, we can try to avoid any type conversion to remove a shadow copy of
device list in JAX.

PiperOrigin-RevId: 555317152

											
										
										
											2023-08-09 16:57:28 -07:00
+								def _create_da_object(  # pytype: disable=invalid-annotation
-												Replace _DeviceAssignment with xc.DeviceList

PiperOrigin-RevId: 599597226

											
										
										
											2024-01-18 12:54:54 -08:00
+								    device_assignment: tuple[xc.Device, ...]) -> xc.DeviceList:  # type: ignore
 								  return xc.DeviceList(device_assignment)
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
-												Set the jax_enable_memories flag to True.

If all memory_kinds in the jaxpr are the default memory kind, then annotate_device_placement custom calls are not inserted. This allows for existing code to work without any changes.

If non-default memory kind is present in the jaxpr, then we allow custom calls to be inserted.

PiperOrigin-RevId: 564457393

											
										
										
											2023-09-11 11:54:29 -07:00
+								def jaxpr_transfer_mem_kinds(
 								    jaxpr: core.Jaxpr) -> Iterator[sharding_impls.TransferToMemoryKind]:
-												Add `TransferToMemoryKind` as a private API to allow device_put to transfer to different memories without specifying the sharding and allowing the SPMD partitioner to choose the sharding for the intermediate.

Exposing it as a public API can be done later.

PiperOrigin-RevId: 559314369

											
										
										
											2023-08-22 22:07:24 -07:00
+								  for eqn in jaxpr.eqns:
 								    if (eqn.primitive is dispatch.device_put_p and
 								        isinstance(eqn.params['device'], sharding_impls.TransferToMemoryKind)):
-												Set the jax_enable_memories flag to True.

If all memory_kinds in the jaxpr are the default memory kind, then annotate_device_placement custom calls are not inserted. This allows for existing code to work without any changes.

If non-default memory kind is present in the jaxpr, then we allow custom calls to be inserted.

PiperOrigin-RevId: 564457393

											
										
										
											2023-09-11 11:54:29 -07:00
+								      yield eqn.params['device']
-												Add `TransferToMemoryKind` as a private API to allow device_put to transfer to different memories without specifying the sharding and allowing the SPMD partitioner to choose the sharding for the intermediate.

Exposing it as a public API can be done later.

PiperOrigin-RevId: 559314369

											
										
										
											2023-08-22 22:07:24 -07:00
+								  for subjaxpr in core.subjaxprs(jaxpr):
-												Set the jax_enable_memories flag to True.

If all memory_kinds in the jaxpr are the default memory kind, then annotate_device_placement custom calls are not inserted. This allows for existing code to work without any changes.

If non-default memory kind is present in the jaxpr, then we allow custom calls to be inserted.

PiperOrigin-RevId: 564457393

											
										
										
											2023-09-11 11:54:29 -07:00
+								    yield from jaxpr_transfer_mem_kinds(subjaxpr)
 								def are_all_shardings_default_mem_kind(da_object, shardings):
 								  try:
 								    default_mem_kind = da_object.default_memory_kind
 								  except:
 								    return True
 								  for i in shardings:
 								    if is_unspecified_or_auto(i):
 								      continue
 								    if i.memory_kind != default_mem_kind:
 								      return False
 								  return True
-												Add `TransferToMemoryKind` as a private API to allow device_put to transfer to different memories without specifying the sharding and allowing the SPMD partitioner to choose the sharding for the intermediate.

Exposing it as a public API can be done later.

PiperOrigin-RevId: 559314369

											
										
										
											2023-08-22 22:07:24 -07:00
-												Upgrade remaining sources to Python 3.9

This PR is a follow up to #18881.

The changes were generated by adding

    from __future__ import annotations

to the files which did not already have them and running

    pyupgrade --py39-plus --keep-percent-format {jax,tests,jaxlib,examples,benchmarks}/**/*.py

											
										
										
											2023-12-11 13:59:29 +00:00
+								MaybeLayout = Sequence[Union[XLACompatibleLayout, LayoutRequest, None]]
-												Add `TransferToMemoryKind` as a private API to allow device_put to transfer to different memories without specifying the sharding and allowing the SPMD partitioner to choose the sharding for the intermediate.

Exposing it as a public API can be done later.

PiperOrigin-RevId: 559314369

											
										
										
											2023-08-22 22:07:24 -07:00
-												Typecheck avals and sharding for arguments that were DCE'd.

This keeps the promise of AOT that recompilation is guaranteed.

Fixes https://github.com/google/jax/issues/18686

PiperOrigin-RevId: 585855658

											
										
										
											2023-11-27 22:38:46 -08:00
 								class AllArgsInfo(NamedTuple):
 								  """Avals, shardings, layouts and debug_info for all arguments prior to DCE."""
 								  in_avals: Sequence[core.ShapedArray]
 								  in_shardings: Any
 								  debug_info: core.JaxprDebugInfo | None
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								@profiler.annotate_function
 								def lower_sharding_computation(
-												Simply lower_sharding_computation signature by always taking a closed jaxpr as input. For apply_primitive do the tracing to jaxpr in dispatch.py

PiperOrigin-RevId: 585810475

											
										
										
											2023-11-27 18:00:22 -08:00
+								    closed_jaxpr: core.ClosedJaxpr,
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								    api_name: str,
 								    fun_name: str,
 								    in_shardings: Sequence[MaybeSharding],
-												Use `trace_to_jaxpr_dynamic` for the apply_primitive path. `trace_to_jaxpr_final` is only for final style primitives. Also do some cleanup.

PiperOrigin-RevId: 586106427

											
										
										
											2023-11-28 14:35:00 -08:00
+								    out_shardings: Sequence[MaybeSharding],
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								    donated_invars: Sequence[bool],
 								    global_in_avals: Sequence[core.ShapedArray],
 								    *,
 								    keep_unused: bool,
-												Default jax_spmd_mode to allow_jit which will allow explicit jax.jit to not raise the multihost error (since jit and pjit have been merged).

Implicit jit and apply_primitive will still raise an error though (which is recognized via inline parameter). Majority of jnp operations in JAX should be inlined.

PiperOrigin-RevId: 527398394

											
										
										
											2023-04-26 15:54:50 -07:00
+								    inline: bool,
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								    devices_from_context: Sequence[xc.Device] | None = None,
-												Introduce a LoweringParameters dataclass for easier plumbing

There are currently two parameters that are used to configure
lowering: lowering_platform (for cross-platform lowering), and
override_lowering_rules. Each of them are passed as separate arguments
through several layers of lowering internal functions. This is tedious,
and error prone. In fact, override_lowering_rules was not plumbed
in all places, and due to using default arguments in all places,
this leads to silent errors.

We foresee introducing other parameters for lowering: for multi-platform
lowering, for controlling the lowering of effects.

Here is pack all such parameters into a `mlir.LoweringParameters`
dataclass and we plumb that through.

											
										
										
											2023-09-28 12:44:14 +02:00
+								    lowering_parameters: mlir.LoweringParameters,
-												Add a private API to allow setting layouts on jitted computations.

We expose 3 modes:

* `SpecifiedLayout`: User specifies the `minor_to_major` field of the layout. Tiling not exposed yet.

* `DefaultLayout`: PJRT chooses the layout. It defaults to the current behavior.

* `AUTO`: Compiler chooses the layout. This field is not a layout per se. It's a request to get the layout from the compiler. This field cannot be on an Array or other data types. It can only be on jit.

Public API coming soon.

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 582692036

											
										
										
											2023-11-15 08:48:17 -08:00
+								    in_layouts: MaybeLayout,
-												Use `trace_to_jaxpr_dynamic` for the apply_primitive path. `trace_to_jaxpr_final` is only for final style primitives. Also do some cleanup.

PiperOrigin-RevId: 586106427

											
										
										
											2023-11-28 14:35:00 -08:00
+								    out_layouts: MaybeLayout,
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								) -> MeshComputation:
 								  """Lowers a computation to XLA. It can take arbitrary shardings as input.
-												Split sharding_impls into its own Bazel target.

* Move dependencies of sharding_impls into sharding_impls to avoid creating cyclic dependencies.
* Fix a handful of new pytype errors.

PiperOrigin-RevId: 523146076

											
										
										
											2023-04-10 10:15:08 -07:00
+								  The caller of this code can pass in a singleton UNSPECIFIED because the
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								  number of out_avals might not be known at that time and
 								  lower_sharding_computation calculates the number of out_avals so it can apply
-												Split sharding_impls into its own Bazel target.

* Move dependencies of sharding_impls into sharding_impls to avoid creating cyclic dependencies.
* Fix a handful of new pytype errors.

PiperOrigin-RevId: 523146076

											
										
										
											2023-04-10 10:15:08 -07:00
+								  the singleton UNSPECIFIED to all out_avals.
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								  """
 								  # 1. Trace to jaxpr and preprocess/verify it
-												Use `trace_to_jaxpr_dynamic` for the apply_primitive path. `trace_to_jaxpr_final` is only for final style primitives. Also do some cleanup.

PiperOrigin-RevId: 586106427

											
										
										
											2023-11-28 14:35:00 -08:00
+								  auto_spmd_lowering = check_if_any_auto(
 								      it.chain.from_iterable([in_shardings, out_shardings]))  # type: ignore
-												Allow pjit.AUTO to be used with jax.jit. This introduces an API change which requires a mesh to be provided to pjit.AUTO(mesh).

`with mesh:` is no longer required with pjit to use the auto spmd pass of GSPMD.

PiperOrigin-RevId: 533801596

											
										
										
											2023-05-20 22:59:52 -07:00
-												Typecheck avals and sharding for arguments that were DCE'd.

This keeps the promise of AOT that recompilation is guaranteed.

Fixes https://github.com/google/jax/issues/18686

PiperOrigin-RevId: 585855658

											
										
										
											2023-11-27 22:38:46 -08:00
+								  all_args_info = AllArgsInfo(global_in_avals, in_shardings,
 								                              closed_jaxpr.jaxpr.debug_info)
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								  (closed_jaxpr, global_in_avals, global_out_avals, donated_invars,
-												Simply lower_sharding_computation signature by always taking a closed jaxpr as input. For apply_primitive do the tracing to jaxpr in dispatch.py

PiperOrigin-RevId: 585810475

											
										
										
											2023-11-27 18:00:22 -08:00
+								   kept_var_idx, name_stack) = _dce_jaxpr(
 								      closed_jaxpr, global_in_avals, api_name, fun_name, keep_unused,
-												Allow pjit.AUTO to be used with jax.jit. This introduces an API change which requires a mesh to be provided to pjit.AUTO(mesh).

`with mesh:` is no longer required with pjit to use the auto spmd pass of GSPMD.

PiperOrigin-RevId: 533801596

											
										
										
											2023-05-20 22:59:52 -07:00
+								      donated_invars, auto_spmd_lowering)
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								  in_shardings = tuple(s for i, s in enumerate(in_shardings) if i in kept_var_idx)
-												Add a private API to allow setting layouts on jitted computations.

We expose 3 modes:

* `SpecifiedLayout`: User specifies the `minor_to_major` field of the layout. Tiling not exposed yet.

* `DefaultLayout`: PJRT chooses the layout. It defaults to the current behavior.

* `AUTO`: Compiler chooses the layout. This field is not a layout per se. It's a request to get the layout from the compiler. This field cannot be on an Array or other data types. It can only be on jit.

Public API coming soon.

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 582692036

											
										
										
											2023-11-15 08:48:17 -08:00
+								  in_layouts = tuple(l for i, l in enumerate(in_layouts) if i in kept_var_idx)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												[mutable-arrays] allow state effects in jit by building in run_state

with help from @sharadmv, @yashkatariya, @dougalm, and others

The basic strategy is to apply discharge_state when lowering a jaxpr with state
effects to HLO, and update the dispatch path accordingly. Specifically:
1. in tests only for now, introduce a MutableArray data type;
2. teach jit to abstract it to a Ref(ShapedArray) type, register an input
   handler, etc;
3. call discharge_state in `lower_sharding_computation` to lower a jaxpr with
   refs to a jaxpr (and then to an HLO) with extra outputs, and set up aliasing;
4. teach the output side of the dispatch path to drop those outputs.

As an alternative to (3), we could potentially lower away the effects at a
higher level, like in _pjit_lower_cached. They are similar because
_pjit_lower_cached is the only (non-xmap) caller of lower_sharding_computation.
I decided to do it in lower_sharding_computation mainly because that's closer
to where we set up aliases, and I wanted to make mutable arrays correspond to
aliased inputs/outputs on the XLA computation.

											
										
										
											2024-02-26 14:46:05 -08:00
+								  if any(isinstance(e, RefEffect) for e in closed_jaxpr.effects):
-												[mutable-arrays] support closed-over mutable arrays in jit

											
										
										
											2024-03-05 16:20:24 -08:00
+								    closed_jaxpr, inout_aliases, mut = _discharge_refs(closed_jaxpr)
 								    in_shardings = (*in_shardings,) + (UNSPECIFIED,) * len(mut.in_mut)
 								    in_layouts = (*in_layouts,) + (None,) * len(mut.in_mut)
 								    donated_invars = (*donated_invars,) + (False,) * len(mut.in_mut)
 								    out_layouts_ = iter(zip(out_shardings, out_layouts))
 								    out_shardings, out_layouts = unzip2(
 								        next(out_layouts_) if i is None else (in_shardings[i], in_layouts[i])
 								        for i in mut.out_mut)
 								    assert next(out_layouts_, None) is None
 								    # TODO(yashkatariya): remove global_in_avals / global_out_avals
 								    global_in_avals = closed_jaxpr.in_avals
 								    global_out_avals = closed_jaxpr.out_avals
-												[mutable-arrays] allow state effects in jit by building in run_state

with help from @sharadmv, @yashkatariya, @dougalm, and others

The basic strategy is to apply discharge_state when lowering a jaxpr with state
effects to HLO, and update the dispatch path accordingly. Specifically:
1. in tests only for now, introduce a MutableArray data type;
2. teach jit to abstract it to a Ref(ShapedArray) type, register an input
   handler, etc;
3. call discharge_state in `lower_sharding_computation` to lower a jaxpr with
   refs to a jaxpr (and then to an HLO) with extra outputs, and set up aliasing;
4. teach the output side of the dispatch path to drop those outputs.

As an alternative to (3), we could potentially lower away the effects at a
higher level, like in _pjit_lower_cached. They are similar because
_pjit_lower_cached is the only (non-xmap) caller of lower_sharding_computation.
I decided to do it in lower_sharding_computation mainly because that's closer
to where we set up aliases, and I wanted to make mutable arrays correspond to
aliased inputs/outputs on the XLA computation.

											
										
										
											2024-02-26 14:46:05 -08:00
+								  else:
-												[mutable-arrays] support closed-over mutable arrays in jit

											
										
										
											2024-03-05 16:20:24 -08:00
+								    inout_aliases = mut = None
-												[mutable-arrays] allow state effects in jit by building in run_state

with help from @sharadmv, @yashkatariya, @dougalm, and others

The basic strategy is to apply discharge_state when lowering a jaxpr with state
effects to HLO, and update the dispatch path accordingly. Specifically:
1. in tests only for now, introduce a MutableArray data type;
2. teach jit to abstract it to a Ref(ShapedArray) type, register an input
   handler, etc;
3. call discharge_state in `lower_sharding_computation` to lower a jaxpr with
   refs to a jaxpr (and then to an HLO) with extra outputs, and set up aliasing;
4. teach the output side of the dispatch path to drop those outputs.

As an alternative to (3), we could potentially lower away the effects at a
higher level, like in _pjit_lower_cached. They are similar because
_pjit_lower_cached is the only (non-xmap) caller of lower_sharding_computation.
I decided to do it in lower_sharding_computation mainly because that's closer
to where we set up aliases, and I wanted to make mutable arrays correspond to
aliased inputs/outputs on the XLA computation.

											
										
										
											2024-02-26 14:46:05 -08:00
 								  jaxpr = closed_jaxpr.jaxpr
-												Add a private API to allow setting layouts on jitted computations.

We expose 3 modes:

* `SpecifiedLayout`: User specifies the `minor_to_major` field of the layout. Tiling not exposed yet.

* `DefaultLayout`: PJRT chooses the layout. It defaults to the current behavior.

* `AUTO`: Compiler chooses the layout. This field is not a layout per se. It's a request to get the layout from the compiler. This field cannot be on an Array or other data types. It can only be on jit.

Public API coming soon.

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 582692036

											
										
										
											2023-11-15 08:48:17 -08:00
+								  assert len(out_shardings) == len(out_layouts) == len(global_out_avals), (
 								      len(out_shardings), len(out_layouts), len(global_out_avals))
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								  # Device assignment across all inputs, outputs and shardings inside jaxpr
 								  # should be the same.
 								  jaxpr_sharding = list(dispatch.jaxpr_shardings(jaxpr))
-												Improve the error message which is raised from `_get_and_check_device_assignment`.

Before:

```
ValueError: Devices of all `Array` inputs and outputs should be the same. Got array device ids [0] on platform CPU and another array's device ids [0, 1, 2, 3] on platform CPU
```

After:

```
ValueError: Received incompatible devices for jitted computation. Got argument inp of ArrayPjitTest.test_jit_with_sharding_constraint_committed_inp_error.<locals>.sharded_inp with bfloat16[8,2] and device ids [0] on platform CPU and with_sharding_constraint or nested pjit or shard_map with device ids [0, 1, 2, 3] on platform CPU at jax/tests/pjit_test.py:2509 (sharded_inp)
```
PiperOrigin-RevId: 508746961

											
										
										
											2023-02-10 13:53:43 -08:00
+								  backend, device_assignment = _get_and_check_device_assignment(
-												Dedupe shardings before passing them to _get_and_check_device_assignment

In practice, the number of different shardings is usually much smaller then
the number of inputs/output.

PiperOrigin-RevId: 600558309

											
										
										
											2024-01-22 13:44:34 -08:00
+								      it.chain(
 								          ((i, MismatchType.ARG_SHARDING, None) for i in util.stable_unique(in_shardings)),
 								          ((o, MismatchType.OUT_SHARDING, None) for o in util.stable_unique(out_shardings)),
 								          ((js, MismatchType.SHARDING_INSIDE_COMPUTATION, source_info)
 								           for js, source_info in util.stable_unique(jaxpr_sharding))),
-												Improve the error message which is raised from `_get_and_check_device_assignment`.

Before:

```
ValueError: Devices of all `Array` inputs and outputs should be the same. Got array device ids [0] on platform CPU and another array's device ids [0, 1, 2, 3] on platform CPU
```

After:

```
ValueError: Received incompatible devices for jitted computation. Got argument inp of ArrayPjitTest.test_jit_with_sharding_constraint_committed_inp_error.<locals>.sharded_inp with bfloat16[8,2] and device ids [0] on platform CPU and with_sharding_constraint or nested pjit or shard_map with device ids [0, 1, 2, 3] on platform CPU at jax/tests/pjit_test.py:2509 (sharded_inp)
```
PiperOrigin-RevId: 508746961

											
										
										
											2023-02-10 13:53:43 -08:00
+								      devices_from_context)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Support activation offloading to host in JAX!

Currently on TPU support works. GPU support is being added.

PiperOrigin-RevId: 604482085

											
										
										
											2024-02-05 17:27:32 -08:00
+								  # TODO(yashkatariya): Enable this when offload APIs are stable.
 								  # transfer_mem_kind_in_jaxpr = list(jaxpr_transfer_mem_kinds(jaxpr))
-												Set the jax_enable_memories flag to True.

If all memory_kinds in the jaxpr are the default memory kind, then annotate_device_placement custom calls are not inserted. This allows for existing code to work without any changes.

If non-default memory kind is present in the jaxpr, then we allow custom calls to be inserted.

PiperOrigin-RevId: 564457393

											
										
										
											2023-09-11 11:54:29 -07:00
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  committed = bool(
 								      devices_from_context or
 								      len(device_assignment) > 1 or
-												Split sharding_impls into its own Bazel target.

* Move dependencies of sharding_impls into sharding_impls to avoid creating cyclic dependencies.
* Fix a handful of new pytype errors.

PiperOrigin-RevId: 523146076

											
										
										
											2023-04-10 10:15:08 -07:00
+								      any(not is_unspecified(i) for i in in_shardings) or
 								      any(not is_unspecified(js) for js, _ in jaxpr_sharding) or
-												Support activation offloading to host in JAX!

Currently on TPU support works. GPU support is being added.

PiperOrigin-RevId: 604482085

											
										
										
											2024-02-05 17:27:32 -08:00
+								      any(not is_unspecified(o) for o in out_shardings))
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Let XLA choose in_shardings for inputs who sharding is unspecified.

This is a strict improvement over the current state where JAX always chooses replicated sharding.

PiperOrigin-RevId: 610771289

											
										
										
											2024-02-27 09:06:21 -08:00
+								  gs = GSPMDSharding.get_replicated(device_assignment)
-												[XLA:MHLO->HLO] Allow partially-set parameter tuple sharding to exist by filling in the missing sharding elements with replicated sharding. (This is what is done for the missing shardings in the result tuple.)

Before this change, if an element of a tuple parameter did not have a sharding, MHLO->HLO conversion dropped the existing annotations on the parameter. This issue caused the disappearing of the parameter sharding for a model, which then resulted in an OOM.

PiperOrigin-RevId: 615917181

											
										
										
											2024-03-14 15:09:07 -07:00
+								  if xla_extension_version < 241 or hasattr(backend, "compile_replicated"):
 								    in_shardings = tuple(gs if is_unspecified(i) else i for i in in_shardings)
-												Let XLA choose in_shardings for inputs who sharding is unspecified.

This is a strict improvement over the current state where JAX always chooses replicated sharding.

PiperOrigin-RevId: 610771289

											
										
										
											2024-02-27 09:06:21 -08:00
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								  da_object = _create_da_object(tuple(device_assignment))
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Set the jax_enable_memories flag to True.

If all memory_kinds in the jaxpr are the default memory kind, then annotate_device_placement custom calls are not inserted. This allows for existing code to work without any changes.

If non-default memory kind is present in the jaxpr, then we allow custom calls to be inserted.

PiperOrigin-RevId: 564457393

											
										
										
											2023-09-11 11:54:29 -07:00
+								  all_default_mem_kind = are_all_shardings_default_mem_kind(
 								      da_object,
-												Support activation offloading to host in JAX!

Currently on TPU support works. GPU support is being added.

PiperOrigin-RevId: 604482085

											
										
										
											2024-02-05 17:27:32 -08:00
+								      it.chain(in_shardings, out_shardings, [js for js, _ in jaxpr_sharding]))  # type: ignore
-												Set the jax_enable_memories flag to True.

If all memory_kinds in the jaxpr are the default memory kind, then annotate_device_placement custom calls are not inserted. This allows for existing code to work without any changes.

If non-default memory kind is present in the jaxpr, then we allow custom calls to be inserted.

PiperOrigin-RevId: 564457393

											
										
										
											2023-09-11 11:54:29 -07:00
-												[JAX] Introduce `DeviceList` backed by C++ `xla::ifrt::DeviceList`

This change adds `xla_client.DeviceList` that is implemented in C++
`jax::PyDeviceList`. `jax::PyDeviceList` implements the features of
`pxla._DeviceAssignment` as a functional drop-in replacement.
`jax::PyDeviceList` internally has `xla::ifrt::DeviceList`, which will be used
when using IFRT APIs without having to construct a new copy of a potentially
large device list.

`pxla._DeviceAssignment`'s interface is changed slightly to encourage avoiding
conversion to tuple.

Note that for the backward compatibility (and fast `xla_client.Device`
conversion), `jax::PyDeviceList` still uses a Python tuple whose element can be
any Python object matches `xla_client.Device` interface with duck typing. This
duck typing support will be removed when such use case is deprecated.
Eventually, we can try to avoid any type conversion to remove a shadow copy of
device list in JAX.

PiperOrigin-RevId: 555317152

											
										
										
											2023-08-09 16:57:28 -07:00
+								  if not da_object.is_fully_addressable:  # type: ignore
-												Migrate a subset of internal modules to use state objects

The motivation here is to gradually replace all dynamic lookups on `jax.config`
with statically-typed state objects, which are more type checker/IDE friendly.

PiperOrigin-RevId: 571932143

											
										
										
											2023-10-09 07:28:18 -07:00
+								    if inline and config.spmd_mode.value != 'allow_all':
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      raise RuntimeError(
 								          "Running operations on `Array`s that are not fully addressable by this "
 								          "process (i.e. `Array`s with data sharded across multiple devices and "
 								          "processes.) is dangerous. It’s very important that all processes run "
 								          "the same cross-process computations in the same order otherwise it "
 								          "can lead to hangs. "
 								          "If you’re not already familiar with JAX’s multi-process "
 								          "programming model, please read "
 								          "https://jax.readthedocs.io/en/latest/multi_process.html. "
 								          "To fix this error, run your `jitted` computation inside "
 								          "`with jax.spmd_mode('allow_all'):` context manager.")
 								  # 2. Build up the HLO
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								  semantic_in_shardings = SemanticallyEqualShardings(in_shardings)  # type: ignore
-												Use `trace_to_jaxpr_dynamic` for the apply_primitive path. `trace_to_jaxpr_final` is only for final style primitives. Also do some cleanup.

PiperOrigin-RevId: 586106427

											
										
										
											2023-11-28 14:35:00 -08:00
+								  semantic_out_shardings = SemanticallyEqualShardings(out_shardings)  # type: ignore
-												Add a registry for primitives that require device_assignment during lowering

PiperOrigin-RevId: 589272990

											
										
										
											2023-12-08 16:31:11 -08:00
+								  prim_requires_devices = dispatch.jaxpr_has_prim_requiring_devices(jaxpr)
-												Add a flag `jax_require_devices_during_lowering` to control if physical devices are passed during lowering to stablehlo. This is temporary to unblock nvidia.

PiperOrigin-RevId: 590318918

											
										
										
											2023-12-12 13:33:27 -08:00
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								  (module, keepalive, host_callbacks, unordered_effects, ordered_effects,
-												[shape_poly] Keep track of whether a lowering contains shape polymorphism

Previously, we kept the `dim_vars` in the `mlir.ModuleContext`. Now we
replace that with a mutable `ShapePolyLoweringState` that also tracks
whether we encounter shape polymorphism anywhere in the lowering.
For this purpose, we also add `shape_poly_state` to the lowering.compile_args.

We need to keep track of whether a module contains dimension variables
because such modules need shape refinement before they can be converted
to MHLO and compiled. For now, we just test that we set the
`Exported.module_uses_dim_vars` correctly.

											
										
										
											2023-05-31 11:00:08 +03:00
+								   nreps, tuple_args, shape_poly_state) = _cached_lowering_to_hlo(
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								       closed_jaxpr, api_name, fun_name, backend, semantic_in_shardings,
-												Make lowering oblivious to real physical devices. Instead cache lowering on HloSharding only (which is based on logical device numbers)

Make an exception for callbacks and custom_partitioning because they need access to device_assignment during lowering.

PiperOrigin-RevId: 589244695

											
										
										
											2023-12-08 14:35:27 -08:00
+								       semantic_out_shardings, in_layouts, out_layouts, len(da_object),
-												[roll forward 2] Remove the `jax_require_devices_during_lowering flag since it was temporary. Added the semi-breaking change to Changelog.md.

Reverts b52bcc1639368069075284eefc763f824ca155f1

PiperOrigin-RevId: 590959383

											
										
										
											2023-12-14 09:13:43 -08:00
+								       tuple(da_object) if prim_requires_devices else None, donated_invars,
-												[mutable-arrays] allow state effects in jit by building in run_state

with help from @sharadmv, @yashkatariya, @dougalm, and others

The basic strategy is to apply discharge_state when lowering a jaxpr with state
effects to HLO, and update the dispatch path accordingly. Specifically:
1. in tests only for now, introduce a MutableArray data type;
2. teach jit to abstract it to a Ref(ShapedArray) type, register an input
   handler, etc;
3. call discharge_state in `lower_sharding_computation` to lower a jaxpr with
   refs to a jaxpr (and then to an HLO) with extra outputs, and set up aliasing;
4. teach the output side of the dispatch path to drop those outputs.

As an alternative to (3), we could potentially lower away the effects at a
higher level, like in _pjit_lower_cached. They are similar because
_pjit_lower_cached is the only (non-xmap) caller of lower_sharding_computation.
I decided to do it in lower_sharding_computation mainly because that's closer
to where we set up aliases, and I wanted to make mutable arrays correspond to
aliased inputs/outputs on the XLA computation.

											
										
										
											2024-02-26 14:46:05 -08:00
+								       name_stack, all_default_mem_kind, inout_aliases,
 								       lowering_parameters=lowering_parameters)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								  # backend and device_assignment is passed through to MeshExecutable because
 								  # if keep_unused=False and all in_shardings are pruned, then there is no way
 								  # to get the device_assignment and backend. So pass it to MeshExecutable
 								  # because we calculate the device_assignment and backend before in_shardings,
 								  # etc are pruned.
 								  return MeshComputation(
 								      str(name_stack),
 								      module,
 								      donated_invars,
 								      global_in_avals=global_in_avals,
 								      global_out_avals=global_out_avals,
 								      in_shardings=in_shardings,
 								      out_shardings=out_shardings,
 								      spmd_lowering=True,
 								      tuple_args=tuple_args,
-												Allow pjit.AUTO to be used with jax.jit. This introduces an API change which requires a mesh to be provided to pjit.AUTO(mesh).

`with mesh:` is no longer required with pjit to use the auto spmd pass of GSPMD.

PiperOrigin-RevId: 533801596

											
										
										
											2023-05-20 22:59:52 -07:00
+								      auto_spmd_lowering=auto_spmd_lowering,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      unordered_effects=unordered_effects,
 								      ordered_effects=ordered_effects,
 								      host_callbacks=host_callbacks,
 								      keepalive=keepalive,
 								      kept_var_idx=kept_var_idx,
-												[mutable-arrays] support closed-over mutable arrays in jit

											
										
										
											2024-03-05 16:20:24 -08:00
+								      mut=mut,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      backend=backend,
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								      device_assignment=da_object,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      committed=committed,
-												Add a private API to allow setting layouts on jitted computations.

We expose 3 modes:

* `SpecifiedLayout`: User specifies the `minor_to_major` field of the layout. Tiling not exposed yet.

* `DefaultLayout`: PJRT chooses the layout. It defaults to the current behavior.

* `AUTO`: Compiler chooses the layout. This field is not a layout per se. It's a request to get the layout from the compiler. This field cannot be on an Array or other data types. It can only be on jit.

Public API coming soon.

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 582692036

											
										
										
											2023-11-15 08:48:17 -08:00
+								      in_layouts=in_layouts,
 								      out_layouts=out_layouts,
-												Plumb debug_info to meshExecutable as a optional arg to raise better error messages.

PiperOrigin-RevId: 525521694

											
										
										
											2023-04-19 12:35:15 -07:00
+								      pmap_nreps=nreps,
-												Set the jax_enable_memories flag to True.

If all memory_kinds in the jaxpr are the default memory kind, then annotate_device_placement custom calls are not inserted. This allows for existing code to work without any changes.

If non-default memory kind is present in the jaxpr, then we allow custom calls to be inserted.

PiperOrigin-RevId: 564457393

											
										
										
											2023-09-11 11:54:29 -07:00
+								      shape_poly_state=shape_poly_state,
-												Typecheck avals and sharding for arguments that were DCE'd.

This keeps the promise of AOT that recompilation is guaranteed.

Fixes https://github.com/google/jax/issues/18686

PiperOrigin-RevId: 585855658

											
										
										
											2023-11-27 22:38:46 -08:00
+								      all_default_mem_kind=all_default_mem_kind,
 								      all_args_info=all_args_info)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
-												Add is_fully_replicated method to Shardings. This allows to scrub the usage of is_op_sharding_replicated from JAX because we can just query it on Shardings and save an expensive round trip to OpSharding creation.

PiperOrigin-RevId: 524379122

											
										
										
											2023-04-14 13:55:52 -07:00
+								def _to_logical_sharding(
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								    aval: core.AbstractValue, sharding: MaybeSharding | AUTO
 								) -> sharding_impls.XLACompatibleSharding | None:
-												Split sharding_impls into its own Bazel target.

* Move dependencies of sharding_impls into sharding_impls to avoid creating cyclic dependencies.
* Fix a handful of new pytype errors.

PiperOrigin-RevId: 523146076

											
										
										
											2023-04-10 10:15:08 -07:00
+								  if is_unspecified(sharding) or is_auto(sharding):
-												[shard-map] fix eager shmap+prngs, revise phys aval/sharding logic

Co-authored-by: Yash Katariya <yashkatariya@google.com>

											
										
										
											2023-04-05 14:09:46 -07:00
+								    return None
-												[mutable-arrays] allow state effects in jit by building in run_state

with help from @sharadmv, @yashkatariya, @dougalm, and others

The basic strategy is to apply discharge_state when lowering a jaxpr with state
effects to HLO, and update the dispatch path accordingly. Specifically:
1. in tests only for now, introduce a MutableArray data type;
2. teach jit to abstract it to a Ref(ShapedArray) type, register an input
   handler, etc;
3. call discharge_state in `lower_sharding_computation` to lower a jaxpr with
   refs to a jaxpr (and then to an HLO) with extra outputs, and set up aliasing;
4. teach the output side of the dispatch path to drop those outputs.

As an alternative to (3), we could potentially lower away the effects at a
higher level, like in _pjit_lower_cached. They are similar because
_pjit_lower_cached is the only (non-xmap) caller of lower_sharding_computation.
I decided to do it in lower_sharding_computation mainly because that's closer
to where we set up aliases, and I wanted to make mutable arrays correspond to
aliased inputs/outputs on the XLA computation.

											
										
										
											2024-02-26 14:46:05 -08:00
+								  elif isinstance(aval, (ShapedArray, DShapedArray, AbstractRef)):
-												[shard-map] fix eager shmap+prngs, revise phys aval/sharding logic

Co-authored-by: Yash Katariya <yashkatariya@google.com>

											
										
										
											2023-04-05 14:09:46 -07:00
+								    assert isinstance(sharding, sharding_impls.XLACompatibleSharding)
-												Add is_fully_replicated method to Shardings. This allows to scrub the usage of is_op_sharding_replicated from JAX because we can just query it on Shardings and save an expensive round trip to OpSharding creation.

PiperOrigin-RevId: 524379122

											
										
										
											2023-04-14 13:55:52 -07:00
+								    return sharding
-												[shard-map] fix eager shmap+prngs, revise phys aval/sharding logic

Co-authored-by: Yash Katariya <yashkatariya@google.com>

											
										
										
											2023-04-05 14:09:46 -07:00
+								  elif isinstance(aval, core.AbstractToken):
 								    return None
 								  else:
 								    raise TypeError(aval)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								@profiler.annotate_function
 								def lower_mesh_computation(
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								    fun_or_jaxpr: lu.WrappedFun | core.ClosedJaxpr,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    api_name: str,
 								    fun_name: str,
 								    mesh: Mesh,
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								    in_shardings: Sequence[sharding_impls.NamedSharding | AUTO],
 								    out_shardings: Sequence[(sharding_impls.NamedSharding | AUTO |
 								                                  UnspecifiedValue)],
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    donated_invars: Sequence[bool],
 								    spmd_lowering: bool,
 								    global_in_avals: Sequence[core.ShapedArray],
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								    tiling_method: TilingMethod | None,
-												Introduce a LoweringParameters dataclass for easier plumbing

There are currently two parameters that are used to configure
lowering: lowering_platform (for cross-platform lowering), and
override_lowering_rules. Each of them are passed as separate arguments
through several layers of lowering internal functions. This is tedious,
and error prone. In fact, override_lowering_rules was not plumbed
in all places, and due to using default arguments in all places,
this leads to silent errors.

We foresee introducing other parameters for lowering: for multi-platform
lowering, for controlling the lowering of effects.

Here is pack all such parameters into a `mlir.LoweringParameters`
dataclass and we plumb that through.

											
										
										
											2023-09-28 12:44:14 +02:00
+								    lowering_parameters: mlir.LoweringParameters) -> MeshComputation:
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  assert not mesh.empty
 								  backend = xb.get_device_backend(mesh.devices.flat[0])
-												Remove circular dependency between source_info_util and util.

Move util.new_name_stack into source_info_util. Replace uses of util.extend_name_stack with stack.extend().

PiperOrigin-RevId: 512685810

											
										
										
											2023-02-27 11:37:10 -08:00
+								  name_stack = source_info_util.new_name_stack(wrap_name(fun_name, api_name))
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								  global_axis_sizes = mesh.shape
-												Migrate a subset of internal modules to use state objects

The motivation here is to gradually replace all dynamic lookups on `jax.config`
with statically-typed state objects, which are more type checker/IDE friendly.

PiperOrigin-RevId: 571932143

											
										
										
											2023-10-09 07:28:18 -07:00
+								  log_priority = logging.WARNING if config.log_compiles.value else logging.DEBUG
-												[Micro-optimization] Only log the avals and shardings if logging is enabled for that level.

PiperOrigin-RevId: 524845969

											
										
										
											2023-04-17 07:52:56 -07:00
+								  if logger.isEnabledFor(log_priority):
 								    logger.log(log_priority,
 								               "Compiling %s for %s mesh with global shapes and types %s. "
 								               "Argument mapping: %s.",
 								               fun_name, tuple(global_axis_sizes.items()), global_in_avals,
 								               in_shardings)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								  # 1. Trace to jaxpr and preprocess/verify it
 								  if spmd_lowering:
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								    manual_axes: frozenset[MeshAxisName] = frozenset()
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    # TODO: Consider handling xmap's 'vectorize' in here. We can vmap once instead of vtile twice!
 								    if tiling_method is not None:
 								      if isinstance(tiling_method, TileVectorize):
 								        tiling_transform = vtile_by_mesh
 								      elif isinstance(tiling_method, TileManual):
 								        tiling_transform = lambda f, *args: vtile_manual(f, tiling_method.manual_axes, *args)  # type: ignore
 								        manual_axes = tiling_method.manual_axes
 								      else:
 								        raise NotImplementedError(f"Unrecognized tiling method: {tiling_method}")
 								      assert not callable(out_shardings)
-												Pass the `jaxpr` from `pjit` since there is no need to trace it again in lower_sharding_computation. It also helps in preserving debug_info that already exists on the jaxpr to surface it in MHLO eventually.

PiperOrigin-RevId: 513268085

											
										
										
											2023-03-01 10:04:59 -08:00
+								      assert isinstance(fun_or_jaxpr, lu.WrappedFun)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      # This is the xmap path where there is no `AUTO` or `UNSPECIFIED`, which
 								      # is why `.spec` can be accessed.
-												Pass the `jaxpr` from `pjit` since there is no need to trace it again in lower_sharding_computation. It also helps in preserving debug_info that already exists on the jaxpr to surface it in MHLO eventually.

PiperOrigin-RevId: 513268085

											
										
										
											2023-03-01 10:04:59 -08:00
+								      fun_or_jaxpr = tiling_transform(
 								          fun_or_jaxpr, mesh, [get_array_mapping(i.spec) for i in in_shardings],  # type: ignore
-												Prune accidental exports from jax.interpreters.pxla.

These imports do not appear to have users outside JAX itself.

PiperOrigin-RevId: 507835295

											
										
										
											2023-02-07 11:16:01 -08:00
+								          [get_array_mapping(o.spec) for o in out_shardings])  # type: ignore
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    in_jaxpr_avals = global_in_avals
 								  else:
 								    assert isinstance(tiling_method, TileVectorize)
 								    # In non-spmd lowering path, there is no `AUTO` or `UNSPECIFIED`, which is
 								    # why `.spec` can be accessed.
-												Prune accidental exports from jax.interpreters.pxla.

These imports do not appear to have users outside JAX itself.

PiperOrigin-RevId: 507835295

											
										
										
											2023-02-07 11:16:01 -08:00
+								    in_tiled_avals = [tile_aval_nd(global_axis_sizes, get_array_mapping(i.spec), aval)  # type: ignore
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								                      for aval, i in safe_zip(global_in_avals, in_shardings)]
 								    in_jaxpr_avals = in_tiled_avals
-												Pass the `jaxpr` from `pjit` since there is no need to trace it again in lower_sharding_computation. It also helps in preserving debug_info that already exists on the jaxpr to surface it in MHLO eventually.

PiperOrigin-RevId: 513268085

											
										
										
											2023-03-01 10:04:59 -08:00
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  with core.extend_axis_env_nd(mesh.shape.items()):
-												Pass the `jaxpr` from `pjit` since there is no need to trace it again in lower_sharding_computation. It also helps in preserving debug_info that already exists on the jaxpr to surface it in MHLO eventually.

PiperOrigin-RevId: 513268085

											
										
										
											2023-03-01 10:04:59 -08:00
+								    if isinstance(fun_or_jaxpr, lu.WrappedFun):
 								      with dispatch.log_elapsed_time(
-												Remove the f-string evaluation during logging the elapsed time by passing in fun_name to log_elapsed_time

PiperOrigin-RevId: 532132574

											
										
										
											2023-05-15 09:15:22 -07:00
+								          "Finished tracing + transforming {fun_name} in {elapsed_time} sec",
 								          fun_name=str(name_stack), event=dispatch.JAXPR_TRACE_EVENT):
-												Pass the `jaxpr` from `pjit` since there is no need to trace it again in lower_sharding_computation. It also helps in preserving debug_info that already exists on the jaxpr to surface it in MHLO eventually.

PiperOrigin-RevId: 513268085

											
										
										
											2023-03-01 10:04:59 -08:00
+								        jaxpr, out_jaxpr_avals, consts = pe.trace_to_jaxpr_final(
 								            fun_or_jaxpr, in_jaxpr_avals)
 								    else:
 								      assert isinstance(fun_or_jaxpr, core.ClosedJaxpr)
 								      jaxpr = fun_or_jaxpr.jaxpr
 								      out_jaxpr_avals = fun_or_jaxpr.out_avals
 								      consts = fun_or_jaxpr.consts
-												Typecheck avals and sharding for arguments that were DCE'd.

This keeps the promise of AOT that recompilation is guaranteed.

Fixes https://github.com/google/jax/issues/18686

PiperOrigin-RevId: 585855658

											
										
										
											2023-11-27 22:38:46 -08:00
+								  all_args_info = AllArgsInfo(global_in_avals, in_shardings, jaxpr.debug_info)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  assert len(out_shardings) == len(out_jaxpr_avals)
 								  if spmd_lowering:
 								    global_out_avals = out_jaxpr_avals
 								  else:
 								    # In non-spmd lowering path, there is no `AUTO` or `UNSPECIFIED`, which is
 								    # why `.spec` can be accessed.
-												Prune accidental exports from jax.interpreters.pxla.

These imports do not appear to have users outside JAX itself.

PiperOrigin-RevId: 507835295

											
										
										
											2023-02-07 11:16:01 -08:00
+								    global_out_avals = [untile_aval_nd(global_axis_sizes, get_array_mapping(o.spec), aval)  # type: ignore
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								                        for aval, o in safe_zip(out_jaxpr_avals, out_shardings)]
-												Fix the case where debug_info was not attached when a xmap was present in the computation.

PiperOrigin-RevId: 513718785

											
										
										
											2023-03-02 20:49:51 -08:00
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  _sanitize_mesh_jaxpr(jaxpr)
 								  jaxpr = dispatch.apply_outfeed_rewriter(jaxpr)
 								  # 2. Build up the HLO
 								  tuple_args = dispatch.should_tuple_args(len(in_jaxpr_avals), backend.platform)
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								  in_partitions: list[sharding_impls.XLACompatibleSharding | None] | None
 								  out_partitions: list[sharding_impls.XLACompatibleSharding | None] | None
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  axis_ctx: mlir.AxisContext
 								  if spmd_lowering:
-												Add is_fully_replicated method to Shardings. This allows to scrub the usage of is_op_sharding_replicated from JAX because we can just query it on Shardings and save an expensive round trip to OpSharding creation.

PiperOrigin-RevId: 524379122

											
										
										
											2023-04-14 13:55:52 -07:00
+								    in_partitions = map(_to_logical_sharding, global_in_avals, in_shardings)
 								    out_partitions = map(_to_logical_sharding, global_out_avals, out_shardings)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    replicated_args = [False] * len(in_jaxpr_avals)
-												Split sharding_impls into its own Bazel target.

* Move dependencies of sharding_impls into sharding_impls to avoid creating cyclic dependencies.
* Fix a handful of new pytype errors.

PiperOrigin-RevId: 523146076

											
										
										
											2023-04-10 10:15:08 -07:00
+								    axis_ctx = sharding_impls.SPMDAxisContext(mesh, manual_axes)
-												Add unregistered mhlo.num_replicas and mhlo.num_partitions attributes to HLO output.

These are to allow PJRT plugin developers an inline way to determine the number of replicas/partitions to which the module is targeted. There are no stability guarantees on these attributes at the moment.

PiperOrigin-RevId: 524013922

											
										
										
											2023-04-13 08:55:01 -07:00
+								    num_replicas = 1
 								    num_partitions = mesh.devices.size
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  else:
-												Prune accidental exports from jax.interpreters.pxla.

These imports do not appear to have users outside JAX itself.

PiperOrigin-RevId: 507835295

											
										
										
											2023-02-07 11:16:01 -08:00
+								    replicated_args = [not get_array_mapping(i.spec) for i in in_shardings]  # type: ignore
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    in_partitions = None
 								    out_partitions = None
-												Split sharding_impls into its own Bazel target.

* Move dependencies of sharding_impls into sharding_impls to avoid creating cyclic dependencies.
* Fix a handful of new pytype errors.

PiperOrigin-RevId: 523146076

											
										
										
											2023-04-10 10:15:08 -07:00
+								    axis_env = sharding_impls.AxisEnv(
 								        nreps=mesh.size,
 								        names=tuple(global_axis_sizes.keys()),
 								        sizes=tuple(global_axis_sizes.values()))
 								    axis_ctx = sharding_impls.ReplicaAxisContext(axis_env)
-												Add unregistered mhlo.num_replicas and mhlo.num_partitions attributes to HLO output.

These are to allow PJRT plugin developers an inline way to determine the number of replicas/partitions to which the module is targeted. There are no stability guarantees on these attributes at the moment.

PiperOrigin-RevId: 524013922

											
										
										
											2023-04-13 08:55:01 -07:00
+								    num_replicas = mesh.devices.size
 								    num_partitions = 1
-												Axis names are now tracked via an effect

This allows propagating the names bottom up -- from equations to the jaxpr,
instead of "discovering" them top-down by traversing (and rebuilding) the
jaxpr via core.subst_axis_names.

PiperOrigin-RevId: 612416803

											
										
										
											2024-03-04 05:41:29 -08:00
+								  jaxpr = core.remove_named_axis_effects(jaxpr, mesh.axis_names)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  closed_jaxpr = core.ClosedJaxpr(jaxpr, consts)
 								  module_name = f"{api_name}_{fun_name}"
 								  with core.extend_axis_env_nd(mesh.shape.items()):
-												Refactor effects system to use effect types, not objects

											
										
										
											2023-02-01 17:50:00 -08:00
+								    if any(effects.ordered_effects.contains(eff) for eff
 								           in closed_jaxpr.effects):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      raise ValueError("Ordered effects not supported in mesh computations.")
-												Refactor effects system to use effect types, not objects

											
										
										
											2023-02-01 17:50:00 -08:00
+								    unordered_effects = list(effects.ordered_effects.filter_not_in(
 								      closed_jaxpr.effects))
 								    ordered_effects = list(effects.ordered_effects.filter_in(
 								      closed_jaxpr.effects))
-												Log the time it takes to lower from jaxpr to stableHLO

PiperOrigin-RevId: 532115098

											
										
										
											2023-05-15 08:07:31 -07:00
+								    with dispatch.log_elapsed_time(
-												Remove the f-string evaluation during logging the elapsed time by passing in fun_name to log_elapsed_time

PiperOrigin-RevId: 532132574

											
										
										
											2023-05-15 09:15:22 -07:00
+								        "Finished jaxpr to MLIR module conversion {fun_name} in {elapsed_time} sec",
 								        fun_name=str(name_stack), event=dispatch.JAXPR_TO_MLIR_MODULE_EVENT):
-												Log the time it takes to lower from jaxpr to stableHLO

PiperOrigin-RevId: 532115098

											
										
										
											2023-05-15 08:07:31 -07:00
+								      lowering_result = mlir.lower_jaxpr_to_module(
 								          module_name,
 								          closed_jaxpr,
-												Introduce a LoweringParameters dataclass for easier plumbing

There are currently two parameters that are used to configure
lowering: lowering_platform (for cross-platform lowering), and
override_lowering_rules. Each of them are passed as separate arguments
through several layers of lowering internal functions. This is tedious,
and error prone. In fact, override_lowering_rules was not plumbed
in all places, and due to using default arguments in all places,
this leads to silent errors.

We foresee introducing other parameters for lowering: for multi-platform
lowering, for controlling the lowering of effects.

Here is pack all such parameters into a `mlir.LoweringParameters`
dataclass and we plumb that through.

											
										
										
											2023-09-28 12:44:14 +02:00
+								          ordered_effects=ordered_effects,
 								          backend_or_name=backend,
-												Cleanup the handling of single- and multi-platform lowering in ModuleContext

Previously, we introduced support for multi-platform lowering, by
adding a new LoweringParameters object that can be used to specify
a cross-lowering platform or even multiple platforms. But we had
kept the ModuleContext.platform in place because some lowering rules
were still referencing it. Now we replace ModuleContext.platform with
ModuleContext.platforms, which removes the redundancy, simplifies
the code, and makes it clearer that the lowering rules should not
simply assume single-platform lowering.

PiperOrigin-RevId: 576575376

											
										
										
											2023-10-25 10:39:47 -07:00
+								          platforms=lowering_parameters.platforms or (backend.platform,),
-												Introduce a LoweringParameters dataclass for easier plumbing

There are currently two parameters that are used to configure
lowering: lowering_platform (for cross-platform lowering), and
override_lowering_rules. Each of them are passed as separate arguments
through several layers of lowering internal functions. This is tedious,
and error prone. In fact, override_lowering_rules was not plumbed
in all places, and due to using default arguments in all places,
this leads to silent errors.

We foresee introducing other parameters for lowering: for multi-platform
lowering, for controlling the lowering of effects.

Here is pack all such parameters into a `mlir.LoweringParameters`
dataclass and we plumb that through.

											
										
										
											2023-09-28 12:44:14 +02:00
+								          axis_context=axis_ctx,
 								          name_stack=name_stack,
 								          donated_args=donated_invars,
-												Log the time it takes to lower from jaxpr to stableHLO

PiperOrigin-RevId: 532115098

											
										
										
											2023-05-15 08:07:31 -07:00
+								          replicated_args=replicated_args,
 								          arg_shardings=in_partitions,
 								          result_shardings=out_partitions,
 								          arg_names=jaxpr.debug_info and jaxpr.debug_info.arg_names,
 								          result_names=jaxpr.debug_info and jaxpr.debug_info.result_paths,
 								          num_replicas=num_replicas,
-												Introduce a LoweringParameters dataclass for easier plumbing

There are currently two parameters that are used to configure
lowering: lowering_platform (for cross-platform lowering), and
override_lowering_rules. Each of them are passed as separate arguments
through several layers of lowering internal functions. This is tedious,
and error prone. In fact, override_lowering_rules was not plumbed
in all places, and due to using default arguments in all places,
this leads to silent errors.

We foresee introducing other parameters for lowering: for multi-platform
lowering, for controlling the lowering of effects.

Here is pack all such parameters into a `mlir.LoweringParameters`
dataclass and we plumb that through.

											
										
										
											2023-09-28 12:44:14 +02:00
+								          num_partitions=num_partitions,
 								          lowering_parameters=lowering_parameters)
-												Log the time it takes to lower from jaxpr to stableHLO

PiperOrigin-RevId: 532115098

											
										
										
											2023-05-15 08:07:31 -07:00
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  return MeshComputation(
 								      str(name_stack),
-												Small cleanups to pxla.py.

Remove stale references to XlaComputation and code left over from handling both XlaComputations and ir.Modules.

No functional changes intended.

PiperOrigin-RevId: 526139679

											
										
										
											2023-04-21 14:37:52 -07:00
+								      lowering_result.module,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      donated_invars,
 								      global_in_avals=global_in_avals,
 								      global_out_avals=global_out_avals,
 								      in_shardings=in_shardings,
 								      out_shardings=out_shardings,
 								      spmd_lowering=spmd_lowering,
 								      tuple_args=tuple_args,
-												Allow pjit.AUTO to be used with jax.jit. This introduces an API change which requires a mesh to be provided to pjit.AUTO(mesh).

`with mesh:` is no longer required with pjit to use the auto spmd pass of GSPMD.

PiperOrigin-RevId: 533801596

											
										
										
											2023-05-20 22:59:52 -07:00
+								      auto_spmd_lowering=False,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      unordered_effects=unordered_effects,
 								      ordered_effects=ordered_effects,
-												Small cleanups to pxla.py.

Remove stale references to XlaComputation and code left over from handling both XlaComputations and ir.Modules.

No functional changes intended.

PiperOrigin-RevId: 526139679

											
										
										
											2023-04-21 14:37:52 -07:00
+								      host_callbacks=lowering_result.host_callbacks,
 								      keepalive=lowering_result.keepalive,
-												Always default keep_unused to True if going via lower_mesh_computation

PiperOrigin-RevId: 513729988

											
										
										
											2023-03-02 22:12:53 -08:00
+								      kept_var_idx=set(range(len(global_in_avals))),
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      backend=backend,
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								      device_assignment=_create_da_object(tuple(mesh.devices.flat)),
-												Plumb debug_info to meshExecutable as a optional arg to raise better error messages.

PiperOrigin-RevId: 525521694

											
										
										
											2023-04-19 12:35:15 -07:00
+								      committed=True,
-												Add a private API to allow setting layouts on jitted computations.

We expose 3 modes:

* `SpecifiedLayout`: User specifies the `minor_to_major` field of the layout. Tiling not exposed yet.

* `DefaultLayout`: PJRT chooses the layout. It defaults to the current behavior.

* `AUTO`: Compiler chooses the layout. This field is not a layout per se. It's a request to get the layout from the compiler. This field cannot be on an Array or other data types. It can only be on jit.

Public API coming soon.

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 582692036

											
										
										
											2023-11-15 08:48:17 -08:00
+								      in_layouts=(None,) * len(global_in_avals),
 								      out_layouts=(None,) * len(global_out_avals),
-												Typecheck avals and sharding for arguments that were DCE'd.

This keeps the promise of AOT that recompilation is guaranteed.

Fixes https://github.com/google/jax/issues/18686

PiperOrigin-RevId: 585855658

											
										
										
											2023-11-27 22:38:46 -08:00
+								      shape_poly_state=lowering_result.shape_poly_state,
 								      all_args_info=all_args_info)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								class MeshComputation(stages.XlaLowering):
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								  _hlo: ir.Module | None
 								  _executable: MeshExecutable | None
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								  def __init__(self, name: str, hlo: ir.Module | None,
-												Remove trivial execution from jax since it leads to 100x slower dispatch time.

Trivial computations were added for a pre-omnistaging world. After omnistaging, JAX produces less trivial computations, so there is need for this to exist.

In the future, if we want to support forwarding of inputs to outputs, there would need to be a different way which the C++ dispatch path knows about.

```
jit_trivial_dispatch                                   246µs ± 3%                4µs ± 1%  -98.52%          (p=0.008 n=5+5)
jit_trivial                                            250µs ± 3%                5µs ± 1%  -98.19%          (p=0.008 n=5+5)
```

PiperOrigin-RevId: 560141018

											
										
										
											2023-08-25 10:59:10 -07:00
+								               donated_invars: Sequence[bool], **compile_args):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    self._name = name
 								    self._hlo = hlo
 								    self._donated_invars = donated_invars
 								    self.compile_args = compile_args
 								    self._executable = None
 								  # -- stages.XlaLowering overrides
 								  def stablehlo(self) -> ir.Module:
 								    return self._hlo
-												Remove trivial execution from jax since it leads to 100x slower dispatch time.

Trivial computations were added for a pre-omnistaging world. After omnistaging, JAX produces less trivial computations, so there is need for this to exist.

In the future, if we want to support forwarding of inputs to outputs, there would need to be a different way which the C++ dispatch path knows about.

```
jit_trivial_dispatch                                   246µs ± 3%                4µs ± 1%  -98.52%          (p=0.008 n=5+5)
jit_trivial                                            250µs ± 3%                5µs ± 1%  -98.19%          (p=0.008 n=5+5)
```

PiperOrigin-RevId: 560141018

											
										
										
											2023-08-25 10:59:10 -07:00
+								  def compile(self, compiler_options=None) -> MeshExecutable:
-												expose `compiler_options` on `compile()`

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 520782460

											
										
										
											2023-03-30 17:13:46 -07:00
+								    if self._executable is None or compiler_options is not None:
-												Remove trivial execution from jax since it leads to 100x slower dispatch time.

Trivial computations were added for a pre-omnistaging world. After omnistaging, JAX produces less trivial computations, so there is need for this to exist.

In the future, if we want to support forwarding of inputs to outputs, there would need to be a different way which the C++ dispatch path knows about.

```
jit_trivial_dispatch                                   246µs ± 3%                4µs ± 1%  -98.52%          (p=0.008 n=5+5)
jit_trivial                                            250µs ± 3%                5µs ± 1%  -98.19%          (p=0.008 n=5+5)
```

PiperOrigin-RevId: 560141018

											
										
										
											2023-08-25 10:59:10 -07:00
+								      executable = UnloadedMeshExecutable.from_hlo(
 								          self._name, self._hlo, **self.compile_args,
 								          compiler_options=compiler_options)
-												expose `compiler_options` on `compile()`

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 520782460

											
										
										
											2023-03-30 17:13:46 -07:00
+								      if compiler_options is None:
 								        self._executable = executable
 								      return executable
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    return self._executable
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								  def cost_analysis(self) -> dict[str, float]:
-												MeshComputation.cost_analysis() isn't implemented with PJRT C API.

This was caught via PJitTest.testLowerCostAnalysis
(https://github.com/google/jax/blob/e74852f79695c9f2fcf06b4c8400b176bb899766/tests/pjit_test.py#L998). We
don't need to change the test because NotImplementedError is already
caught in Lowered.cost_analysis:
https://github.com/google/jax/blob/e74852f79695c9f2fcf06b4c8400b176bb899766/jax/_src/stages.py#L659-L660

											
										
										
											2023-02-15 01:49:55 +00:00
+								    backend = self.compile_args["backend"]
 								    if xb.using_pjrt_c_api(backend):
 								      raise NotImplementedError(
 								          "Lowered.cost_analysis not implemented on platform "
-												[mutable-arrays] allow state effects in jit by building in run_state

with help from @sharadmv, @yashkatariya, @dougalm, and others

The basic strategy is to apply discharge_state when lowering a jaxpr with state
effects to HLO, and update the dispatch path accordingly. Specifically:
1. in tests only for now, introduce a MutableArray data type;
2. teach jit to abstract it to a Ref(ShapedArray) type, register an input
   handler, etc;
3. call discharge_state in `lower_sharding_computation` to lower a jaxpr with
   refs to a jaxpr (and then to an HLO) with extra outputs, and set up aliasing;
4. teach the output side of the dispatch path to drop those outputs.

As an alternative to (3), we could potentially lower away the effects at a
higher level, like in _pjit_lower_cached. They are similar because
_pjit_lower_cached is the only (non-xmap) caller of lower_sharding_computation.
I decided to do it in lower_sharding_computation mainly because that's closer
to where we set up aliases, and I wanted to make mutable arrays correspond to
aliased inputs/outputs on the XLA computation.

											
										
										
											2024-02-26 14:46:05 -08:00
+								          f"'{backend.platform}'. Use compile().cost_analysis() for "  # type: ignore
-												MeshComputation.cost_analysis() isn't implemented with PJRT C API.

This was caught via PJitTest.testLowerCostAnalysis
(https://github.com/google/jax/blob/e74852f79695c9f2fcf06b4c8400b176bb899766/tests/pjit_test.py#L998). We
don't need to change the test because NotImplementedError is already
caught in Lowered.cost_analysis:
https://github.com/google/jax/blob/e74852f79695c9f2fcf06b4c8400b176bb899766/jax/_src/stages.py#L659-L660

											
										
										
											2023-02-15 01:49:55 +00:00
+								          "post-compilation cost estimates.")
 								    return xe.hlo_module_cost_analysis(backend, self.hlo().as_hlo_module())
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								if xla_extension_version < 229:
 								  def _get_input_indices(
 								      avals: Sequence[ShapedArray],
 								      shardings: Sequence[sharding_impls.XLACompatibleSharding],
-												Replace _DeviceAssignment with xc.DeviceList

PiperOrigin-RevId: 599597226

											
										
										
											2024-01-18 12:54:54 -08:00
+								      da_object: xc.DeviceList | Sequence[xc.Device],  # type: ignore
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								  ) -> Sequence[tuple[Index | None, ...]]:
-												Don't loop to create replicated indices tuple consisting of slice(None). Use multiplication instead.

PiperOrigin-RevId: 524992811

											
										
										
											2023-04-17 17:21:41 -07:00
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								    input_indices = []
-												Replace _DeviceAssignment with xc.DeviceList

PiperOrigin-RevId: 599597226

											
										
										
											2024-01-18 12:54:54 -08:00
+								    if not isinstance(da_object, xc.DeviceList):
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								      da_object = _create_da_object(tuple(da_object))
 								    num_addressable_devices = len(da_object.addressable_device_list)
-												Don't loop to create replicated indices tuple consisting of slice(None). Use multiplication instead.

PiperOrigin-RevId: 524992811

											
										
										
											2023-04-17 17:21:41 -07:00
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								    def _get_replicated_slices(num_addressable_devices: int, ndim: int | None):
 								      if ndim is None:
 								        return ((slice(None),),) * num_addressable_devices
 								      else:
 								        return ((slice(None),) * ndim,) * num_addressable_devices
-												Do the sharding.addressable_devices check only once in _get_input_indices since all shardings should have the same device_assignment.

That check happens at the start of lower_sharding_computation. Also use the optimized DeviceAssignment object which has all the calculations cached if this path is hit multiple times.

Also remove `device_assignment` from MeshExecutable since it is not used anywhere in that class

PiperOrigin-RevId: 523182028

											
										
										
											2023-04-10 12:22:45 -07:00
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								    for aval, sharding in zip(avals, shardings):
 								      if aval is core.abstract_token:
 								        index = _get_replicated_slices(num_addressable_devices, None)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      else:
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								        if sharding.is_fully_replicated:
 								          index = _get_replicated_slices(num_addressable_devices, aval.ndim)
 								        else:
 								          index = tuple(
 								              sharding.addressable_devices_indices_map(aval.shape).values())  # type: ignore
 								      input_indices.append(index)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								    return input_indices
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Let XLA choose in_shardings for inputs who sharding is unspecified.

This is a strict improvement over the current state where JAX always chooses replicated sharding.

PiperOrigin-RevId: 610771289

											
										
										
											2024-02-27 09:06:21 -08:00
+								def get_out_shardings_from_executable(
-												Set the jax_enable_memories flag to True.

If all memory_kinds in the jaxpr are the default memory kind, then annotate_device_placement custom calls are not inserted. This allows for existing code to work without any changes.

If non-default memory kind is present in the jaxpr, then we allow custom calls to be inserted.

PiperOrigin-RevId: 564457393

											
										
										
											2023-09-11 11:54:29 -07:00
+								    xla_executable,
 								    device_assignment: Sequence[xc.Device],
 								    num_out_avals: int,
 								    num_ordered_effects: int,
 								    all_default_mem_kind: bool,
-												Contrain the trailing dims of prng key array to REPLICATED and keep other dims as unconstrained.

PiperOrigin-RevId: 611232967

											
										
										
											2024-02-28 14:36:20 -08:00
+								) -> Sequence[sharding_impls.GSPMDSharding] | None:
-												Prune some exports from jax.experimental.pjit.

jax.experimental.pjit is deprecated in its entirety (use "jit" instead), and experimental APIs have no stability promises.

PiperOrigin-RevId: 552903601

											
										
										
											2023-08-01 13:26:43 -07:00
+								  from jax._src import pjit
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Only call executale.get_output_memory_kinds() if jax_enable_memories is True

PiperOrigin-RevId: 584087022

											
										
										
											2023-11-20 11:43:41 -08:00
+								  if config.enable_memories.value:
 								    if all_default_mem_kind:
-												[Memories] Add Memories support to jax.jit and jax.device_put!

These are the following changes:

* Add a temporary flag (`JAX_FETCH_MEMORY_KIND_ON_EXECUTABLE`) (should not be used by user but needed in C++ in pjrt-ifrt code) on whether to fetch memory kinds from executable. If it is set to True, the host runtime dep needs to be linked in and should also work in OSS (more work needs to happen for that). So only the test sets it to True for now until jax memories is under development.

* Add with_memory_kind method on Sharding to allow for easier creation of shardings with different memory kind.

* Add lowering rules for device_put and jax.jit.
  * For device_put, we always add the annotation that describes a transfer to a memory and a sharding annotation.
  * For jax.jit, if the argument is on host memory, it will have an extra attribute _xla_buffer_placement.

* Handle the correct output sharding in pxla.py by extracting the memory kind from the executable.

* Handle the caching of pjit caches by canonicalizing the memory_kinds so that `NS(mesh, pspec) == NS(mesh, pspec, memory_kind='tpu_hbm')`. Also canonicalize memory_kind in `__hash__` and `__eq__` of shardings.
  * This is to not change the StableHLO to include device placement annotations right now since the host aware passes are not enabled by default and the work is under progress to make it work everywhere.

PiperOrigin-RevId: 553833344

											
										
										
											2023-08-04 09:43:39 -07:00
+								      omk = [None] * num_out_avals
-												Only call executale.get_output_memory_kinds() if jax_enable_memories is True

PiperOrigin-RevId: 584087022

											
										
										
											2023-11-20 11:43:41 -08:00
+								    else:
 								      try:
 								        omk = xla_executable.get_output_memory_kinds()[0]
 								        if num_ordered_effects > 0:
 								          omk = omk[num_ordered_effects:]
 								      except:
 								        omk = [None] * num_out_avals
 								  else:
 								    omk = [None] * num_out_avals
-												Set the jax_enable_memories flag to True.

If all memory_kinds in the jaxpr are the default memory kind, then annotate_device_placement custom calls are not inserted. This allows for existing code to work without any changes.

If non-default memory kind is present in the jaxpr, then we allow custom calls to be inserted.

PiperOrigin-RevId: 564457393

											
										
										
											2023-09-11 11:54:29 -07:00
 								  assert len(omk) == num_out_avals, (len(omk), num_out_avals)
-												[Memories] Add Memories support to jax.jit and jax.device_put!

These are the following changes:

* Add a temporary flag (`JAX_FETCH_MEMORY_KIND_ON_EXECUTABLE`) (should not be used by user but needed in C++ in pjrt-ifrt code) on whether to fetch memory kinds from executable. If it is set to True, the host runtime dep needs to be linked in and should also work in OSS (more work needs to happen for that). So only the test sets it to True for now until jax memories is under development.

* Add with_memory_kind method on Sharding to allow for easier creation of shardings with different memory kind.

* Add lowering rules for device_put and jax.jit.
  * For device_put, we always add the annotation that describes a transfer to a memory and a sharding annotation.
  * For jax.jit, if the argument is on host memory, it will have an extra attribute _xla_buffer_placement.

* Handle the correct output sharding in pxla.py by extracting the memory kind from the executable.

* Handle the caching of pjit caches by canonicalizing the memory_kinds so that `NS(mesh, pspec) == NS(mesh, pspec, memory_kind='tpu_hbm')`. Also canonicalize memory_kind in `__hash__` and `__eq__` of shardings.
  * This is to not change the StableHLO to include device placement annotations right now since the host aware passes are not enabled by default and the work is under progress to make it work everywhere.

PiperOrigin-RevId: 553833344

											
										
										
											2023-08-04 09:43:39 -07:00
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  # When the device assignment only has 1 device, SPMD partitioner will not run.
-												Preserve single device NamedSharding/PositionalSharding on the output instead of always return SingleDeviceShardings.

Fixes https://github.com/google/jax/issues/19459

PiperOrigin-RevId: 600999853

											
										
										
											2024-01-23 21:28:33 -08:00
+								  # Hence the op shardings will not be set on the `hlo_module`.
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  if len(device_assignment) == 1:
-												Preserve single device NamedSharding/PositionalSharding on the output instead of always return SingleDeviceShardings.

Fixes https://github.com/google/jax/issues/19459

PiperOrigin-RevId: 600999853

											
										
										
											2024-01-23 21:28:33 -08:00
+								    return [sharding_impls.GSPMDSharding.get_replicated(device_assignment, memory_kind=mk)
-												[Memories] Add Memories support to jax.jit and jax.device_put!

These are the following changes:

* Add a temporary flag (`JAX_FETCH_MEMORY_KIND_ON_EXECUTABLE`) (should not be used by user but needed in C++ in pjrt-ifrt code) on whether to fetch memory kinds from executable. If it is set to True, the host runtime dep needs to be linked in and should also work in OSS (more work needs to happen for that). So only the test sets it to True for now until jax memories is under development.

* Add with_memory_kind method on Sharding to allow for easier creation of shardings with different memory kind.

* Add lowering rules for device_put and jax.jit.
  * For device_put, we always add the annotation that describes a transfer to a memory and a sharding annotation.
  * For jax.jit, if the argument is on host memory, it will have an extra attribute _xla_buffer_placement.

* Handle the correct output sharding in pxla.py by extracting the memory kind from the executable.

* Handle the caching of pjit caches by canonicalizing the memory_kinds so that `NS(mesh, pspec) == NS(mesh, pspec, memory_kind='tpu_hbm')`. Also canonicalize memory_kind in `__hash__` and `__eq__` of shardings.
  * This is to not change the StableHLO to include device placement annotations right now since the host aware passes are not enabled by default and the work is under progress to make it work everywhere.

PiperOrigin-RevId: 553833344

											
										
										
											2023-08-04 09:43:39 -07:00
+								            for mk in omk]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												[Memories] Add Memories support to jax.jit and jax.device_put!

These are the following changes:

* Add a temporary flag (`JAX_FETCH_MEMORY_KIND_ON_EXECUTABLE`) (should not be used by user but needed in C++ in pjrt-ifrt code) on whether to fetch memory kinds from executable. If it is set to True, the host runtime dep needs to be linked in and should also work in OSS (more work needs to happen for that). So only the test sets it to True for now until jax memories is under development.

* Add with_memory_kind method on Sharding to allow for easier creation of shardings with different memory kind.

* Add lowering rules for device_put and jax.jit.
  * For device_put, we always add the annotation that describes a transfer to a memory and a sharding annotation.
  * For jax.jit, if the argument is on host memory, it will have an extra attribute _xla_buffer_placement.

* Handle the correct output sharding in pxla.py by extracting the memory kind from the executable.

* Handle the caching of pjit caches by canonicalizing the memory_kinds so that `NS(mesh, pspec) == NS(mesh, pspec, memory_kind='tpu_hbm')`. Also canonicalize memory_kind in `__hash__` and `__eq__` of shardings.
  * This is to not change the StableHLO to include device placement annotations right now since the host aware passes are not enabled by default and the work is under progress to make it work everywhere.

PiperOrigin-RevId: 553833344

											
										
										
											2023-08-04 09:43:39 -07:00
+								  _, out_op_shardings = pjit.get_op_sharding_from_executable(xla_executable)
-												Make looking up shardings from executable consistent. If `out_shardings` are specified on `jit`, always check it against the `get_output_shardings` from the executable.

PiperOrigin-RevId: 583456869

											
										
										
											2023-11-17 12:18:46 -08:00
+								  if not out_op_shardings:
 								    return None
 								  if num_ordered_effects > 0:
 								    out_op_shardings = out_op_shardings[num_ordered_effects:]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												If a function returns no output, xla_executable.get_output_shardings() returns 1 sharding because for XLA the output is an empty tuple which has a tuple sharding.

PiperOrigin-RevId: 583555384

											
										
										
											2023-11-17 20:48:22 -08:00
+								  # This means that there are no outputs for JAX but for XLA there is an empty
 								  # tuple output which gets a replicated sharding.
 								  if num_out_avals == 0 and len(out_op_shardings) == 1:
 								    return None
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  # This condition happens when all the elements in the output tuple have the
 								  # same sharding, so XLA decides to run the `FusionTupleDeduplicator` to
 								  # put the sharding on ROOT instead of the tuple.
 								  # TODO(b/245667823): Remove this when XLA fixes this.
-												[Memories] Add Memories support to jax.jit and jax.device_put!

These are the following changes:

* Add a temporary flag (`JAX_FETCH_MEMORY_KIND_ON_EXECUTABLE`) (should not be used by user but needed in C++ in pjrt-ifrt code) on whether to fetch memory kinds from executable. If it is set to True, the host runtime dep needs to be linked in and should also work in OSS (more work needs to happen for that). So only the test sets it to True for now until jax memories is under development.

* Add with_memory_kind method on Sharding to allow for easier creation of shardings with different memory kind.

* Add lowering rules for device_put and jax.jit.
  * For device_put, we always add the annotation that describes a transfer to a memory and a sharding annotation.
  * For jax.jit, if the argument is on host memory, it will have an extra attribute _xla_buffer_placement.

* Handle the correct output sharding in pxla.py by extracting the memory kind from the executable.

* Handle the caching of pjit caches by canonicalizing the memory_kinds so that `NS(mesh, pspec) == NS(mesh, pspec, memory_kind='tpu_hbm')`. Also canonicalize memory_kind in `__hash__` and `__eq__` of shardings.
  * This is to not change the StableHLO to include device placement annotations right now since the host aware passes are not enabled by default and the work is under progress to make it work everywhere.

PiperOrigin-RevId: 553833344

											
										
										
											2023-08-04 09:43:39 -07:00
+								  if len(out_op_shardings) == 1 and len(out_op_shardings) < num_out_avals:
 								    out_op_shardings = out_op_shardings * num_out_avals  # type: ignore
 								  assert len(out_op_shardings) == num_out_avals == len(omk), (
 								      len(out_op_shardings), num_out_avals, len(omk))
 								  return [sharding_impls.GSPMDSharding(device_assignment, os, memory_kind=mk)
 								          for os, mk in safe_zip(out_op_shardings, omk)]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Let XLA choose in_shardings for inputs who sharding is unspecified.

This is a strict improvement over the current state where JAX always chooses replicated sharding.

PiperOrigin-RevId: 610771289

											
										
										
											2024-02-27 09:06:21 -08:00
+								def _get_in_shardings_from_xla(
 								    xla_executable, device_assignment: Sequence[xc.Device], num_in_avals: int,
 								    num_ordered_effects: int
-												Contrain the trailing dims of prng key array to REPLICATED and keep other dims as unconstrained.

PiperOrigin-RevId: 611232967

											
										
										
											2024-02-28 14:36:20 -08:00
+								  ) -> Sequence[GSPMDSharding] | None:
-												Let XLA choose in_shardings for inputs who sharding is unspecified.

This is a strict improvement over the current state where JAX always chooses replicated sharding.

PiperOrigin-RevId: 610771289

											
										
										
											2024-02-27 09:06:21 -08:00
+								  """Returns input shardings from XLA."""
 								  from jax._src import pjit
 								  # When the device assignment only has 1 device, SPMD partitioner will not run.
 								  # Hence the op shardings will not be set on the `hlo_module`.
 								  if len(device_assignment) == 1:
-												Contrain the trailing dims of prng key array to REPLICATED and keep other dims as unconstrained.

PiperOrigin-RevId: 611232967

											
										
										
											2024-02-28 14:36:20 -08:00
+								    return [GSPMDSharding.get_replicated(device_assignment)] * num_in_avals
-												Let XLA choose in_shardings for inputs who sharding is unspecified.

This is a strict improvement over the current state where JAX always chooses replicated sharding.

PiperOrigin-RevId: 610771289

											
										
										
											2024-02-27 09:06:21 -08:00
 								  in_op_shardings, _ = pjit.get_op_sharding_from_executable(xla_executable)
 								  if not in_op_shardings:
 								    return None
 								  if num_ordered_effects > 0:
 								    in_op_shardings = in_op_shardings[num_ordered_effects:]
 								  assert len(in_op_shardings) == num_in_avals, (
 								      len(in_op_shardings), num_in_avals)
-												Contrain the trailing dims of prng key array to REPLICATED and keep other dims as unconstrained.

PiperOrigin-RevId: 611232967

											
										
										
											2024-02-28 14:36:20 -08:00
+								  return [GSPMDSharding(device_assignment, os)
-												Let XLA choose in_shardings for inputs who sharding is unspecified.

This is a strict improvement over the current state where JAX always chooses replicated sharding.

PiperOrigin-RevId: 610771289

											
										
										
											2024-02-27 09:06:21 -08:00
+								          for os in in_op_shardings]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								# TODO(yashkatariya): Remove this function after `AUTO` can return shardings
 								# without mesh.
 								def _get_mesh_pspec_shardings_from_executable(
 								    xla_executable, mesh: Mesh
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								) -> tuple[Sequence[sharding_impls.NamedSharding],
-												Split Sharding subclasses out of _src/sharding.py into _src/sharding_impls.py

By defining the Sharding base class in its own module, we can pull it out into a separate Bazel submodule, which will help pytype inference when defining Array.

PiperOrigin-RevId: 516223009

											
										
										
											2023-03-13 08:49:39 -07:00
+								           Sequence[sharding_impls.NamedSharding]]:
-												Prune some exports from jax.experimental.pjit.

jax.experimental.pjit is deprecated in its entirety (use "jit" instead), and experimental APIs have no stability promises.

PiperOrigin-RevId: 552903601

											
										
										
											2023-08-01 13:26:43 -07:00
+								  from jax._src import pjit
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Prune some exports from jax.experimental.pjit.

jax.experimental.pjit is deprecated in its entirety (use "jit" instead), and experimental APIs have no stability promises.

PiperOrigin-RevId: 552903601

											
										
										
											2023-08-01 13:26:43 -07:00
+								  in_pspec, out_pspec = pjit.get_pspec_from_executable(xla_executable, mesh)
-												Split Sharding subclasses out of _src/sharding.py into _src/sharding_impls.py

By defining the Sharding base class in its own module, we can pull it out into a separate Bazel submodule, which will help pytype inference when defining Array.

PiperOrigin-RevId: 516223009

											
										
										
											2023-03-13 08:49:39 -07:00
+								  return ([sharding_impls.NamedSharding(mesh, i) for i in in_pspec],
 								          [sharding_impls.NamedSharding(mesh, o) for o in out_pspec])
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Removed the type annotation of `orig_out_sharding_handlers`

The annotation was a generic type alias which are not well supported for
pytype and can, in some cases, lead to crashes.

It was used "raw" and was thus implicitly specialized to `Any`. So, the
constraint of `SubClassT` being the same in the key, parameter and return
types was not actually enforced. It is now enforced through the `_register_out_sharding_handler` function.

PiperOrigin-RevId: 545460402

											
										
										
											2023-07-04 09:00:06 -07:00
+								_orig_out_sharding_handlers = {}
 								_ShardingT = TypeVar("_ShardingT", bound=sharding_impls.XLACompatibleSharding)
 								def _register_out_sharding_handler(
 								    sharding_cls: type[_ShardingT],
-												[Memories] Add Memories support to jax.jit and jax.device_put!

These are the following changes:

* Add a temporary flag (`JAX_FETCH_MEMORY_KIND_ON_EXECUTABLE`) (should not be used by user but needed in C++ in pjrt-ifrt code) on whether to fetch memory kinds from executable. If it is set to True, the host runtime dep needs to be linked in and should also work in OSS (more work needs to happen for that). So only the test sets it to True for now until jax memories is under development.

* Add with_memory_kind method on Sharding to allow for easier creation of shardings with different memory kind.

* Add lowering rules for device_put and jax.jit.
  * For device_put, we always add the annotation that describes a transfer to a memory and a sharding annotation.
  * For jax.jit, if the argument is on host memory, it will have an extra attribute _xla_buffer_placement.

* Handle the correct output sharding in pxla.py by extracting the memory kind from the executable.

* Handle the caching of pjit caches by canonicalizing the memory_kinds so that `NS(mesh, pspec) == NS(mesh, pspec, memory_kind='tpu_hbm')`. Also canonicalize memory_kind in `__hash__` and `__eq__` of shardings.
  * This is to not change the StableHLO to include device placement annotations right now since the host aware passes are not enabled by default and the work is under progress to make it work everywhere.

PiperOrigin-RevId: 553833344

											
										
										
											2023-08-04 09:43:39 -07:00
+								    handler: Callable[[sharding_impls.GSPMDSharding, _ShardingT], _ShardingT],
-												Removed the type annotation of `orig_out_sharding_handlers`

The annotation was a generic type alias which are not well supported for
pytype and can, in some cases, lead to crashes.

It was used "raw" and was thus implicitly specialized to `Any`. So, the
constraint of `SubClassT` being the same in the key, parameter and return
types was not actually enforced. It is now enforced through the `_register_out_sharding_handler` function.

PiperOrigin-RevId: 545460402

											
										
										
											2023-07-04 09:00:06 -07:00
+								) -> None:
 								  _orig_out_sharding_handlers[sharding_cls] = handler
-												Preserve PositionalSharding on the output of pjits if the inputs had PositionalSharding on them by converting GSPMDSharding to PositionalSharding

PiperOrigin-RevId: 523535581

											
										
										
											2023-04-11 16:27:08 -07:00
-												Convert in_shardings to physical shardings in cpp dispatch path because the same happens with prng arrays.

Also comment out key reuse check in cpp dispatch since it's True for jax tests which prevent prng keys from taking Cpp dispatch.

PiperOrigin-RevId: 613289252

											
										
										
											2024-03-06 11:41:34 -08:00
+								def _gspmd_to_named_sharding_via_mesh(
 								    out_s: sharding_impls.GSPMDSharding,
 								    mesh: Mesh) -> sharding_impls.NamedSharding:
 								  parsed_pspec = sharding_impls.parse_flatten_op_sharding(
 								      out_s._hlo_sharding, mesh)[0]
 								  return create_mesh_pspec_sharding(
 								      mesh, parsed_pspec.get_partition_spec(), parsed_pspec,
 								      out_s.memory_kind)
-												Preserve PositionalSharding on the output of pjits if the inputs had PositionalSharding on them by converting GSPMDSharding to PositionalSharding

PiperOrigin-RevId: 523535581

											
										
										
											2023-04-11 16:27:08 -07:00
+								def _gspmd_to_named_sharding(
-												[Memories] Add Memories support to jax.jit and jax.device_put!

These are the following changes:

* Add a temporary flag (`JAX_FETCH_MEMORY_KIND_ON_EXECUTABLE`) (should not be used by user but needed in C++ in pjrt-ifrt code) on whether to fetch memory kinds from executable. If it is set to True, the host runtime dep needs to be linked in and should also work in OSS (more work needs to happen for that). So only the test sets it to True for now until jax memories is under development.

* Add with_memory_kind method on Sharding to allow for easier creation of shardings with different memory kind.

* Add lowering rules for device_put and jax.jit.
  * For device_put, we always add the annotation that describes a transfer to a memory and a sharding annotation.
  * For jax.jit, if the argument is on host memory, it will have an extra attribute _xla_buffer_placement.

* Handle the correct output sharding in pxla.py by extracting the memory kind from the executable.

* Handle the caching of pjit caches by canonicalizing the memory_kinds so that `NS(mesh, pspec) == NS(mesh, pspec, memory_kind='tpu_hbm')`. Also canonicalize memory_kind in `__hash__` and `__eq__` of shardings.
  * This is to not change the StableHLO to include device placement annotations right now since the host aware passes are not enabled by default and the work is under progress to make it work everywhere.

PiperOrigin-RevId: 553833344

											
										
										
											2023-08-04 09:43:39 -07:00
+								    out_s: sharding_impls.GSPMDSharding,
 								    orig_in_s: sharding_impls.NamedSharding) -> sharding_impls.NamedSharding:
-												Reuse the utility `_gspmd_to_named_sharding_via_mesh` in other places

PiperOrigin-RevId: 613686995

											
										
										
											2024-03-07 13:33:13 -08:00
+								  return _gspmd_to_named_sharding_via_mesh(out_s, orig_in_s.mesh)
-												Removed the type annotation of `orig_out_sharding_handlers`

The annotation was a generic type alias which are not well supported for
pytype and can, in some cases, lead to crashes.

It was used "raw" and was thus implicitly specialized to `Any`. So, the
constraint of `SubClassT` being the same in the key, parameter and return
types was not actually enforced. It is now enforced through the `_register_out_sharding_handler` function.

PiperOrigin-RevId: 545460402

											
										
										
											2023-07-04 09:00:06 -07:00
 								_register_out_sharding_handler(
-												[Memories] Add Memories support to jax.jit and jax.device_put!

These are the following changes:

* Add a temporary flag (`JAX_FETCH_MEMORY_KIND_ON_EXECUTABLE`) (should not be used by user but needed in C++ in pjrt-ifrt code) on whether to fetch memory kinds from executable. If it is set to True, the host runtime dep needs to be linked in and should also work in OSS (more work needs to happen for that). So only the test sets it to True for now until jax memories is under development.

* Add with_memory_kind method on Sharding to allow for easier creation of shardings with different memory kind.

* Add lowering rules for device_put and jax.jit.
  * For device_put, we always add the annotation that describes a transfer to a memory and a sharding annotation.
  * For jax.jit, if the argument is on host memory, it will have an extra attribute _xla_buffer_placement.

* Handle the correct output sharding in pxla.py by extracting the memory kind from the executable.

* Handle the caching of pjit caches by canonicalizing the memory_kinds so that `NS(mesh, pspec) == NS(mesh, pspec, memory_kind='tpu_hbm')`. Also canonicalize memory_kind in `__hash__` and `__eq__` of shardings.
  * This is to not change the StableHLO to include device placement annotations right now since the host aware passes are not enabled by default and the work is under progress to make it work everywhere.

PiperOrigin-RevId: 553833344

											
										
										
											2023-08-04 09:43:39 -07:00
+								    sharding_impls.NamedSharding, _gspmd_to_named_sharding)
-												Preserve PositionalSharding on the output of pjits if the inputs had PositionalSharding on them by converting GSPMDSharding to PositionalSharding

PiperOrigin-RevId: 523535581

											
										
										
											2023-04-11 16:27:08 -07:00
 								def _gspmd_to_positional_sharding(
-												[Memories] Add Memories support to jax.jit and jax.device_put!

These are the following changes:

* Add a temporary flag (`JAX_FETCH_MEMORY_KIND_ON_EXECUTABLE`) (should not be used by user but needed in C++ in pjrt-ifrt code) on whether to fetch memory kinds from executable. If it is set to True, the host runtime dep needs to be linked in and should also work in OSS (more work needs to happen for that). So only the test sets it to True for now until jax memories is under development.

* Add with_memory_kind method on Sharding to allow for easier creation of shardings with different memory kind.

* Add lowering rules for device_put and jax.jit.
  * For device_put, we always add the annotation that describes a transfer to a memory and a sharding annotation.
  * For jax.jit, if the argument is on host memory, it will have an extra attribute _xla_buffer_placement.

* Handle the correct output sharding in pxla.py by extracting the memory kind from the executable.

* Handle the caching of pjit caches by canonicalizing the memory_kinds so that `NS(mesh, pspec) == NS(mesh, pspec, memory_kind='tpu_hbm')`. Also canonicalize memory_kind in `__hash__` and `__eq__` of shardings.
  * This is to not change the StableHLO to include device placement annotations right now since the host aware passes are not enabled by default and the work is under progress to make it work everywhere.

PiperOrigin-RevId: 553833344

											
										
										
											2023-08-04 09:43:39 -07:00
+								    out_s: sharding_impls.GSPMDSharding,
-												Preserve single device NamedSharding/PositionalSharding on the output instead of always return SingleDeviceShardings.

Fixes https://github.com/google/jax/issues/19459

PiperOrigin-RevId: 600999853

											
										
										
											2024-01-23 21:28:33 -08:00
+								    orig_in_s: sharding_impls.PositionalSharding
 								    ) -> sharding_impls.PositionalSharding:
-												Return PositionalSharding instead of GSPMDSharding in custom_partitioning when mesh is not defined

PiperOrigin-RevId: 539719517

											
										
										
											2023-06-12 11:51:47 -07:00
+								  return sharding_impls._op_sharding_to_pos_sharding(
-												[Memories] Add Memories support to jax.jit and jax.device_put!

These are the following changes:

* Add a temporary flag (`JAX_FETCH_MEMORY_KIND_ON_EXECUTABLE`) (should not be used by user but needed in C++ in pjrt-ifrt code) on whether to fetch memory kinds from executable. If it is set to True, the host runtime dep needs to be linked in and should also work in OSS (more work needs to happen for that). So only the test sets it to True for now until jax memories is under development.

* Add with_memory_kind method on Sharding to allow for easier creation of shardings with different memory kind.

* Add lowering rules for device_put and jax.jit.
  * For device_put, we always add the annotation that describes a transfer to a memory and a sharding annotation.
  * For jax.jit, if the argument is on host memory, it will have an extra attribute _xla_buffer_placement.

* Handle the correct output sharding in pxla.py by extracting the memory kind from the executable.

* Handle the caching of pjit caches by canonicalizing the memory_kinds so that `NS(mesh, pspec) == NS(mesh, pspec, memory_kind='tpu_hbm')`. Also canonicalize memory_kind in `__hash__` and `__eq__` of shardings.
  * This is to not change the StableHLO to include device placement annotations right now since the host aware passes are not enabled by default and the work is under progress to make it work everywhere.

PiperOrigin-RevId: 553833344

											
										
										
											2023-08-04 09:43:39 -07:00
+								      out_s._hlo_sharding, orig_in_s._device_assignment, out_s.memory_kind)
-												Removed the type annotation of `orig_out_sharding_handlers`

The annotation was a generic type alias which are not well supported for
pytype and can, in some cases, lead to crashes.

It was used "raw" and was thus implicitly specialized to `Any`. So, the
constraint of `SubClassT` being the same in the key, parameter and return
types was not actually enforced. It is now enforced through the `_register_out_sharding_handler` function.

PiperOrigin-RevId: 545460402

											
										
										
											2023-07-04 09:00:06 -07:00
 								_register_out_sharding_handler(
-												[Memories] Add Memories support to jax.jit and jax.device_put!

These are the following changes:

* Add a temporary flag (`JAX_FETCH_MEMORY_KIND_ON_EXECUTABLE`) (should not be used by user but needed in C++ in pjrt-ifrt code) on whether to fetch memory kinds from executable. If it is set to True, the host runtime dep needs to be linked in and should also work in OSS (more work needs to happen for that). So only the test sets it to True for now until jax memories is under development.

* Add with_memory_kind method on Sharding to allow for easier creation of shardings with different memory kind.

* Add lowering rules for device_put and jax.jit.
  * For device_put, we always add the annotation that describes a transfer to a memory and a sharding annotation.
  * For jax.jit, if the argument is on host memory, it will have an extra attribute _xla_buffer_placement.

* Handle the correct output sharding in pxla.py by extracting the memory kind from the executable.

* Handle the caching of pjit caches by canonicalizing the memory_kinds so that `NS(mesh, pspec) == NS(mesh, pspec, memory_kind='tpu_hbm')`. Also canonicalize memory_kind in `__hash__` and `__eq__` of shardings.
  * This is to not change the StableHLO to include device placement annotations right now since the host aware passes are not enabled by default and the work is under progress to make it work everywhere.

PiperOrigin-RevId: 553833344

											
										
										
											2023-08-04 09:43:39 -07:00
+								    sharding_impls.PositionalSharding, _gspmd_to_positional_sharding)
-												Preserve PositionalSharding on the output of pjits if the inputs had PositionalSharding on them by converting GSPMDSharding to PositionalSharding

PiperOrigin-RevId: 523535581

											
										
										
											2023-04-11 16:27:08 -07:00
-												Preserve single device NamedSharding/PositionalSharding on the output instead of always return SingleDeviceShardings.

Fixes https://github.com/google/jax/issues/19459

PiperOrigin-RevId: 600999853

											
										
										
											2024-01-23 21:28:33 -08:00
+								def _gspmd_to_single_device_sharding(
 								    out_s: GSPMDSharding, orig_in_s: SingleDeviceSharding) -> SingleDeviceSharding:
 								  assert isinstance(orig_in_s, SingleDeviceSharding)
 								  return SingleDeviceSharding(
 								      out_s._device_assignment[0], memory_kind=out_s.memory_kind)
 								_register_out_sharding_handler(
 								    SingleDeviceSharding, _gspmd_to_single_device_sharding)
-												Preserve PositionalSharding on the output of pjits if the inputs had PositionalSharding on them by converting GSPMDSharding to PositionalSharding

PiperOrigin-RevId: 523535581

											
										
										
											2023-04-11 16:27:08 -07:00
 								def _get_out_sharding_from_orig_sharding(
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								    out_shardings, out_avals, orig_in_s, orig_aval):
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								  out = []
-												[Memories] Add Memories support to jax.jit and jax.device_put!

These are the following changes:

* Add a temporary flag (`JAX_FETCH_MEMORY_KIND_ON_EXECUTABLE`) (should not be used by user but needed in C++ in pjrt-ifrt code) on whether to fetch memory kinds from executable. If it is set to True, the host runtime dep needs to be linked in and should also work in OSS (more work needs to happen for that). So only the test sets it to True for now until jax memories is under development.

* Add with_memory_kind method on Sharding to allow for easier creation of shardings with different memory kind.

* Add lowering rules for device_put and jax.jit.
  * For device_put, we always add the annotation that describes a transfer to a memory and a sharding annotation.
  * For jax.jit, if the argument is on host memory, it will have an extra attribute _xla_buffer_placement.

* Handle the correct output sharding in pxla.py by extracting the memory kind from the executable.

* Handle the caching of pjit caches by canonicalizing the memory_kinds so that `NS(mesh, pspec) == NS(mesh, pspec, memory_kind='tpu_hbm')`. Also canonicalize memory_kind in `__hash__` and `__eq__` of shardings.
  * This is to not change the StableHLO to include device placement annotations right now since the host aware passes are not enabled by default and the work is under progress to make it work everywhere.

PiperOrigin-RevId: 553833344

											
										
										
											2023-08-04 09:43:39 -07:00
+								  orig_handler = _orig_out_sharding_handlers[type(orig_in_s)]
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								  for o, out_aval in safe_zip(out_shardings, out_avals):
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								    if isinstance(o, sharding_impls.GSPMDSharding):
 								      try:
-												Only return the same input Sharding object is the original aval's ndim and out_aval's ndim are the same.

This is because if both the OpShardings are replicated then the ndim is not encoded in the OpSharding and it will return True even if the Sharding is incompatible with the output's ndim. Concretely `NamedSharding({'x': 1, y: '2'}, P('x'))` is not compatible with a input with `ndim == 0`.

PiperOrigin-RevId: 528621971

											
										
										
											2023-05-01 17:39:16 -07:00
+								        # Only return the same input sharding object if the OpShardings and
 								        # in_aval.ndim and out_aval.ndim match. This is because if OpSharding is
 								        # replicated then, it doesn't encode the ndim in it. The devices
 								        # will be the same at this point because those checks happen before.
 								        if (orig_aval is not None and out_aval is not None and
-												Canonicalize to default memory in init of Shardings only on the backends that support memories right now.

PiperOrigin-RevId: 553942534

											
										
										
											2023-08-04 16:26:31 -07:00
+								            out_aval.ndim == orig_aval.ndim
 								            and sharding_impls.are_op_shardings_equal(
 								                o._hlo_sharding, orig_in_s._to_xla_hlo_sharding(orig_aval.ndim))
 								            and o.memory_kind == orig_in_s.memory_kind):
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								          out.append(orig_in_s)
-												Return the same sharding object if the output OpSharding matches the input OpSharding.

Fixes https://github.com/google/jax/issues/15782

PiperOrigin-RevId: 528531594

											
										
										
											2023-05-01 11:46:19 -07:00
+								        else:
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								          out.append(orig_handler(o, orig_in_s))
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								      except:
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								        out.append(o)
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								    else:
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								      out.append(o)
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								  return out
 								def maybe_get_orig_out_sharding(
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								    in_shardings, out_shardings, in_avals, out_avals):
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								  if all(hasattr(o, '_original_sharding') for o in out_shardings):
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								    return [o._original_sharding for o in out_shardings]
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
-												[Memories] Add Memories support to jax.jit and jax.device_put!

These are the following changes:

* Add a temporary flag (`JAX_FETCH_MEMORY_KIND_ON_EXECUTABLE`) (should not be used by user but needed in C++ in pjrt-ifrt code) on whether to fetch memory kinds from executable. If it is set to True, the host runtime dep needs to be linked in and should also work in OSS (more work needs to happen for that). So only the test sets it to True for now until jax memories is under development.

* Add with_memory_kind method on Sharding to allow for easier creation of shardings with different memory kind.

* Add lowering rules for device_put and jax.jit.
  * For device_put, we always add the annotation that describes a transfer to a memory and a sharding annotation.
  * For jax.jit, if the argument is on host memory, it will have an extra attribute _xla_buffer_placement.

* Handle the correct output sharding in pxla.py by extracting the memory kind from the executable.

* Handle the caching of pjit caches by canonicalizing the memory_kinds so that `NS(mesh, pspec) == NS(mesh, pspec, memory_kind='tpu_hbm')`. Also canonicalize memory_kind in `__hash__` and `__eq__` of shardings.
  * This is to not change the StableHLO to include device placement annotations right now since the host aware passes are not enabled by default and the work is under progress to make it work everywhere.

PiperOrigin-RevId: 553833344

											
										
										
											2023-08-04 09:43:39 -07:00
+								  orig_in_s = None
-												Return the same sharding object if the output OpSharding matches the input OpSharding.

Fixes https://github.com/google/jax/issues/15782

PiperOrigin-RevId: 528531594

											
										
										
											2023-05-01 11:46:19 -07:00
+								  orig_aval = None
 								  for i, aval in safe_zip(in_shardings, in_avals):
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								    oi = getattr(i, '_original_sharding', None)
-												Removed the type annotation of `orig_out_sharding_handlers`

The annotation was a generic type alias which are not well supported for
pytype and can, in some cases, lead to crashes.

It was used "raw" and was thus implicitly specialized to `Any`. So, the
constraint of `SubClassT` being the same in the key, parameter and return
types was not actually enforced. It is now enforced through the `_register_out_sharding_handler` function.

PiperOrigin-RevId: 545460402

											
										
										
											2023-07-04 09:00:06 -07:00
+								    if type(oi) in _orig_out_sharding_handlers:
-												[Memories] Add Memories support to jax.jit and jax.device_put!

These are the following changes:

* Add a temporary flag (`JAX_FETCH_MEMORY_KIND_ON_EXECUTABLE`) (should not be used by user but needed in C++ in pjrt-ifrt code) on whether to fetch memory kinds from executable. If it is set to True, the host runtime dep needs to be linked in and should also work in OSS (more work needs to happen for that). So only the test sets it to True for now until jax memories is under development.

* Add with_memory_kind method on Sharding to allow for easier creation of shardings with different memory kind.

* Add lowering rules for device_put and jax.jit.
  * For device_put, we always add the annotation that describes a transfer to a memory and a sharding annotation.
  * For jax.jit, if the argument is on host memory, it will have an extra attribute _xla_buffer_placement.

* Handle the correct output sharding in pxla.py by extracting the memory kind from the executable.

* Handle the caching of pjit caches by canonicalizing the memory_kinds so that `NS(mesh, pspec) == NS(mesh, pspec, memory_kind='tpu_hbm')`. Also canonicalize memory_kind in `__hash__` and `__eq__` of shardings.
  * This is to not change the StableHLO to include device placement annotations right now since the host aware passes are not enabled by default and the work is under progress to make it work everywhere.

PiperOrigin-RevId: 553833344

											
										
										
											2023-08-04 09:43:39 -07:00
+								      orig_in_s = oi
-												Return the same sharding object if the output OpSharding matches the input OpSharding.

Fixes https://github.com/google/jax/issues/15782

PiperOrigin-RevId: 528531594

											
										
										
											2023-05-01 11:46:19 -07:00
+								      orig_aval = aval
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								      break
-												[Memories] Add Memories support to jax.jit and jax.device_put!

These are the following changes:

* Add a temporary flag (`JAX_FETCH_MEMORY_KIND_ON_EXECUTABLE`) (should not be used by user but needed in C++ in pjrt-ifrt code) on whether to fetch memory kinds from executable. If it is set to True, the host runtime dep needs to be linked in and should also work in OSS (more work needs to happen for that). So only the test sets it to True for now until jax memories is under development.

* Add with_memory_kind method on Sharding to allow for easier creation of shardings with different memory kind.

* Add lowering rules for device_put and jax.jit.
  * For device_put, we always add the annotation that describes a transfer to a memory and a sharding annotation.
  * For jax.jit, if the argument is on host memory, it will have an extra attribute _xla_buffer_placement.

* Handle the correct output sharding in pxla.py by extracting the memory kind from the executable.

* Handle the caching of pjit caches by canonicalizing the memory_kinds so that `NS(mesh, pspec) == NS(mesh, pspec, memory_kind='tpu_hbm')`. Also canonicalize memory_kind in `__hash__` and `__eq__` of shardings.
  * This is to not change the StableHLO to include device placement annotations right now since the host aware passes are not enabled by default and the work is under progress to make it work everywhere.

PiperOrigin-RevId: 553833344

											
										
										
											2023-08-04 09:43:39 -07:00
+								  if orig_in_s is not None:
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								    return _get_out_sharding_from_orig_sharding(
 								        out_shardings, out_avals, orig_in_s, orig_aval)
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								  return out_shardings
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
-												Add a private API to allow setting layouts on jitted computations.

We expose 3 modes:

* `SpecifiedLayout`: User specifies the `minor_to_major` field of the layout. Tiling not exposed yet.

* `DefaultLayout`: PJRT chooses the layout. It defaults to the current behavior.

* `AUTO`: Compiler chooses the layout. This field is not a layout per se. It's a request to get the layout from the compiler. This field cannot be on an Array or other data types. It can only be on jit.

Public API coming soon.

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 582692036

											
										
										
											2023-11-15 08:48:17 -08:00
+								def _get_layouts_from_executable(
-												Remove DefaultLayout and make `None` same as `DefaultLayout`

PiperOrigin-RevId: 583221970

											
										
										
											2023-11-16 18:00:49 -08:00
+								    xla_executable, in_layouts, out_layouts, num_ordered_effects
 								) -> tuple[Sequence[SpecifiedLayout | None], Sequence[SpecifiedLayout | None]]:
 								  try:
 								    in_layouts_xla = xla_executable.get_parameter_layouts()
 								    out_layouts_xla = xla_executable.get_output_layouts()
 								  except:
 								    return (None,) * len(in_layouts), (None,) * len(out_layouts)
-												Add a private API to allow setting layouts on jitted computations.

We expose 3 modes:

* `SpecifiedLayout`: User specifies the `minor_to_major` field of the layout. Tiling not exposed yet.

* `DefaultLayout`: PJRT chooses the layout. It defaults to the current behavior.

* `AUTO`: Compiler chooses the layout. This field is not a layout per se. It's a request to get the layout from the compiler. This field cannot be on an Array or other data types. It can only be on jit.

Public API coming soon.

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 582692036

											
										
										
											2023-11-15 08:48:17 -08:00
-												Remove DefaultLayout and make `None` same as `DefaultLayout`

PiperOrigin-RevId: 583221970

											
										
										
											2023-11-16 18:00:49 -08:00
+								  if num_ordered_effects > 0:
 								    in_layouts_xla = in_layouts_xla[num_ordered_effects:]
 								    out_layouts_xla = out_layouts_xla[num_ordered_effects:]
-												Add a private API to allow setting layouts on jitted computations.

We expose 3 modes:

* `SpecifiedLayout`: User specifies the `minor_to_major` field of the layout. Tiling not exposed yet.

* `DefaultLayout`: PJRT chooses the layout. It defaults to the current behavior.

* `AUTO`: Compiler chooses the layout. This field is not a layout per se. It's a request to get the layout from the compiler. This field cannot be on an Array or other data types. It can only be on jit.

Public API coming soon.

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 582692036

											
										
										
											2023-11-15 08:48:17 -08:00
 								  new_in_layouts = []
 								  for x, i in safe_zip(in_layouts_xla, in_layouts):
-												Make the SpecifiedLayout class opaque.

Also need to enabling pickling to xc.Layout so that AOT serialization continues to work.

PiperOrigin-RevId: 583684299

											
										
										
											2023-11-18 15:16:31 -08:00
+								    x = SpecifiedLayout(x)
-												Add a private API to allow setting layouts on jitted computations.

We expose 3 modes:

* `SpecifiedLayout`: User specifies the `minor_to_major` field of the layout. Tiling not exposed yet.

* `DefaultLayout`: PJRT chooses the layout. It defaults to the current behavior.

* `AUTO`: Compiler chooses the layout. This field is not a layout per se. It's a request to get the layout from the compiler. This field cannot be on an Array or other data types. It can only be on jit.

Public API coming soon.

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 582692036

											
										
										
											2023-11-15 08:48:17 -08:00
+								    if isinstance(i, SpecifiedLayout):
 								      if i != x:
 								        raise AssertionError(
 								            f"Unexpected XLA layout override: (XLA) {x} != {i} (User sharding)")
 								      new_in_layouts.append(i)
 								    else:
 								      new_in_layouts.append(x)
 								  new_out_layouts = []
 								  for x, o in safe_zip(out_layouts_xla, out_layouts):
-												Make the SpecifiedLayout class opaque.

Also need to enabling pickling to xc.Layout so that AOT serialization continues to work.

PiperOrigin-RevId: 583684299

											
										
										
											2023-11-18 15:16:31 -08:00
+								    x = SpecifiedLayout(x)
-												Add a private API to allow setting layouts on jitted computations.

We expose 3 modes:

* `SpecifiedLayout`: User specifies the `minor_to_major` field of the layout. Tiling not exposed yet.

* `DefaultLayout`: PJRT chooses the layout. It defaults to the current behavior.

* `AUTO`: Compiler chooses the layout. This field is not a layout per se. It's a request to get the layout from the compiler. This field cannot be on an Array or other data types. It can only be on jit.

Public API coming soon.

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 582692036

											
										
										
											2023-11-15 08:48:17 -08:00
+								    if isinstance(o, SpecifiedLayout):
 								      if o != x:
 								        raise AssertionError(
 								            f"Unexpected XLA layout override: (XLA) {x} != {o} (User sharding)")
 								      new_out_layouts.append(o)
 								    else:
 								      new_out_layouts.append(x)
 								  assert all(isinstance(i, SpecifiedLayout) for i in new_in_layouts)
 								  assert all(isinstance(o, SpecifiedLayout) for o in new_out_layouts)
-												Remove DefaultLayout and make `None` same as `DefaultLayout`

PiperOrigin-RevId: 583221970

											
										
										
											2023-11-16 18:00:49 -08:00
+								  return new_in_layouts, new_out_layouts  # type: ignore
-												Add a private API to allow setting layouts on jitted computations.

We expose 3 modes:

* `SpecifiedLayout`: User specifies the `minor_to_major` field of the layout. Tiling not exposed yet.

* `DefaultLayout`: PJRT chooses the layout. It defaults to the current behavior.

* `AUTO`: Compiler chooses the layout. This field is not a layout per se. It's a request to get the layout from the compiler. This field cannot be on an Array or other data types. It can only be on jit.

Public API coming soon.

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 582692036

											
										
										
											2023-11-15 08:48:17 -08:00
-												Delete sharding spec to HloSharding conversion since it's not used anymore.

PiperOrigin-RevId: 595192496

											
										
										
											2024-01-02 13:12:44 -08:00
+								def get_logical_mesh_ids(mesh_shape):
 								  return np.arange(math.prod(mesh_shape)).reshape(mesh_shape)
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								@weakref_lru_cache
-												Remove _allow_propagation_to_outputs from `compile` in MeshComputation since after jax.Array it is not required and can just default to being set to True if a sharding is unspecified.

PiperOrigin-RevId: 523851611

											
										
										
											2023-04-12 17:37:52 -07:00
+								def _cached_compilation(computation, name, mesh, spmd_lowering,
-												Let XLA choose in_shardings for inputs who sharding is unspecified.

This is a strict improvement over the current state where JAX always chooses replicated sharding.

PiperOrigin-RevId: 610771289

											
										
										
											2024-02-27 09:06:21 -08:00
+								                        tuple_args, auto_spmd_lowering, allow_prop_to_inputs,
 								                        allow_prop_to_outputs, host_callbacks, backend,
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								                        da, pmap_nreps, compiler_options_keys,
 								                        compiler_options_values):
-												Allow pjit.AUTO to be used with jax.jit. This introduces an API change which requires a mesh to be provided to pjit.AUTO(mesh).

`with mesh:` is no longer required with pjit to use the auto spmd pass of GSPMD.

PiperOrigin-RevId: 533801596

											
										
										
											2023-05-20 22:59:52 -07:00
+								  # TODO(phawkins): One would normally just write:
 								  # dev = np.array(device_assignment)
 								  # The formulation below is substantially faster if there are many devices.
 								  # If we were to optimize __getattr__ on xc.Device we might not need this
 								  # workaround.
-												[JAX] Introduce `DeviceList` backed by C++ `xla::ifrt::DeviceList`

This change adds `xla_client.DeviceList` that is implemented in C++
`jax::PyDeviceList`. `jax::PyDeviceList` implements the features of
`pxla._DeviceAssignment` as a functional drop-in replacement.
`jax::PyDeviceList` internally has `xla::ifrt::DeviceList`, which will be used
when using IFRT APIs without having to construct a new copy of a potentially
large device list.

`pxla._DeviceAssignment`'s interface is changed slightly to encourage avoiding
conversion to tuple.

Note that for the backward compatibility (and fast `xla_client.Device`
conversion), `jax::PyDeviceList` still uses a Python tuple whose element can be
any Python object matches `xla_client.Device` interface with duck typing. This
duck typing support will be removed when such use case is deprecated.
Eventually, we can try to avoid any type conversion to remove a shadow copy of
device list in JAX.

PiperOrigin-RevId: 555317152

											
										
										
											2023-08-09 16:57:28 -07:00
+								  dev = np.vectorize(lambda i: da[i], otypes=[object])(
 								    np.arange(len(da))
-												Allow pjit.AUTO to be used with jax.jit. This introduces an API change which requires a mesh to be provided to pjit.AUTO(mesh).

`with mesh:` is no longer required with pjit to use the auto spmd pass of GSPMD.

PiperOrigin-RevId: 533801596

											
										
										
											2023-05-20 22:59:52 -07:00
+								  )
 								  if pmap_nreps > 1:
 								    num_replicas, num_partitions = pmap_nreps, 1
 								  elif spmd_lowering:
 								    num_replicas, num_partitions = 1, dev.size
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								  else:
-												Allow pjit.AUTO to be used with jax.jit. This introduces an API change which requires a mesh to be provided to pjit.AUTO(mesh).

`with mesh:` is no longer required with pjit to use the auto spmd pass of GSPMD.

PiperOrigin-RevId: 533801596

											
										
										
											2023-05-20 22:59:52 -07:00
+								    num_replicas, num_partitions = dev.size, 1
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
 								  if pmap_nreps > 1:
 								    # In `jit` device_assignment is set to None when num_replicas > 1. Do
 								    # the same thing here too.
 								    xla_device_assignment = None
 								  else:
 								    xla_device_assignment = dev.reshape((num_replicas, num_partitions))
 								  if compiler_options_keys is None:
 								    compiler_options = None
 								  else:
 								    compiler_options = dict(safe_zip(compiler_options_keys, compiler_options_values))
-												Enable passing fdo_profile in compiler_options in pxla.py

PiperOrigin-RevId: 549109629

											
										
										
											2023-07-18 14:17:56 -07:00
+								  fdo_profile = (None if compiler_options is None else
 								                 compiler_options.pop("fdo_profile", None))
-												Move compiler APIs out of dispatch.py and xla_bridge.py into a new jax._src.compiler module.

Refactoring only, no user-visible changes intended.

PiperOrigin-RevId: 557116160

											
										
										
											2023-08-15 06:38:56 -07:00
+								  compile_options = compiler.get_compile_options(
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								      num_replicas=num_replicas,
 								      num_partitions=num_partitions,
 								      device_assignment=xla_device_assignment,
 								      use_spmd_partitioning=spmd_lowering,
 								      use_auto_spmd_partitioning=auto_spmd_lowering,
 								      env_options_overrides=compiler_options,
-												Enable passing fdo_profile in compiler_options in pxla.py

PiperOrigin-RevId: 549109629

											
										
										
											2023-07-18 14:17:56 -07:00
+								      fdo_profile=fdo_profile,
-												Restrict retrieving XLA-AutoFDO profile version to TPU workloads.

XLA-AutoFDO is supported only for TPUs, so requesting the latest
profile version for non-TPU workloads is unnecessary and can delay
the completion of initialization.

Testing: test workload.
PiperOrigin-RevId: 584148686

											
										
										
											2023-11-20 15:51:27 -08:00
+								      detailed_logging=compiler.use_detailed_logging(computation),
 								      backend=backend,
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								  )
 								  opts = compile_options.executable_build_options
 								  if auto_spmd_lowering:
 								    assert mesh is not None
 								    opts.auto_spmd_partitioning_mesh_shape = list(mesh.shape.values())
 								    opts.auto_spmd_partitioning_mesh_ids = (
-												Delete sharding spec to HloSharding conversion since it's not used anymore.

PiperOrigin-RevId: 595192496

											
										
										
											2024-01-02 13:12:44 -08:00
+								        get_logical_mesh_ids(list(mesh.shape.values()))
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								        .reshape(-1))
 								  compile_options.parameter_is_tupled_arguments = tuple_args
-												[XLA:SPMD] Do not propagate sharding to parameter/output if it does not evenly partition the parameter/output.

PiperOrigin-RevId: 612998062

											
										
										
											2024-03-05 15:56:16 -08:00
+								  if xla_extension_version >= 241:
-												Let XLA choose in_shardings for inputs who sharding is unspecified.

This is a strict improvement over the current state where JAX always chooses replicated sharding.

PiperOrigin-RevId: 610771289

											
										
										
											2024-02-27 09:06:21 -08:00
+								    opts.allow_spmd_sharding_propagation_to_parameters = list(allow_prop_to_inputs)
 								  opts.allow_spmd_sharding_propagation_to_output = list(allow_prop_to_outputs)
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
 								  if hasattr(backend, "compile_replicated"):
 								    return None, compile_options
-												Remove the f-string evaluation during logging the elapsed time by passing in fun_name to log_elapsed_time

PiperOrigin-RevId: 532132574

											
										
										
											2023-05-15 09:15:22 -07:00
+								  with dispatch.log_elapsed_time(
 								      "Finished XLA compilation of {fun_name} in {elapsed_time} sec",
 								      fun_name=name, event=dispatch.BACKEND_COMPILE_EVENT):
-												Move compiler APIs out of dispatch.py and xla_bridge.py into a new jax._src.compiler module.

Refactoring only, no user-visible changes intended.

PiperOrigin-RevId: 557116160

											
										
										
											2023-08-15 06:38:56 -07:00
+								    xla_executable = compiler.compile_or_get_cached(
-												Include the device_kind in the compilation cache key.

PiperOrigin-RevId: 525726898

											
										
										
											2023-04-20 06:16:12 -07:00
+								        backend, computation, dev, compile_options, host_callbacks)
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								  return xla_executable, compile_options
-												Let XLA choose in_shardings for inputs who sharding is unspecified.

This is a strict improvement over the current state where JAX always chooses replicated sharding.

PiperOrigin-RevId: 610771289

											
										
										
											2024-02-27 09:06:21 -08:00
+								def _maybe_get_and_check_in_shardings(
 								    xla_executable, in_shardings, device_assignment,
 								    global_in_avals, num_ordered_effects):
 								  """Returns in_shardings extracted from XLA or checks and returns original
 								  shardings.
 								  If in_shardings exist on `jit` or on `jax.Array`, then this function will
 								  check that sharding against what XLA returns as in_shardings. If they don't
 								  match, an error is raised.
 								  If in_sharding is unspecified, then the sharding returned by XLA is returned.
 								  """
 								  in_shardings_xla = _get_in_shardings_from_xla(  # type: ignore
 								      xla_executable, device_assignment, len(global_in_avals),
 								      num_ordered_effects)  # type: ignore
 								  if in_shardings_xla is None:
 								    return in_shardings
 								  new_in_shardings = []
 								  for xla_s, orig, aval in safe_zip(in_shardings_xla, in_shardings,
 								                                    global_in_avals):
 								    if is_unspecified(orig):
-												Contrain the trailing dims of prng key array to REPLICATED and keep other dims as unconstrained.

PiperOrigin-RevId: 611232967

											
										
										
											2024-02-28 14:36:20 -08:00
+								      if (aval is not core.abstract_token and
 								          dtypes.issubdtype(aval.dtype, dtypes.extended)):
-												Convert in_shardings to physical shardings in cpp dispatch path because the same happens with prng arrays.

Also comment out key reuse check in cpp dispatch since it's True for jax tests which prevent prng keys from taking Cpp dispatch.

PiperOrigin-RevId: 613289252

											
										
										
											2024-03-06 11:41:34 -08:00
+								        xla_s = aval.dtype._rules.logical_sharding(aval, xla_s)
-												Let XLA choose in_shardings for inputs who sharding is unspecified.

This is a strict improvement over the current state where JAX always chooses replicated sharding.

PiperOrigin-RevId: 610771289

											
										
										
											2024-02-27 09:06:21 -08:00
+								      new_in_shardings.append(xla_s)
 								    else:
-												Partially rollback propagating sharding to inputs because SPMD chooses wrong shardings when shape is not divisble by shard_shape.

PiperOrigin-RevId: 611806938

											
										
										
											2024-03-01 08:39:49 -08:00
+								      # TODO(yashkatariya): Remove the if branch for abstract_token once
 								      # choosing input shardings by XLA is enabled again.
 								      if aval is core.abstract_token:
 								        new_in_shardings.append(orig)
 								      else:
 								        xla_hlo_s = xla_s._to_xla_hlo_sharding(aval.ndim)  # type: ignore
 								        orig_hlo_s = orig._to_xla_hlo_sharding(aval.ndim)  # type: ignore
 								        # MANUAL HloSharding comes from other partitioning frameworks.
 								        if (not dtypes.issubdtype(aval.dtype, dtypes.extended) and
 								            not xla_hlo_s.is_manual() and
-												Lower memory_kind on inputs/outputs to `mhlo.memory_kind` and then translate that to entry_computation_layout attached to HLO.

This is very similar to how layouts are plumbed from stablehlo to entry_computation_layout.

PiperOrigin-RevId: 615485286

											
										
										
											2024-03-13 11:23:12 -07:00
+								            (not op_shardings.are_op_shardings_equal(xla_hlo_s, orig_hlo_s))):
-												Partially rollback propagating sharding to inputs because SPMD chooses wrong shardings when shape is not divisble by shard_shape.

PiperOrigin-RevId: 611806938

											
										
										
											2024-03-01 08:39:49 -08:00
+								          raise AssertionError(
 								              f"Unexpected XLA sharding override: (XLA) {xla_s} != {orig} "
 								              "(User sharding)")
 								        new_in_shardings.append(orig)
-												Let XLA choose in_shardings for inputs who sharding is unspecified.

This is a strict improvement over the current state where JAX always chooses replicated sharding.

PiperOrigin-RevId: 610771289

											
										
										
											2024-02-27 09:06:21 -08:00
+								  return new_in_shardings
-												Move the replicated trailing dims check inside logical_op_sharding

PiperOrigin-RevId: 611277405

											
										
										
											2024-02-28 17:03:04 -08:00
+								def _maybe_get_and_check_out_shardings(
-												Make looking up shardings from executable consistent. If `out_shardings` are specified on `jit`, always check it against the `get_output_shardings` from the executable.

PiperOrigin-RevId: 583456869

											
										
										
											2023-11-17 12:18:46 -08:00
+								    xla_executable, out_shardings, device_assignment, global_out_avals,
 								    num_ordered_effects, all_default_mem_kind
 								  ):
-												Let XLA choose in_shardings for inputs who sharding is unspecified.

This is a strict improvement over the current state where JAX always chooses replicated sharding.

PiperOrigin-RevId: 610771289

											
										
										
											2024-02-27 09:06:21 -08:00
+								  out_shardings_xla = get_out_shardings_from_executable(  # type: ignore
-												Make looking up shardings from executable consistent. If `out_shardings` are specified on `jit`, always check it against the `get_output_shardings` from the executable.

PiperOrigin-RevId: 583456869

											
										
										
											2023-11-17 12:18:46 -08:00
+								      xla_executable, device_assignment, len(global_out_avals),
 								      num_ordered_effects, all_default_mem_kind)  # type: ignore
 								  if out_shardings_xla is None:
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								    return out_shardings
-												Make looking up shardings from executable consistent. If `out_shardings` are specified on `jit`, always check it against the `get_output_shardings` from the executable.

PiperOrigin-RevId: 583456869

											
										
										
											2023-11-17 12:18:46 -08:00
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								  new_out_shardings = []
-												Let XLA choose in_shardings for inputs who sharding is unspecified.

This is a strict improvement over the current state where JAX always chooses replicated sharding.

PiperOrigin-RevId: 610771289

											
										
										
											2024-02-27 09:06:21 -08:00
+								  for xla_s, orig, aval in safe_zip(out_shardings_xla, out_shardings,
-												Make looking up shardings from executable consistent. If `out_shardings` are specified on `jit`, always check it against the `get_output_shardings` from the executable.

PiperOrigin-RevId: 583456869

											
										
										
											2023-11-17 12:18:46 -08:00
+								                                    global_out_avals):
 								    if is_unspecified(orig):
-												Contrain the trailing dims of prng key array to REPLICATED and keep other dims as unconstrained.

PiperOrigin-RevId: 611232967

											
										
										
											2024-02-28 14:36:20 -08:00
+								      if (aval is not core.abstract_token and
 								          dtypes.issubdtype(aval.dtype, dtypes.extended)):
-												Convert in_shardings to physical shardings in cpp dispatch path because the same happens with prng arrays.

Also comment out key reuse check in cpp dispatch since it's True for jax tests which prevent prng keys from taking Cpp dispatch.

PiperOrigin-RevId: 613289252

											
										
										
											2024-03-06 11:41:34 -08:00
+								        xla_s = aval.dtype._rules.logical_sharding(aval, xla_s)
-												Let XLA choose in_shardings for inputs who sharding is unspecified.

This is a strict improvement over the current state where JAX always chooses replicated sharding.

PiperOrigin-RevId: 610771289

											
										
										
											2024-02-27 09:06:21 -08:00
+								      new_out_shardings.append(xla_s)
-												Make looking up shardings from executable consistent. If `out_shardings` are specified on `jit`, always check it against the `get_output_shardings` from the executable.

PiperOrigin-RevId: 583456869

											
										
										
											2023-11-17 12:18:46 -08:00
+								    else:
 								      xla_hlo_s = xla_s._to_xla_hlo_sharding(aval.ndim)  # type: ignore
 								      orig_hlo_s = orig._to_xla_hlo_sharding(aval.ndim)  # type: ignore
 								      # MANUAL HloSharding comes from other partitioning frameworks.
 								      if (not dtypes.issubdtype(aval.dtype, dtypes.extended) and
 								          not xla_hlo_s.is_manual() and
 								          (not op_shardings.are_op_shardings_equal(xla_hlo_s, orig_hlo_s) or
 								           xla_s.memory_kind != orig.memory_kind)):  # type: ignore
 								        raise AssertionError(
 								            f"Unexpected XLA sharding override: (XLA) {xla_s} != {orig} "
 								            "(User sharding)")
-												Let XLA choose in_shardings for inputs who sharding is unspecified.

This is a strict improvement over the current state where JAX always chooses replicated sharding.

PiperOrigin-RevId: 610771289

											
										
										
											2024-02-27 09:06:21 -08:00
+								      new_out_shardings.append(orig)
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								  return new_out_shardings
-												Make looking up shardings from executable consistent. If `out_shardings` are specified on `jit`, always check it against the `get_output_shardings` from the executable.

PiperOrigin-RevId: 583456869

											
										
										
											2023-11-17 12:18:46 -08:00
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								def finalize_out_shardings(out_shardings, device_assignment):
-												Preserve single device NamedSharding/PositionalSharding on the output instead of always return SingleDeviceShardings.

Fixes https://github.com/google/jax/issues/19459

PiperOrigin-RevId: 600999853

											
										
										
											2024-01-23 21:28:33 -08:00
+								  if len(device_assignment) == 1:
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								    return [SingleDeviceSharding(device_assignment[0], memory_kind=o.memory_kind)
 								            if isinstance(o, GSPMDSharding) else o for o in out_shardings]
 								  return out_shardings
-												Preserve single device NamedSharding/PositionalSharding on the output instead of always return SingleDeviceShardings.

Fixes https://github.com/google/jax/issues/19459

PiperOrigin-RevId: 600999853

											
										
										
											2024-01-23 21:28:33 -08:00
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								@dataclasses.dataclass
 								class UnloadedMeshExecutable:
 								  xla_executable: Any
-												Replace _DeviceAssignment with xc.DeviceList

PiperOrigin-RevId: 599597226

											
										
										
											2024-01-18 12:54:54 -08:00
+								  device_assignment: xc.DeviceList | Sequence[xc.Device]  # type: ignore
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  backend: xb.XlaBackend
 								  input_avals: Sequence[ShapedArray]
-												Split Sharding subclasses out of _src/sharding.py into _src/sharding_impls.py

By defining the Sharding base class in its own module, we can pull it out into a separate Bazel submodule, which will help pytype inference when defining Array.

PiperOrigin-RevId: 516223009

											
										
										
											2023-03-13 08:49:39 -07:00
+								  input_shardings: Sequence[sharding_impls.XLACompatibleSharding]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  output_avals: Sequence[ShapedArray]
-												Split Sharding subclasses out of _src/sharding.py into _src/sharding_impls.py

By defining the Sharding base class in its own module, we can pull it out into a separate Bazel submodule, which will help pytype inference when defining Array.

PiperOrigin-RevId: 516223009

											
										
										
											2023-03-13 08:49:39 -07:00
+								  output_shardings: Sequence[sharding_impls.XLACompatibleSharding]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  committed: bool
 								  name: str
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								  unordered_effects: list[core.Effect]
 								  ordered_effects: list[core.Effect]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  keepalive: Sequence[Any]
 								  host_callbacks: Sequence[Any]
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								  kept_var_idx: set[int]
-												[mutable-arrays] support closed-over mutable arrays in jit

											
										
										
											2024-03-05 16:20:24 -08:00
+								  mut: MutationData | None
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  auto_spmd_lowering: bool
-												Add a private API to allow setting layouts on jitted computations.

We expose 3 modes:

* `SpecifiedLayout`: User specifies the `minor_to_major` field of the layout. Tiling not exposed yet.

* `DefaultLayout`: PJRT chooses the layout. It defaults to the current behavior.

* `AUTO`: Compiler chooses the layout. This field is not a layout per se. It's a request to get the layout from the compiler. This field cannot be on an Array or other data types. It can only be on jit.

Public API coming soon.

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 582692036

											
										
										
											2023-11-15 08:48:17 -08:00
+								  in_layouts: Sequence[SpecifiedLayout | None]
 								  out_layouts: Sequence[SpecifiedLayout | None]
-												Typecheck avals and sharding for arguments that were DCE'd.

This keeps the promise of AOT that recompilation is guaranteed.

Fixes https://github.com/google/jax/issues/18686

PiperOrigin-RevId: 585855658

											
										
										
											2023-11-27 22:38:46 -08:00
+								  all_args_info: AllArgsInfo | None
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Redefine `compile_and_serialize` as `serialize(lowered.compile())`.

This has the downside of keeping around the UnloadedMeshComputation,
but it makes the serialize() API easier to understand.

PiperOrigin-RevId: 518715469

											
										
										
											2023-03-22 17:22:39 -07:00
+								  def build_unsafe_call(self):
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								    if xla_extension_version >= 229:
 								      handle_args = InputsHandler(self.input_shardings)
 								    else:
 								      input_indices = _get_input_indices(self.input_avals, self.input_shardings,
 								                                         self.device_assignment)
 								      handle_args = InputsHandler(
 								          self.input_shardings, self.xla_executable.local_devices(), input_indices)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    handle_outs = global_avals_to_results_handler(
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								        self.output_avals, self.output_shardings, self.committed)  # type: ignore  # arg-type
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Rollforward with fixes: Remove _execute_replicated from UnloadedMeshExecutable.load since it is not required anymore for jit(pmap) cases

PiperOrigin-RevId: 516317920

											
										
										
											2023-03-13 14:08:48 -07:00
+								    unsafe_call = ExecuteReplicated(  # type: ignore  # assignment
 								        self.xla_executable, self.name, self.backend, handle_args,
 								        handle_outs, self.unordered_effects, self.ordered_effects, self.keepalive,
-												[mutable-arrays] support closed-over mutable arrays in jit

											
										
										
											2024-03-05 16:20:24 -08:00
+								        bool(self.host_callbacks), self.kept_var_idx, self.mut)
-												Redefine `compile_and_serialize` as `serialize(lowered.compile())`.

This has the downside of keeping around the UnloadedMeshComputation,
but it makes the serialize() API easier to understand.

PiperOrigin-RevId: 518715469

											
										
										
											2023-03-22 17:22:39 -07:00
+								    return unsafe_call
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Redefine `compile_and_serialize` as `serialize(lowered.compile())`.

This has the downside of keeping around the UnloadedMeshComputation,
but it makes the serialize() API easier to understand.

PiperOrigin-RevId: 518715469

											
										
										
											2023-03-22 17:22:39 -07:00
+								  def load(self) -> MeshExecutable:
 								    return MeshExecutable(self.xla_executable, self.build_unsafe_call,
-												Thread out_avals to MeshExecutable

PiperOrigin-RevId: 612037684

											
										
										
											2024-03-02 13:34:46 -08:00
+								                          self.input_avals, self.output_avals,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								                          self.input_shardings, self.output_shardings,
 								                          self.auto_spmd_lowering, self.kept_var_idx,
-												Add a private API to allow setting layouts on jitted computations.

We expose 3 modes:

* `SpecifiedLayout`: User specifies the `minor_to_major` field of the layout. Tiling not exposed yet.

* `DefaultLayout`: PJRT chooses the layout. It defaults to the current behavior.

* `AUTO`: Compiler chooses the layout. This field is not a layout per se. It's a request to get the layout from the compiler. This field cannot be on an Array or other data types. It can only be on jit.

Public API coming soon.

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 582692036

											
										
										
											2023-11-15 08:48:17 -08:00
+								                          self.in_layouts, self.out_layouts,
-												Remove jaxpr_debug_info from MeshComputation since that information is available via AllArgsInfo

PiperOrigin-RevId: 586018345

											
										
										
											2023-11-28 10:04:29 -08:00
+								                          self.all_args_info, self)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								  # May return a MeshExecutable in the compile_replicated case.
 								  @staticmethod
 								  def from_hlo(name: str,
-												Small cleanups to pxla.py.

Remove stale references to XlaComputation and code left over from handling both XlaComputations and ir.Modules.

No functional changes intended.

PiperOrigin-RevId: 526139679

											
										
										
											2023-04-21 14:37:52 -07:00
+								               hlo: ir.Module,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								               global_in_avals: Sequence[ShapedArray],
 								               global_out_avals: Sequence[ShapedArray],
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								               in_shardings: Sequence[sharding_impls.XLACompatibleSharding | AUTO],
 								               out_shardings: Sequence[(sharding_impls.XLACompatibleSharding | AUTO |
-												Set out_mut to `None` as default on `from_hlo` instead of in `__init__` of `MeshComputation` and correct the types too.

PiperOrigin-RevId: 611814102

											
										
										
											2024-03-01 09:27:57 -08:00
+								                                        UnspecifiedValue)],
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								               spmd_lowering: bool,
 								               tuple_args: bool,
 								               auto_spmd_lowering: bool,
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								               unordered_effects: list[core.Effect],
 								               ordered_effects: list[core.Effect],
 								               host_callbacks: list[Any],
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								               keepalive: Any,
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								               kept_var_idx: set[int],
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								               backend: xb.XlaBackend,
-												Replace _DeviceAssignment with xc.DeviceList

PiperOrigin-RevId: 599597226

											
										
										
											2024-01-18 12:54:54 -08:00
+								               device_assignment: xc.DeviceList | Sequence[xc.Device],  # type: ignore
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								               committed: bool,
-												Add a private API to allow setting layouts on jitted computations.

We expose 3 modes:

* `SpecifiedLayout`: User specifies the `minor_to_major` field of the layout. Tiling not exposed yet.

* `DefaultLayout`: PJRT chooses the layout. It defaults to the current behavior.

* `AUTO`: Compiler chooses the layout. This field is not a layout per se. It's a request to get the layout from the compiler. This field cannot be on an Array or other data types. It can only be on jit.

Public API coming soon.

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 582692036

											
										
										
											2023-11-15 08:48:17 -08:00
+								               in_layouts: MaybeLayout,
 								               out_layouts: MaybeLayout,
-												expose `compiler_options` on `compile()`

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 520782460

											
										
										
											2023-03-30 17:13:46 -07:00
+								               pmap_nreps: int = 1,
-												[mutable-arrays] support closed-over mutable arrays in jit

											
										
										
											2024-03-05 16:20:24 -08:00
+								               mut: MutationData | None = None,
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								               shape_poly_state: mlir.ShapePolyLoweringState | None = None,
-												Set the jax_enable_memories flag to True.

If all memory_kinds in the jaxpr are the default memory kind, then annotate_device_placement custom calls are not inserted. This allows for existing code to work without any changes.

If non-default memory kind is present in the jaxpr, then we allow custom calls to be inserted.

PiperOrigin-RevId: 564457393

											
										
										
											2023-09-11 11:54:29 -07:00
+								               all_default_mem_kind: bool = True,
-												Typecheck avals and sharding for arguments that were DCE'd.

This keeps the promise of AOT that recompilation is guaranteed.

Fixes https://github.com/google/jax/issues/18686

PiperOrigin-RevId: 585855658

											
										
										
											2023-11-27 22:38:46 -08:00
+								               all_args_info: AllArgsInfo | None = None,
-												Set the jax_enable_memories flag to True.

If all memory_kinds in the jaxpr are the default memory kind, then annotate_device_placement custom calls are not inserted. This allows for existing code to work without any changes.

If non-default memory kind is present in the jaxpr, then we allow custom calls to be inserted.

PiperOrigin-RevId: 564457393

											
										
										
											2023-09-11 11:54:29 -07:00
+								               compiler_options=None,
-												Redefine `compile_and_serialize` as `serialize(lowered.compile())`.

This has the downside of keeping around the UnloadedMeshComputation,
but it makes the serialize() API easier to understand.

PiperOrigin-RevId: 518715469

											
										
										
											2023-03-22 17:22:39 -07:00
+								  ) -> MeshExecutable:
-												[shape_poly] Enable calling from JAX modules that use shape polymorphism.

If a JAX function uses shape polymorphism the resulting StableHLO contains
dynamic shapes and it is not directly compileable. However, in such modules
the dynamic shapes depend only on the input shapes, and in JAX jit the input
shapes are static. So, we run a shape refinement pass over the module to
resolve the dynamic shapes prior to compilation.

PiperOrigin-RevId: 538275268

											
										
										
											2023-06-06 13:26:35 -07:00
+								    if shape_poly_state is not None and shape_poly_state.uses_dim_vars:
-												Remove the monkey patch in jax2tf by moving the function to mlir.py

PiperOrigin-RevId: 539266562

											
										
										
											2023-06-09 23:46:45 -07:00
+								      hlo = mlir.refine_polymorphic_shapes(hlo)
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								    compiler_options_keys = tuple(
 								        compiler_options.keys()) if compiler_options is not None else None
 								    compiler_options_values = tuple(
 								        compiler_options.values()) if compiler_options is not None else None
-												Replace _DeviceAssignment with xc.DeviceList

PiperOrigin-RevId: 599597226

											
										
										
											2024-01-18 12:54:54 -08:00
+								    if isinstance(device_assignment, xc.DeviceList):
-												[JAX] Introduce `DeviceList` backed by C++ `xla::ifrt::DeviceList`

This change adds `xla_client.DeviceList` that is implemented in C++
`jax::PyDeviceList`. `jax::PyDeviceList` implements the features of
`pxla._DeviceAssignment` as a functional drop-in replacement.
`jax::PyDeviceList` internally has `xla::ifrt::DeviceList`, which will be used
when using IFRT APIs without having to construct a new copy of a potentially
large device list.

`pxla._DeviceAssignment`'s interface is changed slightly to encourage avoiding
conversion to tuple.

Note that for the backward compatibility (and fast `xla_client.Device`
conversion), `jax::PyDeviceList` still uses a Python tuple whose element can be
any Python object matches `xla_client.Device` interface with duck typing. This
duck typing support will be removed when such use case is deprecated.
Eventually, we can try to avoid any type conversion to remove a shadow copy of
device list in JAX.

PiperOrigin-RevId: 555317152

											
										
										
											2023-08-09 16:57:28 -07:00
+								      da = device_assignment
 								    else:
 								      da = _create_da_object(tuple(device_assignment))
-												Do the sharding.addressable_devices check only once in _get_input_indices since all shardings should have the same device_assignment.

That check happens at the start of lower_sharding_computation. Also use the optimized DeviceAssignment object which has all the calculations cached if this path is hit multiple times.

Also remove `device_assignment` from MeshExecutable since it is not used anywhere in that class

PiperOrigin-RevId: 523182028

											
										
										
											2023-04-10 12:22:45 -07:00
+								    del device_assignment
-												Let XLA choose in_shardings for inputs who sharding is unspecified.

This is a strict improvement over the current state where JAX always chooses replicated sharding.

PiperOrigin-RevId: 610771289

											
										
										
											2024-02-27 09:06:21 -08:00
 								    allow_prop_to_inputs = tuple(is_unspecified(i) for i in in_shardings)
-												Remove _allow_propagation_to_outputs from `compile` in MeshComputation since after jax.Array it is not required and can just default to being set to True if a sharding is unspecified.

PiperOrigin-RevId: 523851611

											
										
										
											2023-04-12 17:37:52 -07:00
+								    allow_prop_to_outputs = tuple(is_unspecified(o) for o in out_shardings)
-												Allow pjit.AUTO to be used with jax.jit. This introduces an API change which requires a mesh to be provided to pjit.AUTO(mesh).

`with mesh:` is no longer required with pjit to use the auto spmd pass of GSPMD.

PiperOrigin-RevId: 533801596

											
										
										
											2023-05-20 22:59:52 -07:00
 								    mesh = None
 								    if auto_spmd_lowering:
 								      for i in it.chain.from_iterable([in_shardings, out_shardings]):
 								        if is_auto(i):
 								          mesh = i.mesh  # type: ignore
 								          break
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								    xla_executable, compile_options = _cached_compilation(
-												Small cleanups to pxla.py.

Remove stale references to XlaComputation and code left over from handling both XlaComputations and ir.Modules.

No functional changes intended.

PiperOrigin-RevId: 526139679

											
										
										
											2023-04-21 14:37:52 -07:00
+								        hlo, name, mesh, spmd_lowering,
-												Let XLA choose in_shardings for inputs who sharding is unspecified.

This is a strict improvement over the current state where JAX always chooses replicated sharding.

PiperOrigin-RevId: 610771289

											
										
										
											2024-02-27 09:06:21 -08:00
+								        tuple_args, auto_spmd_lowering, allow_prop_to_inputs,
 								        allow_prop_to_outputs, tuple(host_callbacks), backend, da, pmap_nreps,
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								        compiler_options_keys, compiler_options_values)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								    if hasattr(backend, "compile_replicated"):
 								      semantics_in_shardings = SemanticallyEqualShardings(in_shardings)  # type: ignore
 								      semantics_out_shardings = SemanticallyEqualShardings(out_shardings)  # type: ignore
 								      return _compile_replicated_mesh_executable_from_hlo(
-												Small cleanups to pxla.py.

Remove stale references to XlaComputation and code left over from handling both XlaComputations and ir.Modules.

No functional changes intended.

PiperOrigin-RevId: 526139679

											
										
										
											2023-04-21 14:37:52 -07:00
+								          hlo, name, tuple(global_in_avals), tuple(global_out_avals),
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								          semantics_in_shardings, semantics_out_shardings, auto_spmd_lowering,
 								          compile_options, tuple(host_callbacks), bool(unordered_effects),
 								          tuple(ordered_effects), tuple(kept_var_idx), backend, da, committed,
-												Remove jaxpr_debug_info from MeshComputation since that information is available via AllArgsInfo

PiperOrigin-RevId: 586018345

											
										
										
											2023-11-28 10:04:29 -08:00
+								          pmap_nreps)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								    if auto_spmd_lowering:
 								      assert mesh is not None
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								      in_shardings_xla, out_shardings_xla = _get_mesh_pspec_shardings_from_executable(
 								          xla_executable, mesh)
-												Allow pjit.AUTO to be used with jax.jit. This introduces an API change which requires a mesh to be provided to pjit.AUTO(mesh).

`with mesh:` is no longer required with pjit to use the auto spmd pass of GSPMD.

PiperOrigin-RevId: 533801596

											
										
										
											2023-05-20 22:59:52 -07:00
+								      in_shardings = [x if is_auto(i) else getattr(i, '_original_sharding', i)  # type: ignore
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								                      for x, i in safe_zip(in_shardings_xla, in_shardings)]
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								      out_shardings = [x if is_auto(o) else o
 								                       for x, o in safe_zip(out_shardings_xla, out_shardings)]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    else:
-												Make looking up shardings from executable consistent. If `out_shardings` are specified on `jit`, always check it against the `get_output_shardings` from the executable.

PiperOrigin-RevId: 583456869

											
										
										
											2023-11-17 12:18:46 -08:00
+								      if pmap_nreps == 1:
 								        assert mesh is None
-												[XLA:SPMD] Do not propagate sharding to parameter/output if it does not evenly partition the parameter/output.

PiperOrigin-RevId: 612998062

											
										
										
											2024-03-05 15:56:16 -08:00
+								        if xla_extension_version >= 241:
-												Let XLA choose in_shardings for inputs who sharding is unspecified.

This is a strict improvement over the current state where JAX always chooses replicated sharding.

PiperOrigin-RevId: 610771289

											
										
										
											2024-02-27 09:06:21 -08:00
+								          in_shardings = _maybe_get_and_check_in_shardings(
 								              xla_executable, in_shardings, tuple(da), global_in_avals,
 								              len(ordered_effects))
-												Move the replicated trailing dims check inside logical_op_sharding

PiperOrigin-RevId: 611277405

											
										
										
											2024-02-28 17:03:04 -08:00
+								        out_shardings = _maybe_get_and_check_out_shardings(
-												Make looking up shardings from executable consistent. If `out_shardings` are specified on `jit`, always check it against the `get_output_shardings` from the executable.

PiperOrigin-RevId: 583456869

											
										
										
											2023-11-17 12:18:46 -08:00
+								            xla_executable, out_shardings, tuple(da), global_out_avals,
 								            len(ordered_effects), all_default_mem_kind)
 								      else:
 								        in_shardings, out_shardings, committed, da = _get_metadata_jit_pmap(
 								            xla_executable.local_devices(), len(in_shardings), len(out_shardings))
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
-												Make the SpecifiedLayout class opaque.

Also need to enabling pickling to xc.Layout so that AOT serialization continues to work.

PiperOrigin-RevId: 583684299

											
										
										
											2023-11-18 15:16:31 -08:00
+								    if xla_extension_version >= 217:
-												Remove DefaultLayout and make `None` same as `DefaultLayout`

PiperOrigin-RevId: 583221970

											
										
										
											2023-11-16 18:00:49 -08:00
+								      in_layouts, out_layouts = _get_layouts_from_executable(
 								          xla_executable, in_layouts, out_layouts, len(ordered_effects))
 								    else:
 								      assert all(i is None for i in in_layouts)
 								      assert all(o is None for o in out_layouts)
-												Add a private API to allow setting layouts on jitted computations.

We expose 3 modes:

* `SpecifiedLayout`: User specifies the `minor_to_major` field of the layout. Tiling not exposed yet.

* `DefaultLayout`: PJRT chooses the layout. It defaults to the current behavior.

* `AUTO`: Compiler chooses the layout. This field is not a layout per se. It's a request to get the layout from the compiler. This field cannot be on an Array or other data types. It can only be on jit.

Public API coming soon.

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 582692036

											
										
										
											2023-11-15 08:48:17 -08:00
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								    out_shardings = maybe_get_orig_out_sharding(
 								        in_shardings, out_shardings, global_in_avals, global_out_avals)
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
-												Allow sharding propagation to input for prng keys whose sharding is not specified.

Convert shardings returned by XLA (when propagation is on for input and output) for extended dtypes to user shardings which allows to remove `are_out_shardings_from_xla`.

PiperOrigin-RevId: 611246986

											
										
										
											2024-02-28 15:21:50 -08:00
+								    out_shardings = finalize_out_shardings(out_shardings, da)
-												Preserve single device NamedSharding/PositionalSharding on the output instead of always return SingleDeviceShardings.

Fixes https://github.com/google/jax/issues/19459

PiperOrigin-RevId: 600999853

											
										
										
											2024-01-23 21:28:33 -08:00
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								    return UnloadedMeshExecutable(
 								        xla_executable=xla_executable,
-												Do the sharding.addressable_devices check only once in _get_input_indices since all shardings should have the same device_assignment.

That check happens at the start of lower_sharding_computation. Also use the optimized DeviceAssignment object which has all the calculations cached if this path is hit multiple times.

Also remove `device_assignment` from MeshExecutable since it is not used anywhere in that class

PiperOrigin-RevId: 523182028

											
										
										
											2023-04-10 12:22:45 -07:00
+								        device_assignment=da,  # type: ignore
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								        backend=backend,
 								        input_avals=global_in_avals,
 								        input_shardings=in_shardings,  # type: ignore
 								        output_avals=global_out_avals,
 								        output_shardings=out_shardings,  # type: ignore # arg-type
 								        committed=committed,
 								        name=name,
 								        unordered_effects=unordered_effects,
 								        ordered_effects=ordered_effects,
 								        keepalive=keepalive,
 								        host_callbacks=host_callbacks,
 								        kept_var_idx=kept_var_idx,
-												[mutable-arrays] support closed-over mutable arrays in jit

											
										
										
											2024-03-05 16:20:24 -08:00
+								        mut=mut,
-												Plumb debug_info to meshExecutable as a optional arg to raise better error messages.

PiperOrigin-RevId: 525521694

											
										
										
											2023-04-19 12:35:15 -07:00
+								        auto_spmd_lowering=auto_spmd_lowering,
-												Add a private API to allow setting layouts on jitted computations.

We expose 3 modes:

* `SpecifiedLayout`: User specifies the `minor_to_major` field of the layout. Tiling not exposed yet.

* `DefaultLayout`: PJRT chooses the layout. It defaults to the current behavior.

* `AUTO`: Compiler chooses the layout. This field is not a layout per se. It's a request to get the layout from the compiler. This field cannot be on an Array or other data types. It can only be on jit.

Public API coming soon.

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 582692036

											
										
										
											2023-11-15 08:48:17 -08:00
+								        in_layouts=in_layouts,  # type: ignore
-												Typecheck avals and sharding for arguments that were DCE'd.

This keeps the promise of AOT that recompilation is guaranteed.

Fixes https://github.com/google/jax/issues/18686

PiperOrigin-RevId: 585855658

											
										
										
											2023-11-27 22:38:46 -08:00
+								        out_layouts=out_layouts,  # type: ignore
 								        all_args_info=all_args_info).load()  # type: ignore
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Prune accidental exports from jax.interpreters.pxla.

These imports do not appear to have users outside JAX itself.

PiperOrigin-RevId: 507835295

											
										
										
											2023-02-07 11:16:01 -08:00
+								class MeshExecutableFastpathData(NamedTuple):
-												Remove more exported names from jax.interpreters.xla.

None of these appear to have public users, and this module is not included in the deprecation policy.

Also:
* shorten a number of alias chains.
* move make_op_metadata() into its only caller in jax2tf
* delete the unused function dtype_to_primitive_type.
PiperOrigin-RevId: 510205315

											
										
										
											2023-02-16 11:54:25 -08:00
+								  xla_executable: xc.LoadedExecutable
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  out_pytree_def: Any
-												Split Sharding subclasses out of _src/sharding.py into _src/sharding_impls.py

By defining the Sharding base class in its own module, we can pull it out into a separate Bazel submodule, which will help pytype inference when defining Array.

PiperOrigin-RevId: 516223009

											
										
										
											2023-03-13 08:49:39 -07:00
+								  in_shardings: Sequence[sharding_impls.XLACompatibleSharding]
 								  out_shardings: Sequence[sharding_impls.XLACompatibleSharding]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  out_avals: Sequence[ShapedArray]
 								  out_committed: Sequence[bool]
 								  kept_var_bitvec: Iterable[bool]
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								  # TODO(yashkatariya): Remove once minimum jaxlib version is 0.4.24
-												Call shard_arg fallback in pjit's cpp fast path instead of dropping out completely.

PiperOrigin-RevId: 592344105

											
										
										
											2023-12-19 14:25:25 -08:00
+								  arg_handler_devices: Sequence[xc.Device]
 								  arg_handler_indices: Sequence[tuple[Index | None, ...]]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												fast dispatch for functions over typed PRNG key arrays

Before this change, JAX could dispatch compiled functions over new-style (typed)
RNG key arrays, but it would always do so off of the fast (C++-based) dispatch
path. In other words, switching from old-style `uint32` RNG keys to new-style
keys would regress dispatch times. With this change, dispatch happens on the
fast path again and performance regressions ought to be minimal.

We currently maintain only one pytree registry, for all registered pytree node
types. We want RNG key arrays to also be treated as pytree leaves everywhere
*except* during dispatch. In other words: we want operations on (typed) RNG key
arrays to appear in Jaxpr, but we want to unravel those arrays into their
underlying `uint32` arrays only during dispatch.

To do this, we add a new internal pytree registry that dispatch respects
uniquely. This registry includes all items in the default registry, but also the
RNG key array type.

Co-authored-by: Matthew Johnson <mattjj@google.com>
PiperOrigin-RevId: 565077758

											
										
										
											2023-09-13 09:43:14 -07:00
+								def reflatten_outputs_for_dispatch(out_tree, out_flat):
 								  # We arrive at dispatch having flattened according to the default
 								  # pytree registry, but we want to re-flatten according to our
 								  # dispatch-specific registry.
 								  out_unflat = tree_util.tree_unflatten(out_tree, out_flat)
 								  return tree_util.dispatch_registry.flatten(out_unflat, None)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								class MeshExecutable(stages.XlaExecutable):
 								  __slots__ = [
-												Do the sharding.addressable_devices check only once in _get_input_indices since all shardings should have the same device_assignment.

That check happens at the start of lower_sharding_computation. Also use the optimized DeviceAssignment object which has all the calculations cached if this path is hit multiple times.

Also remove `device_assignment` from MeshExecutable since it is not used anywhere in that class

PiperOrigin-RevId: 523182028

											
										
										
											2023-04-10 12:22:45 -07:00
+								      "xla_executable", "_unsafe_call", "build_unsafe_call", "in_avals",
-												Thread out_avals to MeshExecutable

PiperOrigin-RevId: 612037684

											
										
										
											2024-03-02 13:34:46 -08:00
+								      "out_avals", "_in_shardings", "_out_shardings", "_auto_spmd_lowering",
 								      "_kept_var_idx", "_in_layouts", "_out_layouts", "_all_args_info",
 								      "_unloaded_executable",
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  ]
-												Thread out_avals to MeshExecutable

PiperOrigin-RevId: 612037684

											
										
										
											2024-03-02 13:34:46 -08:00
+								  def __init__(self, xla_executable, build_unsafe_call, in_avals, out_avals,
 								               in_shardings, out_shardings, auto_spmd_lowering, kept_var_idx,
-												Remove jaxpr_debug_info from MeshComputation since that information is available via AllArgsInfo

PiperOrigin-RevId: 586018345

											
										
										
											2023-11-28 10:04:29 -08:00
+								               in_layouts, out_layouts,
-												Typecheck avals and sharding for arguments that were DCE'd.

This keeps the promise of AOT that recompilation is guaranteed.

Fixes https://github.com/google/jax/issues/18686

PiperOrigin-RevId: 585855658

											
										
										
											2023-11-27 22:38:46 -08:00
+								               all_args_info: AllArgsInfo | None = None,
-												Add a private API to allow setting layouts on jitted computations.

We expose 3 modes:

* `SpecifiedLayout`: User specifies the `minor_to_major` field of the layout. Tiling not exposed yet.

* `DefaultLayout`: PJRT chooses the layout. It defaults to the current behavior.

* `AUTO`: Compiler chooses the layout. This field is not a layout per se. It's a request to get the layout from the compiler. This field cannot be on an Array or other data types. It can only be on jit.

Public API coming soon.

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 582692036

											
										
										
											2023-11-15 08:48:17 -08:00
+								               unloaded_executable=None):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    self.xla_executable = xla_executable
-												Redefine `compile_and_serialize` as `serialize(lowered.compile())`.

This has the downside of keeping around the UnloadedMeshComputation,
but it makes the serialize() API easier to understand.

PiperOrigin-RevId: 518715469

											
										
										
											2023-03-22 17:22:39 -07:00
+								    self.build_unsafe_call = build_unsafe_call
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    # in_avals is a list of global and local avals. Aval is global if input
 								    # is a GDA or jax.Array else local.
 								    self.in_avals = in_avals
-												Thread out_avals to MeshExecutable

PiperOrigin-RevId: 612037684

											
										
										
											2024-03-02 13:34:46 -08:00
+								    self.out_avals = out_avals
-												Redefine `compile_and_serialize` as `serialize(lowered.compile())`.

This has the downside of keeping around the UnloadedMeshComputation,
but it makes the serialize() API easier to understand.

PiperOrigin-RevId: 518715469

											
										
										
											2023-03-22 17:22:39 -07:00
+								    self._unsafe_call = None
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    self._in_shardings = in_shardings
 								    self._out_shardings = out_shardings
 								    self._auto_spmd_lowering = auto_spmd_lowering
 								    self._kept_var_idx = kept_var_idx
-												Add a private API to allow setting layouts on jitted computations.

We expose 3 modes:

* `SpecifiedLayout`: User specifies the `minor_to_major` field of the layout. Tiling not exposed yet.

* `DefaultLayout`: PJRT chooses the layout. It defaults to the current behavior.

* `AUTO`: Compiler chooses the layout. This field is not a layout per se. It's a request to get the layout from the compiler. This field cannot be on an Array or other data types. It can only be on jit.

Public API coming soon.

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 582692036

											
										
										
											2023-11-15 08:48:17 -08:00
+								    self._in_layouts = in_layouts
 								    self._out_layouts = out_layouts
-												Typecheck avals and sharding for arguments that were DCE'd.

This keeps the promise of AOT that recompilation is guaranteed.

Fixes https://github.com/google/jax/issues/18686

PiperOrigin-RevId: 585855658

											
										
										
											2023-11-27 22:38:46 -08:00
+								    self._all_args_info = all_args_info
-												Redefine `compile_and_serialize` as `serialize(lowered.compile())`.

This has the downside of keeping around the UnloadedMeshComputation,
but it makes the serialize() API easier to understand.

PiperOrigin-RevId: 518715469

											
										
										
											2023-03-22 17:22:39 -07:00
+								    self._unloaded_executable = unloaded_executable
 								  @property
 								  def unsafe_call(self) -> Callable[..., Any]:
 								    if self._unsafe_call is None:
 								      self._unsafe_call = self.build_unsafe_call()
-												Typecheck avals and sharding for arguments that were DCE'd.

This keeps the promise of AOT that recompilation is guaranteed.

Fixes https://github.com/google/jax/issues/18686

PiperOrigin-RevId: 585855658

											
										
										
											2023-11-27 22:38:46 -08:00
+								    return self._unsafe_call  # type: ignore
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								  # -- stages.XlaExecutable overrides
 								  def xla_extension_executable(self):
 								    return self.xla_executable
 								  def call(self, *args):
-												Typecheck avals and sharding for arguments that were DCE'd.

This keeps the promise of AOT that recompilation is guaranteed.

Fixes https://github.com/google/jax/issues/18686

PiperOrigin-RevId: 585855658

											
										
										
											2023-11-27 22:38:46 -08:00
+								    if self._all_args_info is None:
 								      kept_args = [a for i, a in enumerate(args) if i in self._kept_var_idx]
 								      ref_avals = self.in_avals
 								      in_shardings = self._in_shardings
-												Remove jaxpr_debug_info from MeshComputation since that information is available via AllArgsInfo

PiperOrigin-RevId: 586018345

											
										
										
											2023-11-28 10:04:29 -08:00
+								      debug_info = None
-												Typecheck avals and sharding for arguments that were DCE'd.

This keeps the promise of AOT that recompilation is guaranteed.

Fixes https://github.com/google/jax/issues/18686

PiperOrigin-RevId: 585855658

											
										
										
											2023-11-27 22:38:46 -08:00
+								    else:
 								      kept_args = args
 								      ref_avals = self._all_args_info.in_avals
 								      iter_in_shardings = iter(self._in_shardings)
 								      in_shardings = [next(iter_in_shardings) if i in self._kept_var_idx else s
 								                      for i, s in enumerate(self._all_args_info.in_shardings)]
 								      debug_info = self._all_args_info.debug_info
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    arg_avals = map(xla.abstractify, kept_args)
-												Typecheck avals and sharding for arguments that were DCE'd.

This keeps the promise of AOT that recompilation is guaranteed.

Fixes https://github.com/google/jax/issues/18686

PiperOrigin-RevId: 585855658

											
										
										
											2023-11-27 22:38:46 -08:00
+								    check_arg_avals_for_call(ref_avals, arg_avals, debug_info)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    # Check the GDA sharding and the input sharding.
-												Typecheck avals and sharding for arguments that were DCE'd.

This keeps the promise of AOT that recompilation is guaranteed.

Fixes https://github.com/google/jax/issues/18686

PiperOrigin-RevId: 585855658

											
										
										
											2023-11-27 22:38:46 -08:00
+								    check_gda_or_array_xla_sharding_match(kept_args, in_shardings, debug_info)
-												Redefine `compile_and_serialize` as `serialize(lowered.compile())`.

This has the downside of keeping around the UnloadedMeshComputation,
but it makes the serialize() API easier to understand.

PiperOrigin-RevId: 518715469

											
										
										
											2023-03-22 17:22:39 -07:00
+								    return self.unsafe_call(*args)  # pylint: disable=not-callable
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Split Sharding subclasses out of _src/sharding.py into _src/sharding_impls.py

By defining the Sharding base class in its own module, we can pull it out into a separate Bazel submodule, which will help pytype inference when defining Array.

PiperOrigin-RevId: 516223009

											
										
										
											2023-03-13 08:49:39 -07:00
+								  def input_shardings(self) -> Sequence[sharding_impls.XLACompatibleSharding]:
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    return self._in_shardings
-												Split Sharding subclasses out of _src/sharding.py into _src/sharding_impls.py

By defining the Sharding base class in its own module, we can pull it out into a separate Bazel submodule, which will help pytype inference when defining Array.

PiperOrigin-RevId: 516223009

											
										
										
											2023-03-13 08:49:39 -07:00
+								  def output_shardings(self) -> Sequence[sharding_impls.XLACompatibleSharding]:
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    return self._out_shardings
-												Add a private API to allow setting layouts on jitted computations.

We expose 3 modes:

* `SpecifiedLayout`: User specifies the `minor_to_major` field of the layout. Tiling not exposed yet.

* `DefaultLayout`: PJRT chooses the layout. It defaults to the current behavior.

* `AUTO`: Compiler chooses the layout. This field is not a layout per se. It's a request to get the layout from the compiler. This field cannot be on an Array or other data types. It can only be on jit.

Public API coming soon.

Co-authored-by: Roy Frostig <frostig@google.com>
PiperOrigin-RevId: 582692036

											
										
										
											2023-11-15 08:48:17 -08:00
+								  def input_layouts(self):
 								    return self._in_layouts
 								  def output_layouts(self):
 								    return self._out_layouts
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  def create_cpp_call(self, no_kwargs, in_tree, out_tree):
 								    if not (isinstance(self.unsafe_call, ExecuteReplicated) and
 								            not self.unsafe_call.has_unordered_effects and
 								            not self.unsafe_call.has_host_callbacks):
 								      return None
 								    def aot_cache_miss(*args, **kwargs):
 								      params = stages.CompiledCallParams(self, no_kwargs, in_tree, out_tree)
 								      outs, out_flat, args_flat = stages.Compiled.call(params, *args, **kwargs)
-												fast dispatch for functions over typed PRNG key arrays

Before this change, JAX could dispatch compiled functions over new-style (typed)
RNG key arrays, but it would always do so off of the fast (C++-based) dispatch
path. In other words, switching from old-style `uint32` RNG keys to new-style
keys would regress dispatch times. With this change, dispatch happens on the
fast path again and performance regressions ought to be minimal.

We currently maintain only one pytree registry, for all registered pytree node
types. We want RNG key arrays to also be treated as pytree leaves everywhere
*except* during dispatch. In other words: we want operations on (typed) RNG key
arrays to appear in Jaxpr, but we want to unravel those arrays into their
underlying `uint32` arrays only during dispatch.

To do this, we add a new internal pytree registry that dispatch respects
uniquely. This registry includes all items in the default registry, but also the
RNG key array type.

Co-authored-by: Matthew Johnson <mattjj@google.com>
PiperOrigin-RevId: 565077758

											
										
										
											2023-09-13 09:43:14 -07:00
+								      out_flat, out_tree_dispatch = reflatten_outputs_for_dispatch(
 								          out_tree, out_flat)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      use_fastpath = (all(isinstance(x, xc.ArrayImpl) for x in out_flat))
 								      if use_fastpath:
 								        out_avals = [o.aval for o in out_flat]
 								        out_committed = [o._committed for o in out_flat]
 								        kept_var_bitvec = [i in self._kept_var_idx
 								                           for i in range(len(args_flat))]
-												Convert in_shardings to physical shardings in cpp dispatch path because the same happens with prng arrays.

Also comment out key reuse check in cpp dispatch since it's True for jax tests which prevent prng keys from taking Cpp dispatch.

PiperOrigin-RevId: 613289252

											
										
										
											2024-03-06 11:41:34 -08:00
+								        in_shardings = [
 								            a.dtype._rules.physical_sharding(a, s)
 								            if a is not core.abstract_token and dtypes.issubdtype(a.dtype, dtypes.extended)
 								            else s
 								            for s, a in zip(self._in_shardings, self.in_avals)
 								        ]
-												Prune accidental exports from jax.interpreters.pxla.

These imports do not appear to have users outside JAX itself.

PiperOrigin-RevId: 507835295

											
										
										
											2023-02-07 11:16:01 -08:00
+								        fastpath_data = MeshExecutableFastpathData(
-												Convert in_shardings to physical shardings in cpp dispatch path because the same happens with prng arrays.

Also comment out key reuse check in cpp dispatch since it's True for jax tests which prevent prng keys from taking Cpp dispatch.

PiperOrigin-RevId: 613289252

											
										
										
											2024-03-06 11:41:34 -08:00
+								            self.xla_executable, out_tree_dispatch, in_shardings,
-												Call shard_arg fallback in pjit's cpp fast path instead of dropping out completely.

PiperOrigin-RevId: 592344105

											
										
										
											2023-12-19 14:25:25 -08:00
+								            self._out_shardings, out_avals, out_committed, kept_var_bitvec,
 								            self.unsafe_call.in_handler.local_devices,
 								            self.unsafe_call.in_handler.input_indices)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      else:
 								        fastpath_data = None
 								      return outs, fastpath_data
-												Call shard_arg fallback in pjit's cpp fast path instead of dropping out completely.

PiperOrigin-RevId: 592344105

											
										
										
											2023-12-19 14:25:25 -08:00
+								    if xla_extension_version >= 226:
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								      return xc._xla.pjit(
 								          self.unsafe_call.name, None, aot_cache_miss, [], [], [],
 								          tree_util.dispatch_registry,
 								          shard_arg if xla_extension_version >= 229 else temp_shard_arg)  # type: ignore
-												Call shard_arg fallback in pjit's cpp fast path instead of dropping out completely.

PiperOrigin-RevId: 592344105

											
										
										
											2023-12-19 14:25:25 -08:00
+								    else:
 								      return xc._xla.pjit(self.unsafe_call.name, None, aot_cache_miss, [], [], [],  # type: ignore
 								                          tree_util.dispatch_registry)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								# TODO(yashkatariya): Remove once minimum jaxlib version is 0.4.24
 								def temp_shard_arg(arg, devices, arg_indices, sharding, canonicalize=True):
 								  return shard_arg(arg, sharding)
-												Add arg_names to aval mismatch error raised during AOT compilation to raise better error messages

PiperOrigin-RevId: 525561905

											
										
										
											2023-04-19 15:08:21 -07:00
+								def check_arg_avals_for_call(ref_avals, arg_avals,
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								                             jaxpr_debug_info: core.JaxprDebugInfo | None = None):
-												[Jax cleanup]

* Remove lower_xla_callable and all related functions
* Remove pxla.device_put
* Remove dispatch.device_put_handlers

PiperOrigin-RevId: 517249345

											
										
										
											2023-03-16 15:46:57 -07:00
+								  if len(ref_avals) != len(arg_avals):
 								    raise TypeError(
 								        f"Computation compiled for {len(ref_avals)} inputs "
 								        f"but called with {len(arg_avals)}")
-												AOT: better error messages on call signature mismatch

Also update error example in AOT docs.

											
										
										
											2023-07-10 18:28:50 -07:00
 								  if jaxpr_debug_info is not None:
 								    arg_names = [f"'{name}'" for name in jaxpr_debug_info.arg_names]
 								  else:
 								    num_args = len(ref_avals)
 								    arg_names = [f"{i + 1}/{num_args}" for i in range(num_args)]
-												Add arg_names to aval mismatch error raised during AOT compilation to raise better error messages

PiperOrigin-RevId: 525561905

											
										
										
											2023-04-19 15:08:21 -07:00
+								  errors = []
 								  for ref_aval, arg_aval, name in safe_zip(ref_avals, arg_avals, arg_names):
-												[Jax cleanup]

* Remove lower_xla_callable and all related functions
* Remove pxla.device_put
* Remove dispatch.device_put_handlers

PiperOrigin-RevId: 517249345

											
										
										
											2023-03-16 15:46:57 -07:00
+								    if not core.typematch(ref_aval, arg_aval):
-												AOT: better error messages on call signature mismatch

Also update error example in AOT docs.

											
										
										
											2023-07-10 18:28:50 -07:00
+								      errors.append(
 								          f"Argument {name} compiled with {ref_aval.str_short()} and called "
 								          f"with {arg_aval.str_short()}")
-												Add arg_names to aval mismatch error raised during AOT compilation to raise better error messages

PiperOrigin-RevId: 525561905

											
										
										
											2023-04-19 15:08:21 -07:00
+								  if errors:
-												AOT: better error messages on call signature mismatch

Also update error example in AOT docs.

											
										
										
											2023-07-10 18:28:50 -07:00
+								    max_num_errors = 5
 								    str_errors = "\n".join(errors[:max_num_errors])
 								    if len(errors) >= max_num_errors:
 								      num_mismatch_str = f"The first {max_num_errors} of {len(errors)}"
 								    else:
 								      num_mismatch_str = "The"
-												Add arg_names to aval mismatch error raised during AOT compilation to raise better error messages

PiperOrigin-RevId: 525561905

											
										
										
											2023-04-19 15:08:21 -07:00
+								    raise TypeError(
-												AOT: better error messages on call signature mismatch

Also update error example in AOT docs.

											
										
										
											2023-07-10 18:28:50 -07:00
+								        "Argument types differ from the types for which this computation was "
 								        f"compiled. {num_mismatch_str} mismatches are:\n{str_errors}")
-												[Jax cleanup]

* Remove lower_xla_callable and all related functions
* Remove pxla.device_put
* Remove dispatch.device_put_handlers

PiperOrigin-RevId: 517249345

											
										
										
											2023-03-16 15:46:57 -07:00
-												Do the sharding.addressable_devices check only once in _get_input_indices since all shardings should have the same device_assignment.

That check happens at the start of lower_sharding_computation. Also use the optimized DeviceAssignment object which has all the calculations cached if this path is hit multiple times.

Also remove `device_assignment` from MeshExecutable since it is not used anywhere in that class

PiperOrigin-RevId: 523182028

											
										
										
											2023-04-10 12:22:45 -07:00
+								def _get_metadata_jit_pmap(local_devices, num_in_shardings, num_out_shardings):
 								  # Create replicated shardings for jit(pmap) path with local devices
 								  # because multihost jit(pmap) is not allowed.
-												Create same Sharding objects wherever possible to get maximum cache hits

PiperOrigin-RevId: 524116574

											
										
										
											2023-04-13 15:18:56 -07:00
+								  gs = sharding_impls.GSPMDSharding.get_replicated(local_devices)
 								  in_shardings = [gs] * num_in_shardings
 								  out_shardings = [gs] * num_out_shardings
-												Do the sharding.addressable_devices check only once in _get_input_indices since all shardings should have the same device_assignment.

That check happens at the start of lower_sharding_computation. Also use the optimized DeviceAssignment object which has all the calculations cached if this path is hit multiple times.

Also remove `device_assignment` from MeshExecutable since it is not used anywhere in that class

PiperOrigin-RevId: 523182028

											
										
										
											2023-04-10 12:22:45 -07:00
+								  # jit(pmap) will generate Arrays with multi-device sharding.
-												Fix typos across the package

											
										
										
											2023-09-22 14:54:31 -07:00
+								  # It is unsupported for these shardings to be uncommitted, so force
-												Do the sharding.addressable_devices check only once in _get_input_indices since all shardings should have the same device_assignment.

That check happens at the start of lower_sharding_computation. Also use the optimized DeviceAssignment object which has all the calculations cached if this path is hit multiple times.

Also remove `device_assignment` from MeshExecutable since it is not used anywhere in that class

PiperOrigin-RevId: 523182028

											
										
										
											2023-04-10 12:22:45 -07:00
+								  # the outputs to be committed.
 								  committed = True
 								  return in_shardings, out_shardings, committed, tuple(local_devices)
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								@weakref_lru_cache
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								def _compile_replicated_mesh_executable_from_hlo(
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								    computation, name, global_in_avals, global_out_avals, semantics_in_shardings,
 								    semantics_out_shardings, auto_spmd_lowering, compile_options,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								    host_callbacks, has_unordered_effects, ordered_effects, kept_var_idx,
-												Remove jaxpr_debug_info from MeshComputation since that information is available via AllArgsInfo

PiperOrigin-RevId: 586018345

											
										
										
											2023-11-28 10:04:29 -08:00
+								    backend, da, committed, pmap_nreps):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  assert not auto_spmd_lowering
-												Preserve shardings on the output of pjit that were provided on the arguments.

Following are the changes:

* Make _pjit_lower_cached depend on exact sharding equality if `_original_sharding` exists. This top level cache should fill up eventually if users are passing different shardings into the pjit function.
* Split lower_sharding_computation into 3 caches:
  * _trace_to_jaxpr_and_dce cache -- This will return a closed jaxpr which is DCE'd
  * _cached_lowering_to_hlo cache -- This will cache the generation of MHLO. This cache is dependent on the semantic equality of shardings i.e. if 2 shardings lower to the same OpSharding, then there will be a cache hit
  * _cached_compilation cache -- This caches the compilation so that we don't recompile if the shardings are semantically equal.

The way this works is the out_handlers are created again if we pass in different shardings to pjit (but there is no recompilation). This allows us to maintain the shardings passed by the user.

For ops like `jnp.squeeze` where we infer the sharding from the executable, we try to recreate a NamedSharding (right now, more support will be added in following CLs) from the GSPMDSharding since it will be available on the input.

PiperOrigin-RevId: 522991145

											
										
										
											2023-04-09 15:41:32 -07:00
+								  in_shardings = semantics_in_shardings.shardings
 								  out_shardings = semantics_out_shardings.shardings
 								  kept_var_idx = set(kept_var_idx)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  # Will compute out_handler with executable information.
 								  unsafe_call = backend.compile_replicated(
 								      is_trivial=False, name=name, computation=computation,
 								      compile_options=compile_options, host_callbacks=host_callbacks,
 								      has_unordered_effects=has_unordered_effects,
-												Remove indices and devices from shard_arg_handlers and shard_args.

This only affects python dispatch path. This has no impact on the speed of cpp dispatch (which is why benchmarks are **not** regressing).

If your code ends up taking the python dispatch, then something is going wrong anyways.

PiperOrigin-RevId: 596081987

											
										
										
											2024-01-05 14:16:32 -08:00
+								      device_assignment=da, ordered_effects=ordered_effects,
 								      in_avals=global_in_avals,
 								      in_shardings=in_shardings, kept_var_idx=kept_var_idx,
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      out_avals=global_out_avals, out_shardings=out_shardings,
-												Ensure that the sharding specs are always set to match the number of
addressable devices when pmap_nreps is set.

PiperOrigin-RevId: 514537352

											
										
										
											2023-03-06 15:01:48 -08:00
+								      committed=committed, pmap_nreps=pmap_nreps)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  xla_executable = None
-												Redefine `compile_and_serialize` as `serialize(lowered.compile())`.

This has the downside of keeping around the UnloadedMeshComputation,
but it makes the serialize() API easier to understand.

PiperOrigin-RevId: 518715469

											
										
										
											2023-03-22 17:22:39 -07:00
+								  return MeshExecutable(xla_executable, lambda: unsafe_call, global_in_avals,
-												Thread out_avals to MeshExecutable

PiperOrigin-RevId: 612037684

											
										
										
											2024-03-02 13:34:46 -08:00
+								                        global_out_avals, in_shardings, out_shardings,
 								                        auto_spmd_lowering, kept_var_idx,
 								                        (None,) * len(global_in_avals),
-												Remove jaxpr_debug_info from MeshComputation since that information is available via AllArgsInfo

PiperOrigin-RevId: 586018345

											
										
										
											2023-11-28 10:04:29 -08:00
+								                        (None,) * len(global_out_avals))
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								@lru_cache
-												Prune accidental exports from jax.interpreters.pxla.

These imports do not appear to have users outside JAX itself.

PiperOrigin-RevId: 507835295

											
										
										
											2023-02-07 11:16:01 -08:00
+								def create_mesh_pspec_sharding(
-												Upgrade most .py sources to 3.9

This commit was generated by running

    pyupgrade --py39-plus --keep-percent-format {jax,tests,jaxlib,examples,benchmarks}/**/*.py

											
										
										
											2023-12-08 12:09:04 +00:00
+								    mesh: Mesh, pspec: PartitionSpec | None, parsed_pspec=None,
 								    memory_kind: str | None = None) -> sharding_impls.NamedSharding:
-												Create a proper NamedSharding without `None` as the pspec. This happens when users pass `None` as the out_shardings/in_shardings and pjit should convert it to a proper PartitionSpec.

PiperOrigin-RevId: 523125287

											
										
										
											2023-04-10 08:42:18 -07:00
+								  if pspec is None:
-												Remove None's from initialization of ParsedPartitionSpec so that we are consistent across jax. This also makes accessing `.user_spec` return the normalized value.

PiperOrigin-RevId: 523155411

											
										
										
											2023-04-10 10:48:26 -07:00
+								    pspec, parsed_pspec = PartitionSpec(), None
-												[Memories] Add Memories support to jax.jit and jax.device_put!

These are the following changes:

* Add a temporary flag (`JAX_FETCH_MEMORY_KIND_ON_EXECUTABLE`) (should not be used by user but needed in C++ in pjrt-ifrt code) on whether to fetch memory kinds from executable. If it is set to True, the host runtime dep needs to be linked in and should also work in OSS (more work needs to happen for that). So only the test sets it to True for now until jax memories is under development.

* Add with_memory_kind method on Sharding to allow for easier creation of shardings with different memory kind.

* Add lowering rules for device_put and jax.jit.
  * For device_put, we always add the annotation that describes a transfer to a memory and a sharding annotation.
  * For jax.jit, if the argument is on host memory, it will have an extra attribute _xla_buffer_placement.

* Handle the correct output sharding in pxla.py by extracting the memory kind from the executable.

* Handle the caching of pjit caches by canonicalizing the memory_kinds so that `NS(mesh, pspec) == NS(mesh, pspec, memory_kind='tpu_hbm')`. Also canonicalize memory_kind in `__hash__` and `__eq__` of shardings.
  * This is to not change the StableHLO to include device placement annotations right now since the host aware passes are not enabled by default and the work is under progress to make it work everywhere.

PiperOrigin-RevId: 553833344

											
										
										
											2023-08-04 09:43:39 -07:00
+								  return sharding_impls.NamedSharding(mesh, pspec, _parsed_pspec=parsed_pspec,
 								                                      memory_kind=memory_kind)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Prune accidental exports from jax.interpreters.pxla.

These imports do not appear to have users outside JAX itself.

PiperOrigin-RevId: 507835295

											
										
										
											2023-02-07 11:16:01 -08:00
+								def check_device_backend_on_shardings(shardings) -> bool:
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  for i in shardings:
-												Split sharding_impls into its own Bazel target.

* Move dependencies of sharding_impls into sharding_impls to avoid creating cyclic dependencies.
* Fix a handful of new pytype errors.

PiperOrigin-RevId: 523146076

											
										
										
											2023-04-10 10:15:08 -07:00
+								    if is_unspecified(i) or is_auto(i):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      continue
 								    if hasattr(i, '_original_sharding') and getattr(
 								        i._original_sharding, '_device_backend', False):
 								      return True
 								  return False
-												Prune accidental exports from jax.interpreters.pxla.

These imports do not appear to have users outside JAX itself.

PiperOrigin-RevId: 507835295

											
										
										
											2023-02-07 11:16:01 -08:00
+								def check_gda_or_array_xla_sharding_match(
-												Plumb debug_info to meshExecutable as a optional arg to raise better error messages.

PiperOrigin-RevId: 525521694

											
										
										
											2023-04-19 12:35:15 -07:00
+								    args, in_xla_shardings: Sequence[sharding_impls.XLACompatibleSharding],
-												Apply pyupgrade --py39-plus.

Notable changes:
* use PEP 585 type names
* use PEP 604 type union syntax where `from __future__ import annotations` is present.
* use f-strings in more places.
* remove redundant arguments to open().

											
										
										
											2023-07-21 14:20:39 -04:00
+								    jaxpr_debug_info: core.JaxprDebugInfo | None) -> None:
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  from jax._src.array import ArrayImpl
-												Plumb debug_info to meshExecutable as a optional arg to raise better error messages.

PiperOrigin-RevId: 525521694

											
										
										
											2023-04-19 12:35:15 -07:00
+								  arg_names = ([''] * len(args) if jaxpr_debug_info is None else
 								               jaxpr_debug_info.arg_names)
 								  errors = []
 								  num_errors = 5
 								  for arg, xs, name in safe_zip(args, in_xla_shardings, arg_names):
-												Remove GDA from JAX since jax.Array is the default type and cannot be disabled anymore as per https://jax.readthedocs.io/en/latest/jax_array_migration.html#how-can-i-disable-jax-array-for-now

PiperOrigin-RevId: 516905931

											
										
										
											2023-03-15 12:59:33 -07:00
+								    if not isinstance(arg, ArrayImpl):
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								      continue
-												Typecheck avals and sharding for arguments that were DCE'd.

This keeps the promise of AOT that recompilation is guaranteed.

Fixes https://github.com/google/jax/issues/18686

PiperOrigin-RevId: 585855658

											
										
										
											2023-11-27 22:38:46 -08:00
+								    if is_unspecified_or_auto(xs):
 								      continue
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												AOT sharding mismatch error shouldn't have GSPMDSharding in it.

PiperOrigin-RevId: 576668290

											
										
										
											2023-10-25 15:47:17 -07:00
+								    db_xs = check_device_backend_on_shardings([xs])
 								    if not db_xs:
 								      xs = getattr(xs, '_original_sharding', xs)
-												[Memories] Add Memories support to jax.jit and jax.device_put!

These are the following changes:

* Add a temporary flag (`JAX_FETCH_MEMORY_KIND_ON_EXECUTABLE`) (should not be used by user but needed in C++ in pjrt-ifrt code) on whether to fetch memory kinds from executable. If it is set to True, the host runtime dep needs to be linked in and should also work in OSS (more work needs to happen for that). So only the test sets it to True for now until jax memories is under development.

* Add with_memory_kind method on Sharding to allow for easier creation of shardings with different memory kind.

* Add lowering rules for device_put and jax.jit.
  * For device_put, we always add the annotation that describes a transfer to a memory and a sharding annotation.
  * For jax.jit, if the argument is on host memory, it will have an extra attribute _xla_buffer_placement.

* Handle the correct output sharding in pxla.py by extracting the memory kind from the executable.

* Handle the caching of pjit caches by canonicalizing the memory_kinds so that `NS(mesh, pspec) == NS(mesh, pspec, memory_kind='tpu_hbm')`. Also canonicalize memory_kind in `__hash__` and `__eq__` of shardings.
  * This is to not change the StableHLO to include device placement annotations right now since the host aware passes are not enabled by default and the work is under progress to make it work everywhere.

PiperOrigin-RevId: 553833344

											
										
										
											2023-08-04 09:43:39 -07:00
+								    # Raise memory kind mismatch error even if the arg is uncommitted.
-												Canonicalize to default memory in init of Shardings only on the backends that support memories right now.

PiperOrigin-RevId: 553942534

											
										
										
											2023-08-04 16:26:31 -07:00
+								    if arg.sharding.memory_kind != xs.memory_kind:
-												[Memories] Add Memories support to jax.jit and jax.device_put!

These are the following changes:

* Add a temporary flag (`JAX_FETCH_MEMORY_KIND_ON_EXECUTABLE`) (should not be used by user but needed in C++ in pjrt-ifrt code) on whether to fetch memory kinds from executable. If it is set to True, the host runtime dep needs to be linked in and should also work in OSS (more work needs to happen for that). So only the test sets it to True for now until jax memories is under development.

* Add with_memory_kind method on Sharding to allow for easier creation of shardings with different memory kind.

* Add lowering rules for device_put and jax.jit.
  * For device_put, we always add the annotation that describes a transfer to a memory and a sharding annotation.
  * For jax.jit, if the argument is on host memory, it will have an extra attribute _xla_buffer_placement.

* Handle the correct output sharding in pxla.py by extracting the memory kind from the executable.

* Handle the caching of pjit caches by canonicalizing the memory_kinds so that `NS(mesh, pspec) == NS(mesh, pspec, memory_kind='tpu_hbm')`. Also canonicalize memory_kind in `__hash__` and `__eq__` of shardings.
  * This is to not change the StableHLO to include device placement annotations right now since the host aware passes are not enabled by default and the work is under progress to make it work everywhere.

PiperOrigin-RevId: 553833344

											
										
										
											2023-08-04 09:43:39 -07:00
+								      errors.append(
-												AOT sharding mismatch error shouldn't have GSPMDSharding in it.

PiperOrigin-RevId: 576668290

											
										
										
											2023-10-25 15:47:17 -07:00
+								          "Got input sharding(s) that compiled object was called with: "
 								          f"{arg.sharding} and sharding(s) the computation was compiled "
 								          f"with: {xs} for arg {name} with shape: {arg.aval.str_short()}")
-												[Memories] Add Memories support to jax.jit and jax.device_put!

These are the following changes:

* Add a temporary flag (`JAX_FETCH_MEMORY_KIND_ON_EXECUTABLE`) (should not be used by user but needed in C++ in pjrt-ifrt code) on whether to fetch memory kinds from executable. If it is set to True, the host runtime dep needs to be linked in and should also work in OSS (more work needs to happen for that). So only the test sets it to True for now until jax memories is under development.

* Add with_memory_kind method on Sharding to allow for easier creation of shardings with different memory kind.

* Add lowering rules for device_put and jax.jit.
  * For device_put, we always add the annotation that describes a transfer to a memory and a sharding annotation.
  * For jax.jit, if the argument is on host memory, it will have an extra attribute _xla_buffer_placement.

* Handle the correct output sharding in pxla.py by extracting the memory kind from the executable.

* Handle the caching of pjit caches by canonicalizing the memory_kinds so that `NS(mesh, pspec) == NS(mesh, pspec, memory_kind='tpu_hbm')`. Also canonicalize memory_kind in `__hash__` and `__eq__` of shardings.
  * This is to not change the StableHLO to include device placement annotations right now since the host aware passes are not enabled by default and the work is under progress to make it work everywhere.

PiperOrigin-RevId: 553833344

											
										
										
											2023-08-04 09:43:39 -07:00
-												AOT sharding mismatch error shouldn't have GSPMDSharding in it.

PiperOrigin-RevId: 576668290

											
										
										
											2023-10-25 15:47:17 -07:00
+								    if (not db_xs and arg._committed and
-												Rename jax._src.sharding_utils to jax._src.op_shardings.

Move some more op_sharding related helpers to that module.

PiperOrigin-RevId: 522343010

											
										
										
											2023-04-06 08:31:47 -07:00
+								        not op_shardings.are_op_shardings_equal(
-												Use `_to_xla_hlo_sharding` everywhere in JAX. Remove `_to_xla_op_sharding` in favor of `_to_xla_hlo_sharding` since constructing a C++ class is faster than protos and will help with further changes coming to HloSharding.

PiperOrigin-RevId: 537969500

											
										
										
											2023-06-05 13:40:59 -07:00
+								            arg.sharding._to_xla_hlo_sharding(arg.ndim),
 								            xs._to_xla_hlo_sharding(arg.ndim))):
-												Plumb debug_info to meshExecutable as a optional arg to raise better error messages.

PiperOrigin-RevId: 525521694

											
										
										
											2023-04-19 12:35:15 -07:00
+								      errors.append(
-												AOT sharding mismatch error shouldn't have GSPMDSharding in it.

PiperOrigin-RevId: 576668290

											
										
										
											2023-10-25 15:47:17 -07:00
+								          "Got input sharding(s) that compiled object was called with: "
 								          f"{arg.sharding} and sharding(s) the computation was compiled "
 								          f"with: {xs} for arg {name} with shape: {arg.aval.str_short()}")
-												Plumb debug_info to meshExecutable as a optional arg to raise better error messages.

PiperOrigin-RevId: 525521694

											
										
										
											2023-04-19 12:35:15 -07:00
 								  if errors:
-												Add arg_names to aval mismatch error raised during AOT compilation to raise better error messages

PiperOrigin-RevId: 525561905

											
										
										
											2023-04-19 15:08:21 -07:00
+								    str_errors = '\n'.join(errors[:num_errors])
 								    num_mismatch_str = (
 								        f'the {len(errors)} mismatches' if len(errors) < num_errors else
 								        f"{num_errors} mismatches out of {len(errors)}")
-												Plumb debug_info to meshExecutable as a optional arg to raise better error messages.

PiperOrigin-RevId: 525521694

											
										
										
											2023-04-19 12:35:15 -07:00
+								    raise ValueError(
-												AOT sharding mismatch error shouldn't have GSPMDSharding in it.

PiperOrigin-RevId: 576668290

											
										
										
											2023-10-25 15:47:17 -07:00
+								          "Compiled object called with input sharding(s) does not match the "
 								          "sharding(s) the computation was compiled with. "
-												Add arg_names to aval mismatch error raised during AOT compilation to raise better error messages

PiperOrigin-RevId: 525561905

											
										
										
											2023-04-19 15:08:21 -07:00
+								          f"Here are {num_mismatch_str}:\n{str_errors}")
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
-												Prune accidental exports from jax.interpreters.pxla.

These imports do not appear to have users outside JAX itself.

PiperOrigin-RevId: 507835295

											
										
										
											2023-02-07 11:16:01 -08:00
+								def get_array_mapping(pspec: PartitionSpec) -> ArrayMappingOrAutoOrUnspecified:
-												Split sharding_impls into its own Bazel target.

* Move dependencies of sharding_impls into sharding_impls to avoid creating cyclic dependencies.
* Fix a handful of new pytype errors.

PiperOrigin-RevId: 523146076

											
										
										
											2023-04-10 10:15:08 -07:00
+								  parsed_pspec, _, _ = sharding_impls.prepare_axis_resources(
 								      pspec, "pspec to array_mapping")
-												Prune accidental exports from jax.interpreters.pxla.

These imports do not appear to have users outside JAX itself.

PiperOrigin-RevId: 507835295

											
										
										
											2023-02-07 11:16:01 -08:00
+								  return _get_array_mapping(parsed_pspec)
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								_forbidden_primitives = {
 								  'xla_pmap': 'pmap',
 								}
 								def _sanitize_mesh_jaxpr(jaxpr):
 								  if isinstance(jaxpr, core.ClosedJaxpr):
 								    jaxpr = jaxpr.jaxpr
 								  for eqn in jaxpr.eqns:
 								    if eqn.primitive.name in _forbidden_primitives:
 								      raise RuntimeError(f"Nesting {_forbidden_primitives[eqn.primitive.name]} "
 								                         f"inside xmaps not supported!")
 								    core.traverse_jaxpr_params(_sanitize_mesh_jaxpr, eqn.params)
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								custom_resource_typing_rules: dict[core.Primitive, Callable] = {}
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
 								def resource_typecheck(jaxpr, resource_env, axis_resources, what_jaxpr_thunk):
 								  if isinstance(jaxpr, core.ClosedJaxpr):
 								    jaxpr = jaxpr.jaxpr
 								  def _check_aval(aval, what_thunk):
 								    if not hasattr(aval, 'named_shape'):
 								      return
 								    resource_to_axis = {}
 								    for axis in aval.named_shape:
 								      if axis_resources:
 								        for resource in axis_resources[axis]:
 								          if resource in resource_to_axis:
 								            other_axis = resource_to_axis[resource]
 								            axis, other_axis = sorted([str(axis), str(other_axis)])
 								            raise JAXTypeError(
 								                f"Axes `{axis}` and `{other_axis}` are both mapped to the "
 								                f"resource `{resource}`, but they coincide in the named_shape "
 								                f"of {what_thunk()}")
 								          resource_to_axis[resource] = axis
 								  what_thunk = lambda: (f"an input to {what_jaxpr_thunk()}")
 								  for v in jaxpr.constvars:
 								    _check_aval(v.aval, what_thunk)
 								  for v in jaxpr.invars:
 								    _check_aval(v.aval, what_thunk)
 								  what_thunk = lambda: (f"a value returned from a primitive {eqn.primitive} created "
 								                        f"at {source_info_util.summarize(eqn.source_info)}")
 								  rec_what_jaxpr_thunk = lambda: (f"a primitive {eqn.primitive} created at"
 								                                  f"{source_info_util.summarize(eqn.source_info)}")
 								  for eqn in jaxpr.eqns:
 								    typing_rule = custom_resource_typing_rules.get(eqn.primitive, None)
 								    if typing_rule:
 								      typing_rule([v.aval for v in eqn.invars], eqn.params, eqn.source_info,
 								                  resource_env, axis_resources)
 								    else:
 								      core.traverse_jaxpr_params(partial(resource_typecheck,
 								                                         resource_env=resource_env,
 								                                         axis_resources=axis_resources,
 								                                         what_jaxpr_thunk=rec_what_jaxpr_thunk),
 								                                 eqn.params)
 								    for v in eqn.outvars:
 								      _check_aval(v.aval, what_thunk)
 								@contextmanager
 								def maybe_extend_axis_env(*args, **kwargs):
 								  with core.extend_axis_env(*args, **kwargs):
 								    yield
-												Buffer -> Array in some pxla type annotations.

PiperOrigin-RevId: 520975371

											
										
										
											2023-03-31 11:41:49 -07:00
+								def device_put(x, devices: Sequence[xc.ArrayImpl],
-												Use lower-case PEP 585 names for types.

Issue https://github.com/google/jax/issues/16537

PiperOrigin-RevId: 542969282

											
										
										
											2023-06-23 15:11:37 -07:00
+								               replicate: bool=False) -> list[xc.ArrayImpl]:
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  """Call device_put on a sequence of devices and return a flat sequence of buffers."""
 								  if replicate:
-												[Jax cleanup]

* Remove lower_xla_callable and all related functions
* Remove pxla.device_put
* Remove dispatch.device_put_handlers

PiperOrigin-RevId: 517249345

											
										
										
											2023-03-16 15:46:57 -07:00
+								    return [jax.device_put(x, device) for device in devices]
-												Move jax.interpreters.pxla to jax._src.interpreters.pxla.

Make jax.interpreters.pxla a shim that at the moment re-exports everything in the implementation, with the goal of reducing it over time.

PiperOrigin-RevId: 507584264

											
										
										
											2023-02-06 14:28:36 -08:00
+								  else:
-												[Jax cleanup]

* Remove lower_xla_callable and all related functions
* Remove pxla.device_put
* Remove dispatch.device_put_handlers

PiperOrigin-RevId: 517249345

											
										
										
											2023-03-16 15:46:57 -07:00
+								    return [jax.device_put(val, device) for val, device in safe_zip(x, devices)]