# Copyright 2018 The JAX Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Shims that allow the XLA CPU backend to call scipy-provided LAPACK kernels # via CustomCallWithLayout. from collections.abc import Sequence from enum import Enum from typing import Optional import numpy as np import jaxlib.mlir.ir as ir # pylint: disable=consider-using-from-import import jaxlib.mlir.dialects.stablehlo as hlo from jaxlib import xla_client from .cpu import _lapack from .cpu._lapack import eig from .hlo_helpers import ( custom_call, hlo_u8, hlo_s32, ensure_hlo_s32, hlo_add, DimensionSize, ShapeTypePair, mk_result_types_and_shapes, ) for _name, _value in _lapack.registrations().items(): xla_client.register_custom_call_target( _name, _value, platform="cpu", api_version=(1 if _name.endswith("_ffi") else 0), ) def _char_attr(c): return ir.IntegerAttr.get(ir.IntegerType.get_unsigned(8), ord(c)) def _lapack_int_attr(value): return ir.IntegerAttr.get(ir.IntegerType.get_signless(32), value) def _enum_to_char_attr(e: Enum): return ir.IntegerAttr.get(ir.IntegerType.get_unsigned(8), e.value) def _matrix_side_attr(*, left_side: bool): return _char_attr("L" if left_side else "R") def _matrix_uplo_attr(*, lower: bool): return _char_attr("L" if lower else "U") def _matrix_transpose_attr(*, transpose: bool, conjugate: bool): return _char_attr(("C" if conjugate else "T") if transpose else "N") def _matrix_diagonal_attr(*, unit_diag: bool): return _char_attr("U" if unit_diag else "N") def _svd_computation_attr( *, compute_uv: bool, full_matrices: Optional[bool] = True ): mode = "A" if full_matrices is None: full_matrices = True if not compute_uv: # We should assert that `full_matrices` is never True here. # This should never happen because `full_matrices` can only be computed when # `compute_uv` is True. However, at this point there are too many tests that # rely on this behavior. mode = "N" elif not full_matrices: mode = "S" return _char_attr(mode) LAPACK_DTYPE_PREFIX = { np.float32: "s", np.float64: "d", np.complex64: "c", np.complex128: "z", } def prepare_lapack_call(fn_base, dtype): """Initializes the LAPACK library and returns the LAPACK target name.""" _lapack.initialize() return build_lapack_fn_target(fn_base, dtype) def build_lapack_fn_target(fn_base: str, dtype) -> str: """Builds the target name for a LAPACK function custom call.""" try: prefix = ( LAPACK_DTYPE_PREFIX.get(dtype, None) or LAPACK_DTYPE_PREFIX[dtype.type] ) return f"lapack_{prefix}{fn_base}" except KeyError as err: raise NotImplementedError(err, f"Unsupported dtype {dtype}.") from err # TODO(phawkins): it would be nice to avoid duplicating code for each type. # ?trsm(left_side, lower, trans_a, diag, m, n, alpha, a, b): # triangular solve def trsm_hlo(dtype, alpha, a, b, left_side=False, lower=False, trans_a=False, conj_a=False, diag=False, *, b_shape_vals: tuple[DimensionSize, ...]): _lapack.initialize() b_type = ir.RankedTensorType(b.type) m, n = b_shape_vals[-2:] batch_dims_vals = b_shape_vals[:-2] num_bd = len(batch_dims_vals) batch_size_val = hlo_s32(1) for b_v in batch_dims_vals: batch_size_val = hlo.multiply(batch_size_val, ensure_hlo_s32(b_v)) if dtype == np.float32: fn = "blas_strsm" elif dtype == np.float64: fn = "blas_dtrsm" elif dtype == np.complex64: fn = "blas_ctrsm" elif dtype == np.complex128: fn = "blas_ztrsm" else: raise NotImplementedError(f"Unsupported dtype {dtype}") if conj_a and not trans_a: raise NotImplementedError("Conjugation without transposition not supported") scalar_layout = [] layout = (num_bd, num_bd + 1) + tuple(range(num_bd - 1, -1, -1)) result_types, result_shapes = mk_result_types_and_shapes( [(b_shape_vals, b_type.element_type)]) return custom_call( fn, result_types=result_types, operands=[hlo_s32(int(left_side)), hlo_s32(int(lower)), hlo_s32((2 if conj_a else 1) if trans_a else 0), hlo_s32(int(diag)), ensure_hlo_s32(m), ensure_hlo_s32(n), batch_size_val, alpha, a, b], operand_layouts=[scalar_layout] * 8 + [layout] * 2, result_layouts=[layout], operand_output_aliases={9: 0}, result_shapes=result_shapes, ).results # ?potrf: Cholesky decomposition def potrf_hlo(ctx, dtype, a: ir.Value, *, lower=False, a_shape_vals: tuple[DimensionSize, ...]): a_type = ir.RankedTensorType(a.type) fn_base = prepare_lapack_call(fn_base="potrf", dtype=dtype) batch_dims_vals = a_shape_vals[:-2] num_bd = len(batch_dims_vals) layout = (num_bd, num_bd + 1) + tuple(range(num_bd - 1, -1, -1)) info_layout = tuple(range(num_bd - 1, -1, -1)) shape_type_pairs: Sequence[ShapeTypePair] = [ (a_shape_vals, a_type.element_type), (batch_dims_vals, ir.IntegerType.get_signless(32)) ] result_types, result_shapes = mk_result_types_and_shapes(shape_type_pairs) if ctx.is_forward_compat(): fn = fn_base scalar_layout = [] n = a_shape_vals[-1] batch_size_val = hlo_s32(1) for b_v in batch_dims_vals: batch_size_val = hlo.multiply(batch_size_val, ensure_hlo_s32(b_v)) out = custom_call( fn, result_types=result_types, operands=[hlo_s32(int(lower)), batch_size_val, ensure_hlo_s32(n), a], operand_layouts=[scalar_layout] * 3 + [layout], result_layouts=[layout, info_layout], operand_output_aliases={3: 0}, result_shapes=result_shapes, ).results else: fn = fn_base + "_ffi" out = custom_call( fn, result_types=result_types, operands=[a], operand_layouts=[layout], result_layouts=[layout, info_layout], operand_output_aliases={0: 0}, result_shapes=result_shapes, backend_config={ "uplo": _matrix_uplo_attr(lower=lower), }, api_version=4, ).results return out[:2] # # ?gesdd: Singular value decomposition def gesdd_hlo(ctx, dtype, a: ir.Value, *, full_matrices=True, compute_uv=True, a_shape_vals: tuple[DimensionSize, ...]): a_type = ir.RankedTensorType(a.type) assert len(a_shape_vals) >= 2 m, n = a_shape_vals[-2:] assert type(m) is int assert type(n) is int batch_dims_vals = a_shape_vals[:-2] num_bd = len(batch_dims_vals) fn_base = prepare_lapack_call(fn_base="gesdd", dtype=dtype) i32_type = ir.IntegerType.get_signless(32) workspace: list[ShapeTypePair] # TODO(b/344892332): Remove the old kernel after the compatibility period. if ctx.is_forward_compat(): fn = fn_base batch_size_val = hlo_s32(1) for b_v in batch_dims_vals: batch_size_val = hlo.multiply(batch_size_val, ensure_hlo_s32(b_v)) if dtype == np.float32: singular_vals_type = ir.F32Type.get() lwork = _lapack.sgesdd_work_size(m, n, compute_uv, full_matrices) workspace = [ ([_lapack.gesdd_iwork_size(m, n)], i32_type), ([lwork], a_type.element_type), ] workspace_layouts = [[0], [0]] elif dtype == np.float64: singular_vals_type = ir.F64Type.get() lwork = _lapack.dgesdd_work_size(m, n, compute_uv, full_matrices) workspace = [ ([_lapack.gesdd_iwork_size(m, n)], i32_type), ([lwork], a_type.element_type), ] workspace_layouts = [[0], [0]] elif dtype == np.complex64: singular_vals_type = ir.F32Type.get() lwork = _lapack.cgesdd_work_size(m, n, compute_uv, full_matrices) workspace = [ ([_lapack.gesdd_iwork_size(m, n)], i32_type), ([_lapack.cgesdd_rwork_size(m, n, int(compute_uv))], ir.F32Type.get()), ([lwork], a_type.element_type), ] workspace_layouts = [[0], [0], [0]] elif dtype == np.complex128: singular_vals_type = ir.F64Type.get() lwork = _lapack.zgesdd_work_size(m, n, compute_uv, full_matrices) workspace = [ ([_lapack.gesdd_iwork_size(m, n)], i32_type), ([_lapack.cgesdd_rwork_size(m, n, int(compute_uv))], ir.F64Type.get()), ([lwork], a_type.element_type), ] workspace_layouts = [[0], [0], [0]] else: raise NotImplementedError(f"Unsupported dtype {dtype}") scalar_layout = [] layout = (num_bd, num_bd + 1) + tuple(range(num_bd - 1, -1, -1)) shape_type_pairs: Sequence[ShapeTypePair] = [ (a_shape_vals, a_type.element_type), (batch_dims_vals + (min(m, n),), singular_vals_type), (batch_dims_vals + (m, m if full_matrices else min(m, n)), a_type.element_type), (batch_dims_vals + (n if full_matrices else min(m, n), n), a_type.element_type), (batch_dims_vals, i32_type), ] + workspace result_types, result_shapes = mk_result_types_and_shapes(shape_type_pairs) return custom_call( fn, result_types=result_types, operands=[hlo_s32(int(full_matrices)), hlo_s32(int(compute_uv)), batch_size_val, hlo_s32(m), hlo_s32(n), hlo_s32(lwork), a], operand_layouts=[scalar_layout] * 6 + [layout], result_layouts=[ layout, (num_bd,) + tuple(range(num_bd - 1, -1, -1)), layout, layout, tuple(range(num_bd - 1, -1, -1)), ] + workspace_layouts, operand_output_aliases={6: 0}, result_shapes=result_shapes ).results[1:5] fn = fn_base + "_ffi" mode_attr = _svd_computation_attr( compute_uv=compute_uv, full_matrices=full_matrices ) if dtype == np.float32 or dtype == np.complex64: singular_vals_type = ir.F32Type.get() elif dtype == np.float64 or dtype == np.complex128: singular_vals_type = ir.F64Type.get() else: raise NotImplementedError(f"Unsupported dtype {dtype}") layout = (num_bd, num_bd + 1) + tuple(range(num_bd - 1, -1, -1)) a_elem_type = a_type.element_type shape_type_pairs: Sequence[ShapeTypePair] = [ (a_shape_vals, a_elem_type), (batch_dims_vals + (min(m, n),), singular_vals_type), (batch_dims_vals + (m, m if full_matrices else min(m, n)), a_elem_type), (batch_dims_vals + (n if full_matrices else min(m, n), n), a_elem_type), (batch_dims_vals, i32_type), ] result_types, result_shapes = mk_result_types_and_shapes(shape_type_pairs) return custom_call( fn, result_types=result_types, operands=[a], operand_layouts=[layout], result_layouts=[ layout, (num_bd,) + tuple(range(num_bd - 1, -1, -1)), layout, layout, tuple(range(num_bd - 1, -1, -1)), ], operand_output_aliases={0: 0}, result_shapes=result_shapes, backend_config={ "mode": mode_attr, }, api_version=4, ).results[1:] # # geev: Nonsymmetric eigendecomposition (eig) def geev_hlo(ctx, dtype, input, *, input_shape_vals: tuple[DimensionSize, ...], # input.shape as ir.Values jobvl=True, jobvr=True): # input_shape_vals are used for when input has dynamic shapes. _lapack.initialize() input_shape = ir.RankedTensorType(input.type).shape assert len(input_shape) >= 2 n = input_shape_vals[-1] batch_dims_vals = input_shape_vals[:-2] num_bd = len(batch_dims_vals) layout = (num_bd, num_bd + 1) + tuple(range(num_bd - 1, -1, -1)) compute_left = ( eig.ComputationMode.kComputeEigenvectors if jobvl else eig.ComputationMode.kNoEigenvectors ) compute_right = ( eig.ComputationMode.kComputeEigenvectors if jobvr else eig.ComputationMode.kNoEigenvectors ) fn_base = build_lapack_fn_target(fn_base="geev", dtype=dtype) i32_type = ir.IntegerType.get_signless(32) f32_type = ir.F32Type.get() f64_type = ir.F64Type.get() c64_type = ir.ComplexType.get(ir.F32Type.get()) c128_type = ir.ComplexType.get(ir.F64Type.get()) if ctx.is_forward_compat(): fn = fn_base workspaces: list[ShapeTypePair] eigvals: list[ShapeTypePair] if dtype == np.float32: real = True eigvecs_type = c64_type workspaces = [([n, n], f32_type)] * 3 workspace_layouts = [[0, 1]] * 3 eigvals = [(batch_dims_vals + (n,), f32_type)] * 2 eigvals_layouts = [tuple(range(num_bd, -1, -1))] * 2 elif dtype == np.float64: real = True eigvecs_type = c128_type workspaces = [([n, n], f64_type)] * 3 workspace_layouts = [[0, 1]] * 3 eigvals = [(batch_dims_vals + (n,), f64_type)] * 2 eigvals_layouts = [tuple(range(num_bd, -1, -1))] * 2 elif dtype == np.complex64: real = False eigvecs_type = c64_type workspaces = [([n, n], c64_type), ([hlo_add(n, n)], f32_type)] workspace_layouts = [[0, 1], [0]] eigvals = [(batch_dims_vals + (n,), c64_type)] eigvals_layouts = [tuple(range(num_bd, -1, -1))] elif dtype == np.complex128: real = False eigvecs_type = c128_type workspaces = [([n, n], c128_type), ([hlo_add(n, n)], f64_type)] workspace_layouts = [[0, 1], [0]] eigvals = [(batch_dims_vals + (n,), c128_type)] eigvals_layouts = [tuple(range(num_bd, -1, -1))] else: raise NotImplementedError(f"Unsupported dtype {dtype}") scalar_layout = [] info_layout = tuple(range(num_bd - 1, -1, -1)) batch_size_val = hlo_s32(1) for b_v in batch_dims_vals: batch_size_val = hlo.multiply(batch_size_val, ensure_hlo_s32(b_v)) shape_type_pairs: Sequence[ShapeTypePair] = workspaces + eigvals + [ (input_shape_vals, eigvecs_type), (input_shape_vals, eigvecs_type), (batch_dims_vals, i32_type)] result_types, result_shapes = mk_result_types_and_shapes(shape_type_pairs) out = custom_call( fn, result_types=result_types, operands=[batch_size_val, ensure_hlo_s32(n), hlo_u8(compute_left.value), hlo_u8(compute_right.value), input], operand_layouts=[scalar_layout] * 4 + [layout], result_layouts=(workspace_layouts + eigvals_layouts + [layout] * 2 + [info_layout]), result_shapes=result_shapes, ).results if real: return (hlo.complex(out[3], out[4]), out[5], out[6], out[7]) else: return out[2:6] fn = fn_base + "_ffi" real = dtype == np.float32 or dtype == np.float64 eigvecs_type = ( c64_type if dtype == np.float32 or dtype == np.complex64 else c128_type ) input_type = ir.RankedTensorType(input.type) eigvals = [(batch_dims_vals + (n,), input_type.element_type)] eigvals_layouts = [tuple(range(num_bd, -1, -1))] if real: eigvals = eigvals * 2 eigvals_layouts = eigvals_layouts * 2 info_layout = tuple(range(num_bd - 1, -1, -1)) shape_type_pairs: Sequence[ShapeTypePair] = [ *eigvals, (input_shape_vals, eigvecs_type), (input_shape_vals, eigvecs_type), (batch_dims_vals, i32_type), ] result_types, result_shapes = mk_result_types_and_shapes(shape_type_pairs) out = custom_call( fn, result_types=result_types, operands=[input], operand_layouts=[layout], result_layouts=( *eigvals_layouts, layout, layout, info_layout, ), result_shapes=result_shapes, backend_config={ "compute_left": _enum_to_char_attr(compute_left), "compute_right": _enum_to_char_attr(compute_right), }, api_version=4, ).results if real: return (hlo.complex(out[0], out[1]), out[2], out[3], out[4]) else: return out[:4] # # gees : Schur factorization def gees_hlo(dtype, a, *, jobvs=True, sort=False, select=None, a_shape_vals: tuple[DimensionSize, ...]): _lapack.initialize() a_type = ir.RankedTensorType(a.type) etype = a_type.element_type assert len(a_shape_vals) >= 2 n = a_shape_vals[-1] batch_dims_vals = a_shape_vals[:-2] num_bd = len(batch_dims_vals) layout = (num_bd, num_bd + 1) + tuple(range(num_bd - 1, -1, -1)) if sort: raise NotImplementedError( "The sort feature of LAPACK's gees routine is not implemented.") jobvs = ord('V' if jobvs else 'N') sort = ord('S' if sort else 'N') if dtype == np.float32: fn = "lapack_sgees" elif dtype == np.float64: fn = "lapack_dgees" elif dtype == np.complex64: fn = "lapack_cgees" elif dtype == np.complex128: fn = "lapack_zgees" else: raise NotImplementedError(f"Unsupported dtype {dtype}") workspaces: list[ShapeTypePair] eigvals: list[ShapeTypePair] if not np.issubdtype(dtype, np.complexfloating): workspaces = [(a_shape_vals, etype)] workspace_layouts = [layout] eigvals = [(batch_dims_vals + (n,), etype)] * 2 eigvals_layouts = [tuple(range(num_bd, -1, -1))] * 2 else: workspaces = [(a_shape_vals, etype), ([n], ir.ComplexType(etype).element_type), ] workspace_layouts = [layout, [0]] eigvals = [(batch_dims_vals + (n,), etype)] eigvals_layouts = [tuple(range(num_bd, -1, -1))] i32_type = ir.IntegerType.get_signless(32) scalar_layout = [] batch_size_val = hlo_s32(1) for b_v in batch_dims_vals: batch_size_val = hlo.multiply(batch_size_val, ensure_hlo_s32(b_v)) shape_type_pairs = workspaces + eigvals + [ (a_shape_vals, etype), (batch_dims_vals, i32_type), (batch_dims_vals, i32_type)] result_types, result_shapes = mk_result_types_and_shapes(shape_type_pairs) out = custom_call( fn, result_types=result_types, operands=[ batch_size_val, ensure_hlo_s32(n), hlo_u8(jobvs), hlo_u8(sort), # TODO: figure out how to put the callable select function here a ], operand_layouts=[scalar_layout] * 4 + [layout], result_layouts=workspace_layouts + eigvals_layouts + [ layout, tuple(range(num_bd - 1, -1, -1)), tuple(range(num_bd - 1, -1, -1)), ], operand_output_aliases={4: 0}, result_shapes=result_shapes, ).results if sort == ord('S'): return (out[0], out[3], out[4], out[5]) else: return (out[0], out[3], out[5]) # gehrd: Reduction of a non-symmetric square matrix to upper Hessenberg form. def gehrd_hlo(ctx, dtype, a): fn_base = prepare_lapack_call(fn_base="gehrd", dtype=dtype) a_type = ir.RankedTensorType(a.type) dims = a_type.shape assert len(dims) >= 2 m, n = dims[-2:] assert m == n, (m, n) batch_dims = tuple(dims[:-2]) num_bd = len(batch_dims) if ctx.is_forward_compat(): fn = fn_base b = 1 for d in batch_dims: b *= d if dtype == np.float32: lwork = _lapack.lapack_sgehrd_workspace(n, n, 1, n) elif dtype == np.float64: lwork = _lapack.lapack_dgehrd_workspace(n, n, 1, n) elif dtype == np.complex64: lwork = _lapack.lapack_cgehrd_workspace(n, n, 1, n) elif dtype == np.complex128: lwork = _lapack.lapack_zgehrd_workspace(n, n, 1, n) else: raise NotImplementedError(f"Unsupported dtype {dtype}") layout = (num_bd, num_bd + 1) + tuple(range(num_bd - 1, -1, -1)) i32_type = ir.IntegerType.get_signless(32) return custom_call( fn, result_types=[ a.type, ir.RankedTensorType.get(batch_dims + (n - 1,), a_type.element_type), ir.RankedTensorType.get(batch_dims, i32_type), ir.RankedTensorType.get([lwork], a_type.element_type), ], operands=[hlo_s32(n), hlo_s32(1), hlo_s32(n), hlo_s32(n), hlo_s32(b), hlo_s32(lwork), a], operand_layouts=[[]] * 6 + [layout], result_layouts=[ layout, (num_bd,) + tuple(range(num_bd - 1, -1, -1)), tuple(range(num_bd - 1, -1, -1)), [0], ], operand_output_aliases={6: 0}, ).results[:3] fn = fn_base + "_ffi" layout = (num_bd, num_bd + 1) + tuple(range(num_bd - 1, -1, -1)) i32_type = ir.IntegerType.get_signless(32) return custom_call( fn, result_types=[ a.type, ir.RankedTensorType.get(batch_dims + (n - 1,), a_type.element_type), ir.RankedTensorType.get(batch_dims, i32_type), ], operands=[a], operand_layouts=[layout], result_layouts=[ layout, (num_bd,) + tuple(range(num_bd - 1, -1, -1)), tuple(range(num_bd - 1, -1, -1)), ], operand_output_aliases={0: 0}, backend_config={ "low": _lapack_int_attr(1), "high": _lapack_int_attr(n), }, api_version=4, ).results # sytrd: Reduction of a symmetric (Hermitian) matrix to tridiagonal form. def sytrd_hlo(dtype, a, *, lower): _lapack.initialize() a_type = ir.RankedTensorType(a.type) dims = a_type.shape assert len(dims) >= 2 m, n = dims[-2:] assert m == n, (m, n) batch_dims = tuple(dims[:-2]) num_bd = len(batch_dims) b = 1 for d in batch_dims: b *= d if dtype == np.float32: fn = "lapack_ssytrd" lwork = _lapack.lapack_ssytrd_workspace(n, n) diag_type = a_type.element_type elif dtype == np.float64: fn = "lapack_dsytrd" lwork = _lapack.lapack_dsytrd_workspace(n, n) diag_type = a_type.element_type elif dtype == np.complex64: fn = "lapack_chetrd" lwork = _lapack.lapack_chetrd_workspace(n, n) diag_type = ir.F32Type.get() elif dtype == np.complex128: fn = "lapack_zhetrd" lwork = _lapack.lapack_zhetrd_workspace(n, n) diag_type = ir.F64Type.get() else: raise NotImplementedError(f"Unsupported dtype {dtype}") layout = (num_bd, num_bd + 1) + tuple(range(num_bd - 1, -1, -1)) i32_type = ir.IntegerType.get_signless(32) out = custom_call( fn, result_types=[ a.type, ir.RankedTensorType.get(batch_dims + (n,), diag_type), ir.RankedTensorType.get(batch_dims + (n - 1,), diag_type), ir.RankedTensorType.get(batch_dims + (n - 1,), a_type.element_type), ir.RankedTensorType.get(batch_dims, i32_type), ir.RankedTensorType.get([lwork], a_type.element_type), ], operands=[hlo_s32(n), hlo_s32(1 if lower else 0), hlo_s32(max(1, n)), hlo_s32(b), hlo_s32(lwork), a], operand_layouts=[[]] * 5 + [layout], result_layouts=[ layout, (num_bd,) + tuple(range(num_bd - 1, -1, -1)), (num_bd,) + tuple(range(num_bd - 1, -1, -1)), (num_bd,) + tuple(range(num_bd - 1, -1, -1)), tuple(range(num_bd - 1, -1, -1)), [0], ], operand_output_aliases={5: 0}, ).results return out[:5]