mirror of
https://github.com/ROCm/jax.git
synced 2025-04-15 19:36:06 +00:00

The unbatched Jacobi solver is faster for small-moderate matrices, and the unbatched kernel doesn't have size restrictions. Timings on T4 GPU: Before: ------------------------------------------------------------ Benchmark Time CPU Iterations ------------------------------------------------------------ svd/m:1/n:1 263587 ns 242274 ns 2780 svd/m:2/n:1 335561 ns 298238 ns 2303 svd/m:5/n:1 337784 ns 299841 ns 2304 svd/m:10/n:1 339184 ns 300703 ns 2311 svd/m:100/n:1 359826 ns 320088 ns 2159 svd/m:500/n:1 376124 ns 338660 ns 2076 svd/m:800/n:1 375779 ns 335590 ns 2060 svd/m:1000/n:1 419171 ns 341487 ns 2072 svd/m:1/n:2 307564 ns 270663 ns 2544 svd/m:2/n:2 320928 ns 283601 ns 2487 svd/m:5/n:2 377373 ns 344228 ns 2035 svd/m:10/n:2 380557 ns 349412 ns 1953 svd/m:100/n:2 435465 ns 403496 ns 1722 svd/m:500/n:2 444610 ns 410913 ns 1680 svd/m:800/n:2 454493 ns 416495 ns 1665 svd/m:1000/n:2 492110 ns 420539 ns 1665 svd/m:1/n:5 307316 ns 275833 ns 2531 svd/m:2/n:5 374318 ns 341432 ns 2086 svd/m:5/n:5 512928 ns 470293 ns 1361 svd/m:10/n:5 589330 ns 537070 ns 1353 svd/m:100/n:5 620164 ns 580166 ns 1193 svd/m:500/n:5 636424 ns 593692 ns 1180 svd/m:800/n:5 635545 ns 595016 ns 1181 svd/m:1000/n:5 672443 ns 597387 ns 1115 svd/m:1/n:10 310013 ns 273998 ns 2520 svd/m:2/n:10 370451 ns 334489 ns 2105 svd/m:5/n:10 560037 ns 522223 ns 1274 svd/m:10/n:10 572868 ns 535388 ns 1304 svd/m:100/n:10 959802 ns 918258 ns 765 svd/m:500/n:10 955958 ns 909778 ns 758 svd/m:800/n:10 924104 ns 879512 ns 777 svd/m:1000/n:10 950140 ns 883493 ns 775 svd/m:1/n:100 351237 ns 315554 ns 2198 svd/m:2/n:100 426883 ns 390089 ns 1792 svd/m:5/n:100 601557 ns 564493 ns 1255 svd/m:10/n:100 920819 ns 880011 ns 787 svd/m:100/n:100 7902281 ns 7229220 ns 95 svd/m:500/n:100 9720727 ns 9040679 ns 79 svd/m:800/n:100 9856378 ns 8998050 ns 79 svd/m:1000/n:100 9721017 ns 9086414 ns 79 svd/m:1/n:500 371171 ns 334217 ns 2117 svd/m:2/n:500 449165 ns 411499 ns 1700 svd/m:5/n:500 620354 ns 581866 ns 1185 svd/m:10/n:500 892375 ns 847239 ns 833 svd/m:100/n:500 9564810 ns 8867540 ns 79 svd/m:500/n:500 111924035 ns 104078023 ns 7 svd/m:800/n:500 147777319 ns 142730412 ns 5 svd/m:1000/n:500 154205084 ns 149740209 ns 5 svd/m:1/n:800 372122 ns 334212 ns 2119 svd/m:2/n:800 456672 ns 419260 ns 1680 svd/m:5/n:800 691208 ns 626003 ns 1190 svd/m:10/n:800 1017694 ns 941480 ns 730 svd/m:100/n:800 9892683 ns 9091043 ns 76 svd/m:500/n:800 144134235 ns 139129722 ns 5 svd/m:800/n:800 342790246 ns 333299774 ns 2 svd/m:1000/n:800 432820082 ns 427978978 ns 2 svd/m:1/n:1000 372785 ns 335745 ns 1805 svd/m:2/n:1000 451946 ns 413341 ns 1668 svd/m:5/n:1000 618475 ns 577213 ns 1169 svd/m:10/n:1000 907729 ns 863335 ns 808 svd/m:100/n:1000 9868543 ns 9116870 ns 76 svd/m:500/n:1000 156777811 ns 152042065 ns 5 svd/m:800/n:1000 429704070 ns 424677592 ns 2 svd/m:1000/n:1000 654864311 ns 642693162 ns 1 After: ------------------------------------------------------------ Benchmark Time CPU Iterations ------------------------------------------------------------ svd/m:1/n:1 265980 ns 245433 ns 2791 svd/m:2/n:1 340203 ns 302783 ns 2288 svd/m:5/n:1 337807 ns 301916 ns 2286 svd/m:10/n:1 338064 ns 302441 ns 2297 svd/m:100/n:1 335444 ns 298440 ns 2327 svd/m:500/n:1 338025 ns 302096 ns 2272 svd/m:800/n:1 328382 ns 291740 ns 2252 svd/m:1000/n:1 397494 ns 310905 ns 2239 svd/m:1/n:2 310464 ns 274507 ns 2535 svd/m:2/n:2 319999 ns 284247 ns 2515 svd/m:5/n:2 373435 ns 335919 ns 2069 svd/m:10/n:2 376327 ns 339327 ns 2056 svd/m:100/n:2 385061 ns 349258 ns 2003 svd/m:500/n:2 392352 ns 355735 ns 1932 svd/m:800/n:2 410736 ns 370677 ns 1881 svd/m:1000/n:2 494326 ns 405603 ns 1721 svd/m:1/n:5 316735 ns 277292 ns 2538 svd/m:2/n:5 383748 ns 342218 ns 2077 svd/m:5/n:5 494204 ns 454309 ns 1476 svd/m:10/n:5 547017 ns 508184 ns 1371 svd/m:100/n:5 514537 ns 476761 ns 1460 svd/m:500/n:5 544656 ns 504877 ns 1381 svd/m:800/n:5 642590 ns 599314 ns 1159 svd/m:1000/n:5 706166 ns 621209 ns 1106 svd/m:1/n:10 310825 ns 274374 ns 2511 svd/m:2/n:10 381316 ns 344202 ns 2094 svd/m:5/n:10 565469 ns 526759 ns 1266 svd/m:10/n:10 576111 ns 537286 ns 1299 svd/m:100/n:10 653250 ns 613392 ns 1137 svd/m:500/n:10 690532 ns 645828 ns 1080 svd/m:800/n:10 763924 ns 723677 ns 959 svd/m:1000/n:10 940342 ns 855517 ns 818 svd/m:1/n:100 306134 ns 271533 ns 2526 svd/m:2/n:100 374680 ns 339298 ns 2071 svd/m:5/n:100 576926 ns 539062 ns 1228 svd/m:10/n:100 656806 ns 615171 ns 1123 svd/m:100/n:100 3295164 ns 3138621 ns 223 svd/m:500/n:100 4269347 ns 4166000 ns 168 svd/m:800/n:100 4656541 ns 4522247 ns 154 svd/m:1000/n:100 6479223 ns 6354578 ns 112 svd/m:1/n:500 329966 ns 289083 ns 2440 svd/m:2/n:500 407535 ns 366794 ns 1947 svd/m:5/n:500 567367 ns 522809 ns 1336 svd/m:10/n:500 712307 ns 657608 ns 1065 svd/m:100/n:500 4262986 ns 4169907 ns 167 svd/m:500/n:500 28824720 ns 28650258 ns 25 svd/m:800/n:500 29330139 ns 28677269 ns 25 svd/m:1000/n:500 30848037 ns 30089216 ns 23 svd/m:1/n:800 328620 ns 289181 ns 2329 svd/m:2/n:800 419052 ns 379483 ns 1876 svd/m:5/n:800 587366 ns 546979 ns 1269 svd/m:10/n:800 830762 ns 787923 ns 893 svd/m:100/n:800 4763633 ns 4595738 ns 152 svd/m:500/n:800 30447861 ns 29949714 ns 24 svd/m:800/n:800 94188958 ns 93488372 ns 8 svd/m:1000/n:800 94701529 ns 93394677 ns 7 svd/m:1/n:1000 351102 ns 313099 ns 2218 svd/m:2/n:1000 446543 ns 407807 ns 1708 svd/m:5/n:1000 661152 ns 616174 ns 1129 svd/m:10/n:1000 915743 ns 873397 ns 802 svd/m:100/n:1000 6434730 ns 6282779 ns 113 svd/m:500/n:1000 30244321 ns 29684290 ns 24 svd/m:800/n:1000 92727423 ns 91477078 ns 8 svd/m:1000/n:1000 169500709 ns 168358420 ns 4 PiperOrigin-RevId: 582041508
556 lines
19 KiB
Python
556 lines
19 KiB
Python
# Copyright 2019 The JAX Authors.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# https://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from collections.abc import Sequence
|
|
from functools import partial
|
|
import importlib
|
|
import math
|
|
|
|
import jaxlib.mlir.ir as ir
|
|
import jaxlib.mlir.dialects.stablehlo as hlo
|
|
|
|
import numpy as np
|
|
|
|
from .gpu_common_utils import GpuLibNotLinkedError
|
|
|
|
from jaxlib import xla_client
|
|
|
|
from .hlo_helpers import (
|
|
DimensionSize, ShapeTypePair, mk_result_types_and_shapes,
|
|
custom_call, ensure_hlo_s32, hlo_s32)
|
|
|
|
try:
|
|
from .cuda import _blas as _cublas # pytype: disable=import-error
|
|
except ImportError:
|
|
for cuda_module_name in ["jax_cuda12_plugin", "jax_cuda11_plugin"]:
|
|
try:
|
|
_cublas = importlib.import_module(f"{cuda_module_name}._blas")
|
|
except ImportError:
|
|
_cublas = None
|
|
else:
|
|
break
|
|
|
|
if _cublas:
|
|
for _name, _value in _cublas.registrations().items():
|
|
xla_client.register_custom_call_target(_name, _value, platform="CUDA")
|
|
|
|
for cuda_module_name in [".cuda", "jax_cuda12_plugin", "jax_cuda11_plugin"]:
|
|
try:
|
|
_cusolver = importlib.import_module(
|
|
f"{cuda_module_name}._solver", package="jaxlib"
|
|
)
|
|
except ImportError:
|
|
_cusolver = None
|
|
else:
|
|
break
|
|
|
|
if _cusolver:
|
|
for _name, _value in _cusolver.registrations().items():
|
|
xla_client.register_custom_call_target(_name, _value, platform="CUDA")
|
|
|
|
|
|
try:
|
|
from .rocm import _blas as _hipblas # pytype: disable=import-error
|
|
for _name, _value in _hipblas.registrations().items():
|
|
xla_client.register_custom_call_target(_name, _value, platform="ROCM")
|
|
except ImportError:
|
|
_hipblas = None
|
|
|
|
try:
|
|
from .rocm import _solver as _hipsolver # pytype: disable=import-error
|
|
for _name, _value in _hipsolver.registrations().items():
|
|
xla_client.register_custom_call_target(_name, _value, platform="ROCM")
|
|
except ImportError:
|
|
_hipsolver = None
|
|
|
|
|
|
def _real_type(dtype):
|
|
"""Returns the real equivalent of 'dtype'."""
|
|
return np.finfo(dtype).dtype
|
|
|
|
|
|
def _getrf_hlo(platform, gpu_blas, gpu_solver, dtype, a):
|
|
"""LU decomposition."""
|
|
a_type = ir.RankedTensorType(a.type)
|
|
dims = a_type.shape
|
|
assert len(dims) >= 2
|
|
m, n = dims[-2:]
|
|
batch_dims = tuple(dims[:-2])
|
|
num_bd = len(batch_dims)
|
|
batch = math.prod(batch_dims)
|
|
|
|
if not gpu_blas:
|
|
raise GpuLibNotLinkedError()
|
|
|
|
if batch > 1 and m == n and m // batch <= 128:
|
|
lwork, opaque = gpu_blas.build_getrf_batched_descriptor(
|
|
np.dtype(dtype), batch, m)
|
|
workspace = ir.RankedTensorType.get([lwork], ir.IntegerType.get_signless(8))
|
|
kernel = f"{platform}blas_getrf_batched"
|
|
else:
|
|
lwork, opaque = gpu_solver.build_getrf_descriptor(
|
|
np.dtype(dtype), batch, m, n)
|
|
workspace = ir.RankedTensorType.get([lwork], a_type.element_type)
|
|
kernel = f"{platform}solver_getrf"
|
|
|
|
layout = (num_bd, num_bd + 1) + tuple(range(num_bd - 1, -1, -1))
|
|
i32_type = ir.IntegerType.get_signless(32)
|
|
out = custom_call(
|
|
kernel,
|
|
result_types=[
|
|
a.type,
|
|
ir.RankedTensorType.get(batch_dims + (min(m, n),), i32_type),
|
|
ir.RankedTensorType.get(batch_dims, i32_type),
|
|
workspace,
|
|
],
|
|
operands=[a],
|
|
backend_config=opaque,
|
|
operand_layouts=[layout],
|
|
result_layouts=[
|
|
layout,
|
|
tuple(range(num_bd, -1, -1)),
|
|
tuple(range(num_bd - 1, -1, -1)),
|
|
[0],
|
|
],
|
|
operand_output_aliases={0: 0}).results
|
|
return out[:3]
|
|
|
|
cuda_getrf = partial(_getrf_hlo, "cu", _cublas, _cusolver)
|
|
rocm_getrf = partial(_getrf_hlo, "hip", _hipblas, _hipsolver)
|
|
|
|
|
|
def _geqrf_hlo(platform, gpu_solver, dtype, a):
|
|
"""QR decomposition."""
|
|
a_type = ir.RankedTensorType(a.type)
|
|
dims = a_type.shape
|
|
assert len(dims) >= 2
|
|
m, n = dims[-2:]
|
|
batch_dims = tuple(dims[:-2])
|
|
num_bd = len(batch_dims)
|
|
batch = math.prod(batch_dims)
|
|
|
|
lwork, opaque = gpu_solver.build_geqrf_descriptor(
|
|
np.dtype(dtype), batch, m, n)
|
|
|
|
layout = (num_bd, num_bd + 1) + tuple(range(num_bd - 1, -1, -1))
|
|
i32_type = ir.IntegerType.get_signless(32)
|
|
out = custom_call(
|
|
f"{platform}solver_geqrf",
|
|
result_types=[
|
|
a.type,
|
|
ir.RankedTensorType.get(batch_dims + (min(m, n),), a_type.element_type),
|
|
ir.RankedTensorType.get(batch_dims, i32_type),
|
|
ir.RankedTensorType.get([lwork], a_type.element_type),
|
|
],
|
|
operands=[a],
|
|
backend_config=opaque,
|
|
operand_layouts=[layout],
|
|
result_layouts=[
|
|
layout,
|
|
tuple(range(num_bd, -1, -1)),
|
|
tuple(range(num_bd - 1, -1, -1)),
|
|
[0],
|
|
],
|
|
operand_output_aliases={0: 0}).results
|
|
return out[:3]
|
|
|
|
cuda_geqrf = partial(_geqrf_hlo, "cu", _cusolver)
|
|
rocm_geqrf = partial(_geqrf_hlo, "hip", _hipsolver)
|
|
|
|
def _geqrf_batched_hlo(platform, gpu_blas, dtype, a):
|
|
"""Batched QR decomposition."""
|
|
a_type = ir.RankedTensorType(a.type)
|
|
dims = a_type.shape
|
|
assert len(dims) >= 2
|
|
m, n = dims[-2:]
|
|
batch_dims = tuple(dims[:-2])
|
|
num_bd = len(batch_dims)
|
|
batch = math.prod(batch_dims)
|
|
|
|
if not gpu_blas:
|
|
raise GpuLibNotLinkedError()
|
|
|
|
lwork, opaque = gpu_blas.build_geqrf_batched_descriptor(
|
|
np.dtype(dtype), batch, m, n)
|
|
|
|
layout = (num_bd, num_bd + 1) + tuple(range(num_bd - 1, -1, -1))
|
|
out = custom_call(
|
|
f"{platform}blas_geqrf_batched",
|
|
result_types=[
|
|
a.type,
|
|
ir.RankedTensorType.get(batch_dims + (min(m, n),), a_type.element_type),
|
|
ir.RankedTensorType.get([lwork], ir.IntegerType.get_signless(8)),
|
|
ir.RankedTensorType.get([lwork], ir.IntegerType.get_signless(8)),
|
|
],
|
|
operands=[a],
|
|
backend_config=opaque,
|
|
operand_layouts=[layout],
|
|
result_layouts=[
|
|
layout,
|
|
tuple(range(num_bd, -1, -1)),
|
|
[0],
|
|
[0],
|
|
],
|
|
operand_output_aliases={0: 0}
|
|
).results
|
|
return out[:2]
|
|
|
|
cuda_geqrf_batched = partial(_geqrf_batched_hlo, "cu", _cublas)
|
|
rocm_geqrf_batched = partial(_geqrf_batched_hlo, "hip", _hipblas)
|
|
|
|
|
|
def _csrlsvqr_hlo(platform, gpu_solver, dtype, data,
|
|
indices, indptr, b, tol, reorder):
|
|
"""Sparse solver via QR decomposition. CUDA only."""
|
|
b_type = ir.RankedTensorType(b.type)
|
|
data_type = ir.RankedTensorType(data.type)
|
|
|
|
n = b_type.shape[0]
|
|
nnz = data_type.shape[0]
|
|
opaque = gpu_solver.build_csrlsvqr_descriptor(
|
|
np.dtype(dtype), n, nnz, reorder, tol
|
|
)
|
|
|
|
out = custom_call(
|
|
f"{platform}solver_csrlsvqr", # call_target_name
|
|
result_types=[b.type],
|
|
operands=[data, indptr, indices, b],
|
|
backend_config=opaque, # backend_config
|
|
operand_layouts=[(0,), (0,), (0,), (0,)], # operand_layouts
|
|
result_layouts=[(0,)] # result_layouts
|
|
).results
|
|
return out
|
|
|
|
cuda_csrlsvqr = partial(_csrlsvqr_hlo, "cu", _cusolver)
|
|
|
|
|
|
def _orgqr_hlo(platform, gpu_solver, dtype, a, tau):
|
|
"""Product of elementary Householder reflections."""
|
|
a_type = ir.RankedTensorType(a.type)
|
|
dims = a_type.shape
|
|
assert len(dims) >= 2
|
|
m, n = dims[-2:]
|
|
batch_dims = tuple(dims[:-2])
|
|
num_bd = len(batch_dims)
|
|
batch = math.prod(batch_dims)
|
|
|
|
tau_dims = ir.RankedTensorType(tau.type).shape
|
|
assert tau_dims[:-1] == dims[:-2]
|
|
k = tau_dims[-1]
|
|
|
|
lwork, opaque = gpu_solver.build_orgqr_descriptor(
|
|
np.dtype(dtype), batch, m, n, k)
|
|
|
|
layout = (num_bd, num_bd + 1) + tuple(range(num_bd - 1, -1, -1))
|
|
i32_type = ir.IntegerType.get_signless(32)
|
|
out = custom_call(
|
|
f"{platform}solver_orgqr",
|
|
result_types=[
|
|
a.type,
|
|
ir.RankedTensorType.get(batch_dims, i32_type),
|
|
ir.RankedTensorType.get([lwork], a_type.element_type),
|
|
],
|
|
operands=[a, tau],
|
|
backend_config=opaque,
|
|
operand_layouts=[
|
|
layout,
|
|
tuple(range(num_bd, -1, -1)),
|
|
],
|
|
result_layouts=[
|
|
layout,
|
|
tuple(range(num_bd - 1, -1, -1)),
|
|
[0],
|
|
],
|
|
operand_output_aliases={0: 0}).results
|
|
return out[:2]
|
|
|
|
cuda_orgqr = partial(_orgqr_hlo, "cu", _cusolver)
|
|
rocm_orgqr = partial(_orgqr_hlo, "hip", _hipsolver)
|
|
|
|
|
|
def _syevd_hlo(platform, gpu_solver, have_jacobi_solver, dtype, a, *,
|
|
a_shape_vals: tuple[DimensionSize, ...], lower=False):
|
|
"""Symmetric (Hermitian) eigendecomposition."""
|
|
a_type = ir.RankedTensorType(a.type)
|
|
assert len(a_shape_vals) >= 2
|
|
m, n = a_shape_vals[-2:]
|
|
assert type(m) is int and type(n) is int and m == n, a_shape_vals
|
|
batch_dims_vals = a_shape_vals[:-2]
|
|
|
|
num_bd = len(batch_dims_vals)
|
|
layout = (num_bd, num_bd + 1) + tuple(range(num_bd - 1, -1, -1))
|
|
|
|
dynamic_batch_dims = any(type(d) != int for d in batch_dims_vals)
|
|
if dynamic_batch_dims:
|
|
batch_int = -1 # Signals to the kernel that the batch is an operand.
|
|
else:
|
|
batch_int = math.prod(batch_dims_vals)
|
|
|
|
if have_jacobi_solver and n <= 32 and not dynamic_batch_dims:
|
|
# We cannot use syevj for dynamic shapes because the workspace size
|
|
# depends on the batch size.
|
|
kernel = f"{platform}solver_syevj"
|
|
lwork, opaque = gpu_solver.build_syevj_descriptor(
|
|
np.dtype(dtype), lower, batch_int, n)
|
|
else:
|
|
kernel = f"{platform}solver_syevd"
|
|
lwork, opaque = gpu_solver.build_syevd_descriptor(
|
|
np.dtype(dtype), lower, batch_int, n)
|
|
assert lwork > 0
|
|
|
|
if ir.ComplexType.isinstance(a_type.element_type):
|
|
eigvals_type = ir.ComplexType(a_type.element_type).element_type
|
|
else:
|
|
eigvals_type = a_type.element_type
|
|
|
|
i32_type = ir.IntegerType.get_signless(32)
|
|
operands = [a]
|
|
operand_layouts = [layout]
|
|
if dynamic_batch_dims:
|
|
batch_size_val = hlo_s32(1)
|
|
for b_v in batch_dims_vals:
|
|
batch_size_val = hlo.MulOp(batch_size_val, ensure_hlo_s32(b_v)).result
|
|
operands.append(batch_size_val)
|
|
operand_layouts.append(())
|
|
|
|
shape_type_pairs: Sequence[ShapeTypePair] = [
|
|
(a_shape_vals, a_type.element_type),
|
|
(batch_dims_vals + (n,), eigvals_type),
|
|
(batch_dims_vals, i32_type),
|
|
([lwork], a_type.element_type)]
|
|
result_types, result_shapes = mk_result_types_and_shapes(shape_type_pairs)
|
|
out = custom_call(
|
|
kernel,
|
|
result_types=result_types,
|
|
operands=operands,
|
|
backend_config=opaque,
|
|
operand_layouts=operand_layouts,
|
|
result_layouts=[
|
|
layout,
|
|
tuple(range(num_bd, -1, -1)),
|
|
tuple(range(num_bd - 1, -1, -1)),
|
|
[0],
|
|
],
|
|
operand_output_aliases={0: 0},
|
|
result_shapes=result_shapes).results
|
|
return out[:3]
|
|
|
|
cuda_syevd = partial(_syevd_hlo, "cu", _cusolver, True)
|
|
rocm_syevd = partial(_syevd_hlo, "hip", _hipsolver, True)
|
|
|
|
|
|
def _gesvd_hlo(platform, gpu_solver, have_jacobi_solver, dtype, a,
|
|
full_matrices=True, compute_uv=True):
|
|
"""Singular value decomposition."""
|
|
a_type = ir.RankedTensorType(a.type)
|
|
dims = a_type.shape
|
|
assert len(dims) >= 2
|
|
m, n = dims[-2:]
|
|
batch_dims = tuple(dims[:-2])
|
|
num_bd = len(batch_dims)
|
|
b = math.prod(batch_dims)
|
|
if ir.ComplexType.isinstance(a_type.element_type):
|
|
singular_vals_type = ir.ComplexType(a_type.element_type).element_type
|
|
else:
|
|
singular_vals_type = a_type.element_type
|
|
|
|
scalar_layout = tuple(range(num_bd - 1, -1, -1))
|
|
vector_layout = (num_bd,) + tuple(range(num_bd - 1, -1, -1))
|
|
i32_type = ir.IntegerType.get_signless(32)
|
|
|
|
# NVIDIA's batched Jacobi solver supports a maximum matrix size of 32x32, but
|
|
# the unbatched solver has no such limit. The unbatched solver appears to
|
|
# outperform gesvd for small-moderate matrices, e.g., see:
|
|
# https://developer.download.nvidia.com/video/gputechconf/gtc/2019/presentation/s9226-fast-singular-value-decomposition-on-gpus-v2.pdf
|
|
# slide 5.
|
|
if have_jacobi_solver and (
|
|
(b == 1 and m <= 1024 and n <= 1024) or (m <= 32 and n <= 32)
|
|
):
|
|
# The batched kernel doesn't support "econ" mode.
|
|
econ = not full_matrices and b == 1
|
|
lwork, opaque = gpu_solver.build_gesvdj_descriptor(
|
|
np.dtype(dtype), b, m, n, compute_uv, 1 if econ else 0)
|
|
k = min(m, n)
|
|
matrix_layout = (num_bd, num_bd + 1) + tuple(range(num_bd - 1, -1, -1))
|
|
_, s, u, v, info, _ = custom_call(
|
|
f"{platform}solver_gesvdj",
|
|
result_types=[
|
|
a.type,
|
|
ir.RankedTensorType.get(batch_dims + (min(m, n),), singular_vals_type),
|
|
ir.RankedTensorType.get(batch_dims + (m, k if econ else m),
|
|
a_type.element_type),
|
|
ir.RankedTensorType.get(batch_dims + (n, k if econ else n),
|
|
a_type.element_type),
|
|
ir.RankedTensorType.get(batch_dims, i32_type),
|
|
ir.RankedTensorType.get([lwork], a_type.element_type),
|
|
],
|
|
operands=[a],
|
|
backend_config=opaque,
|
|
operand_layouts=[matrix_layout],
|
|
result_layouts=[
|
|
matrix_layout,
|
|
vector_layout,
|
|
matrix_layout,
|
|
matrix_layout,
|
|
scalar_layout,
|
|
[0],
|
|
],
|
|
operand_output_aliases={0: 0}).results
|
|
vt = hlo.TransposeOp(
|
|
v,
|
|
ir.DenseIntElementsAttr.get(np.array(tuple(range(num_bd)) + (num_bd + 1, num_bd)))).result
|
|
if np.issubdtype(dtype, np.complexfloating):
|
|
vt = hlo.ComplexOp(hlo.RealOp(vt), hlo.NegOp(hlo.ImagOp(vt))).result
|
|
if not full_matrices and not econ:
|
|
u = hlo.SliceOp(
|
|
u,
|
|
ir.DenseIntElementsAttr.get(np.zeros([len(dims)], np.int64)),
|
|
ir.DenseIntElementsAttr.get(np.array(batch_dims + (m, min(m, n)))),
|
|
ir.DenseIntElementsAttr.get(np.ones([len(dims)], np.int64))).result
|
|
vt = hlo.SliceOp(
|
|
vt,
|
|
ir.DenseIntElementsAttr.get(np.zeros([len(dims)], np.int64)),
|
|
ir.DenseIntElementsAttr.get(np.array(batch_dims + (min(m, n), n))),
|
|
ir.DenseIntElementsAttr.get(np.ones([len(dims)], np.int64))).result
|
|
elif m < n:
|
|
lwork, opaque = gpu_solver.build_gesvd_descriptor(
|
|
np.dtype(dtype), b, n, m, compute_uv, full_matrices)
|
|
k = n if full_matrices else m
|
|
matrix_layout = (num_bd + 1, num_bd) + tuple(range(num_bd - 1, -1, -1))
|
|
_, s, vt, u, info, _ = custom_call(
|
|
f"{platform}solver_gesvd",
|
|
result_types=[
|
|
a.type,
|
|
ir.RankedTensorType.get(batch_dims + (min(m, n),), singular_vals_type),
|
|
ir.RankedTensorType.get(batch_dims + (k, n), a_type.element_type),
|
|
ir.RankedTensorType.get(batch_dims + (m, m), a_type.element_type),
|
|
ir.RankedTensorType.get(batch_dims, i32_type),
|
|
ir.RankedTensorType.get([lwork], a_type.element_type),
|
|
],
|
|
operands=[a],
|
|
backend_config=opaque,
|
|
operand_layouts=[matrix_layout],
|
|
result_layouts=[
|
|
matrix_layout,
|
|
vector_layout,
|
|
matrix_layout,
|
|
matrix_layout,
|
|
scalar_layout,
|
|
[0],
|
|
],
|
|
operand_output_aliases={0: 0}).results
|
|
else:
|
|
lwork, opaque = gpu_solver.build_gesvd_descriptor(
|
|
np.dtype(dtype), b, m, n, compute_uv, full_matrices)
|
|
k = m if full_matrices else n
|
|
matrix_layout = (num_bd, num_bd + 1) + tuple(range(num_bd - 1, -1, -1))
|
|
_, s, u, vt, info, _ = custom_call(
|
|
f"{platform}solver_gesvd",
|
|
result_types=[
|
|
a.type,
|
|
ir.RankedTensorType.get(batch_dims + (min(m, n),), singular_vals_type),
|
|
ir.RankedTensorType.get(batch_dims + (m, k), a_type.element_type),
|
|
ir.RankedTensorType.get(batch_dims + (n, n), a_type.element_type),
|
|
ir.RankedTensorType.get(batch_dims, i32_type),
|
|
ir.RankedTensorType.get([lwork], a_type.element_type),
|
|
],
|
|
operands=[a],
|
|
backend_config=opaque,
|
|
operand_layouts=[matrix_layout],
|
|
result_layouts=[
|
|
matrix_layout,
|
|
vector_layout,
|
|
matrix_layout,
|
|
matrix_layout,
|
|
scalar_layout,
|
|
[0],
|
|
],
|
|
operand_output_aliases={0: 0}).results
|
|
return s, u, vt, info
|
|
|
|
cuda_gesvd = partial(_gesvd_hlo, "cu", _cusolver, True)
|
|
rocm_gesvd = partial(_gesvd_hlo, "hip", _hipsolver, False)
|
|
|
|
|
|
def _sytrd_hlo(platform, gpu_solver, dtype, a, *, lower):
|
|
"""sytrd: Reduction of a symmetric (Hermitian) matrix to tridiagonal form."""
|
|
a_type = ir.RankedTensorType(a.type)
|
|
dims = a_type.shape
|
|
assert len(dims) >= 2
|
|
m, n = dims[-2:]
|
|
assert m == n, (m, n)
|
|
batch_dims = tuple(dims[:-2])
|
|
num_bd = len(batch_dims)
|
|
b = 1
|
|
for d in batch_dims:
|
|
b *= d
|
|
|
|
lwork, opaque = gpu_solver.build_sytrd_descriptor(dtype, lower, b, n)
|
|
if np.issubdtype(dtype, np.floating):
|
|
diag_type = a_type.element_type
|
|
elif dtype == np.complex64:
|
|
diag_type = ir.F32Type.get()
|
|
elif dtype == np.complex128:
|
|
diag_type = ir.F64Type.get()
|
|
else:
|
|
raise NotImplementedError(f"Unsupported dtype {dtype}")
|
|
|
|
layout = (num_bd, num_bd + 1) + tuple(range(num_bd - 1, -1, -1))
|
|
i32_type = ir.IntegerType.get_signless(32)
|
|
a, d, e, taus, info, _ = custom_call(
|
|
f"{platform}solver_sytrd",
|
|
result_types=[
|
|
a.type,
|
|
ir.RankedTensorType.get(batch_dims + (n,), diag_type),
|
|
ir.RankedTensorType.get(batch_dims + (n - 1,), diag_type),
|
|
ir.RankedTensorType.get(batch_dims + (n - 1,), a_type.element_type),
|
|
ir.RankedTensorType.get(batch_dims, i32_type),
|
|
ir.RankedTensorType.get([lwork], a_type.element_type),
|
|
],
|
|
operands=[a],
|
|
backend_config=opaque,
|
|
operand_layouts=[layout],
|
|
result_layouts=[
|
|
layout,
|
|
(num_bd,) + tuple(range(num_bd - 1, -1, -1)),
|
|
(num_bd,) + tuple(range(num_bd - 1, -1, -1)),
|
|
(num_bd,) + tuple(range(num_bd - 1, -1, -1)),
|
|
tuple(range(num_bd - 1, -1, -1)),
|
|
[0],
|
|
],
|
|
operand_output_aliases={0: 0},
|
|
).results
|
|
# Workaround for NVIDIA partners bug #3865118: sytrd returns an incorrect "1"
|
|
# in the first element of the superdiagonal in the `a` matrix in the
|
|
# lower=False case. The correct result is returned in the `e` vector so we can
|
|
# simply copy it back to where it needs to be:
|
|
intattr = lambda xs: ir.DenseIntElementsAttr.get(np.asarray(xs, np.int64))
|
|
if not lower and platform == "cu" and m > 1:
|
|
start = (0,) * len(batch_dims) + (0,)
|
|
end = batch_dims + (1,)
|
|
s = hlo.SliceOp(e, intattr(start), intattr(end), intattr([1] * len(start)))
|
|
s_type = ir.RankedTensorType.get(batch_dims + (1, 1), diag_type)
|
|
s = hlo.BroadcastInDimOp(s_type, s, intattr(range(len(dims) - 1)))
|
|
# The diagonals are always real; convert to complex if needed.
|
|
s = hlo.ConvertOp(
|
|
ir.RankedTensorType.get(s_type.shape, a_type.element_type), s)
|
|
offsets = tuple(hlo.ConstantOp(intattr(i))
|
|
for i in ((0,) * len(batch_dims) + (0, 1)))
|
|
a = hlo.DynamicUpdateSliceOp(a, s, offsets).result
|
|
|
|
return a, d, e, taus, info
|
|
|
|
cuda_sytrd = partial(_sytrd_hlo, "cu", _cusolver)
|
|
rocm_sytrd = partial(_sytrd_hlo, "hip", _hipsolver)
|