rocm_jax/tests/nn_test.py
Kaixi Hou df6080f346 PR #21371: [NVIDIA] Add new SDPA API to jax.nn
Imported from GitHub PR https://github.com/google/jax/pull/21371

Attention plays a crucial role in modern transformer-based models. While there exist various variants, they generally follow the same workflow. Examples include the typical multi-head attention (MHA), global query attention (GQA), and multi-query attention (MQA). Additionally, new implementations like the Flash Attention algorithm aim to enhance the utilization of accelerator devices. For instance, NVIDIA cuDNN supports Flash Attention and, through its API, can result in a 1.3x end-to-end speedup for training large language models based on GPT alone.

This PR proposes introducing a new API in the `jax.nn` module to handle attention. It will first try to use the cudnn flash attention execution path when the config is compatible. Otherwise it falls back to a jax implementation.

cc. @nluehr @Cjkkkk @cliffwoolley

Copybara import of the project:

--
39a11d91632aab1af5aeec1e92990a7aaeea0cca by kaixih <kaixih@nvidia.com>:

Add new SDPA API to jax.nn

Merging this change closes #21371

COPYBARA_INTEGRATE_REVIEW=https://github.com/google/jax/pull/21371 from kaixih:jax_sdpa_dev 39a11d91632aab1af5aeec1e92990a7aaeea0cca
PiperOrigin-RevId: 650225872
2024-07-08 06:16:04 -07:00

585 lines
21 KiB
Python

# Copyright 2019 The JAX Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for nn module."""
import collections
from functools import partial
import itertools
import unittest
from absl.testing import absltest
from absl.testing import parameterized
import scipy.stats
from jax._src import config
from jax._src import core
from jax._src import test_util as jtu
from jax._src import ad_checkpoint
from jax._src.interpreters import mlir
from jax._src.lib import cuda_versions
from jax._src.nn.functions import _get_causal_mask
from jax.test_util import check_grads
from jax import nn
from jax import random
import jax
import jax.numpy as jnp
config.parse_flags_with_absl()
def _is_required_cudnn_version_satisfied():
return (
jtu.is_cuda_compute_capability_at_least("8.0") and
cuda_versions is not None and
cuda_versions.cudnn_get_version() >= 8904
)
def _get_causal_mask(T, S):
causal_mask = jnp.tril(jnp.ones((T, S), dtype=jnp.bool_))
return causal_mask[jnp.newaxis, jnp.newaxis, :, :]
@jtu.with_config(jax_legacy_prng_key="allow",
jax_numpy_dtype_promotion="standard")
class NNFunctionsTest(jtu.JaxTestCase):
@parameterized.product(
dtype=[jnp.float32, jnp.bfloat16, jnp.float16],
use_bias=(False, True),
causal_mode=(None, 'is_causal', 'is_mask'),
impl=('xla', 'cudnn'),
)
def testDotProductAttentionInfer(self, dtype, use_bias, causal_mode, impl):
if impl == 'cudnn' and not _is_required_cudnn_version_satisfied():
raise unittest.SkipTest("CUDA or cuDNN versions are not compatible.")
if impl == 'cudnn' and dtype == jnp.float32:
raise unittest.SkipTest("cuDNN only supports fp16 or bf16.")
sdpa = nn.dot_product_attention
B, S, T, N, H = 2, 128, 128, 4, 32
keys = random.split(random.PRNGKey(0), 4)
Q = random.normal(keys[0], (B, T, N, H), dtype)
K = random.normal(keys[1], (B, S, N, H), dtype)
V = random.normal(keys[2], (B, S, N, H), dtype)
if use_bias:
bias = random.normal(keys[3], (1, N, T, S), dtype)
else:
bias = None
is_causal = causal_mode == 'is_causal'
causal_mask = _get_causal_mask(T, S) if causal_mode == 'is_mask' else None
sdpa_ref = partial(sdpa, is_causal=is_causal, implementation=None)
sdpa_ans = partial(sdpa, is_causal=is_causal, implementation=impl)
if impl == 'cudnn':
lowered = jax.jit(sdpa_ans).lower(Q, K, V, bias=bias, mask=causal_mask)
hlo = mlir.module_to_string(lowered.compiler_ir('stablehlo'))
self.assertIn('__cudnn$fmha', hlo)
out_ref = sdpa_ref(Q, K, V, bias=bias, mask=causal_mask)
out_ans = sdpa_ans(Q, K, V, bias=bias, mask=causal_mask)
self.assertAllClose(out_ref, out_ans, atol=.01, rtol=.01)
@parameterized.product(
dtype=[jnp.float32, jnp.bfloat16, jnp.float16],
use_bias=[False, True],
causal_mode=[None, 'is_causal', 'is_mask'],
impl=['xla', 'cudnn'],
)
def testDotProductAttentionTrain(self, dtype, use_bias, causal_mode, impl):
if impl == 'cudnn' and not _is_required_cudnn_version_satisfied():
raise unittest.SkipTest("CUDA or cuDNN versions are not compatible.")
if impl == 'cudnn' and dtype == jnp.float32:
raise unittest.SkipTest("cuDNN only supports fp16 or bf16.")
sdpa = nn.dot_product_attention
B, S, T, N, H = 2, 128, 128, 4, 32
keys = random.split(random.PRNGKey(0), 5)
Q = random.normal(keys[0], (B, T, N, H), dtype)
K = random.normal(keys[1], (B, S, N, H), dtype)
V = random.normal(keys[2], (B, S, N, H), dtype)
grad = random.normal(keys[3], (B, T, N, H), dtype)
if use_bias:
bias = random.normal(keys[4], (1, N, T, S), dtype)
else:
bias = None
is_causal = causal_mode == 'is_causal'
causal_mask = _get_causal_mask(T, S) if causal_mode == 'is_mask' else None
sdpa_ref = partial(sdpa, is_causal=is_causal, implementation=None)
fn_ref = lambda q, k, v, b, m: sdpa_ref(q, k, v, bias=b, mask=m)
_, sdpa_vjp_ref = jax.vjp(fn_ref, Q, K, V, bias, causal_mask)
dQ_ref, dK_ref, dV_ref, dbias_ref, _ = sdpa_vjp_ref(grad)
sdpa_ans = partial(sdpa, is_causal=is_causal, implementation=impl)
fn_ans = lambda q, k, v, b, m: sdpa_ans(q, k, v, bias=b, mask=m)
_, sdpa_vjp_ans = jax.vjp(fn_ans, Q, K, V, bias, causal_mask)
dQ_ans, dK_ans, dV_ans, dbias_ans, _ = sdpa_vjp_ans(grad)
if impl == 'cudnn':
lowered = jax.jit(sdpa_vjp_ans).lower(grad)
hlo = mlir.module_to_string(lowered.compiler_ir('stablehlo'))
self.assertRegex(hlo, r'__cudnn\$fmha.*Backward\(')
rtol, atol = (.01, .01)
self.assertAllClose(dQ_ref, dQ_ans, rtol=rtol, atol=atol)
self.assertAllClose(dK_ref, dK_ans, rtol=rtol, atol=atol)
self.assertAllClose(dV_ref, dV_ans, rtol=rtol, atol=atol)
self.assertAllClose(dbias_ref, dbias_ans, rtol=.03, atol=.03)
@jtu.skip_on_flag("jax_skip_slow_tests", True)
def testSoftplusGrad(self):
check_grads(nn.softplus, (1e-8,), order=4,
rtol=1e-2 if jtu.test_device_matches(["tpu"]) else None)
def testSoftplusGradZero(self):
check_grads(nn.softplus, (0.,), order=1,
rtol=1e-2 if jtu.test_device_matches(["tpu"]) else None)
def testSoftplusGradInf(self):
self.assertAllClose(
1., jax.grad(nn.softplus)(float('inf')))
def testSoftplusGradNegInf(self):
check_grads(nn.softplus, (-float('inf'),), order=1,
rtol=1e-2 if jtu.test_device_matches(["tpu"]) else None)
def testSoftplusGradNan(self):
check_grads(nn.softplus, (float('nan'),), order=1,
rtol=1e-2 if jtu.test_device_matches(["tpu"]) else None)
@parameterized.parameters([int, float] + jtu.dtypes.floating + jtu.dtypes.integer)
def testSoftplusZero(self, dtype):
self.assertEqual(jnp.log(dtype(2)), nn.softplus(dtype(0)))
def testSparseplusGradZero(self):
check_grads(nn.sparse_plus, (-2.,), order=1,
rtol=1e-2 if jtu.test_device_matches(["tpu"]) else None)
def testSparseplusGrad(self):
check_grads(nn.sparse_plus, (0.,), order=1,
rtol=1e-2 if jtu.test_device_matches(["tpu"]) else None)
def testSparseplusAndSparseSigmoid(self):
self.assertAllClose(
jax.grad(nn.sparse_plus)(0.), nn.sparse_sigmoid(0.),
check_dtypes=False)
self.assertAllClose(
jax.grad(nn.sparse_plus)(2.), nn.sparse_sigmoid(2.),
check_dtypes=False)
self.assertAllClose(
jax.grad(nn.sparse_plus)(-2.), nn.sparse_sigmoid(-2.),
check_dtypes=False)
def testSquareplusGrad(self):
check_grads(nn.squareplus, (1e-8,), order=4,
rtol=1e-2 if jtu.test_device_matches(["tpu"]) else None)
def testSquareplusGradZero(self):
check_grads(nn.squareplus, (0.,), order=1,
rtol=1e-2 if jtu.test_device_matches(["tpu"]) else None)
def testSquareplusGradNegInf(self):
check_grads(nn.squareplus, (-float('inf'),), order=1,
rtol=1e-2 if jtu.test_device_matches(["tpu"]) else None)
def testSquareplusGradNan(self):
check_grads(nn.squareplus, (float('nan'),), order=1,
rtol=1e-2 if jtu.test_device_matches(["tpu"]) else None)
@parameterized.parameters([float] + jtu.dtypes.floating)
def testSquareplusZero(self, dtype):
self.assertEqual(dtype(1), nn.squareplus(dtype(0), dtype(4)))
def testMishGrad(self):
check_grads(nn.mish, (1e-8,), order=4,
rtol=1e-2 if jtu.test_device_matches(["tpu"]) else None)
def testMishGradZero(self):
check_grads(nn.mish, (0.,), order=1,
rtol=1e-2 if jtu.test_device_matches(["tpu"]) else None)
def testMishGradNegInf(self):
check_grads(nn.mish, (-float('inf'),), order=1,
rtol=1e-2 if jtu.test_device_matches(["tpu"]) else None)
def testMishGradNan(self):
check_grads(nn.mish, (float('nan'),), order=1,
rtol=1e-2 if jtu.test_device_matches(["tpu"]) else None)
@parameterized.parameters([float] + jtu.dtypes.floating)
def testMishZero(self, dtype):
self.assertEqual(dtype(0), nn.mish(dtype(0)))
def testReluGrad(self):
rtol = 1e-2 if jtu.test_device_matches(["tpu"]) else None
check_grads(nn.relu, (1.,), order=3, rtol=rtol)
check_grads(nn.relu, (-1.,), order=3, rtol=rtol)
jaxpr = jax.make_jaxpr(jax.grad(nn.relu))(0.)
self.assertGreaterEqual(len(jaxpr.jaxpr.eqns), 2)
def testRelu6Grad(self):
rtol = 1e-2 if jtu.test_device_matches(["tpu"]) else None
check_grads(nn.relu6, (1.,), order=3, rtol=rtol)
check_grads(nn.relu6, (-1.,), order=3, rtol=rtol)
self.assertAllClose(jax.grad(nn.relu6)(0.), 0., check_dtypes=False)
self.assertAllClose(jax.grad(nn.relu6)(6.), 0., check_dtypes=False)
def testSoftplusValue(self):
val = nn.softplus(89.)
self.assertAllClose(val, 89., check_dtypes=False)
def testSparseplusValue(self):
val = nn.sparse_plus(89.)
self.assertAllClose(val, 89., check_dtypes=False)
def testSparsesigmoidValue(self):
self.assertAllClose(nn.sparse_sigmoid(-2.), 0., check_dtypes=False)
self.assertAllClose(nn.sparse_sigmoid(2.), 1., check_dtypes=False)
self.assertAllClose(nn.sparse_sigmoid(0.), .5, check_dtypes=False)
def testSquareplusValue(self):
val = nn.squareplus(1e3)
self.assertAllClose(val, 1e3, check_dtypes=False, atol=1e-3)
def testMishValue(self):
val = nn.mish(1e3)
self.assertAllClose(val, 1e3, check_dtypes=False, atol=1e-3)
@jtu.skip_on_flag("jax_skip_slow_tests", True)
def testEluGrad(self):
check_grads(nn.elu, (1e4,), order=4, eps=1.)
def testEluValue(self):
val = nn.elu(1e4)
self.assertAllClose(val, 1e4, check_dtypes=False)
def testGluValue(self):
val = nn.glu(jnp.array([1.0, 0.0]), axis=0)
self.assertAllClose(val, jnp.array([0.5]))
@parameterized.parameters(False, True)
def testGeluIntType(self, approximate):
val_float = nn.gelu(jnp.array(-1.0), approximate=approximate)
val_int = nn.gelu(jnp.array(-1), approximate=approximate)
self.assertAllClose(val_float, val_int)
@parameterized.parameters(False, True)
def testGelu(self, approximate):
def gelu_reference(x):
return x * scipy.stats.norm.cdf(x)
rng = jtu.rand_default(self.rng())
args_maker = lambda: [rng((4, 5, 6), jnp.float32)]
self._CheckAgainstNumpy(
gelu_reference, partial(nn.gelu, approximate=approximate), args_maker,
check_dtypes=False, tol=1e-3 if approximate else None)
@parameterized.parameters(*itertools.product(
(jnp.float32, jnp.bfloat16, jnp.float16),
(partial(nn.gelu, approximate=False),
partial(nn.gelu, approximate=True),
nn.relu, nn.softplus, nn.sparse_plus, nn.sigmoid, nn.squareplus, nn.mish)))
def testDtypeMatchesInput(self, dtype, fn):
x = jnp.zeros((), dtype=dtype)
out = fn(x)
self.assertEqual(out.dtype, dtype)
def testEluMemory(self):
# see https://github.com/google/jax/pull/1640
with jax.enable_checks(False): # With checks we materialize the array
jax.make_jaxpr(lambda: nn.elu(jnp.ones((10 ** 12,)))) # don't oom
def testHardTanhMemory(self):
# see https://github.com/google/jax/pull/1640
with jax.enable_checks(False): # With checks we materialize the array
jax.make_jaxpr(lambda: nn.hard_tanh(jnp.ones((10 ** 12,)))) # don't oom
@parameterized.parameters([nn.softmax, nn.log_softmax])
def testSoftmaxEmptyArray(self, fn):
x = jnp.array([], dtype=float)
self.assertArraysEqual(fn(x), x)
@parameterized.parameters([nn.softmax, nn.log_softmax])
def testSoftmaxEmptyMask(self, fn):
x = jnp.array([5.5, 1.3, -4.2, 0.9])
m = jnp.zeros_like(x, dtype=bool)
expected = jnp.full_like(x, 0.0 if fn is nn.softmax else -jnp.inf)
self.assertArraysEqual(fn(x, where=m), expected)
@parameterized.parameters([nn.softmax, nn.log_softmax])
def testSoftmaxWhereMask(self, fn):
x = jnp.array([5.5, 1.3, -4.2, 0.9])
m = jnp.array([True, False, True, True])
out = fn(x, where=m)
self.assertAllClose(out[m], fn(x[m]))
probs = out if fn is nn.softmax else jnp.exp(out)
self.assertAllClose(probs.sum(), 1.0)
# TODO(mattjj): include log_softmax in these extra tests if/when we add a
# custom_jvp rule for it (since otherwise it doesn't pass the numerical
# checks below).
if fn is nn.softmax and config.softmax_custom_jvp.value:
g_fun = lambda x: jnp.take(fn(x, where=m, initial=-jnp.inf),
jnp.array([0, 2, 3]))
jtu.check_grads(g_fun, (x,), order=2)
@parameterized.parameters([nn.softmax, nn.log_softmax])
def testSoftmaxWhereGrad(self, fn):
# regression test for https://github.com/google/jax/issues/19490
x = jnp.array([36., 10000.])
mask = x < 1000
f = lambda x, mask: fn(x, where=mask)[0]
self.assertAllClose(jax.grad(f)(x, mask), jnp.zeros_like(x))
def testSoftmaxGrad(self):
x = jnp.array([5.5, 1.3, -4.2, 0.9])
jtu.check_grads(nn.softmax, (x,), order=2, atol=5e-3)
def testSoftmaxGradResiduals(self):
if not config.softmax_custom_jvp.value:
raise unittest.SkipTest("only applies when upgrade flag enabled")
x = jnp.array([5.5, 1.3, -4.2, 0.9])
res = ad_checkpoint.saved_residuals(nn.softmax, x)
self.assertLen(res, 1)
def testSoftmaxGradFlag(self):
x = jnp.array([5.5, 1.3, -4.2, 0.9])
with jax.softmax_custom_jvp(False):
res = ad_checkpoint.saved_residuals(nn.softmax, x)
self.assertLen(res, 3)
self.assertEqual(sum(a.size for a, _ in res), 6)
with jax.softmax_custom_jvp(True):
res = ad_checkpoint.saved_residuals(nn.softmax, x)
self.assertLen(res, 1)
self.assertEqual(sum(a.size for a, _ in res), 4)
def testStandardizeWhereMask(self):
x = jnp.array([5.5, 1.3, -4.2, 0.9])
m = jnp.array([True, False, True, True])
x_filtered = jnp.take(x, jnp.array([0, 2, 3]))
out_masked = jnp.take(nn.standardize(x, where=m), jnp.array([0, 2, 3]))
out_filtered = nn.standardize(x_filtered)
self.assertAllClose(out_masked, out_filtered)
def testOneHot(self):
actual = nn.one_hot(jnp.array([0, 1, 2]), 3)
expected = jnp.array([[1., 0., 0.],
[0., 1., 0.],
[0., 0., 1.]])
self.assertAllClose(actual, expected, check_dtypes=False)
actual = nn.one_hot(jnp.array([1, 2, 0]), 3)
expected = jnp.array([[0., 1., 0.],
[0., 0., 1.],
[1., 0., 0.]])
self.assertAllClose(actual, expected, check_dtypes=False)
def testOneHotOutOfBound(self):
actual = nn.one_hot(jnp.array([-1, 3]), 3)
expected = jnp.array([[0., 0., 0.],
[0., 0., 0.]])
self.assertAllClose(actual, expected, check_dtypes=False)
def testOneHotNonArrayInput(self):
actual = nn.one_hot([0, 1, 2], 3)
expected = jnp.array([[1., 0., 0.],
[0., 1., 0.],
[0., 0., 1.]])
self.assertAllClose(actual, expected, check_dtypes=False)
def testOneHotCustomDtype(self):
actual = nn.one_hot(jnp.array([0, 1, 2]), 3, dtype=jnp.bool_)
expected = jnp.array([[True, False, False],
[False, True, False],
[False, False, True]])
self.assertAllClose(actual, expected)
def testOneHotConcretizationError(self):
# https://github.com/google/jax/issues/3654
msg = r"in jax.nn.one_hot argument `num_classes`"
with self.assertRaisesRegex(core.ConcretizationTypeError, msg):
jax.jit(nn.one_hot)(3, 5)
def testOneHotAxis(self):
expected = jnp.array([[0., 1., 0.],
[0., 0., 1.],
[1., 0., 0.]]).T
actual = nn.one_hot(jnp.array([1, 2, 0]), 3, axis=0)
self.assertAllClose(actual, expected, check_dtypes=False)
actual = nn.one_hot(jnp.array([1, 2, 0]), 3, axis=-2)
self.assertAllClose(actual, expected, check_dtypes=False)
def testTanhExists(self):
nn.tanh # doesn't crash
def testCustomJVPLeak(self):
# https://github.com/google/jax/issues/8171
@jax.jit
def fwd():
a = jnp.array(1.)
def f(hx, _):
hx = jax.nn.sigmoid(hx + a)
return hx, None
hx = jnp.array(0.)
jax.lax.scan(f, hx, None, length=2)
with jax.checking_leaks():
fwd() # doesn't crash
def testCustomJVPLeak2(self):
# https://github.com/google/jax/issues/8171
# The above test uses jax.nn.sigmoid, as in the original #8171, but that
# function no longer actually has a custom_jvp! So we inline the old def.
@jax.custom_jvp
def sigmoid(x):
one = jnp.float32(1)
return jax.lax.div(one, jax.lax.add(one, jax.lax.exp(jax.lax.neg(x))))
sigmoid.defjvps(lambda g, ans, x: g * ans * (jnp.float32(1) - ans))
@jax.jit
def fwd():
a = jnp.array(1., 'float32')
def f(hx, _):
hx = sigmoid(hx + a)
return hx, None
hx = jnp.array(0., 'float32')
jax.lax.scan(f, hx, None, length=2)
with jax.checking_leaks():
fwd() # doesn't crash
InitializerRecord = collections.namedtuple(
"InitializerRecord",
["name", "initializer", "shapes", "dtypes"])
ALL_SHAPES = [(2,), (2, 2), (2, 3), (3, 2), (2, 3, 4), (4, 3, 2), (2, 3, 4, 5)]
def initializer_record(name, initializer, dtypes, min_dims=2, max_dims=4):
shapes = [shape for shape in ALL_SHAPES
if min_dims <= len(shape) <= max_dims]
return InitializerRecord(name, initializer, shapes, dtypes)
INITIALIZER_RECS = [
initializer_record("uniform", nn.initializers.uniform, jtu.dtypes.floating, 1),
initializer_record("normal", nn.initializers.normal, jtu.dtypes.inexact, 1),
initializer_record("he_normal", nn.initializers.he_normal, jtu.dtypes.inexact),
initializer_record("he_uniform", nn.initializers.he_uniform, jtu.dtypes.inexact),
initializer_record("glorot_normal", nn.initializers.glorot_normal, jtu.dtypes.inexact),
initializer_record("glorot_uniform", nn.initializers.glorot_uniform, jtu.dtypes.inexact),
initializer_record("lecun_normal", nn.initializers.lecun_normal, jtu.dtypes.inexact),
initializer_record("lecun_uniform", nn.initializers.lecun_uniform, jtu.dtypes.inexact),
initializer_record("orthogonal", nn.initializers.orthogonal, jtu.dtypes.floating, 2, 2),
initializer_record("truncated_normal", nn.initializers.truncated_normal, jtu.dtypes.floating, 1),
initializer_record("delta_orthogonal", nn.initializers.delta_orthogonal, jtu.dtypes.floating, 4, 4)
]
@jtu.with_config(jax_legacy_prng_key="allow")
class NNInitializersTest(jtu.JaxTestCase):
@parameterized.parameters(itertools.chain.from_iterable(
jtu.sample_product_testcases(
[dict(initializer=rec.initializer())],
shape=rec.shapes,
dtype=rec.dtypes
)
for rec in INITIALIZER_RECS
))
def testInitializer(self, initializer, shape, dtype):
rng = random.PRNGKey(0)
val = initializer(rng, shape, dtype)
self.assertEqual(shape, jnp.shape(val))
self.assertEqual(jax.dtypes.canonicalize_dtype(dtype), jnp.dtype(val))
@parameterized.parameters(itertools.chain.from_iterable(
jtu.sample_product_testcases(
[dict(initializer_provider=rec.initializer)],
shape=rec.shapes,
dtype=rec.dtypes
)
for rec in INITIALIZER_RECS
))
def testInitializerProvider(self, initializer_provider, shape, dtype):
rng = random.PRNGKey(0)
initializer = initializer_provider(dtype=dtype)
val = initializer(rng, shape)
self.assertEqual(shape, jnp.shape(val))
self.assertEqual(jax.dtypes.canonicalize_dtype(dtype), jnp.dtype(val))
def testVarianceScalingMultiAxis(self):
rng = random.PRNGKey(0)
shape = (2, 3, 4, 5)
initializer = nn.initializers.variance_scaling(
scale=1.0, mode='fan_avg', distribution='truncated_normal',
in_axis=(0, 1), out_axis=(-2, -1))
val = initializer(rng, shape)
self.assertEqual(shape, jnp.shape(val))
def testVarianceScalingBatchAxis(self):
rng = random.PRNGKey(0)
shape = (2, 3, 4, 5)
initializer = nn.initializers.variance_scaling(
scale=1.0, mode='fan_avg', distribution='truncated_normal',
in_axis=0, out_axis=(2, 3), batch_axis=1)
val = initializer(rng, shape)
self.assertEqual(shape, jnp.shape(val))
def testVarianceScalingError(self):
rng = random.PRNGKey(0)
shape = (5,)
initializer = nn.initializers.variance_scaling(
scale=1.0, mode='fan_avg', distribution='truncated_normal')
with self.assertRaisesRegex(
ValueError,
"Can't compute input and output sizes of a 1"
"-dimensional weights tensor. Must be at least 2D."
):
initializer(rng, shape)
def testAccidentalUpcasting(self):
rng = random.PRNGKey(0)
shape = (4, 4)
scalar_param = jnp.array(1.0, dtype=jnp.float32)
for init_fn in (nn.initializers.uniform(scalar_param, jnp.bfloat16),
nn.initializers.normal(scalar_param, jnp.bfloat16),
nn.initializers.truncated_normal(scalar_param, jnp.bfloat16),
):
sub_rng, rng = random.split(rng)
val = init_fn(sub_rng, shape)
self.assertEqual(val.dtype, jnp.bfloat16)
if __name__ == "__main__":
absltest.main(testLoader=jtu.JaxTestLoader())