rocm_jax/jaxlib/rocm/rocm_plugin_extension.cc
Peter Hawkins 3f91b4b43a Move jaxlib/{cuda,rocm}_plugin_extension into jaxlib/{cuda/rocm}/
Move the common jaxlib/gpu_plugin_extension into jaxlib/gpu/

Cleanup only, no functional changes intended.

PiperOrigin-RevId: 738183402
2025-03-18 16:29:37 -07:00

90 lines
2.9 KiB
C++

/* Copyright 2024 The JAX Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <cstdint>
#include <string>
#include "nanobind/nanobind.h"
#include "absl/log/log.h"
#include "absl/strings/str_cat.h"
#include "rocm/include/hip/hip_runtime.h"
#include "jaxlib/gpu/gpu_plugin_extension.h"
namespace nb = nanobind;
namespace xla {
namespace {
std::string ToString(hipError_t result) {
#define OSTREAM_ROCM_ERROR(__name) \
case hipError##__name: \
return "HIP_ERROR_" #__name;
switch (result) {
OSTREAM_ROCM_ERROR(InvalidValue)
OSTREAM_ROCM_ERROR(OutOfMemory)
OSTREAM_ROCM_ERROR(NotInitialized)
OSTREAM_ROCM_ERROR(Deinitialized)
OSTREAM_ROCM_ERROR(NoDevice)
OSTREAM_ROCM_ERROR(InvalidDevice)
OSTREAM_ROCM_ERROR(InvalidImage)
OSTREAM_ROCM_ERROR(InvalidContext)
OSTREAM_ROCM_ERROR(InvalidHandle)
OSTREAM_ROCM_ERROR(NotFound)
OSTREAM_ROCM_ERROR(NotReady)
OSTREAM_ROCM_ERROR(NoBinaryForGpu)
// Encountered an uncorrectable ECC error during execution.
OSTREAM_ROCM_ERROR(ECCNotCorrectable)
// Load/store on an invalid address. Must reboot all context.
case 700:
return "ROCM_ERROR_ILLEGAL_ADDRESS";
// Passed too many / wrong arguments, too many threads for register count.
case 701:
return "ROCM_ERROR_LAUNCH_OUT_OF_RESOURCES";
OSTREAM_ROCM_ERROR(ContextAlreadyInUse)
OSTREAM_ROCM_ERROR(PeerAccessUnsupported)
OSTREAM_ROCM_ERROR(Unknown) // Unknown internal error to ROCM.
default:
return absl::StrCat("hipError_t(", static_cast<int>(result), ")");
}
}
} // namespace
NB_MODULE(rocm_plugin_extension, m) {
BuildGpuPluginExtension(m);
m.def(
"get_device_ordinal",
[](std::intptr_t data_value) {
if (data_value == 0) {
return 0;
}
int device_ordinal;
void* data_ptr = reinterpret_cast<void*>(data_value);
hipError_t result =
hipPointerGetAttribute(static_cast<void*>(&device_ordinal),
HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
reinterpret_cast<hipDeviceptr_t>(data_ptr));
if (result != hipSuccess) {
LOG(FATAL) << "Not able to get the device_ordinal for ptr: "
<< data_ptr << ". Error: " << ToString(result);
}
return device_ordinal;
},
nb::arg("data_value"));
}
} // namespace xla