rocm_jax/build/rocm/run_single_gpu.py

#!/usr/bin/env python3
# Copyright 2022 The JAX Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import csv
import json
import argparse
import threading
import subprocess
from concurrent.futures import ThreadPoolExecutor

GPU_LOCK = threading.Lock()
LAST_CODE = 0
base_dir = "./logs"

def extract_filename(path):
  base_name = os.path.basename(path)
  file_name, _ = os.path.splitext(base_name)
  return file_name


def combine_json_reports():
  all_json_files = [f for f in os.listdir(base_dir) if f.endswith('_log.json')]
  combined_data = []
  for json_file in all_json_files:
    with open(os.path.join(base_dir, json_file), 'r') as infile:
      data = json.load(infile)
      combined_data.append(data)
  combined_json_file = f"{base_dir}/final_compiled_report.json"
  with open(combined_json_file, 'w') as outfile:
    json.dump(combined_data, outfile, indent=4)


def combine_csv_reports():
  all_csv_files = [f for f in os.listdir(base_dir) if f.endswith('_log.csv')]
  combined_csv_file = f"{base_dir}/final_compiled_report.csv"
  with open(combined_csv_file, mode='w', newline='') as outfile:
    csv_writer = csv.writer(outfile)
    for i, csv_file in enumerate(all_csv_files):
      with open(os.path.join(base_dir, csv_file), mode='r') as infile:
        csv_reader = csv.reader(infile)
        if i == 0:
          # write headers only once
          csv_writer.writerow(next(csv_reader))
        for row in csv_reader:
          csv_writer.writerow(row)


def generate_final_report(shell=False, env_vars={}):
  env = os.environ
  env = {**env, **env_vars}
  cmd = ["pytest_html_merger", "-i", f'{base_dir}', "-o", f'{base_dir}/final_compiled_report.html']
  result = subprocess.run(cmd,
                          shell=shell,
                          capture_output=True,
                          env=env)
  if result.returncode != 0:
    print("FAILED - {}".format(" ".join(cmd)))
    print(result.stderr.decode())

  # Generate json reports.
  combine_json_reports()
  # Generate csv reports.
  combine_csv_reports()


def run_shell_command(cmd, shell=False, env_vars={}):
  env = os.environ
  env = {**env, **env_vars}
  result = subprocess.run(cmd,
                          shell=shell,
                          capture_output=True,
                          env=env)
  if result.returncode != 0:
    print("FAILED - {}".format(" ".join(cmd)))
    print(result.stderr.decode())

  return result.returncode, result.stderr.decode(), result.stdout.decode()


def parse_test_log(log_file):
  """Parses the test module log file to extract test modules and functions."""
  test_files = set()
  with open(log_file, "r") as f:
    for line in f:
      report = json.loads(line)
      if "nodeid" in report:
        module = report["nodeid"].split("::")[0]
        if module and ".py" in module:
          test_files.add(os.path.abspath(module))
  return test_files


def collect_testmodules():
  log_file = f"{base_dir}/collect_module_log.jsonl"
  return_code, stderr, stdout = run_shell_command(
      ["python3", "-m", "pytest", "--collect-only", "tests", f"--report-log={log_file}"])
  if return_code != 0:
    print("Test module discovery failed.")
    print("STDOUT:", stdout)
    print("STDERR:", stderr)
    exit(return_code)
  print("---------- collected test modules ----------")
  test_files = parse_test_log(log_file)
  print("Found %d test modules." % (len(test_files)))
  print("--------------------------------------------")
  print("\n".join(test_files))
  return test_files


def run_test(testmodule, gpu_tokens, continue_on_fail):
  global LAST_CODE
  with GPU_LOCK:
    if LAST_CODE != 0:
      return
    target_gpu = gpu_tokens.pop()
  env_vars = {
      "HIP_VISIBLE_DEVICES": str(target_gpu),
      "XLA_PYTHON_CLIENT_ALLOCATOR": "default",
  }
  testfile = extract_filename(testmodule)
  if continue_on_fail:
    cmd = ["python3", "-m", "pytest",
          "--json-report", f"--json-report-file={base_dir}/{testfile}_log.json",
          f"--csv={base_dir}/{testfile}_log.csv",
          "--csv-columns", "id,module,name,file,status,duration",
          f"--html={base_dir}/{testfile}_log.html",
          "--reruns", "3", "-v", testmodule]
  else:
    cmd = ["python3", "-m", "pytest",
          "--json-report", f"--json-report-file={base_dir}/{testfile}_log.json",
          f"--csv={base_dir}/{testfile}_log.csv",
          "--csv-columns", "id,module,name,file,status,duration",
          f"--html={base_dir}/{testfile}_log.html",
          "--reruns", "3", "-x", "-v", testmodule]

  return_code, stderr, stdout = run_shell_command(cmd, env_vars=env_vars)
  with GPU_LOCK:
    gpu_tokens.append(target_gpu)
    if LAST_CODE == 0:
      print("Running tests in module %s on GPU %d:" % (testmodule, target_gpu))
      print(stdout)
      print(stderr)
      if continue_on_fail == False:
          LAST_CODE = return_code


def run_parallel(all_testmodules, p, c):
  print(f"Running tests with parallelism = {p}")
  available_gpu_tokens = list(range(p))
  executor = ThreadPoolExecutor(max_workers=p)
  # walking through test modules.
  for testmodule in all_testmodules:
    executor.submit(run_test, testmodule, available_gpu_tokens, c)
  # waiting for all modules to finish.
  executor.shutdown(wait=True)


def find_num_gpus():
  cmd = [r"lspci|grep 'controller\|accel'|grep 'AMD/ATI'|wc -l"]
  _, _, stdout = run_shell_command(cmd, shell=True)
  return int(stdout)


def main(args):
  all_testmodules = collect_testmodules()
  run_parallel(all_testmodules, args.parallel, args.continue_on_fail)
  generate_final_report()
  exit(LAST_CODE)


if __name__ == '__main__':
  os.environ['HSA_TOOLS_LIB'] = "libroctracer64.so"
  parser = argparse.ArgumentParser()
  parser.add_argument("-p",
                      "--parallel",
                      type=int,
                      help="number of tests to run in parallel")
  parser.add_argument("-c",
                      "--continue_on_fail",
                      action='store_true',
                      help="continue on failure")
  args = parser.parse_args()
  if args.continue_on_fail:
      print("continue on fail is set")
  if args.parallel is None:
    sys_gpu_count = find_num_gpus()
    args.parallel = sys_gpu_count
    print("%d GPUs detected." % sys_gpu_count)

  main(args)
Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00			`#!/usr/bin/env python3`
Change JAX's copyright attribution from "Google LLC" to "The JAX Authors.". See https://opensource.google/documentation/reference/releasing/contributions#copyright for more details. PiperOrigin-RevId: 476167538 2022-09-22 12:26:48 -07:00			`# Copyright 2022 The JAX Authors.`
Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# https://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`import os`
[ROCm] improve gpu script 2024-07-31 15:04:58 -05:00			`import csv`
[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages 2024-01-30 21:01:12 +00:00			`import json`
			`import argparse`
Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00			`import threading`
[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages 2024-01-30 21:01:12 +00:00			`import subprocess`
Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00			`from concurrent.futures import ThreadPoolExecutor`

			`GPU_LOCK = threading.Lock()`
			`LAST_CODE = 0`
Fix pre-commit formatting errors It looks like #21916 introduced some whitespace issues that are causing our pre-commits to fail. 2024-06-26 06:38:51 -04:00			`base_dir = "./logs"`
[ROCM]: Generating pytest html logs from unit-tests. 2024-01-17 22:48:09 +00:00
			`def extract_filename(path):`
			`base_name = os.path.basename(path)`
			`file_name, _ = os.path.splitext(base_name)`
			`return file_name`

[ROCm] improve gpu script 2024-07-31 15:04:58 -05:00
			`def combine_json_reports():`
			`all_json_files = [f for f in os.listdir(base_dir) if f.endswith('_log.json')]`
			`combined_data = []`
			`for json_file in all_json_files:`
			`with open(os.path.join(base_dir, json_file), 'r') as infile:`
			`data = json.load(infile)`
			`combined_data.append(data)`
			`combined_json_file = f"{base_dir}/final_compiled_report.json"`
			`with open(combined_json_file, 'w') as outfile:`
			`json.dump(combined_data, outfile, indent=4)`


			`def combine_csv_reports():`
			`all_csv_files = [f for f in os.listdir(base_dir) if f.endswith('_log.csv')]`
			`combined_csv_file = f"{base_dir}/final_compiled_report.csv"`
			`with open(combined_csv_file, mode='w', newline='') as outfile:`
			`csv_writer = csv.writer(outfile)`
			`for i, csv_file in enumerate(all_csv_files):`
			`with open(os.path.join(base_dir, csv_file), mode='r') as infile:`
			`csv_reader = csv.reader(infile)`
			`if i == 0:`
			`# write headers only once`
			`csv_writer.writerow(next(csv_reader))`
			`for row in csv_reader:`
			`csv_writer.writerow(row)`


[ROCM]: Generating pytest html logs from unit-tests. 2024-01-17 22:48:09 +00:00			`def generate_final_report(shell=False, env_vars={}):`
			`env = os.environ`
			`env = {env, env_vars}`
Run `pyupgrade --py310-plus`. Also apply manual fixes to import sorting and unused imports. 2024-06-26 14:44:52 -04:00			`cmd = ["pytest_html_merger", "-i", f'{base_dir}', "-o", f'{base_dir}/final_compiled_report.html']`
[ROCM]: Generating pytest html logs from unit-tests. 2024-01-17 22:48:09 +00:00			`result = subprocess.run(cmd,`
			`shell=shell,`
			`capture_output=True,`
			`env=env)`
			`if result.returncode != 0:`
			`print("FAILED - {}".format(" ".join(cmd)))`
			`print(result.stderr.decode())`
[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages 2024-01-30 21:01:12 +00:00
[ROCm] improve gpu script 2024-07-31 15:04:58 -05:00			`# Generate json reports.`
			`combine_json_reports()`
			`# Generate csv reports.`
			`combine_csv_reports()`
Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00

			`def run_shell_command(cmd, shell=False, env_vars={}):`
			`env = os.environ`
			`env = {env, env_vars}`
			`result = subprocess.run(cmd,`
			`shell=shell,`
feat: refactor code using pyupgrade This PR upgrades legacy Python code to 3.7+ code using pyupgrade: ```sh pyupgrade --py37-plus --keep-runtime-typing **.py ``` a 2022-05-12 19:13:00 +01:00			`capture_output=True,`
Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00			`env=env)`
			`if result.returncode != 0:`
			`print("FAILED - {}".format(" ".join(cmd)))`
			`print(result.stderr.decode())`
[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages 2024-01-30 21:01:12 +00:00
Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00			`return result.returncode, result.stderr.decode(), result.stdout.decode()`


[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages 2024-01-30 21:01:12 +00:00			`def parse_test_log(log_file):`
			`"""Parses the test module log file to extract test modules and functions."""`
			`test_files = set()`
			`with open(log_file, "r") as f:`
			`for line in f:`
			`report = json.loads(line)`
			`if "nodeid" in report:`
			`module = report["nodeid"].split("::")[0]`
[ROCm] improve gpu script 2024-07-31 15:04:58 -05:00			`if module and ".py" in module:`
[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages 2024-01-30 21:01:12 +00:00			`test_files.add(os.path.abspath(module))`
			`return test_files`


Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00			`def collect_testmodules():`
[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages 2024-01-30 21:01:12 +00:00			`log_file = f"{base_dir}/collect_module_log.jsonl"`
Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00			`return_code, stderr, stdout = run_shell_command(`
[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages 2024-01-30 21:01:12 +00:00			`["python3", "-m", "pytest", "--collect-only", "tests", f"--report-log={log_file}"])`
Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00			`if return_code != 0:`
			`print("Test module discovery failed.")`
[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages 2024-01-30 21:01:12 +00:00			`print("STDOUT:", stdout)`
			`print("STDERR:", stderr)`
Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00			`exit(return_code)`
			`print("---------- collected test modules ----------")`
[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages 2024-01-30 21:01:12 +00:00			`test_files = parse_test_log(log_file)`
			`print("Found %d test modules." % (len(test_files)))`
Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00			`print("--------------------------------------------")`
[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages 2024-01-30 21:01:12 +00:00			`print("\n".join(test_files))`
			`return test_files`
Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00

[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages 2024-01-30 21:01:12 +00:00			`def run_test(testmodule, gpu_tokens, continue_on_fail):`
Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00			`global LAST_CODE`
			`with GPU_LOCK:`
			`if LAST_CODE != 0:`
			`return`
			`target_gpu = gpu_tokens.pop()`
			`env_vars = {`
			`"HIP_VISIBLE_DEVICES": str(target_gpu),`
			`"XLA_PYTHON_CLIENT_ALLOCATOR": "default",`
			`}`
[ROCM]: Generating pytest html logs from unit-tests. 2024-01-17 22:48:09 +00:00			`testfile = extract_filename(testmodule)`
[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages 2024-01-30 21:01:12 +00:00			`if continue_on_fail:`
[ROCm] improve gpu script 2024-07-31 15:04:58 -05:00			`cmd = ["python3", "-m", "pytest",`
			`"--json-report", f"--json-report-file={base_dir}/{testfile}_log.json",`
			`f"--csv={base_dir}/{testfile}_log.csv",`
			`"--csv-columns", "id,module,name,file,status,duration",`
			`f"--html={base_dir}/{testfile}_log.html",`
			`"--reruns", "3", "-v", testmodule]`
[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages 2024-01-30 21:01:12 +00:00			`else:`
[ROCm] improve gpu script 2024-07-31 15:04:58 -05:00			`cmd = ["python3", "-m", "pytest",`
			`"--json-report", f"--json-report-file={base_dir}/{testfile}_log.json",`
			`f"--csv={base_dir}/{testfile}_log.csv",`
			`"--csv-columns", "id,module,name,file,status,duration",`
			`f"--html={base_dir}/{testfile}_log.html",`
			`"--reruns", "3", "-x", "-v", testmodule]`

Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00			`return_code, stderr, stdout = run_shell_command(cmd, env_vars=env_vars)`
			`with GPU_LOCK:`
			`gpu_tokens.append(target_gpu)`
			`if LAST_CODE == 0:`
			`print("Running tests in module %s on GPU %d:" % (testmodule, target_gpu))`
			`print(stdout)`
			`print(stderr)`
[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages 2024-01-30 21:01:12 +00:00			`if continue_on_fail == False:`
			`LAST_CODE = return_code`
Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00

[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages 2024-01-30 21:01:12 +00:00			`def run_parallel(all_testmodules, p, c):`
[ROCm] improve gpu script 2024-07-31 15:04:58 -05:00			`print(f"Running tests with parallelism = {p}")`
Merge pull request #9584 from ROCmSoftwarePlatform:rocm_refactor_jaxlib PiperOrigin-RevId: 432236852 2022-03-03 11:11:02 -08:00			`available_gpu_tokens = list(range(p))`
Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00			`executor = ThreadPoolExecutor(max_workers=p)`
[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages 2024-01-30 21:01:12 +00:00			`# walking through test modules.`
Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00			`for testmodule in all_testmodules:`
[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages 2024-01-30 21:01:12 +00:00			`executor.submit(run_test, testmodule, available_gpu_tokens, c)`
			`# waiting for all modules to finish.`
			`executor.shutdown(wait=True)`
Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00

			`def find_num_gpus():`
Fix mypy error 2024-08-27 16:57:53 +01:00			`cmd = [r"lspci\|grep 'controller\\|accel'\|grep 'AMD/ATI'\|wc -l"]`
Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00			`_, _, stdout = run_shell_command(cmd, shell=True)`
			`return int(stdout)`


			`def main(args):`
			`all_testmodules = collect_testmodules()`
[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages 2024-01-30 21:01:12 +00:00			`run_parallel(all_testmodules, args.parallel, args.continue_on_fail)`
[ROCM]: Generating pytest html logs from unit-tests. 2024-01-17 22:48:09 +00:00			`generate_final_report()`
Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00			`exit(LAST_CODE)`


			`if __name__ == '__main__':`
[ROCm] Upgrade to ROCm 5.3 and associated enhancements 2022-10-01 03:39:55 +00:00			`os.environ['HSA_TOOLS_LIB'] = "libroctracer64.so"`
Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00			`parser = argparse.ArgumentParser()`
			`parser.add_argument("-p",`
			`"--parallel",`
			`type=int,`
			`help="number of tests to run in parallel")`
[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages 2024-01-30 21:01:12 +00:00			`parser.add_argument("-c",`
			`"--continue_on_fail",`
			`action='store_true',`
			`help="continue on failure")`
Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00			`args = parser.parse_args()`
[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages 2024-01-30 21:01:12 +00:00			`if args.continue_on_fail:`
			`print("continue on fail is set")`
Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00			`if args.parallel is None:`
			`sys_gpu_count = find_num_gpus()`
			`args.parallel = sys_gpu_count`
Merge pull request #9584 from ROCmSoftwarePlatform:rocm_refactor_jaxlib PiperOrigin-RevId: 432236852 2022-03-03 11:11:02 -08:00			`print("%d GPUs detected." % sys_gpu_count)`
Update JAX to use new math libraries in ROCm-5.0. 2022-02-15 17:54:02 +00:00
			`main(args)`