[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages

2025-04-15 19:36:06 +00:00 · 2024-01-30 21:01:12 +00:00 · 2024-01-30 21:01:12 +00:00 · 7d6fa3c05b
commit 7d6fa3c05b
parent d54bf529cc
3 changed files with 51 additions and 33 deletions
--- a/build/rocm/Dockerfile.ms
+++ b/build/rocm/Dockerfile.ms
@ -32,6 +32,6 @@ RUN git clone https://github.com/pyenv/pyenv.git /pyenv
 ENV PYENV_ROOT /pyenv
 ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH
 RUN pyenv install $PYTHON_VERSION
-RUN eval "$(pyenv init -)" && pyenv local ${PYTHON_VERSION} && pip3 install --upgrade --force-reinstall setuptools pip && pip install numpy setuptools build wheel six auditwheel scipy pytest pytest-html pytest_html_merger  pytest-rerunfailures matplotlib absl-py flatbuffers hypothesis
+RUN eval "$(pyenv init -)" && pyenv local ${PYTHON_VERSION} && pip3 install --upgrade --force-reinstall setuptools pip && pip install numpy setuptools build wheel six auditwheel scipy pytest pytest-html pytest_html_merger pytest-reportlog pytest-rerunfailures cloudpickle portpicker matplotlib absl-py flatbuffers hypothesis


--- a/build/rocm/run_multi_gpu.sh
+++ b/build/rocm/run_multi_gpu.sh
@ -20,8 +20,8 @@ run_tests() {
    local base_dir=./logs
    local gpu_devices="$1"
    export HIP_VISIBLE_DEVICES=$gpu_devices
-    python3 -m pytest --html=$base_dir/multi_gpu_pmap_test_log.html --reruns 3 -x tests/pmap_test.py
-    python3 -m pytest --html=$base_dir/multi_gpu_multi_device_test_log.html --reruns 3 -x tests/multi_device_test.py
+    python3 -m pytest --html=$base_dir/multi_gpu_pmap_test_log.html --reruns 3  tests/pmap_test.py
+    python3 -m pytest --html=$base_dir/multi_gpu_multi_device_test_log.html --reruns 3 tests/multi_device_test.py
    python3 -m pytest_html_merger -i $base_dir/ -o  $base_dir/final_compiled_report.html
 }

--- a/build/rocm/run_single_gpu.py
+++ b/build/rocm/run_single_gpu.py
@ -13,11 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import argparse
 import os
-import re
-import subprocess
+import json
+import argparse
 import threading
+import subprocess
 from concurrent.futures import ThreadPoolExecutor

 GPU_LOCK = threading.Lock()
@ -40,7 +40,7 @@ def generate_final_report(shell=False, env_vars={}):
  if result.returncode != 0:
    print("FAILED - {}".format(" ".join(cmd)))
    print(result.stderr.decode())
-    # sys.exit(result.returncode)
+
  return result.returncode, result.stderr.decode(), result.stdout.decode()


@ -54,32 +54,42 @@ def run_shell_command(cmd, shell=False, env_vars={}):
  if result.returncode != 0:
    print("FAILED - {}".format(" ".join(cmd)))
    print(result.stderr.decode())
-    # sys.exit(result.returncode)
+
  return result.returncode, result.stderr.decode(), result.stdout.decode()


+def parse_test_log(log_file):
+  """Parses the test module log file to extract test modules and functions."""
+  test_files = set()
+  with open(log_file, "r") as f:
+    for line in f:
+      report = json.loads(line)
+      if "nodeid" in report:
+        module = report["nodeid"].split("::")[0]
+        if module:
+          test_files.add(os.path.abspath(module))
+  return test_files
+
+
 def collect_testmodules():
-  all_test_files = []
+  log_file = f"{base_dir}/collect_module_log.jsonl"
  return_code, stderr, stdout = run_shell_command(
-      ["python3", "-m", "pytest", "--collect-only", "tests"])
+      ["python3", "-m", "pytest", "--collect-only", "tests", f"--report-log={log_file}"])
  if return_code != 0:
-    print(stdout)
-    print(stderr)
    print("Test module discovery failed.")
+    print("STDOUT:", stdout)
+    print("STDERR:", stderr)
    exit(return_code)
-  for line in stdout.split("\n"):
-    match = re.match("<Module (.*)>", line)
-    if match:
-      test_file = match.group(1)
-      all_test_files.append(test_file)
  print("---------- collected test modules ----------")
-  print("Found %d test modules." % (len(all_test_files)))
-  print("\n".join(all_test_files))
+  test_files = parse_test_log(log_file)
+  print("Found %d test modules." % (len(test_files)))
  print("--------------------------------------------")
-  return all_test_files
+  print("\n".join(test_files))
+  
+  return test_files


-def run_test(testmodule, gpu_tokens):
+def run_test(testmodule, gpu_tokens, continue_on_fail):
  global LAST_CODE
  with GPU_LOCK:
    if LAST_CODE != 0:
@ -90,7 +100,10 @@ def run_test(testmodule, gpu_tokens):
      "XLA_PYTHON_CLIENT_ALLOCATOR": "default",
  }
  testfile = extract_filename(testmodule)
-  cmd = ["python3", "-m", "pytest", f'--html={base_dir}/{testfile}_log.html', "--reruns", "3", "-x", testmodule]
+  if continue_on_fail:
+      cmd = ["python3", "-m", "pytest", '--html={}/{}_log.html'.format(base_dir, testfile), "--reruns", "3", "-v", testmodule]
+  else:
+      cmd = ["python3", "-m", "pytest", '--html={}/{}_log.html'.format(base_dir, testfile), "--reruns", "3", "-x", "-v", testmodule]
  return_code, stderr, stdout = run_shell_command(cmd, env_vars=env_vars)
  with GPU_LOCK:
    gpu_tokens.append(target_gpu)
@ -98,31 +111,30 @@ def run_test(testmodule, gpu_tokens):
      print("Running tests in module %s on GPU %d:" % (testmodule, target_gpu))
      print(stdout)
      print(stderr)
-      LAST_CODE = return_code
-  return
+      if continue_on_fail == False:
+          LAST_CODE = return_code


-def run_parallel(all_testmodules, p):
-  print("Running tests with parallelism=", p)
+def run_parallel(all_testmodules, p, c):
+  print(f"Running tests with parallelism=", p)
  available_gpu_tokens = list(range(p))
  executor = ThreadPoolExecutor(max_workers=p)
-  # walking through test modules
+  # walking through test modules.
  for testmodule in all_testmodules:
-    executor.submit(run_test, testmodule, available_gpu_tokens)
-  # waiting for all modules to finish
-  executor.shutdown(wait=True)  # wait for all jobs to finish
-  return
+    executor.submit(run_test, testmodule, available_gpu_tokens, c)
+  # waiting for all modules to finish.
+  executor.shutdown(wait=True)


 def find_num_gpus():
-  cmd = ["lspci|grep 'controller'|grep 'AMD/ATI'|wc -l"]
+  cmd = ["lspci|grep 'controller\|accel'|grep 'AMD/ATI'|wc -l"]
  _, _, stdout = run_shell_command(cmd, shell=True)
  return int(stdout)


 def main(args):
  all_testmodules = collect_testmodules()
-  run_parallel(all_testmodules, args.parallel)
+  run_parallel(all_testmodules, args.parallel, args.continue_on_fail)
  generate_final_report()
  exit(LAST_CODE)

@ -134,7 +146,13 @@ if __name__ == '__main__':
                      "--parallel",
                      type=int,
                      help="number of tests to run in parallel")
+  parser.add_argument("-c",
+                      "--continue_on_fail",
+                      action='store_true',
+                      help="continue on failure")
  args = parser.parse_args()
+  if args.continue_on_fail:
+      print("continue on fail is set")
  if args.parallel is None:
    sys_gpu_count = find_num_gpus()
    args.parallel = sys_gpu_count