From 7d6fa3c05b07cdd454bfaf6067f8539f367e44aa Mon Sep 17 00:00:00 2001
From: Rahul Batra <rahbatra@amd.com>
Date: Tue, 30 Jan 2024 21:01:12 +0000
Subject: [PATCH] [ROCm]: Add support to continue on fail, fix script paths and
 update Dockerfile to add necessary packages

---
 build/rocm/Dockerfile.ms     |  2 +-
 build/rocm/run_multi_gpu.sh  |  4 +-
 build/rocm/run_single_gpu.py | 78 ++++++++++++++++++++++--------------
 3 files changed, 51 insertions(+), 33 deletions(-)

diff --git a/build/rocm/Dockerfile.ms b/build/rocm/Dockerfile.ms
index 25251a9de..5f831f111 100644
--- a/build/rocm/Dockerfile.ms
+++ b/build/rocm/Dockerfile.ms
@@ -32,6 +32,6 @@ RUN git clone https://github.com/pyenv/pyenv.git /pyenv
 ENV PYENV_ROOT /pyenv
 ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH
 RUN pyenv install $PYTHON_VERSION
-RUN eval "$(pyenv init -)" && pyenv local ${PYTHON_VERSION} && pip3 install --upgrade --force-reinstall setuptools pip && pip install numpy setuptools build wheel six auditwheel scipy pytest pytest-html pytest_html_merger  pytest-rerunfailures matplotlib absl-py flatbuffers hypothesis
+RUN eval "$(pyenv init -)" && pyenv local ${PYTHON_VERSION} && pip3 install --upgrade --force-reinstall setuptools pip && pip install numpy setuptools build wheel six auditwheel scipy pytest pytest-html pytest_html_merger pytest-reportlog pytest-rerunfailures cloudpickle portpicker matplotlib absl-py flatbuffers hypothesis
 
 
diff --git a/build/rocm/run_multi_gpu.sh b/build/rocm/run_multi_gpu.sh
index ff186db76..b5d5798e7 100755
--- a/build/rocm/run_multi_gpu.sh
+++ b/build/rocm/run_multi_gpu.sh
@@ -20,8 +20,8 @@ run_tests() {
     local base_dir=./logs
     local gpu_devices="$1"
     export HIP_VISIBLE_DEVICES=$gpu_devices
-    python3 -m pytest --html=$base_dir/multi_gpu_pmap_test_log.html --reruns 3 -x tests/pmap_test.py
-    python3 -m pytest --html=$base_dir/multi_gpu_multi_device_test_log.html --reruns 3 -x tests/multi_device_test.py
+    python3 -m pytest --html=$base_dir/multi_gpu_pmap_test_log.html --reruns 3  tests/pmap_test.py
+    python3 -m pytest --html=$base_dir/multi_gpu_multi_device_test_log.html --reruns 3 tests/multi_device_test.py
     python3 -m pytest_html_merger -i $base_dir/ -o  $base_dir/final_compiled_report.html
 }
 
diff --git a/build/rocm/run_single_gpu.py b/build/rocm/run_single_gpu.py
index add7ee3d8..852bed05e 100755
--- a/build/rocm/run_single_gpu.py
+++ b/build/rocm/run_single_gpu.py
@@ -13,11 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import argparse
 import os
-import re
-import subprocess
+import json
+import argparse
 import threading
+import subprocess
 from concurrent.futures import ThreadPoolExecutor
 
 GPU_LOCK = threading.Lock()
@@ -40,7 +40,7 @@ def generate_final_report(shell=False, env_vars={}):
   if result.returncode != 0:
     print("FAILED - {}".format(" ".join(cmd)))
     print(result.stderr.decode())
-    # sys.exit(result.returncode)
+
   return result.returncode, result.stderr.decode(), result.stdout.decode()
 
 
@@ -54,32 +54,42 @@ def run_shell_command(cmd, shell=False, env_vars={}):
   if result.returncode != 0:
     print("FAILED - {}".format(" ".join(cmd)))
     print(result.stderr.decode())
-    # sys.exit(result.returncode)
+
   return result.returncode, result.stderr.decode(), result.stdout.decode()
 
 
+def parse_test_log(log_file):
+  """Parses the test module log file to extract test modules and functions."""
+  test_files = set()
+  with open(log_file, "r") as f:
+    for line in f:
+      report = json.loads(line)
+      if "nodeid" in report:
+        module = report["nodeid"].split("::")[0]
+        if module:
+          test_files.add(os.path.abspath(module))
+  return test_files
+
+
 def collect_testmodules():
-  all_test_files = []
+  log_file = f"{base_dir}/collect_module_log.jsonl"
   return_code, stderr, stdout = run_shell_command(
-      ["python3", "-m", "pytest", "--collect-only", "tests"])
+      ["python3", "-m", "pytest", "--collect-only", "tests", f"--report-log={log_file}"])
   if return_code != 0:
-    print(stdout)
-    print(stderr)
     print("Test module discovery failed.")
+    print("STDOUT:", stdout)
+    print("STDERR:", stderr)
     exit(return_code)
-  for line in stdout.split("\n"):
-    match = re.match("<Module (.*)>", line)
-    if match:
-      test_file = match.group(1)
-      all_test_files.append(test_file)
   print("---------- collected test modules ----------")
-  print("Found %d test modules." % (len(all_test_files)))
-  print("\n".join(all_test_files))
+  test_files = parse_test_log(log_file)
+  print("Found %d test modules." % (len(test_files)))
   print("--------------------------------------------")
-  return all_test_files
+  print("\n".join(test_files))
+  
+  return test_files
 
 
-def run_test(testmodule, gpu_tokens):
+def run_test(testmodule, gpu_tokens, continue_on_fail):
   global LAST_CODE
   with GPU_LOCK:
     if LAST_CODE != 0:
@@ -90,7 +100,10 @@ def run_test(testmodule, gpu_tokens):
       "XLA_PYTHON_CLIENT_ALLOCATOR": "default",
   }
   testfile = extract_filename(testmodule)
-  cmd = ["python3", "-m", "pytest", f'--html={base_dir}/{testfile}_log.html', "--reruns", "3", "-x", testmodule]
+  if continue_on_fail:
+      cmd = ["python3", "-m", "pytest", '--html={}/{}_log.html'.format(base_dir, testfile), "--reruns", "3", "-v", testmodule]
+  else:
+      cmd = ["python3", "-m", "pytest", '--html={}/{}_log.html'.format(base_dir, testfile), "--reruns", "3", "-x", "-v", testmodule]
   return_code, stderr, stdout = run_shell_command(cmd, env_vars=env_vars)
   with GPU_LOCK:
     gpu_tokens.append(target_gpu)
@@ -98,31 +111,30 @@ def run_test(testmodule, gpu_tokens):
       print("Running tests in module %s on GPU %d:" % (testmodule, target_gpu))
       print(stdout)
       print(stderr)
-      LAST_CODE = return_code
-  return
+      if continue_on_fail == False:
+          LAST_CODE = return_code
 
 
-def run_parallel(all_testmodules, p):
-  print("Running tests with parallelism=", p)
+def run_parallel(all_testmodules, p, c):
+  print(f"Running tests with parallelism=", p)
   available_gpu_tokens = list(range(p))
   executor = ThreadPoolExecutor(max_workers=p)
-  # walking through test modules
+  # walking through test modules.
   for testmodule in all_testmodules:
-    executor.submit(run_test, testmodule, available_gpu_tokens)
-  # waiting for all modules to finish
-  executor.shutdown(wait=True)  # wait for all jobs to finish
-  return
+    executor.submit(run_test, testmodule, available_gpu_tokens, c)
+  # waiting for all modules to finish.
+  executor.shutdown(wait=True)
 
 
 def find_num_gpus():
-  cmd = ["lspci|grep 'controller'|grep 'AMD/ATI'|wc -l"]
+  cmd = ["lspci|grep 'controller\|accel'|grep 'AMD/ATI'|wc -l"]
   _, _, stdout = run_shell_command(cmd, shell=True)
   return int(stdout)
 
 
 def main(args):
   all_testmodules = collect_testmodules()
-  run_parallel(all_testmodules, args.parallel)
+  run_parallel(all_testmodules, args.parallel, args.continue_on_fail)
   generate_final_report()
   exit(LAST_CODE)
 
@@ -134,7 +146,13 @@ if __name__ == '__main__':
                       "--parallel",
                       type=int,
                       help="number of tests to run in parallel")
+  parser.add_argument("-c",
+                      "--continue_on_fail",
+                      action='store_true',
+                      help="continue on failure")
   args = parser.parse_args()
+  if args.continue_on_fail:
+      print("continue on fail is set")
   if args.parallel is None:
     sys_gpu_count = find_num_gpus()
     args.parallel = sys_gpu_count