[ROCM]: Generating pytest html logs from unit-tests.

2025-04-18 12:56:07 +00:00 · 2024-01-17 22:48:09 +00:00 · 2024-01-17 22:48:09 +00:00 · ef7694f26a
commit ef7694f26a
parent a63197fed8
3 changed files with 57 additions and 16 deletions
--- a/build/rocm/Dockerfile.ms
+++ b/build/rocm/Dockerfile.ms
@ -32,6 +32,6 @@ RUN git clone https://github.com/pyenv/pyenv.git /pyenv
 ENV PYENV_ROOT /pyenv
 ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH
 RUN pyenv install $PYTHON_VERSION
-RUN eval "$(pyenv init -)" && pyenv local ${PYTHON_VERSION} && pip3 install --upgrade --force-reinstall setuptools pip && pip install numpy setuptools build wheel six auditwheel scipy pytest pytest-rerunfailures matplotlib absl-py flatbuffers hypothesis
+RUN eval "$(pyenv init -)" && pyenv local ${PYTHON_VERSION} && pip3 install --upgrade --force-reinstall setuptools pip && pip install numpy setuptools build wheel six auditwheel scipy pytest pytest-html pytest_html_merger  pytest-rerunfailures matplotlib absl-py flatbuffers hypothesis


--- a/build/rocm/run_multi_gpu.sh
+++ b/build/rocm/run_multi_gpu.sh
@ -13,20 +13,39 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-set -eux
-# run test module with multi-gpu requirements. We currently do not have a way to filter tests.
-# this issue is also tracked in https://github.com/google/jax/issues/7323
-cmd=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
-echo $cmd
+set -eu

-if [[ $cmd -gt 8 ]]; then
-	export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 && python3 -m pytest --reruns 3 -x tests/pmap_test.py 
-elif [[ $cmd -gt 4 ]]; then
-	export HIP_VISIBLE_DEVICES=0,1,2,3 && python3 -m pytest --reruns 3 -x tests/pmap_test.py
-elif [[ $cmd -gt 2 ]]; then
-	export HIP_VISIBLE_DEVICES=0,1 && python3 -m pytest --reruns 3 -x tests/pmap_test.py 
-else
-	export HIP_VISIBLE_DEVICES=0 && python3 -m pytest --reruns 3 -x tests/pmap_test.py
+# Function to run tests with specified GPUs
+run_tests() {
+    local base_dir=./logs
+    local gpu_devices="$1"
+    export HIP_VISIBLE_DEVICES=$gpu_devices
+    python3 -m pytest --html=$base_dir/multi_gpu_pmap_test_log.html --reruns 3 -x tests/pmap_test.py
+    python3 -m pytest --html=$base_dir/multi_gpu_multi_device_test_log.html --reruns 3 -x tests/multi_device_test.py
+    python3 -m pytest_html_merger -i $base_dir/ -o  $base_dir/final_compiled_report.html
+}
+
+# Check for required commands
+if ! command -v lspci &> /dev/null; then
+    echo "lspci command not found, aborting."
+    exit 1
 fi

-python3 -m pytest --reruns 3 -x tests/multi_device_test.py
+if ! command -v python3 &> /dev/null; then
+    echo "Python3 is not available, aborting."
+    exit 1
+fi
+
+# GPU detection and test execution
+gpu_count=$(lspci | grep -c 'controller.*AMD/ATI')
+echo "Number of AMD/ATI GPUs detected: $gpu_count"
+
+if [[ $gpu_count -gt 8 ]]; then
+    run_tests "0,1,2,3,4,5,6,7"
+elif [[ $gpu_count -gt 4 ]]; then
+    run_tests "0,1,2,3"
+elif [[ $gpu_count -gt 2 ]]; then
+    run_tests "0,1"
+else
+    run_tests "0"
+fi
--- a/build/rocm/run_single_gpu.py
+++ b/build/rocm/run_single_gpu.py
@ -22,6 +22,26 @@ from concurrent.futures import ThreadPoolExecutor

 GPU_LOCK = threading.Lock()
 LAST_CODE = 0
+base_dir="./logs"
+
+def extract_filename(path):
+  base_name = os.path.basename(path)
+  file_name, _ = os.path.splitext(base_name)
+  return file_name
+
+def generate_final_report(shell=False, env_vars={}):
+  env = os.environ
+  env = {**env, **env_vars}
+  cmd = ["pytest_html_merger", "-i", '{}'.format(base_dir), "-o", '{}/final_compiled_report.html'.format(base_dir)]
+  result = subprocess.run(cmd,
+                          shell=shell,
+                          capture_output=True,
+                          env=env)
+  if result.returncode != 0:
+    print("FAILED - {}".format(" ".join(cmd)))
+    print(result.stderr.decode())
+    # sys.exit(result.returncode)
+  return result.returncode, result.stderr.decode(), result.stdout.decode()


 def run_shell_command(cmd, shell=False, env_vars={}):
@ -69,7 +89,8 @@ def run_test(testmodule, gpu_tokens):
      "HIP_VISIBLE_DEVICES": str(target_gpu),
      "XLA_PYTHON_CLIENT_ALLOCATOR": "default",
  }
-  cmd = ["python3", "-m", "pytest", "--reruns", "3", "-x", testmodule]
+  testfile = extract_filename(testmodule)
+  cmd = ["python3", "-m", "pytest", '--html={}/{}_log.html'.format(base_dir, testfile), "--reruns", "3", "-x", testmodule]
  return_code, stderr, stdout = run_shell_command(cmd, env_vars=env_vars)
  with GPU_LOCK:
    gpu_tokens.append(target_gpu)
@ -102,6 +123,7 @@ def find_num_gpus():
 def main(args):
  all_testmodules = collect_testmodules()
  run_parallel(all_testmodules, args.parallel)
+  generate_final_report()
  exit(LAST_CODE)