From 7d6fa3c05b07cdd454bfaf6067f8539f367e44aa Mon Sep 17 00:00:00 2001 From: Rahul Batra Date: Tue, 30 Jan 2024 21:01:12 +0000 Subject: [PATCH] [ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages --- build/rocm/Dockerfile.ms | 2 +- build/rocm/run_multi_gpu.sh | 4 +- build/rocm/run_single_gpu.py | 78 ++++++++++++++++++++++-------------- 3 files changed, 51 insertions(+), 33 deletions(-) diff --git a/build/rocm/Dockerfile.ms b/build/rocm/Dockerfile.ms index 25251a9de..5f831f111 100644 --- a/build/rocm/Dockerfile.ms +++ b/build/rocm/Dockerfile.ms @@ -32,6 +32,6 @@ RUN git clone https://github.com/pyenv/pyenv.git /pyenv ENV PYENV_ROOT /pyenv ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH RUN pyenv install $PYTHON_VERSION -RUN eval "$(pyenv init -)" && pyenv local ${PYTHON_VERSION} && pip3 install --upgrade --force-reinstall setuptools pip && pip install numpy setuptools build wheel six auditwheel scipy pytest pytest-html pytest_html_merger pytest-rerunfailures matplotlib absl-py flatbuffers hypothesis +RUN eval "$(pyenv init -)" && pyenv local ${PYTHON_VERSION} && pip3 install --upgrade --force-reinstall setuptools pip && pip install numpy setuptools build wheel six auditwheel scipy pytest pytest-html pytest_html_merger pytest-reportlog pytest-rerunfailures cloudpickle portpicker matplotlib absl-py flatbuffers hypothesis diff --git a/build/rocm/run_multi_gpu.sh b/build/rocm/run_multi_gpu.sh index ff186db76..b5d5798e7 100755 --- a/build/rocm/run_multi_gpu.sh +++ b/build/rocm/run_multi_gpu.sh @@ -20,8 +20,8 @@ run_tests() { local base_dir=./logs local gpu_devices="$1" export HIP_VISIBLE_DEVICES=$gpu_devices - python3 -m pytest --html=$base_dir/multi_gpu_pmap_test_log.html --reruns 3 -x tests/pmap_test.py - python3 -m pytest --html=$base_dir/multi_gpu_multi_device_test_log.html --reruns 3 -x tests/multi_device_test.py + python3 -m pytest --html=$base_dir/multi_gpu_pmap_test_log.html --reruns 3 tests/pmap_test.py + python3 -m pytest --html=$base_dir/multi_gpu_multi_device_test_log.html --reruns 3 tests/multi_device_test.py python3 -m pytest_html_merger -i $base_dir/ -o $base_dir/final_compiled_report.html } diff --git a/build/rocm/run_single_gpu.py b/build/rocm/run_single_gpu.py index add7ee3d8..852bed05e 100755 --- a/build/rocm/run_single_gpu.py +++ b/build/rocm/run_single_gpu.py @@ -13,11 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import argparse import os -import re -import subprocess +import json +import argparse import threading +import subprocess from concurrent.futures import ThreadPoolExecutor GPU_LOCK = threading.Lock() @@ -40,7 +40,7 @@ def generate_final_report(shell=False, env_vars={}): if result.returncode != 0: print("FAILED - {}".format(" ".join(cmd))) print(result.stderr.decode()) - # sys.exit(result.returncode) + return result.returncode, result.stderr.decode(), result.stdout.decode() @@ -54,32 +54,42 @@ def run_shell_command(cmd, shell=False, env_vars={}): if result.returncode != 0: print("FAILED - {}".format(" ".join(cmd))) print(result.stderr.decode()) - # sys.exit(result.returncode) + return result.returncode, result.stderr.decode(), result.stdout.decode() +def parse_test_log(log_file): + """Parses the test module log file to extract test modules and functions.""" + test_files = set() + with open(log_file, "r") as f: + for line in f: + report = json.loads(line) + if "nodeid" in report: + module = report["nodeid"].split("::")[0] + if module: + test_files.add(os.path.abspath(module)) + return test_files + + def collect_testmodules(): - all_test_files = [] + log_file = f"{base_dir}/collect_module_log.jsonl" return_code, stderr, stdout = run_shell_command( - ["python3", "-m", "pytest", "--collect-only", "tests"]) + ["python3", "-m", "pytest", "--collect-only", "tests", f"--report-log={log_file}"]) if return_code != 0: - print(stdout) - print(stderr) print("Test module discovery failed.") + print("STDOUT:", stdout) + print("STDERR:", stderr) exit(return_code) - for line in stdout.split("\n"): - match = re.match("", line) - if match: - test_file = match.group(1) - all_test_files.append(test_file) print("---------- collected test modules ----------") - print("Found %d test modules." % (len(all_test_files))) - print("\n".join(all_test_files)) + test_files = parse_test_log(log_file) + print("Found %d test modules." % (len(test_files))) print("--------------------------------------------") - return all_test_files + print("\n".join(test_files)) + + return test_files -def run_test(testmodule, gpu_tokens): +def run_test(testmodule, gpu_tokens, continue_on_fail): global LAST_CODE with GPU_LOCK: if LAST_CODE != 0: @@ -90,7 +100,10 @@ def run_test(testmodule, gpu_tokens): "XLA_PYTHON_CLIENT_ALLOCATOR": "default", } testfile = extract_filename(testmodule) - cmd = ["python3", "-m", "pytest", f'--html={base_dir}/{testfile}_log.html', "--reruns", "3", "-x", testmodule] + if continue_on_fail: + cmd = ["python3", "-m", "pytest", '--html={}/{}_log.html'.format(base_dir, testfile), "--reruns", "3", "-v", testmodule] + else: + cmd = ["python3", "-m", "pytest", '--html={}/{}_log.html'.format(base_dir, testfile), "--reruns", "3", "-x", "-v", testmodule] return_code, stderr, stdout = run_shell_command(cmd, env_vars=env_vars) with GPU_LOCK: gpu_tokens.append(target_gpu) @@ -98,31 +111,30 @@ def run_test(testmodule, gpu_tokens): print("Running tests in module %s on GPU %d:" % (testmodule, target_gpu)) print(stdout) print(stderr) - LAST_CODE = return_code - return + if continue_on_fail == False: + LAST_CODE = return_code -def run_parallel(all_testmodules, p): - print("Running tests with parallelism=", p) +def run_parallel(all_testmodules, p, c): + print(f"Running tests with parallelism=", p) available_gpu_tokens = list(range(p)) executor = ThreadPoolExecutor(max_workers=p) - # walking through test modules + # walking through test modules. for testmodule in all_testmodules: - executor.submit(run_test, testmodule, available_gpu_tokens) - # waiting for all modules to finish - executor.shutdown(wait=True) # wait for all jobs to finish - return + executor.submit(run_test, testmodule, available_gpu_tokens, c) + # waiting for all modules to finish. + executor.shutdown(wait=True) def find_num_gpus(): - cmd = ["lspci|grep 'controller'|grep 'AMD/ATI'|wc -l"] + cmd = ["lspci|grep 'controller\|accel'|grep 'AMD/ATI'|wc -l"] _, _, stdout = run_shell_command(cmd, shell=True) return int(stdout) def main(args): all_testmodules = collect_testmodules() - run_parallel(all_testmodules, args.parallel) + run_parallel(all_testmodules, args.parallel, args.continue_on_fail) generate_final_report() exit(LAST_CODE) @@ -134,7 +146,13 @@ if __name__ == '__main__': "--parallel", type=int, help="number of tests to run in parallel") + parser.add_argument("-c", + "--continue_on_fail", + action='store_true', + help="continue on failure") args = parser.parse_args() + if args.continue_on_fail: + print("continue on fail is set") if args.parallel is None: sys_gpu_count = find_num_gpus() args.parallel = sys_gpu_count