[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages

This commit is contained in:
Rahul Batra 2024-01-30 21:01:12 +00:00 committed by Ruturaj4
parent d54bf529cc
commit 7d6fa3c05b
3 changed files with 51 additions and 33 deletions

View File

@ -32,6 +32,6 @@ RUN git clone https://github.com/pyenv/pyenv.git /pyenv
ENV PYENV_ROOT /pyenv
ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH
RUN pyenv install $PYTHON_VERSION
RUN eval "$(pyenv init -)" && pyenv local ${PYTHON_VERSION} && pip3 install --upgrade --force-reinstall setuptools pip && pip install numpy setuptools build wheel six auditwheel scipy pytest pytest-html pytest_html_merger pytest-rerunfailures matplotlib absl-py flatbuffers hypothesis
RUN eval "$(pyenv init -)" && pyenv local ${PYTHON_VERSION} && pip3 install --upgrade --force-reinstall setuptools pip && pip install numpy setuptools build wheel six auditwheel scipy pytest pytest-html pytest_html_merger pytest-reportlog pytest-rerunfailures cloudpickle portpicker matplotlib absl-py flatbuffers hypothesis

View File

@ -20,8 +20,8 @@ run_tests() {
local base_dir=./logs
local gpu_devices="$1"
export HIP_VISIBLE_DEVICES=$gpu_devices
python3 -m pytest --html=$base_dir/multi_gpu_pmap_test_log.html --reruns 3 -x tests/pmap_test.py
python3 -m pytest --html=$base_dir/multi_gpu_multi_device_test_log.html --reruns 3 -x tests/multi_device_test.py
python3 -m pytest --html=$base_dir/multi_gpu_pmap_test_log.html --reruns 3 tests/pmap_test.py
python3 -m pytest --html=$base_dir/multi_gpu_multi_device_test_log.html --reruns 3 tests/multi_device_test.py
python3 -m pytest_html_merger -i $base_dir/ -o $base_dir/final_compiled_report.html
}

View File

@ -13,11 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import re
import subprocess
import json
import argparse
import threading
import subprocess
from concurrent.futures import ThreadPoolExecutor
GPU_LOCK = threading.Lock()
@ -40,7 +40,7 @@ def generate_final_report(shell=False, env_vars={}):
if result.returncode != 0:
print("FAILED - {}".format(" ".join(cmd)))
print(result.stderr.decode())
# sys.exit(result.returncode)
return result.returncode, result.stderr.decode(), result.stdout.decode()
@ -54,32 +54,42 @@ def run_shell_command(cmd, shell=False, env_vars={}):
if result.returncode != 0:
print("FAILED - {}".format(" ".join(cmd)))
print(result.stderr.decode())
# sys.exit(result.returncode)
return result.returncode, result.stderr.decode(), result.stdout.decode()
def parse_test_log(log_file):
"""Parses the test module log file to extract test modules and functions."""
test_files = set()
with open(log_file, "r") as f:
for line in f:
report = json.loads(line)
if "nodeid" in report:
module = report["nodeid"].split("::")[0]
if module:
test_files.add(os.path.abspath(module))
return test_files
def collect_testmodules():
all_test_files = []
log_file = f"{base_dir}/collect_module_log.jsonl"
return_code, stderr, stdout = run_shell_command(
["python3", "-m", "pytest", "--collect-only", "tests"])
["python3", "-m", "pytest", "--collect-only", "tests", f"--report-log={log_file}"])
if return_code != 0:
print(stdout)
print(stderr)
print("Test module discovery failed.")
print("STDOUT:", stdout)
print("STDERR:", stderr)
exit(return_code)
for line in stdout.split("\n"):
match = re.match("<Module (.*)>", line)
if match:
test_file = match.group(1)
all_test_files.append(test_file)
print("---------- collected test modules ----------")
print("Found %d test modules." % (len(all_test_files)))
print("\n".join(all_test_files))
test_files = parse_test_log(log_file)
print("Found %d test modules." % (len(test_files)))
print("--------------------------------------------")
return all_test_files
print("\n".join(test_files))
return test_files
def run_test(testmodule, gpu_tokens):
def run_test(testmodule, gpu_tokens, continue_on_fail):
global LAST_CODE
with GPU_LOCK:
if LAST_CODE != 0:
@ -90,7 +100,10 @@ def run_test(testmodule, gpu_tokens):
"XLA_PYTHON_CLIENT_ALLOCATOR": "default",
}
testfile = extract_filename(testmodule)
cmd = ["python3", "-m", "pytest", f'--html={base_dir}/{testfile}_log.html', "--reruns", "3", "-x", testmodule]
if continue_on_fail:
cmd = ["python3", "-m", "pytest", '--html={}/{}_log.html'.format(base_dir, testfile), "--reruns", "3", "-v", testmodule]
else:
cmd = ["python3", "-m", "pytest", '--html={}/{}_log.html'.format(base_dir, testfile), "--reruns", "3", "-x", "-v", testmodule]
return_code, stderr, stdout = run_shell_command(cmd, env_vars=env_vars)
with GPU_LOCK:
gpu_tokens.append(target_gpu)
@ -98,31 +111,30 @@ def run_test(testmodule, gpu_tokens):
print("Running tests in module %s on GPU %d:" % (testmodule, target_gpu))
print(stdout)
print(stderr)
LAST_CODE = return_code
return
if continue_on_fail == False:
LAST_CODE = return_code
def run_parallel(all_testmodules, p):
print("Running tests with parallelism=", p)
def run_parallel(all_testmodules, p, c):
print(f"Running tests with parallelism=", p)
available_gpu_tokens = list(range(p))
executor = ThreadPoolExecutor(max_workers=p)
# walking through test modules
# walking through test modules.
for testmodule in all_testmodules:
executor.submit(run_test, testmodule, available_gpu_tokens)
# waiting for all modules to finish
executor.shutdown(wait=True) # wait for all jobs to finish
return
executor.submit(run_test, testmodule, available_gpu_tokens, c)
# waiting for all modules to finish.
executor.shutdown(wait=True)
def find_num_gpus():
cmd = ["lspci|grep 'controller'|grep 'AMD/ATI'|wc -l"]
cmd = ["lspci|grep 'controller\|accel'|grep 'AMD/ATI'|wc -l"]
_, _, stdout = run_shell_command(cmd, shell=True)
return int(stdout)
def main(args):
all_testmodules = collect_testmodules()
run_parallel(all_testmodules, args.parallel)
run_parallel(all_testmodules, args.parallel, args.continue_on_fail)
generate_final_report()
exit(LAST_CODE)
@ -134,7 +146,13 @@ if __name__ == '__main__':
"--parallel",
type=int,
help="number of tests to run in parallel")
parser.add_argument("-c",
"--continue_on_fail",
action='store_true',
help="continue on failure")
args = parser.parse_args()
if args.continue_on_fail:
print("continue on fail is set")
if args.parallel is None:
sys_gpu_count = find_num_gpus()
args.parallel = sys_gpu_count