mirror of
https://github.com/ROCm/jax.git
synced 2025-04-15 19:36:06 +00:00
[ROCm]: Add support to continue on fail, fix script paths and update Dockerfile to add necessary packages
This commit is contained in:
parent
d54bf529cc
commit
7d6fa3c05b
@ -32,6 +32,6 @@ RUN git clone https://github.com/pyenv/pyenv.git /pyenv
|
||||
ENV PYENV_ROOT /pyenv
|
||||
ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH
|
||||
RUN pyenv install $PYTHON_VERSION
|
||||
RUN eval "$(pyenv init -)" && pyenv local ${PYTHON_VERSION} && pip3 install --upgrade --force-reinstall setuptools pip && pip install numpy setuptools build wheel six auditwheel scipy pytest pytest-html pytest_html_merger pytest-rerunfailures matplotlib absl-py flatbuffers hypothesis
|
||||
RUN eval "$(pyenv init -)" && pyenv local ${PYTHON_VERSION} && pip3 install --upgrade --force-reinstall setuptools pip && pip install numpy setuptools build wheel six auditwheel scipy pytest pytest-html pytest_html_merger pytest-reportlog pytest-rerunfailures cloudpickle portpicker matplotlib absl-py flatbuffers hypothesis
|
||||
|
||||
|
||||
|
@ -20,8 +20,8 @@ run_tests() {
|
||||
local base_dir=./logs
|
||||
local gpu_devices="$1"
|
||||
export HIP_VISIBLE_DEVICES=$gpu_devices
|
||||
python3 -m pytest --html=$base_dir/multi_gpu_pmap_test_log.html --reruns 3 -x tests/pmap_test.py
|
||||
python3 -m pytest --html=$base_dir/multi_gpu_multi_device_test_log.html --reruns 3 -x tests/multi_device_test.py
|
||||
python3 -m pytest --html=$base_dir/multi_gpu_pmap_test_log.html --reruns 3 tests/pmap_test.py
|
||||
python3 -m pytest --html=$base_dir/multi_gpu_multi_device_test_log.html --reruns 3 tests/multi_device_test.py
|
||||
python3 -m pytest_html_merger -i $base_dir/ -o $base_dir/final_compiled_report.html
|
||||
}
|
||||
|
||||
|
@ -13,11 +13,11 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import json
|
||||
import argparse
|
||||
import threading
|
||||
import subprocess
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
GPU_LOCK = threading.Lock()
|
||||
@ -40,7 +40,7 @@ def generate_final_report(shell=False, env_vars={}):
|
||||
if result.returncode != 0:
|
||||
print("FAILED - {}".format(" ".join(cmd)))
|
||||
print(result.stderr.decode())
|
||||
# sys.exit(result.returncode)
|
||||
|
||||
return result.returncode, result.stderr.decode(), result.stdout.decode()
|
||||
|
||||
|
||||
@ -54,32 +54,42 @@ def run_shell_command(cmd, shell=False, env_vars={}):
|
||||
if result.returncode != 0:
|
||||
print("FAILED - {}".format(" ".join(cmd)))
|
||||
print(result.stderr.decode())
|
||||
# sys.exit(result.returncode)
|
||||
|
||||
return result.returncode, result.stderr.decode(), result.stdout.decode()
|
||||
|
||||
|
||||
def parse_test_log(log_file):
|
||||
"""Parses the test module log file to extract test modules and functions."""
|
||||
test_files = set()
|
||||
with open(log_file, "r") as f:
|
||||
for line in f:
|
||||
report = json.loads(line)
|
||||
if "nodeid" in report:
|
||||
module = report["nodeid"].split("::")[0]
|
||||
if module:
|
||||
test_files.add(os.path.abspath(module))
|
||||
return test_files
|
||||
|
||||
|
||||
def collect_testmodules():
|
||||
all_test_files = []
|
||||
log_file = f"{base_dir}/collect_module_log.jsonl"
|
||||
return_code, stderr, stdout = run_shell_command(
|
||||
["python3", "-m", "pytest", "--collect-only", "tests"])
|
||||
["python3", "-m", "pytest", "--collect-only", "tests", f"--report-log={log_file}"])
|
||||
if return_code != 0:
|
||||
print(stdout)
|
||||
print(stderr)
|
||||
print("Test module discovery failed.")
|
||||
print("STDOUT:", stdout)
|
||||
print("STDERR:", stderr)
|
||||
exit(return_code)
|
||||
for line in stdout.split("\n"):
|
||||
match = re.match("<Module (.*)>", line)
|
||||
if match:
|
||||
test_file = match.group(1)
|
||||
all_test_files.append(test_file)
|
||||
print("---------- collected test modules ----------")
|
||||
print("Found %d test modules." % (len(all_test_files)))
|
||||
print("\n".join(all_test_files))
|
||||
test_files = parse_test_log(log_file)
|
||||
print("Found %d test modules." % (len(test_files)))
|
||||
print("--------------------------------------------")
|
||||
return all_test_files
|
||||
print("\n".join(test_files))
|
||||
|
||||
return test_files
|
||||
|
||||
|
||||
def run_test(testmodule, gpu_tokens):
|
||||
def run_test(testmodule, gpu_tokens, continue_on_fail):
|
||||
global LAST_CODE
|
||||
with GPU_LOCK:
|
||||
if LAST_CODE != 0:
|
||||
@ -90,7 +100,10 @@ def run_test(testmodule, gpu_tokens):
|
||||
"XLA_PYTHON_CLIENT_ALLOCATOR": "default",
|
||||
}
|
||||
testfile = extract_filename(testmodule)
|
||||
cmd = ["python3", "-m", "pytest", f'--html={base_dir}/{testfile}_log.html', "--reruns", "3", "-x", testmodule]
|
||||
if continue_on_fail:
|
||||
cmd = ["python3", "-m", "pytest", '--html={}/{}_log.html'.format(base_dir, testfile), "--reruns", "3", "-v", testmodule]
|
||||
else:
|
||||
cmd = ["python3", "-m", "pytest", '--html={}/{}_log.html'.format(base_dir, testfile), "--reruns", "3", "-x", "-v", testmodule]
|
||||
return_code, stderr, stdout = run_shell_command(cmd, env_vars=env_vars)
|
||||
with GPU_LOCK:
|
||||
gpu_tokens.append(target_gpu)
|
||||
@ -98,31 +111,30 @@ def run_test(testmodule, gpu_tokens):
|
||||
print("Running tests in module %s on GPU %d:" % (testmodule, target_gpu))
|
||||
print(stdout)
|
||||
print(stderr)
|
||||
LAST_CODE = return_code
|
||||
return
|
||||
if continue_on_fail == False:
|
||||
LAST_CODE = return_code
|
||||
|
||||
|
||||
def run_parallel(all_testmodules, p):
|
||||
print("Running tests with parallelism=", p)
|
||||
def run_parallel(all_testmodules, p, c):
|
||||
print(f"Running tests with parallelism=", p)
|
||||
available_gpu_tokens = list(range(p))
|
||||
executor = ThreadPoolExecutor(max_workers=p)
|
||||
# walking through test modules
|
||||
# walking through test modules.
|
||||
for testmodule in all_testmodules:
|
||||
executor.submit(run_test, testmodule, available_gpu_tokens)
|
||||
# waiting for all modules to finish
|
||||
executor.shutdown(wait=True) # wait for all jobs to finish
|
||||
return
|
||||
executor.submit(run_test, testmodule, available_gpu_tokens, c)
|
||||
# waiting for all modules to finish.
|
||||
executor.shutdown(wait=True)
|
||||
|
||||
|
||||
def find_num_gpus():
|
||||
cmd = ["lspci|grep 'controller'|grep 'AMD/ATI'|wc -l"]
|
||||
cmd = ["lspci|grep 'controller\|accel'|grep 'AMD/ATI'|wc -l"]
|
||||
_, _, stdout = run_shell_command(cmd, shell=True)
|
||||
return int(stdout)
|
||||
|
||||
|
||||
def main(args):
|
||||
all_testmodules = collect_testmodules()
|
||||
run_parallel(all_testmodules, args.parallel)
|
||||
run_parallel(all_testmodules, args.parallel, args.continue_on_fail)
|
||||
generate_final_report()
|
||||
exit(LAST_CODE)
|
||||
|
||||
@ -134,7 +146,13 @@ if __name__ == '__main__':
|
||||
"--parallel",
|
||||
type=int,
|
||||
help="number of tests to run in parallel")
|
||||
parser.add_argument("-c",
|
||||
"--continue_on_fail",
|
||||
action='store_true',
|
||||
help="continue on failure")
|
||||
args = parser.parse_args()
|
||||
if args.continue_on_fail:
|
||||
print("continue on fail is set")
|
||||
if args.parallel is None:
|
||||
sys_gpu_count = find_num_gpus()
|
||||
args.parallel = sys_gpu_count
|
||||
|
Loading…
x
Reference in New Issue
Block a user