[ROCm] improve gpu script

This commit is contained in:
Ruturaj4 2024-07-31 15:04:58 -05:00
parent f1974b6471
commit e06be544d4
2 changed files with 50 additions and 7 deletions

View File

@ -49,8 +49,7 @@ RUN eval "$(pyenv init -)" && \
numpy setuptools build wheel six auditwheel scipy \
pytest pytest-html pytest_html_merger pytest-reportlog \
pytest-rerunfailures cloudpickle portpicker matplotlib absl-py \
flatbuffers hypothesis
flatbuffers hypothesis pytest-json-report pytest-csv
################################################################################
FROM rocm_base AS rt_build
@ -68,3 +67,4 @@ LABEL com.amdgpu.rocm_version="$ROCM_VERSION" \
RUN --mount=type=bind,source=wheelhouse,target=/wheelhouse \
pip install --find-links /wheelhouse jax jaxlib jax_rocm60_plugin jax_rocm60_pjrt

View File

@ -14,6 +14,7 @@
# limitations under the License.
import os
import csv
import json
import argparse
import threading
@ -29,6 +30,34 @@ def extract_filename(path):
file_name, _ = os.path.splitext(base_name)
return file_name
def combine_json_reports():
all_json_files = [f for f in os.listdir(base_dir) if f.endswith('_log.json')]
combined_data = []
for json_file in all_json_files:
with open(os.path.join(base_dir, json_file), 'r') as infile:
data = json.load(infile)
combined_data.append(data)
combined_json_file = f"{base_dir}/final_compiled_report.json"
with open(combined_json_file, 'w') as outfile:
json.dump(combined_data, outfile, indent=4)
def combine_csv_reports():
all_csv_files = [f for f in os.listdir(base_dir) if f.endswith('_log.csv')]
combined_csv_file = f"{base_dir}/final_compiled_report.csv"
with open(combined_csv_file, mode='w', newline='') as outfile:
csv_writer = csv.writer(outfile)
for i, csv_file in enumerate(all_csv_files):
with open(os.path.join(base_dir, csv_file), mode='r') as infile:
csv_reader = csv.reader(infile)
if i == 0:
# write headers only once
csv_writer.writerow(next(csv_reader))
for row in csv_reader:
csv_writer.writerow(row)
def generate_final_report(shell=False, env_vars={}):
env = os.environ
env = {**env, **env_vars}
@ -41,7 +70,10 @@ def generate_final_report(shell=False, env_vars={}):
print("FAILED - {}".format(" ".join(cmd)))
print(result.stderr.decode())
return result.returncode, result.stderr.decode(), result.stdout.decode()
# Generate json reports.
combine_json_reports()
# Generate csv reports.
combine_csv_reports()
def run_shell_command(cmd, shell=False, env_vars={}):
@ -66,7 +98,7 @@ def parse_test_log(log_file):
report = json.loads(line)
if "nodeid" in report:
module = report["nodeid"].split("::")[0]
if module:
if module and ".py" in module:
test_files.add(os.path.abspath(module))
return test_files
@ -100,9 +132,20 @@ def run_test(testmodule, gpu_tokens, continue_on_fail):
}
testfile = extract_filename(testmodule)
if continue_on_fail:
cmd = ["python3", "-m", "pytest", '--html={}/{}_log.html'.format(base_dir, testfile), "--reruns", "3", "-v", testmodule]
cmd = ["python3", "-m", "pytest",
"--json-report", f"--json-report-file={base_dir}/{testfile}_log.json",
f"--csv={base_dir}/{testfile}_log.csv",
"--csv-columns", "id,module,name,file,status,duration",
f"--html={base_dir}/{testfile}_log.html",
"--reruns", "3", "-v", testmodule]
else:
cmd = ["python3", "-m", "pytest", '--html={}/{}_log.html'.format(base_dir, testfile), "--reruns", "3", "-x", "-v", testmodule]
cmd = ["python3", "-m", "pytest",
"--json-report", f"--json-report-file={base_dir}/{testfile}_log.json",
f"--csv={base_dir}/{testfile}_log.csv",
"--csv-columns", "id,module,name,file,status,duration",
f"--html={base_dir}/{testfile}_log.html",
"--reruns", "3", "-x", "-v", testmodule]
return_code, stderr, stdout = run_shell_command(cmd, env_vars=env_vars)
with GPU_LOCK:
gpu_tokens.append(target_gpu)
@ -115,7 +158,7 @@ def run_test(testmodule, gpu_tokens, continue_on_fail):
def run_parallel(all_testmodules, p, c):
print(f"Running tests with parallelism=", p)
print(f"Running tests with parallelism = {p}")
available_gpu_tokens = list(range(p))
executor = ThreadPoolExecutor(max_workers=p)
# walking through test modules.