[ROCM]: Generating pytest html logs from unit-tests.

This commit is contained in:
zahiqbal 2024-01-17 22:48:09 +00:00
parent a63197fed8
commit ef7694f26a
3 changed files with 57 additions and 16 deletions

View File

@ -32,6 +32,6 @@ RUN git clone https://github.com/pyenv/pyenv.git /pyenv
ENV PYENV_ROOT /pyenv
ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH
RUN pyenv install $PYTHON_VERSION
RUN eval "$(pyenv init -)" && pyenv local ${PYTHON_VERSION} && pip3 install --upgrade --force-reinstall setuptools pip && pip install numpy setuptools build wheel six auditwheel scipy pytest pytest-rerunfailures matplotlib absl-py flatbuffers hypothesis
RUN eval "$(pyenv init -)" && pyenv local ${PYTHON_VERSION} && pip3 install --upgrade --force-reinstall setuptools pip && pip install numpy setuptools build wheel six auditwheel scipy pytest pytest-html pytest_html_merger pytest-rerunfailures matplotlib absl-py flatbuffers hypothesis

View File

@ -13,20 +13,39 @@
# See the License for the specific language governing permissions and
# limitations under the License.
set -eux
# run test module with multi-gpu requirements. We currently do not have a way to filter tests.
# this issue is also tracked in https://github.com/google/jax/issues/7323
cmd=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
echo $cmd
set -eu
if [[ $cmd -gt 8 ]]; then
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 && python3 -m pytest --reruns 3 -x tests/pmap_test.py
elif [[ $cmd -gt 4 ]]; then
export HIP_VISIBLE_DEVICES=0,1,2,3 && python3 -m pytest --reruns 3 -x tests/pmap_test.py
elif [[ $cmd -gt 2 ]]; then
export HIP_VISIBLE_DEVICES=0,1 && python3 -m pytest --reruns 3 -x tests/pmap_test.py
else
export HIP_VISIBLE_DEVICES=0 && python3 -m pytest --reruns 3 -x tests/pmap_test.py
# Function to run tests with specified GPUs
run_tests() {
local base_dir=./logs
local gpu_devices="$1"
export HIP_VISIBLE_DEVICES=$gpu_devices
python3 -m pytest --html=$base_dir/multi_gpu_pmap_test_log.html --reruns 3 -x tests/pmap_test.py
python3 -m pytest --html=$base_dir/multi_gpu_multi_device_test_log.html --reruns 3 -x tests/multi_device_test.py
python3 -m pytest_html_merger -i $base_dir/ -o $base_dir/final_compiled_report.html
}
# Check for required commands
if ! command -v lspci &> /dev/null; then
echo "lspci command not found, aborting."
exit 1
fi
python3 -m pytest --reruns 3 -x tests/multi_device_test.py
if ! command -v python3 &> /dev/null; then
echo "Python3 is not available, aborting."
exit 1
fi
# GPU detection and test execution
gpu_count=$(lspci | grep -c 'controller.*AMD/ATI')
echo "Number of AMD/ATI GPUs detected: $gpu_count"
if [[ $gpu_count -gt 8 ]]; then
run_tests "0,1,2,3,4,5,6,7"
elif [[ $gpu_count -gt 4 ]]; then
run_tests "0,1,2,3"
elif [[ $gpu_count -gt 2 ]]; then
run_tests "0,1"
else
run_tests "0"
fi

View File

@ -22,6 +22,26 @@ from concurrent.futures import ThreadPoolExecutor
GPU_LOCK = threading.Lock()
LAST_CODE = 0
base_dir="./logs"
def extract_filename(path):
base_name = os.path.basename(path)
file_name, _ = os.path.splitext(base_name)
return file_name
def generate_final_report(shell=False, env_vars={}):
env = os.environ
env = {**env, **env_vars}
cmd = ["pytest_html_merger", "-i", '{}'.format(base_dir), "-o", '{}/final_compiled_report.html'.format(base_dir)]
result = subprocess.run(cmd,
shell=shell,
capture_output=True,
env=env)
if result.returncode != 0:
print("FAILED - {}".format(" ".join(cmd)))
print(result.stderr.decode())
# sys.exit(result.returncode)
return result.returncode, result.stderr.decode(), result.stdout.decode()
def run_shell_command(cmd, shell=False, env_vars={}):
@ -69,7 +89,8 @@ def run_test(testmodule, gpu_tokens):
"HIP_VISIBLE_DEVICES": str(target_gpu),
"XLA_PYTHON_CLIENT_ALLOCATOR": "default",
}
cmd = ["python3", "-m", "pytest", "--reruns", "3", "-x", testmodule]
testfile = extract_filename(testmodule)
cmd = ["python3", "-m", "pytest", '--html={}/{}_log.html'.format(base_dir, testfile), "--reruns", "3", "-x", testmodule]
return_code, stderr, stdout = run_shell_command(cmd, env_vars=env_vars)
with GPU_LOCK:
gpu_tokens.append(target_gpu)
@ -102,6 +123,7 @@ def find_num_gpus():
def main(args):
all_testmodules = collect_testmodules()
run_parallel(all_testmodules, args.parallel)
generate_final_report()
exit(LAST_CODE)