From 2d878ccf543c8a949f60a51e3e2be5238e142bf4 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Fri, 14 Feb 2025 19:08:45 -0800 Subject: [PATCH] [CI] Track Queue/In Progress Metrics By Job Rather Than Workflow This patch makes it so that the metrics container counts the number of in progress and queued jobs at the job level rather than at the workflow level. This helps us distinguish windows versus linux load and also lets us filter out the MacOS jobs that only run in the release branch. Reviewers: Keenuts, lnihlen Reviewed By: lnihlen Pull Request: https://github.com/llvm/llvm-project/pull/127274 --- .ci/metrics/metrics.py | 72 +++++++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py index 70b787665a8b..5347ce679669 100644 --- a/.ci/metrics/metrics.py +++ b/.ci/metrics/metrics.py @@ -43,40 +43,60 @@ def get_sampled_workflow_metrics(github_repo: github.Repository): Returns a list of GaugeMetric objects, containing the relevant metrics about the workflow """ + queued_job_counts = {} + running_job_counts = {} # Other states are available (pending, waiting, etc), but the meaning # is not documented (See #70540). # "queued" seems to be the info we want. - queued_workflow_count = len( - [ - x - for x in github_repo.get_workflow_runs(status="queued") - if x.name in WORKFLOWS_TO_TRACK - ] - ) - running_workflow_count = len( - [ - x - for x in github_repo.get_workflow_runs(status="in_progress") - if x.name in WORKFLOWS_TO_TRACK - ] - ) + for queued_workflow in github_repo.get_workflow_runs(status="queued"): + if queued_workflow.name not in WORKFLOWS_TO_TRACK: + continue + for queued_workflow_job in queued_workflow.jobs(): + job_name = queued_workflow_job.name + # Workflows marked as queued can potentially only have some jobs + # queued, so make sure to also count jobs currently in progress. + if queued_workflow_job.status == "queued": + if job_name not in queued_job_counts: + queued_job_counts[job_name] = 1 + else: + queued_job_counts[job_name] += 1 + elif queued_workflow_job.status == "in_progress": + if job_name not in running_job_counts: + running_job_counts[job_name] = 1 + else: + running_job_counts[job_name] += 1 + + for running_workflow in github_repo.get_workflow_runs(status="in_progress"): + if running_workflow.name not in WORKFLOWS_TO_TRACK: + continue + for running_workflow_job in running_workflow.jobs(): + job_name = running_workflow_job.name + if running_workflow_job.status != "in_progress": + continue + + if job_name not in running_job_counts: + running_job_counts[job_name] = 1 + else: + running_job_counts[job_name] += 1 workflow_metrics = [] - workflow_metrics.append( - GaugeMetric( - "workflow_queue_size", - queued_workflow_count, - time.time_ns(), + for queued_job in queued_job_counts: + workflow_metrics.append( + GaugeMetric( + f"workflow_queue_size_{queued_job}", + queued_job_counts[queued_job], + time.time_ns(), + ) ) - ) - workflow_metrics.append( - GaugeMetric( - "running_workflow_count", - running_workflow_count, - time.time_ns(), + for running_job in running_job_counts: + workflow_metrics.append( + GaugeMetric( + f"running_workflow_count_{running_job}", + running_job_counts[running_job], + time.time_ns(), + ) ) - ) # Always send a hearbeat metric so we can monitor is this container is still able to log to Grafana. workflow_metrics.append( GaugeMetric("metrics_container_heartbeat", 1, time.time_ns())