Add in-depth quality gate checks (#949)

* add in-depth quality gate checks Signed-off-by: Alex Goodman <alex.goodman@anchore.com> * add quality tests to PR checks Signed-off-by: Alex Goodman <alex.goodman@anchore.com> Signed-off-by: Alex Goodman <alex.goodman@anchore.com>
2024-11-10 06:34:13 +00:00 · 2022-10-05 16:26:26 -04:00 · 2022-10-05 16:26:26 -04:00 · d4587ddeec
commit d4587ddeec
parent 7ad60ce410
11 changed files with 653 additions and 0 deletions
--- a/.github/workflows/validations.yaml
+++ b/.github/workflows/validations.yaml
@ -9,6 +9,7 @@ on:
 env:
  GO_VERSION: "1.18.x"
  GO_STABLE_VERSION: true
  PYTHON_VERSION: "3.10"
 jobs:
  Static-Analysis:
@ -100,6 +101,58 @@ jobs:
          name: unit-test-results
          path: test/results/**/*
  Quality-Test:
    # Note: changing this job name requires making the same update in the .github/workflows/release.yaml pipeline
    name: "Quality tests"
    runs-on: ubuntu-20.04
    steps:
      - uses: actions/setup-go@v2
        with:
          go-version: ${{ env.GO_VERSION }}
          stable: ${{ env.GO_STABLE_VERSION }}
      - uses: actions/setup-python@v2
        with:
          python-version: ${{ env.PYTHON_VERSION }}
      - uses: actions/checkout@v2
        with:
          submodules: true
      - name: Restore tool cache
        id: tool-cache
        uses: actions/cache@v2.1.3
        with:
          path: ${{ github.workspace }}/.tmp
          key: ${{ runner.os }}-tool-${{ hashFiles('Makefile') }}
      - name: Restore go cache
        id: go-cache
        uses: actions/cache@v2.1.3
        with:
          path: ~/go/pkg/mod
          key: ${{ runner.os }}-go-${{ env.GO_VERSION }}-${{ hashFiles('**/go.sum') }}
          restore-keys: |
            ${{ runner.os }}-go-${{ env.GO_VERSION }}-
      - name: Restore python cache
        id: python-cache
        uses: actions/cache@v2.1.3
        with:
          path: |
            test/quality/venv
            test/quality/vulnerability-match-labels/venv
          key: ${{ runner.os }}-go-${{ env.PYTHON_VERSION }}-${{ hashFiles('**/test/quality/**/requirements.txt') }}
          restore-keys: |
            ${{ runner.os }}-go-${{ env.PYTHON_VERSION }}-
      - name: (cache-miss) Bootstrap all project dependencies
        if: steps.tool-cache.outputs.cache-hit != 'true' || steps.go-cache.outputs.cache-hit != 'true'
        run: make bootstrap
      - name: Run quality tests
        run: make quality
  Integration-Test:
    # Note: changing this job name requires making the same update in the .github/workflows/release.yaml pipeline
    name: "Integration tests"
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,4 @@
 [submodule "test/quality/vulnerability-match-labels"]
 	path = test/quality/vulnerability-match-labels
 	url = git@github.com:anchore/vulnerability-match-labels.git
 	branch = main
--- a/5
+++ b/5
@ -168,6 +168,11 @@ unit: ## Run unit tests (with coverage)
 	@echo "Coverage: $$(cat $(COVER_TOTAL))"
 	@if [ $$(echo "$$(cat $(COVER_TOTAL)) >= $(COVERAGE_THRESHOLD)" | bc -l) -ne 1 ]; then echo "$(RED)$(BOLD)Failed coverage quality gate (> $(COVERAGE_THRESHOLD)%)$(RESET)" && false; fi
 .PHONY: quality
 quality: ## Run quality tests
 	$(call title,Running quality tests)
 	cd test/quality && make
 # note: this is used by CI to determine if the install test fixture cache (docker image tars) should be busted
 install-fingerprint:
 	cd test/install && \
--- a/test/quality/.gitignore
+++ b/test/quality/.gitignore
@ -0,0 +1,7 @@
 venv
 .yardstick/tools
 .yardstick/result
 stage
 pull
 migrate.py
 .oras-cache
--- a/test/quality/.yardstick.yaml
+++ b/test/quality/.yardstick.yaml
@ -0,0 +1,45 @@
 x-ref:
  # note: always reference images with BOTH a tag and a digest
  images: &images
    - docker.io/cloudbees/cloudbees-core-mm:2.277.3.1@sha256:4c564f473d38f23da1caa48c4ef53b958ef03d279232007ad3319b1f38584bdb
    - docker.io/anchore/test_images:grype-quality-node-d89207b@sha256:f56164678054e5eb59ab838367373a49df723b324617b1ba6de775749d7f91d4
    - docker.io/anchore/test_images:grype-quality-python-d89207b@sha256:b2b58a55c0b03c1626d2aaae2add9832208b02124dda7b7b41811e14f0fb272c
    - docker.io/anchore/test_images:grype-quality-java-d89207b@sha256:b3534fc2e37943136d5b54e3a58b55d4ccd4363d926cf7aa5bf55a524cf8275b
    - docker.io/anchore/test_images:grype-quality-golang-d89207b@sha256:7536ee345532f674ec9e448e3768db4e546c48220ba2b6ec9bc9cfbfb3b7b74a
    - docker.io/anchore/test_images:grype-quality-ruby-d89207b@sha256:1a5a5f870924e88a6f0f2b8089cf276ef0a79b5244a052cdfe4a47bb9e5a2c10
    - docker.io/alpine:3.2@sha256:ddac200f3ebc9902fb8cfcd599f41feb2151f1118929da21bcef57dc276975f9
    - docker.io/centos:6@sha256:3688aa867eb84332460e172b9250c9c198fdfd8d987605fd53f246f498c60bcf
    - docker.io/ubuntu:16.10@sha256:8dc9652808dc091400d7d5983949043a9f9c7132b15c14814275d25f94bca18a
    - docker.io/almalinux:8@sha256:cd49d7250ed7bb194d502d8a3e50bd775055ca275d1d9c2785aea72b890afe6a
    - docker.io/rockylinux:8@sha256:72afc2e1a20c9ddf56a81c51148ebcbe927c0a879849efe813bee77d69df1dd8
    - docker.io/oraclelinux:6@sha256:a06327c0f1d18d753f2a60bb17864c84a850bb6dcbcf5946dd1a8123f6e75495
    - docker.io/debian:7@sha256:81e88820a7759038ffa61cff59dfcc12d3772c3a2e75b7cfe963c952da2ad264
    - docker.io/busybox:1.28.1@sha256:2107a35b58593c58ec5f4e8f2c4a70d195321078aebfadfbfb223a2ff4a4ed21
    - docker.io/amazonlinux:2@sha256:1301cc9f889f21dc45733df9e58034ac1c318202b4b0f0a08d88b3fdc03004de
    - registry.access.redhat.com/ubi8@sha256:68fecea0d255ee253acbf0c860eaebb7017ef5ef007c25bee9eeffd29ce85b29
 # new vulnerabilities are added all of the time, instead of keeping up it's easier to ignore newer entries.
 # This approach helps tremendously with keeping the analysis relatively stable.
 default_max_year: 2020
 result-sets:
  pr_vs_latest_via_sbom:
    description: "latest released grype vs grype from the current build (via SBOM ingestion)"
    matrix:
      images: *images
      tools:
        - name: syft
          # note: we want to use a fixed version of syft for capturing all results (NOT "latest")
          version: v0.54.0
          produces: SBOM
          refresh: false
        - name: grype
          version: git:current-commit
          takes: SBOM
        - name: grype
          version: latest
          takes: SBOM
--- a/test/quality/.yardstick/labels
+++ b/test/quality/.yardstick/labels
@ -0,0 +1 @@
 ../vulnerability-match-labels/labels
--- a/test/quality/Makefile
+++ b/test/quality/Makefile
@ -0,0 +1,68 @@
 SBOM_STORE_TAG = md5-$(shell md5sum .yardstick.yaml | cut -d' ' -f1)
 SBOM_STORE_IMAGE = ghcr.io/anchore/grype/quality-test-sbom-store:$(SBOM_STORE_TAG)
 ACTIVATE_VENV = . venv/bin/activate &&
 YARDSTICK = $(ACTIVATE_VENV) yardstick -v
 YARDSTICK_RESULT_DIR = .yardstick/result
 YARDSTICK_LABELS_DIR = .yardstick/labels
 VULNERABILITY_LABELS = ./vulnerability-labels
 RESULT_SET = pr_vs_latest_via_sbom
 # formatting variables
 BOLD := $(shell tput -T linux bold)
 PURPLE := $(shell tput -T linux setaf 5)
 GREEN := $(shell tput -T linux setaf 2)
 CYAN := $(shell tput -T linux setaf 6)
 RED := $(shell tput -T linux setaf 1)
 RESET := $(shell tput -T linux sgr0)
 TITLE := $(BOLD)$(PURPLE)
 SUCCESS := $(BOLD)$(GREEN)
 .PHONY: all
 all: capture validate ## Fetch or capture all data and run all quality checks
 .PHONY: validate
 validate: venv $(VULNERABILITY_LABELS) ## Run all quality checks against already collected data
 	$(ACTIVATE_VENV) ./gate.py
 .PHONY: capture
 capture: sboms vulns ## Collect and store all syft and grype results
 .PHONY: capture
 vulns: venv ## Collect and store all grype results
 	$(YARDSTICK) -v result capture -r $(RESULT_SET)
 .PHONY: sboms
 sboms: $(YARDSTICK_RESULT_DIR) venv clear-results ## Collect and store all syft results (deletes all existing results)
 	bash -c "make download-sboms || ($(YARDSTICK) -v result capture -r $(RESULT_SET) --only-producers)"
 .PHONY: download-sboms
 download-sboms:
 	cd vulnerability-match-labels && make venv
 	bash -c "export ORAS_CACHE=$(shell pwd)/.oras-cache && make venv && . vulnerability-match-labels/venv/bin/activate && ./vulnerability-match-labels/sboms.py download -r $(RESULT_SET)"
 venv: venv/touchfile
 venv/touchfile: requirements.txt
 	test -d venv || python3 -m venv venv
 	$(ACTIVATE_VENV) pip install -Ur requirements.txt
 	touch venv/touchfile
 $(YARDSTICK_RESULT_DIR):
 	mkdir -p $(YARDSTICK_RESULT_DIR)
 $(VULNERABILITY_LABELS):
 	git submodule update vulnerability-match-labels
 .PHONY: clear-results
 clear-results: venv ## Clear all existing yardstick results
 	$(YARDSTICK) result clear
 .PHONY: clean
 clean: clear-results ## Clear all existing yardstick results and delete python environment
 	rm -rf venv
 	find -iname "*.pyc" -delete
 help:
 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "$(BOLD)$(CYAN)%-25s$(RESET)%s\n", $$1, $$2}'
--- a/test/quality/README.md
+++ b/test/quality/README.md
@ -0,0 +1,140 @@
 # Match quality testing
 This form of testing compares the results from various releases of grype using a
 static set of reference container images. The kinds of comparisons made are:
 1) "relative": find the vulnerability matching differences between both tools
   for a given image. This helps identify when a change has occurred in matching
   behavior and where the changes are.
 2) "against labels": pair each tool results for an image with ground truth. This
   helps identify how well the matching behavior is performing (did it get
   better or worse).
 ## Getting started
 To capture raw tool output and store into the local `.yardstick` directory for
 further analysis:
 ```
 make capture
 ```
 To analyze the tool output and evaluate a pass/fail result:
 ```
 make validate
 ```
 A pass/fail result is shown in the output with reasons for the failure being
 listed explicitly.
 ## What is the quality gate criteria
 The label comparison results are used to determine a pass/fail result,
 specifically with the following criteria:
 - fail when current grype F1 score drops below last grype release F1 score (or
   F1 score is indeterminate)
 - fail when the indeterminate matches % > 10% in the current grype results
 - fail when there is a rise in FNs relative to the results from the last grype
   release
 - otherwise, pass
 F1 score is the primary way that tool matching performance is characterized. F1
 score combines the TP, FP, and FN counts into a single metric between 0 and 1.
 Ideally the F1 score for an image-tool pair should be 1. F1 score is a good way
 to summarize the matching performance but does not explain why the matching
 performance is what it is.
 Indeterminate matches are matches from results that could not be pared with a
 label (TP or FP). This could also mean that multiple conflicting labels were
 found for the a single match. The more indeterminate matches there are the less
 confident you can be about the F1 score. Ideally there should be 0 indeterminate
 matches, but this is difficult to achieve since vulnerability data is constantly
 changing. 
 False negatives represent matches that should have been made by the tool but
 were missed. We should always make certain that this value does not increase
 between releases of grype.
 ## Assumptions
 1. **Comparing vulnerability results taken at different times is invalid**.
   We leverage the yardstick result-set feature to capture all vulnerability
   results at one time for a specific image and tool set. Why? If we use grype
   at version `a` on monday and grype at version `b` on tuesday and attempt to
   compare the results, if differences are found it will not be immediately
   clear why the results are different. That is, it is entirely possible that
   the vulnerability databases from the run of `b` simply had more up to date
   information, and if `grype@a` were run at the same time (on tuesday) this
   reason can be almost entirely eliminated.
 2. **Comparing vulnerability results across images with different digests is invalid**.
   It may be very tempting to compare vulnerability results for
   `alpine:3.2` from monday and `alpine:3.2` from tuesday to see if there are
   any changes. However, this is potentially inaccurate as the image references
   are for the same tag, but the publisher may have pushed a new image with
   differing content. Any change could lead to different vulnerability matching
   results but we are only interested in vulnerability match differences that
   are due to actionable reasons (grype matcher logic problems or [SBOM] input
   data into matchers).
 ## Approach
 Vulnerability matching has essentially two inputs:
 - the packages that were found in the scanned artifact
 - the vulnerability data from upstream providers (e.g. NVD, GHSA, etc.)
 These are both moving targets!
 We may implement more catalogers in syft that raise up more packages discovered
 over time (for the same artifact scanned). Also the world is continually finding
 and reporting new vulnerabilities. The more moving parts there are in this form
 of testing the harder it is to come to a conclusion about the actual quality of
 the output over time.
 To reduce the eroding value over time we've decided to change as many moving
 targets into fixed targets as possible:
 - Vulnerability results beyond a particular year are ignored (the current config
  allows for <= 2020). Though there are still retroactive CVEs created, this
  helps a lot in terms of keeping vulnerability results relatively stable.
 - SBOMs are used as input into grype instead of the raw container images. This
  allows the artifacts under test to remain truly fixed and saves a lot of time
  when capturing grype results (as the container image is no longer needed
  during analysis).
 - For the captured SBOMs, container images referenced must be with a digest, not
  just a tag. In case we update a tool version (say syft) we want to make
  certain that we are scanning the exact same artifact later when we re-run the
  analysis.
 - Versions of tools used are fixed to a specific `major.minor.patch` release used.
  This allows us to account for capability differences between tool runs.
 To reduce maintenance effort of this comparison over time there are a few things 
 to keep in mind:
 - Once an image is labeled (at a specific digest) the image digest should be
  considered immutable (never updated). Why? It takes a lot of effort to label
  images and there are no "clearly safe" assumptions that can be made when it
  comes to migrating labels from one image to another no matter how "similar"
  the images may be. There is also no value in updating the image; these images
  are not being executed and their only purpose is to survey the matching
  performance of grype. In the philosophy of "maximizing fixed points" it
  doesn't make sense to change these assets. Over time it may be that we remove
  assets that are no longer useful for comparison, but this should rarely be
  done.
 - Consider not changing the CVE year max-ceiling (currently set to 2020).
  Pushing this ceiling will likely raise the number of unlabled matches
  significantly for all images. Only bump this ceiling if all possible matches
  are labeled.
--- a/test/quality/gate.py
+++ b/test/quality/gate.py
@ -0,0 +1,326 @@
 #!/usr/bin/env python3
 import logging
 import os
 import re
 import subprocess
 import sys
 from typing import Optional
 import click
 from tabulate import tabulate
 from dataclasses import dataclass, InitVar, field
 import yardstick
 from yardstick import store, comparison, artifact, arrange
 from yardstick.cli import display, config
 # see the .yardstick.yaml configuration for details
 default_result_set = "pr_vs_latest_via_sbom"
 yardstick.utils.grype_db.raise_on_failure(False)
@dataclass
 class Gate:
    label_comparisons: InitVar[Optional[list[comparison.AgainstLabels]]]
    label_comparison_stats: InitVar[Optional[comparison.ImageToolLabelStats]]
    reasons: list[str] = field(default_factory=list)
    def __post_init__(self, label_comparisons: Optional[list[comparison.AgainstLabels]], label_comparison_stats: Optional[comparison.ImageToolLabelStats]):
        if not label_comparisons and not label_comparison_stats:
            return 
        reasons = []
        # - fail when current F1 score drops below last release F1 score (or F1 score is indeterminate)
        # - fail when indeterminate % > 10%
        # - fail when there is a rise in FNs
        latest_release_tool, current_tool = guess_tool_orientation(label_comparison_stats.tools)
        latest_release_comparisons_by_image = {comp.config.image: comp for comp in label_comparisons if comp.config.tool == latest_release_tool }
        current_comparisons_by_image = {comp.config.image: comp for comp in label_comparisons if comp.config.tool == current_tool }
        for image, comp in current_comparisons_by_image.items():
            latest_f1_score = latest_release_comparisons_by_image[image].summary.f1_score
            current_f1_score = comp.summary.f1_score
            if current_f1_score < latest_f1_score:
                reasons.append(f"current F1 score is lower than the latest release F1 score: {bcolors.BOLD+bcolors.UNDERLINE}current={current_f1_score:0.2f} latest={latest_f1_score:0.2f}{bcolors.RESET} image={image}")
            if comp.summary.indeterminate_percent > 10:
                reasons.append(f"current indeterminate matches % is greater than 10%: {bcolors.BOLD+bcolors.UNDERLINE}current={comp.summary.indeterminate_percent:0.2f}%{bcolors.RESET} image={image}")
            latest_fns = latest_release_comparisons_by_image[image].summary.false_negatives
            current_fns = comp.summary.false_negatives
            if current_fns > latest_fns:
                reasons.append(f"current false negatives is greater than the latest release false negatives: {bcolors.BOLD+bcolors.UNDERLINE}current={current_fns} latest={latest_fns}{bcolors.RESET} image={image}")
        self.reasons = reasons
    def passed(self):
        return len(self.reasons) == 0
 def guess_tool_orientation(tools: list[str]):
    if len(tools) != 2:
        raise RuntimeError("expected 2 tools, got %s" % tools)
    current_tool = None
    latest_release_tool = None
    for tool in tools:
        if tool.endswith("latest"):
            latest_release_tool = tool
            continue
        current_tool = tool
    if latest_release_tool is None:
        # "latest" value isn't accessible, so we do a best guess at which version is latest
        current_tool, latest_release_tool = sorted(tools)
    if current_tool is None:
        raise ValueError("current tool not found")
    return latest_release_tool, current_tool
 class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    RESET = '\033[0m'
 def show_results_used(results: list[artifact.ScanResult]):
    print(f"   Results used:")
    for idx, result in enumerate(results):
        branch = "├──"
        if idx == len(results) - 1:
            branch = "└──"
        print(f"    {branch} {result.ID} : {result.config.tool} against {result.config.image}")
    print()
 def validate(cfg: config.Application, result_set: str, images: list[str], always_run_label_comparison: bool, verbosity: int, label_entries: Optional[list[artifact.LabelEntry]] = None):
    print(f"{bcolors.HEADER}{bcolors.BOLD}Validating with {result_set!r}", bcolors.RESET)
    result_set_obj = store.result_set.load(name=result_set)
    ret = []
    for image, result_states in result_set_obj.result_state_by_image.items():
        if images and image not in images:
            print("Skipping image:", image)
            continue
        print()
        print("Testing image:", image)
        for state in result_states:
            print("   ", f"with {state.request.tool}")
        print()
        gate = validate_image(cfg, [s.config.path for s in result_states], always_run_label_comparison=always_run_label_comparison, verbosity=verbosity, label_entries=label_entries)
        ret.append(gate)
        failure = not gate.passed()
        if failure:
            print(f"{bcolors.FAIL}{bcolors.BOLD}Failed quality gate{bcolors.RESET}")
        for reason in gate.reasons:
            print(f"   - {reason}")
        print()
        size = 120
        print("▁"*size)
        print("░"*size)
        print("▔"*size)
    return ret
 def validate_image(cfg: config.Application, descriptions: list[str], always_run_label_comparison: bool, verbosity: int, label_entries: Optional[list[artifact.LabelEntry]] = None):
    # do a relative comparison
    # - show comparison summary (no gating action)
    # - list out all individual match differences
    print(f"{bcolors.HEADER}Running relative comparison...", bcolors.RESET)
    relative_comparison = yardstick.compare_results(descriptions=descriptions, year_max_limit=cfg.default_max_year)
    show_results_used(relative_comparison.results)
    # show the relative comparison results
    if verbosity > 0:
        details = verbosity > 1
        display.preserved_matches(relative_comparison, details=details, summary=True, common=False)
        print()
    # bail if there are no differences found
    if not always_run_label_comparison and not sum([len(relative_comparison.unique[result.ID]) for result in relative_comparison.results]):
        print("no differences found between tool results")
        return Gate(None, None)
    # do a label comparison
    print(f"{bcolors.HEADER}Running comparison against labels...", bcolors.RESET)
    results, label_entries, comparisons_by_result_id, stats_by_image_tool_pair = yardstick.compare_results_against_labels(descriptions=descriptions, year_max_limit=cfg.default_max_year, label_entries=label_entries)
    show_results_used(results)
    if verbosity > 0:
        show_fns = verbosity > 1
        display.label_comparison(
                results,
                comparisons_by_result_id,
                stats_by_image_tool_pair,
                show_fns=show_fns,
                show_summaries=True,
            )
    latest_release_tool, current_tool = guess_tool_orientation([r.config.tool for r in results])
    # show the relative comparison unique differences paired up with label conclusions (TP/FP/FN/TN/Unknown)
    all_rows: list[list[Any]] = []
    for result in relative_comparison.results:
        label_comparison = comparisons_by_result_id[result.ID]
        for unique_match in relative_comparison.unique[result.ID]:
            labels = label_comparison.labels_by_match[unique_match.ID]
            if not labels:
                label = "(unknown)"
            elif len(set(labels)) > 1:
                label = ", ".join([l.name for l in labels])
            else:
                label = labels[0].name
            color = ""
            commentary = ""
            if result.config.tool == latest_release_tool:
                # the tool which found the unique result is the latest release tool...
                if label == artifact.Label.TruePositive.name:
                    # drats! we missed a case (this is a new FN)
                    color = bcolors.FAIL
                    commentary = "(this is a new FN 😱)"
                elif artifact.Label.FalsePositive.name in label:
                    # we got rid of a FP! ["hip!", "hip!"]
                    color = bcolors.OKBLUE
                    commentary = "(got rid of a former FP 🙌)"
            else:
                # the tool which found the unique result is the current tool...
                if label == artifact.Label.TruePositive.name:
                    # highest of fives! we found a new TP that the previous tool release missed!
                    color = bcolors.OKBLUE
                    commentary = "(this is a new TP 🙌)"
                elif artifact.Label.FalsePositive.name in label:
                    # welp, our changes resulted in a new FP... not great, maybe not terrible?
                    color = bcolors.FAIL
                    commentary = "(this is a new FP 😱)"
            all_rows.append(
                [
                    f"{color}{result.config.tool} ONLY{bcolors.RESET}",
                    f"{color}{unique_match.package.name}@{unique_match.package.version}{bcolors.RESET}",
                    f"{color}{unique_match.vulnerability.id}{bcolors.RESET}",
                    f"{color}{label}{bcolors.RESET}",
                    f"{commentary}",
                ]
            )
    def escape_ansi(line):
        ansi_escape = re.compile(r'(?:\x1B[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]')
        return ansi_escape.sub('', line)
    # sort but don't consider ansi escape codes
    all_rows = sorted(all_rows, key=lambda x: escape_ansi(str(x[0]+x[1]+x[2]+x[3])))
    if len(all_rows) == 0:
        print("No differences found between tooling (with labels)")
    else:
        print("Match differences between tooling (with labels):")
        indent = "   "
        print(indent + tabulate([["TOOL PARTITION", "PACKAGE", "VULNERABILITY", "LABEL", "COMMENTARY"]]+all_rows, tablefmt="plain").replace("\n", "\n" + indent) + "\n")
    # populate the quality gate with data that can evaluate pass/fail conditions
    return Gate(label_comparisons=comparisons_by_result_id.values(), label_comparison_stats=stats_by_image_tool_pair)
@click.command()
@click.option("--image", "-i", "images", multiple=True, help="filter down to one or more images to validate with (don't use the full result set)")
@click.option("--label-comparison", "-l", "always_run_label_comparison", is_flag=True, help="run label comparison irregardless of relative comparison results")
@click.option("--breakdown-by-ecosystem", "-e", is_flag=True, help="show label comparison results broken down by ecosystem")
@click.option("--verbose", "-v", "verbosity", count=True, help="show details of all comparisons")
@click.option("--result-set", "-r", default=default_result_set, help="the result set to use for the quality gate")
 def main(images: list[str], always_run_label_comparison: bool, breakdown_by_ecosystem: bool, verbosity: int, result_set: str):
    cfg = config.load()
    setup_logging(verbosity)
    # let's not load any more labels than we need to, base this off of the images we're validating
    if not images:
        images = set()
        result_set_obj = store.result_set.load(name=result_set)
        for state in result_set_obj.state:
            images.add(state.config.image)
        images = sorted(list(images))
    print("Loading label entries...", end=" ")
    label_entries = store.labels.load_for_image(images, year_max_limit=cfg.default_max_year)
    print(f"done! {len(label_entries)} entries loaded")
    result_sets = [result_set] # today only one result set is supported, but more can be added
    gates = []
    for result_set in result_sets:
        gates.extend(validate(cfg, result_set, images=images, always_run_label_comparison=always_run_label_comparison, verbosity=verbosity, label_entries=label_entries))
        print()
        if breakdown_by_ecosystem:
            print(f"{bcolors.HEADER}Breaking down label comparison by ecosystem performance...", bcolors.RESET)
            results_by_image, label_entries, stats = yardstick.compare_results_against_labels_by_ecosystem(result_set=result_set, year_max_limit=cfg.default_max_year, label_entries=label_entries)
            display.labels_by_ecosystem_comparison(
                results_by_image,
                stats,
                show_images_used=False,
            )
            print()
    failure = not all([gate.passed() for gate in gates])
    if failure:
        print("Reasons for quality gate failure:")
    for gate in gates:
        for reason in gate.reasons:
            print(f"   - {reason}")
    if failure:
        print()
        print(f"{bcolors.FAIL}{bcolors.BOLD}Quality gate FAILED{bcolors.RESET}")
        sys.exit(1)
    else:
        print(f"{bcolors.OKGREEN}{bcolors.BOLD}Quality gate passed!{bcolors.RESET}")
 def setup_logging(verbosity: int):
    # pylint: disable=redefined-outer-name, import-outside-toplevel
    import logging.config
    if verbosity in [0, 1, 2]:
        log_level = "WARN"
    elif verbosity == 3:
        log_level = "INFO"
    else:
        log_level = "DEBUG"
    logging.config.dictConfig(
        {
            "version": 1,
            "formatters": {
                "standard": {
                    # [%(module)s.%(funcName)s]
                    "format": "%(asctime)s [%(levelname)s] %(message)s",
                    "datefmt": "",
                },
            },
            "handlers": {
                "default": {
                    "level": log_level,
                    "formatter": "standard",
                    "class": "logging.StreamHandler",
                    "stream": "ext://sys.stderr",
                },
            },
            "loggers": {
                "": {  # root logger
                    "handlers": ["default"],
                    "level": log_level,
                },
            },
        }
    )
 if __name__ == '__main__':
    main()
--- a/test/quality/requirements.txt
+++ b/test/quality/requirements.txt
@ -0,0 +1,3 @@
 git+https://github.com/anchore/yardstick@4526ad2ff6d33d34e900ed692c3a90adc80eab73
 # ../../../yardstick
 tabulate==0.8.10
--- a/test/quality/vulnerability-match-labels
+++ b/test/quality/vulnerability-match-labels
@ -0,0 +1 @@
 Subproject commit 3a2ecc336411ddc3f37b7d5c123b80f6848a2cf3
		`@ -0,0 +1 @@`
							`Subproject commit 3a2ecc336411ddc3f37b7d5c123b80f6848a2cf3`