From d4587ddeecad8b134bf3d104ab58a13f62bae836 Mon Sep 17 00:00:00 2001
From: Alex Goodman <wagoodman@users.noreply.github.com>
Date: Wed, 5 Oct 2022 16:26:26 -0400
Subject: [PATCH] Add in-depth quality gate checks (#949)

* add in-depth quality gate checks

Signed-off-by: Alex Goodman <alex.goodman@anchore.com>

* add quality tests to PR checks

Signed-off-by: Alex Goodman <alex.goodman@anchore.com>

Signed-off-by: Alex Goodman <alex.goodman@anchore.com>
---
 .github/workflows/validations.yaml      |  53 ++++
 .gitmodules                             |   4 +
 Makefile                                |   5 +
 test/quality/.gitignore                 |   7 +
 test/quality/.yardstick.yaml            |  45 ++++
 test/quality/.yardstick/labels          |   1 +
 test/quality/Makefile                   |  68 +++++
 test/quality/README.md                  | 140 ++++++++++
 test/quality/gate.py                    | 326 ++++++++++++++++++++++++
 test/quality/requirements.txt           |   3 +
 test/quality/vulnerability-match-labels |   1 +
 11 files changed, 653 insertions(+)
 create mode 100644 .gitmodules
 create mode 100644 test/quality/.gitignore
 create mode 100644 test/quality/.yardstick.yaml
 create mode 120000 test/quality/.yardstick/labels
 create mode 100644 test/quality/Makefile
 create mode 100644 test/quality/README.md
 create mode 100755 test/quality/gate.py
 create mode 100644 test/quality/requirements.txt
 create mode 160000 test/quality/vulnerability-match-labels

diff --git a/.github/workflows/validations.yaml b/.github/workflows/validations.yaml
index f44e016f..594b8cbe 100644
--- a/.github/workflows/validations.yaml
+++ b/.github/workflows/validations.yaml
@@ -9,6 +9,7 @@ on:
 env:
   GO_VERSION: "1.18.x"
   GO_STABLE_VERSION: true
+  PYTHON_VERSION: "3.10"
 
 jobs:
   Static-Analysis:
@@ -100,6 +101,58 @@ jobs:
           name: unit-test-results
           path: test/results/**/*
 
+  Quality-Test:
+    # Note: changing this job name requires making the same update in the .github/workflows/release.yaml pipeline
+    name: "Quality tests"
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/setup-go@v2
+        with:
+          go-version: ${{ env.GO_VERSION }}
+          stable: ${{ env.GO_STABLE_VERSION }}
+
+      - uses: actions/setup-python@v2
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+
+      - uses: actions/checkout@v2
+        with:
+          submodules: true
+
+      - name: Restore tool cache
+        id: tool-cache
+        uses: actions/cache@v2.1.3
+        with:
+          path: ${{ github.workspace }}/.tmp
+          key: ${{ runner.os }}-tool-${{ hashFiles('Makefile') }}
+
+      - name: Restore go cache
+        id: go-cache
+        uses: actions/cache@v2.1.3
+        with:
+          path: ~/go/pkg/mod
+          key: ${{ runner.os }}-go-${{ env.GO_VERSION }}-${{ hashFiles('**/go.sum') }}
+          restore-keys: |
+            ${{ runner.os }}-go-${{ env.GO_VERSION }}-
+
+      - name: Restore python cache
+        id: python-cache
+        uses: actions/cache@v2.1.3
+        with:
+          path: |
+            test/quality/venv
+            test/quality/vulnerability-match-labels/venv
+          key: ${{ runner.os }}-go-${{ env.PYTHON_VERSION }}-${{ hashFiles('**/test/quality/**/requirements.txt') }}
+          restore-keys: |
+            ${{ runner.os }}-go-${{ env.PYTHON_VERSION }}-
+
+      - name: (cache-miss) Bootstrap all project dependencies
+        if: steps.tool-cache.outputs.cache-hit != 'true' || steps.go-cache.outputs.cache-hit != 'true'
+        run: make bootstrap
+
+      - name: Run quality tests
+        run: make quality
+
   Integration-Test:
     # Note: changing this job name requires making the same update in the .github/workflows/release.yaml pipeline
     name: "Integration tests"
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..281e4b33
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,4 @@
+[submodule "test/quality/vulnerability-match-labels"]
+	path = test/quality/vulnerability-match-labels
+	url = git@github.com:anchore/vulnerability-match-labels.git
+	branch = main
diff --git a/Makefile b/Makefile
index a70370f9..251e6cec 100644
--- a/Makefile
+++ b/Makefile
@@ -168,6 +168,11 @@ unit: ## Run unit tests (with coverage)
 	@echo "Coverage: $$(cat $(COVER_TOTAL))"
 	@if [ $$(echo "$$(cat $(COVER_TOTAL)) >= $(COVERAGE_THRESHOLD)" | bc -l) -ne 1 ]; then echo "$(RED)$(BOLD)Failed coverage quality gate (> $(COVERAGE_THRESHOLD)%)$(RESET)" && false; fi
 
+.PHONY: quality
+quality: ## Run quality tests
+	$(call title,Running quality tests)
+	cd test/quality && make
+
 # note: this is used by CI to determine if the install test fixture cache (docker image tars) should be busted
 install-fingerprint:
 	cd test/install && \
diff --git a/test/quality/.gitignore b/test/quality/.gitignore
new file mode 100644
index 00000000..92d889aa
--- /dev/null
+++ b/test/quality/.gitignore
@@ -0,0 +1,7 @@
+venv
+.yardstick/tools
+.yardstick/result
+stage
+pull
+migrate.py
+.oras-cache
\ No newline at end of file
diff --git a/test/quality/.yardstick.yaml b/test/quality/.yardstick.yaml
new file mode 100644
index 00000000..6c072e70
--- /dev/null
+++ b/test/quality/.yardstick.yaml
@@ -0,0 +1,45 @@
+x-ref:
+  # note: always reference images with BOTH a tag and a digest
+  images: &images
+    - docker.io/cloudbees/cloudbees-core-mm:2.277.3.1@sha256:4c564f473d38f23da1caa48c4ef53b958ef03d279232007ad3319b1f38584bdb
+    - docker.io/anchore/test_images:grype-quality-node-d89207b@sha256:f56164678054e5eb59ab838367373a49df723b324617b1ba6de775749d7f91d4
+    - docker.io/anchore/test_images:grype-quality-python-d89207b@sha256:b2b58a55c0b03c1626d2aaae2add9832208b02124dda7b7b41811e14f0fb272c
+    - docker.io/anchore/test_images:grype-quality-java-d89207b@sha256:b3534fc2e37943136d5b54e3a58b55d4ccd4363d926cf7aa5bf55a524cf8275b
+    - docker.io/anchore/test_images:grype-quality-golang-d89207b@sha256:7536ee345532f674ec9e448e3768db4e546c48220ba2b6ec9bc9cfbfb3b7b74a
+    - docker.io/anchore/test_images:grype-quality-ruby-d89207b@sha256:1a5a5f870924e88a6f0f2b8089cf276ef0a79b5244a052cdfe4a47bb9e5a2c10
+    - docker.io/alpine:3.2@sha256:ddac200f3ebc9902fb8cfcd599f41feb2151f1118929da21bcef57dc276975f9
+    - docker.io/centos:6@sha256:3688aa867eb84332460e172b9250c9c198fdfd8d987605fd53f246f498c60bcf
+    - docker.io/ubuntu:16.10@sha256:8dc9652808dc091400d7d5983949043a9f9c7132b15c14814275d25f94bca18a
+    - docker.io/almalinux:8@sha256:cd49d7250ed7bb194d502d8a3e50bd775055ca275d1d9c2785aea72b890afe6a
+    - docker.io/rockylinux:8@sha256:72afc2e1a20c9ddf56a81c51148ebcbe927c0a879849efe813bee77d69df1dd8
+    - docker.io/oraclelinux:6@sha256:a06327c0f1d18d753f2a60bb17864c84a850bb6dcbcf5946dd1a8123f6e75495
+    - docker.io/debian:7@sha256:81e88820a7759038ffa61cff59dfcc12d3772c3a2e75b7cfe963c952da2ad264
+    - docker.io/busybox:1.28.1@sha256:2107a35b58593c58ec5f4e8f2c4a70d195321078aebfadfbfb223a2ff4a4ed21
+    - docker.io/amazonlinux:2@sha256:1301cc9f889f21dc45733df9e58034ac1c318202b4b0f0a08d88b3fdc03004de
+    - registry.access.redhat.com/ubi8@sha256:68fecea0d255ee253acbf0c860eaebb7017ef5ef007c25bee9eeffd29ce85b29
+
+# new vulnerabilities are added all of the time, instead of keeping up it's easier to ignore newer entries.
+# This approach helps tremendously with keeping the analysis relatively stable.
+default_max_year: 2020
+
+result-sets:
+  pr_vs_latest_via_sbom:
+    description: "latest released grype vs grype from the current build (via SBOM ingestion)"
+    matrix:
+      images: *images
+
+      tools:
+
+        - name: syft
+          # note: we want to use a fixed version of syft for capturing all results (NOT "latest")
+          version: v0.54.0
+          produces: SBOM
+          refresh: false
+
+        - name: grype
+          version: git:current-commit
+          takes: SBOM
+
+        - name: grype
+          version: latest
+          takes: SBOM
diff --git a/test/quality/.yardstick/labels b/test/quality/.yardstick/labels
new file mode 120000
index 00000000..4a9cf095
--- /dev/null
+++ b/test/quality/.yardstick/labels
@@ -0,0 +1 @@
+../vulnerability-match-labels/labels
\ No newline at end of file
diff --git a/test/quality/Makefile b/test/quality/Makefile
new file mode 100644
index 00000000..65e1c96c
--- /dev/null
+++ b/test/quality/Makefile
@@ -0,0 +1,68 @@
+SBOM_STORE_TAG = md5-$(shell md5sum .yardstick.yaml | cut -d' ' -f1)
+SBOM_STORE_IMAGE = ghcr.io/anchore/grype/quality-test-sbom-store:$(SBOM_STORE_TAG)
+ACTIVATE_VENV = . venv/bin/activate &&
+YARDSTICK = $(ACTIVATE_VENV) yardstick -v
+YARDSTICK_RESULT_DIR = .yardstick/result
+YARDSTICK_LABELS_DIR = .yardstick/labels
+VULNERABILITY_LABELS = ./vulnerability-labels
+RESULT_SET = pr_vs_latest_via_sbom
+
+# formatting variables
+BOLD := $(shell tput -T linux bold)
+PURPLE := $(shell tput -T linux setaf 5)
+GREEN := $(shell tput -T linux setaf 2)
+CYAN := $(shell tput -T linux setaf 6)
+RED := $(shell tput -T linux setaf 1)
+RESET := $(shell tput -T linux sgr0)
+TITLE := $(BOLD)$(PURPLE)
+SUCCESS := $(BOLD)$(GREEN)
+
+.PHONY: all
+all: capture validate ## Fetch or capture all data and run all quality checks
+
+.PHONY: validate
+validate: venv $(VULNERABILITY_LABELS) ## Run all quality checks against already collected data
+	$(ACTIVATE_VENV) ./gate.py
+
+.PHONY: capture
+capture: sboms vulns ## Collect and store all syft and grype results
+
+.PHONY: capture
+vulns: venv ## Collect and store all grype results
+	$(YARDSTICK) -v result capture -r $(RESULT_SET)
+
+.PHONY: sboms
+sboms: $(YARDSTICK_RESULT_DIR) venv clear-results ## Collect and store all syft results (deletes all existing results)
+	bash -c "make download-sboms || ($(YARDSTICK) -v result capture -r $(RESULT_SET) --only-producers)"
+
+.PHONY: download-sboms
+download-sboms:
+	cd vulnerability-match-labels && make venv
+	bash -c "export ORAS_CACHE=$(shell pwd)/.oras-cache && make venv && . vulnerability-match-labels/venv/bin/activate && ./vulnerability-match-labels/sboms.py download -r $(RESULT_SET)"
+
+venv: venv/touchfile
+
+venv/touchfile: requirements.txt
+	test -d venv || python3 -m venv venv
+	$(ACTIVATE_VENV) pip install -Ur requirements.txt
+	touch venv/touchfile
+
+
+$(YARDSTICK_RESULT_DIR):
+	mkdir -p $(YARDSTICK_RESULT_DIR)
+
+$(VULNERABILITY_LABELS):
+	git submodule update vulnerability-match-labels
+
+.PHONY: clear-results
+clear-results: venv ## Clear all existing yardstick results
+	$(YARDSTICK) result clear
+
+.PHONY: clean
+clean: clear-results ## Clear all existing yardstick results and delete python environment
+	rm -rf venv
+	find -iname "*.pyc" -delete
+
+help:
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "$(BOLD)$(CYAN)%-25s$(RESET)%s\n", $$1, $$2}'
+
diff --git a/test/quality/README.md b/test/quality/README.md
new file mode 100644
index 00000000..5acb0dac
--- /dev/null
+++ b/test/quality/README.md
@@ -0,0 +1,140 @@
+# Match quality testing
+
+This form of testing compares the results from various releases of grype using a
+static set of reference container images. The kinds of comparisons made are:
+
+1) "relative": find the vulnerability matching differences between both tools
+   for a given image. This helps identify when a change has occurred in matching
+   behavior and where the changes are.
+
+2) "against labels": pair each tool results for an image with ground truth. This
+   helps identify how well the matching behavior is performing (did it get
+   better or worse).
+
+
+## Getting started
+
+To capture raw tool output and store into the local `.yardstick` directory for
+further analysis:
+```
+make capture
+```
+
+To analyze the tool output and evaluate a pass/fail result:
+```
+make validate
+```
+
+A pass/fail result is shown in the output with reasons for the failure being
+listed explicitly.
+
+
+## What is the quality gate criteria
+
+The label comparison results are used to determine a pass/fail result,
+specifically with the following criteria:
+
+ - fail when current grype F1 score drops below last grype release F1 score (or
+   F1 score is indeterminate)
+ - fail when the indeterminate matches % > 10% in the current grype results
+ - fail when there is a rise in FNs relative to the results from the last grype
+   release
+ - otherwise, pass
+
+F1 score is the primary way that tool matching performance is characterized. F1
+score combines the TP, FP, and FN counts into a single metric between 0 and 1.
+Ideally the F1 score for an image-tool pair should be 1. F1 score is a good way
+to summarize the matching performance but does not explain why the matching
+performance is what it is.
+
+Indeterminate matches are matches from results that could not be pared with a
+label (TP or FP). This could also mean that multiple conflicting labels were
+found for the a single match. The more indeterminate matches there are the less
+confident you can be about the F1 score. Ideally there should be 0 indeterminate
+matches, but this is difficult to achieve since vulnerability data is constantly
+changing. 
+
+False negatives represent matches that should have been made by the tool but
+were missed. We should always make certain that this value does not increase
+between releases of grype.
+
+## Assumptions
+
+1. **Comparing vulnerability results taken at different times is invalid**.
+   We leverage the yardstick result-set feature to capture all vulnerability
+   results at one time for a specific image and tool set. Why? If we use grype
+   at version `a` on monday and grype at version `b` on tuesday and attempt to
+   compare the results, if differences are found it will not be immediately
+   clear why the results are different. That is, it is entirely possible that
+   the vulnerability databases from the run of `b` simply had more up to date
+   information, and if `grype@a` were run at the same time (on tuesday) this
+   reason can be almost entirely eliminated.
+
+2. **Comparing vulnerability results across images with different digests is invalid**.
+   It may be very tempting to compare vulnerability results for
+   `alpine:3.2` from monday and `alpine:3.2` from tuesday to see if there are
+   any changes. However, this is potentially inaccurate as the image references
+   are for the same tag, but the publisher may have pushed a new image with
+   differing content. Any change could lead to different vulnerability matching
+   results but we are only interested in vulnerability match differences that
+   are due to actionable reasons (grype matcher logic problems or [SBOM] input
+   data into matchers).
+
+## Approach
+
+Vulnerability matching has essentially two inputs:
+
+- the packages that were found in the scanned artifact
+
+- the vulnerability data from upstream providers (e.g. NVD, GHSA, etc.)
+
+
+These are both moving targets!
+
+
+We may implement more catalogers in syft that raise up more packages discovered
+over time (for the same artifact scanned). Also the world is continually finding
+and reporting new vulnerabilities. The more moving parts there are in this form
+of testing the harder it is to come to a conclusion about the actual quality of
+the output over time.
+
+
+To reduce the eroding value over time we've decided to change as many moving
+targets into fixed targets as possible:
+
+- Vulnerability results beyond a particular year are ignored (the current config
+  allows for <= 2020). Though there are still retroactive CVEs created, this
+  helps a lot in terms of keeping vulnerability results relatively stable.
+
+- SBOMs are used as input into grype instead of the raw container images. This
+  allows the artifacts under test to remain truly fixed and saves a lot of time
+  when capturing grype results (as the container image is no longer needed
+  during analysis).
+
+- For the captured SBOMs, container images referenced must be with a digest, not
+  just a tag. In case we update a tool version (say syft) we want to make
+  certain that we are scanning the exact same artifact later when we re-run the
+  analysis.
+
+- Versions of tools used are fixed to a specific `major.minor.patch` release used.
+  This allows us to account for capability differences between tool runs.
+
+
+To reduce maintenance effort of this comparison over time there are a few things 
+to keep in mind:
+
+- Once an image is labeled (at a specific digest) the image digest should be
+  considered immutable (never updated). Why? It takes a lot of effort to label
+  images and there are no "clearly safe" assumptions that can be made when it
+  comes to migrating labels from one image to another no matter how "similar"
+  the images may be. There is also no value in updating the image; these images
+  are not being executed and their only purpose is to survey the matching
+  performance of grype. In the philosophy of "maximizing fixed points" it
+  doesn't make sense to change these assets. Over time it may be that we remove
+  assets that are no longer useful for comparison, but this should rarely be
+  done.
+
+- Consider not changing the CVE year max-ceiling (currently set to 2020).
+  Pushing this ceiling will likely raise the number of unlabled matches
+  significantly for all images. Only bump this ceiling if all possible matches
+  are labeled.
diff --git a/test/quality/gate.py b/test/quality/gate.py
new file mode 100755
index 00000000..000c0fc2
--- /dev/null
+++ b/test/quality/gate.py
@@ -0,0 +1,326 @@
+#!/usr/bin/env python3
+import logging
+import os
+import re
+import subprocess
+import sys
+from typing import Optional
+
+import click
+from tabulate import tabulate
+from dataclasses import dataclass, InitVar, field
+
+import yardstick
+from yardstick import store, comparison, artifact, arrange
+from yardstick.cli import display, config
+
+
+# see the .yardstick.yaml configuration for details
+default_result_set = "pr_vs_latest_via_sbom"
+yardstick.utils.grype_db.raise_on_failure(False)
+
+@dataclass
+class Gate:
+    label_comparisons: InitVar[Optional[list[comparison.AgainstLabels]]]
+    label_comparison_stats: InitVar[Optional[comparison.ImageToolLabelStats]]
+
+    reasons: list[str] = field(default_factory=list)
+
+    def __post_init__(self, label_comparisons: Optional[list[comparison.AgainstLabels]], label_comparison_stats: Optional[comparison.ImageToolLabelStats]):
+        if not label_comparisons and not label_comparison_stats:
+            return 
+    
+        reasons = []
+
+        # - fail when current F1 score drops below last release F1 score (or F1 score is indeterminate)
+        # - fail when indeterminate % > 10%
+        # - fail when there is a rise in FNs
+        latest_release_tool, current_tool = guess_tool_orientation(label_comparison_stats.tools)
+
+        latest_release_comparisons_by_image = {comp.config.image: comp for comp in label_comparisons if comp.config.tool == latest_release_tool }
+        current_comparisons_by_image = {comp.config.image: comp for comp in label_comparisons if comp.config.tool == current_tool }
+
+        for image, comp in current_comparisons_by_image.items():
+            latest_f1_score = latest_release_comparisons_by_image[image].summary.f1_score
+            current_f1_score = comp.summary.f1_score
+            if current_f1_score < latest_f1_score:
+                reasons.append(f"current F1 score is lower than the latest release F1 score: {bcolors.BOLD+bcolors.UNDERLINE}current={current_f1_score:0.2f} latest={latest_f1_score:0.2f}{bcolors.RESET} image={image}")
+
+            if comp.summary.indeterminate_percent > 10:
+                reasons.append(f"current indeterminate matches % is greater than 10%: {bcolors.BOLD+bcolors.UNDERLINE}current={comp.summary.indeterminate_percent:0.2f}%{bcolors.RESET} image={image}")
+    
+            latest_fns = latest_release_comparisons_by_image[image].summary.false_negatives
+            current_fns = comp.summary.false_negatives
+            if current_fns > latest_fns:
+                reasons.append(f"current false negatives is greater than the latest release false negatives: {bcolors.BOLD+bcolors.UNDERLINE}current={current_fns} latest={latest_fns}{bcolors.RESET} image={image}")
+
+        self.reasons = reasons
+
+    def passed(self):
+        return len(self.reasons) == 0
+
+def guess_tool_orientation(tools: list[str]):
+    if len(tools) != 2:
+        raise RuntimeError("expected 2 tools, got %s" % tools)
+
+    current_tool = None
+    latest_release_tool = None
+    for tool in tools:
+        if tool.endswith("latest"):
+            latest_release_tool = tool
+            continue
+        current_tool = tool
+
+    if latest_release_tool is None:
+        # "latest" value isn't accessible, so we do a best guess at which version is latest
+        current_tool, latest_release_tool = sorted(tools)
+
+    if current_tool is None:
+        raise ValueError("current tool not found")
+    return latest_release_tool, current_tool
+
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKCYAN = '\033[96m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+    RESET = '\033[0m'
+
+def show_results_used(results: list[artifact.ScanResult]):
+    print(f"   Results used:")
+    for idx, result in enumerate(results):
+        branch = "├──"
+        if idx == len(results) - 1:
+            branch = "└──"
+        print(f"    {branch} {result.ID} : {result.config.tool} against {result.config.image}")
+    print()
+
+def validate(cfg: config.Application, result_set: str, images: list[str], always_run_label_comparison: bool, verbosity: int, label_entries: Optional[list[artifact.LabelEntry]] = None):
+    print(f"{bcolors.HEADER}{bcolors.BOLD}Validating with {result_set!r}", bcolors.RESET)
+    result_set_obj = store.result_set.load(name=result_set)
+
+    ret = []
+    for image, result_states in result_set_obj.result_state_by_image.items():
+        if images and image not in images:
+            print("Skipping image:", image)
+            continue
+        print()
+        print("Testing image:", image)
+        for state in result_states:
+            print("   ", f"with {state.request.tool}")
+        print()
+
+        gate = validate_image(cfg, [s.config.path for s in result_states], always_run_label_comparison=always_run_label_comparison, verbosity=verbosity, label_entries=label_entries)
+        ret.append(gate)
+
+        failure = not gate.passed()
+        if failure:
+            print(f"{bcolors.FAIL}{bcolors.BOLD}Failed quality gate{bcolors.RESET}")
+        for reason in gate.reasons:
+            print(f"   - {reason}")
+
+        print()
+        size = 120
+        print("▁"*size)
+        print("░"*size)
+        print("▔"*size)
+    return ret
+
+def validate_image(cfg: config.Application, descriptions: list[str], always_run_label_comparison: bool, verbosity: int, label_entries: Optional[list[artifact.LabelEntry]] = None):
+    # do a relative comparison
+    # - show comparison summary (no gating action)
+    # - list out all individual match differences
+
+    print(f"{bcolors.HEADER}Running relative comparison...", bcolors.RESET)
+    relative_comparison = yardstick.compare_results(descriptions=descriptions, year_max_limit=cfg.default_max_year)
+    show_results_used(relative_comparison.results)
+
+    # show the relative comparison results
+    if verbosity > 0:
+        details = verbosity > 1
+        display.preserved_matches(relative_comparison, details=details, summary=True, common=False)
+        print()
+
+    # bail if there are no differences found
+    if not always_run_label_comparison and not sum([len(relative_comparison.unique[result.ID]) for result in relative_comparison.results]):
+        print("no differences found between tool results")
+        return Gate(None, None)
+
+    # do a label comparison
+    print(f"{bcolors.HEADER}Running comparison against labels...", bcolors.RESET)
+    results, label_entries, comparisons_by_result_id, stats_by_image_tool_pair = yardstick.compare_results_against_labels(descriptions=descriptions, year_max_limit=cfg.default_max_year, label_entries=label_entries)
+    show_results_used(results)
+
+    if verbosity > 0:
+        show_fns = verbosity > 1
+        display.label_comparison(
+                results,
+                comparisons_by_result_id,
+                stats_by_image_tool_pair,
+                show_fns=show_fns,
+                show_summaries=True,
+            )
+
+    latest_release_tool, current_tool = guess_tool_orientation([r.config.tool for r in results])
+
+    # show the relative comparison unique differences paired up with label conclusions (TP/FP/FN/TN/Unknown)
+    all_rows: list[list[Any]] = []
+    for result in relative_comparison.results:
+        label_comparison = comparisons_by_result_id[result.ID]
+        for unique_match in relative_comparison.unique[result.ID]:
+            labels = label_comparison.labels_by_match[unique_match.ID]
+            if not labels:
+                label = "(unknown)"
+            elif len(set(labels)) > 1:
+                label = ", ".join([l.name for l in labels])
+            else:
+                label = labels[0].name
+            
+
+            color = ""
+            commentary = ""
+            if result.config.tool == latest_release_tool:
+                # the tool which found the unique result is the latest release tool...
+                if label == artifact.Label.TruePositive.name:
+                    # drats! we missed a case (this is a new FN)
+                    color = bcolors.FAIL
+                    commentary = "(this is a new FN 😱)"
+                elif artifact.Label.FalsePositive.name in label:
+                    # we got rid of a FP! ["hip!", "hip!"]
+                    color = bcolors.OKBLUE
+                    commentary = "(got rid of a former FP 🙌)"
+            else:
+                # the tool which found the unique result is the current tool...
+                if label == artifact.Label.TruePositive.name:
+                    # highest of fives! we found a new TP that the previous tool release missed!
+                    color = bcolors.OKBLUE
+                    commentary = "(this is a new TP 🙌)"
+                elif artifact.Label.FalsePositive.name in label:
+                    # welp, our changes resulted in a new FP... not great, maybe not terrible?
+                    color = bcolors.FAIL
+                    commentary = "(this is a new FP 😱)"
+
+            all_rows.append(
+                [
+                    f"{color}{result.config.tool} ONLY{bcolors.RESET}",
+                    f"{color}{unique_match.package.name}@{unique_match.package.version}{bcolors.RESET}",
+                    f"{color}{unique_match.vulnerability.id}{bcolors.RESET}",
+                    f"{color}{label}{bcolors.RESET}",
+                    f"{commentary}",
+                ]
+            )
+
+    def escape_ansi(line):
+        ansi_escape = re.compile(r'(?:\x1B[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]')
+        return ansi_escape.sub('', line)
+
+    # sort but don't consider ansi escape codes
+    all_rows = sorted(all_rows, key=lambda x: escape_ansi(str(x[0]+x[1]+x[2]+x[3])))
+    if len(all_rows) == 0:
+        print("No differences found between tooling (with labels)")
+    else:
+        print("Match differences between tooling (with labels):")
+        indent = "   "
+        print(indent + tabulate([["TOOL PARTITION", "PACKAGE", "VULNERABILITY", "LABEL", "COMMENTARY"]]+all_rows, tablefmt="plain").replace("\n", "\n" + indent) + "\n")
+
+
+    # populate the quality gate with data that can evaluate pass/fail conditions
+    return Gate(label_comparisons=comparisons_by_result_id.values(), label_comparison_stats=stats_by_image_tool_pair)
+
+@click.command()
+@click.option("--image", "-i", "images", multiple=True, help="filter down to one or more images to validate with (don't use the full result set)")
+@click.option("--label-comparison", "-l", "always_run_label_comparison", is_flag=True, help="run label comparison irregardless of relative comparison results")
+@click.option("--breakdown-by-ecosystem", "-e", is_flag=True, help="show label comparison results broken down by ecosystem")
+@click.option("--verbose", "-v", "verbosity", count=True, help="show details of all comparisons")
+@click.option("--result-set", "-r", default=default_result_set, help="the result set to use for the quality gate")
+def main(images: list[str], always_run_label_comparison: bool, breakdown_by_ecosystem: bool, verbosity: int, result_set: str):
+    cfg = config.load()
+    setup_logging(verbosity)
+
+    # let's not load any more labels than we need to, base this off of the images we're validating
+    if not images:
+        images = set()
+        result_set_obj = store.result_set.load(name=result_set)
+        for state in result_set_obj.state:
+            images.add(state.config.image)
+        images = sorted(list(images))
+ 
+    print("Loading label entries...", end=" ")
+    label_entries = store.labels.load_for_image(images, year_max_limit=cfg.default_max_year)
+    print(f"done! {len(label_entries)} entries loaded")
+
+    result_sets = [result_set] # today only one result set is supported, but more can be added
+    gates = []
+    for result_set in result_sets:
+        gates.extend(validate(cfg, result_set, images=images, always_run_label_comparison=always_run_label_comparison, verbosity=verbosity, label_entries=label_entries))
+        print()
+    
+        if breakdown_by_ecosystem:
+            print(f"{bcolors.HEADER}Breaking down label comparison by ecosystem performance...", bcolors.RESET)
+            results_by_image, label_entries, stats = yardstick.compare_results_against_labels_by_ecosystem(result_set=result_set, year_max_limit=cfg.default_max_year, label_entries=label_entries)
+            display.labels_by_ecosystem_comparison(
+                results_by_image,
+                stats,
+                show_images_used=False,
+            )
+            print()
+
+    failure = not all([gate.passed() for gate in gates])
+    if failure:
+        print("Reasons for quality gate failure:")
+    for gate in gates:
+        for reason in gate.reasons:
+            print(f"   - {reason}")
+
+    if failure:
+        print()
+        print(f"{bcolors.FAIL}{bcolors.BOLD}Quality gate FAILED{bcolors.RESET}")
+        sys.exit(1)
+    else:
+        print(f"{bcolors.OKGREEN}{bcolors.BOLD}Quality gate passed!{bcolors.RESET}")
+
+
+def setup_logging(verbosity: int):
+    # pylint: disable=redefined-outer-name, import-outside-toplevel
+    import logging.config
+
+    if verbosity in [0, 1, 2]:
+        log_level = "WARN"
+    elif verbosity == 3:
+        log_level = "INFO"
+    else:
+        log_level = "DEBUG"
+
+    logging.config.dictConfig(
+        {
+            "version": 1,
+            "formatters": {
+                "standard": {
+                    # [%(module)s.%(funcName)s]
+                    "format": "%(asctime)s [%(levelname)s] %(message)s",
+                    "datefmt": "",
+                },
+            },
+            "handlers": {
+                "default": {
+                    "level": log_level,
+                    "formatter": "standard",
+                    "class": "logging.StreamHandler",
+                    "stream": "ext://sys.stderr",
+                },
+            },
+            "loggers": {
+                "": {  # root logger
+                    "handlers": ["default"],
+                    "level": log_level,
+                },
+            },
+        }
+    )
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/test/quality/requirements.txt b/test/quality/requirements.txt
new file mode 100644
index 00000000..e33ab6dd
--- /dev/null
+++ b/test/quality/requirements.txt
@@ -0,0 +1,3 @@
+git+https://github.com/anchore/yardstick@4526ad2ff6d33d34e900ed692c3a90adc80eab73
+# ../../../yardstick
+tabulate==0.8.10
\ No newline at end of file
diff --git a/test/quality/vulnerability-match-labels b/test/quality/vulnerability-match-labels
new file mode 160000
index 00000000..3a2ecc33
--- /dev/null
+++ b/test/quality/vulnerability-match-labels
@@ -0,0 +1 @@
+Subproject commit 3a2ecc336411ddc3f37b7d5c123b80f6848a2cf3