diff --git a/.github/workflows/validations.yaml b/.github/workflows/validations.yaml index f44e016f..594b8cbe 100644 --- a/.github/workflows/validations.yaml +++ b/.github/workflows/validations.yaml @@ -9,6 +9,7 @@ on: env: GO_VERSION: "1.18.x" GO_STABLE_VERSION: true + PYTHON_VERSION: "3.10" jobs: Static-Analysis: @@ -100,6 +101,58 @@ jobs: name: unit-test-results path: test/results/**/* + Quality-Test: + # Note: changing this job name requires making the same update in the .github/workflows/release.yaml pipeline + name: "Quality tests" + runs-on: ubuntu-20.04 + steps: + - uses: actions/setup-go@v2 + with: + go-version: ${{ env.GO_VERSION }} + stable: ${{ env.GO_STABLE_VERSION }} + + - uses: actions/setup-python@v2 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - uses: actions/checkout@v2 + with: + submodules: true + + - name: Restore tool cache + id: tool-cache + uses: actions/cache@v2.1.3 + with: + path: ${{ github.workspace }}/.tmp + key: ${{ runner.os }}-tool-${{ hashFiles('Makefile') }} + + - name: Restore go cache + id: go-cache + uses: actions/cache@v2.1.3 + with: + path: ~/go/pkg/mod + key: ${{ runner.os }}-go-${{ env.GO_VERSION }}-${{ hashFiles('**/go.sum') }} + restore-keys: | + ${{ runner.os }}-go-${{ env.GO_VERSION }}- + + - name: Restore python cache + id: python-cache + uses: actions/cache@v2.1.3 + with: + path: | + test/quality/venv + test/quality/vulnerability-match-labels/venv + key: ${{ runner.os }}-go-${{ env.PYTHON_VERSION }}-${{ hashFiles('**/test/quality/**/requirements.txt') }} + restore-keys: | + ${{ runner.os }}-go-${{ env.PYTHON_VERSION }}- + + - name: (cache-miss) Bootstrap all project dependencies + if: steps.tool-cache.outputs.cache-hit != 'true' || steps.go-cache.outputs.cache-hit != 'true' + run: make bootstrap + + - name: Run quality tests + run: make quality + Integration-Test: # Note: changing this job name requires making the same update in the .github/workflows/release.yaml pipeline name: "Integration tests" diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..281e4b33 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "test/quality/vulnerability-match-labels"] + path = test/quality/vulnerability-match-labels + url = git@github.com:anchore/vulnerability-match-labels.git + branch = main diff --git a/Makefile b/Makefile index a70370f9..251e6cec 100644 --- a/Makefile +++ b/Makefile @@ -168,6 +168,11 @@ unit: ## Run unit tests (with coverage) @echo "Coverage: $$(cat $(COVER_TOTAL))" @if [ $$(echo "$$(cat $(COVER_TOTAL)) >= $(COVERAGE_THRESHOLD)" | bc -l) -ne 1 ]; then echo "$(RED)$(BOLD)Failed coverage quality gate (> $(COVERAGE_THRESHOLD)%)$(RESET)" && false; fi +.PHONY: quality +quality: ## Run quality tests + $(call title,Running quality tests) + cd test/quality && make + # note: this is used by CI to determine if the install test fixture cache (docker image tars) should be busted install-fingerprint: cd test/install && \ diff --git a/test/quality/.gitignore b/test/quality/.gitignore new file mode 100644 index 00000000..92d889aa --- /dev/null +++ b/test/quality/.gitignore @@ -0,0 +1,7 @@ +venv +.yardstick/tools +.yardstick/result +stage +pull +migrate.py +.oras-cache \ No newline at end of file diff --git a/test/quality/.yardstick.yaml b/test/quality/.yardstick.yaml new file mode 100644 index 00000000..6c072e70 --- /dev/null +++ b/test/quality/.yardstick.yaml @@ -0,0 +1,45 @@ +x-ref: + # note: always reference images with BOTH a tag and a digest + images: &images + - docker.io/cloudbees/cloudbees-core-mm:2.277.3.1@sha256:4c564f473d38f23da1caa48c4ef53b958ef03d279232007ad3319b1f38584bdb + - docker.io/anchore/test_images:grype-quality-node-d89207b@sha256:f56164678054e5eb59ab838367373a49df723b324617b1ba6de775749d7f91d4 + - docker.io/anchore/test_images:grype-quality-python-d89207b@sha256:b2b58a55c0b03c1626d2aaae2add9832208b02124dda7b7b41811e14f0fb272c + - docker.io/anchore/test_images:grype-quality-java-d89207b@sha256:b3534fc2e37943136d5b54e3a58b55d4ccd4363d926cf7aa5bf55a524cf8275b + - docker.io/anchore/test_images:grype-quality-golang-d89207b@sha256:7536ee345532f674ec9e448e3768db4e546c48220ba2b6ec9bc9cfbfb3b7b74a + - docker.io/anchore/test_images:grype-quality-ruby-d89207b@sha256:1a5a5f870924e88a6f0f2b8089cf276ef0a79b5244a052cdfe4a47bb9e5a2c10 + - docker.io/alpine:3.2@sha256:ddac200f3ebc9902fb8cfcd599f41feb2151f1118929da21bcef57dc276975f9 + - docker.io/centos:6@sha256:3688aa867eb84332460e172b9250c9c198fdfd8d987605fd53f246f498c60bcf + - docker.io/ubuntu:16.10@sha256:8dc9652808dc091400d7d5983949043a9f9c7132b15c14814275d25f94bca18a + - docker.io/almalinux:8@sha256:cd49d7250ed7bb194d502d8a3e50bd775055ca275d1d9c2785aea72b890afe6a + - docker.io/rockylinux:8@sha256:72afc2e1a20c9ddf56a81c51148ebcbe927c0a879849efe813bee77d69df1dd8 + - docker.io/oraclelinux:6@sha256:a06327c0f1d18d753f2a60bb17864c84a850bb6dcbcf5946dd1a8123f6e75495 + - docker.io/debian:7@sha256:81e88820a7759038ffa61cff59dfcc12d3772c3a2e75b7cfe963c952da2ad264 + - docker.io/busybox:1.28.1@sha256:2107a35b58593c58ec5f4e8f2c4a70d195321078aebfadfbfb223a2ff4a4ed21 + - docker.io/amazonlinux:2@sha256:1301cc9f889f21dc45733df9e58034ac1c318202b4b0f0a08d88b3fdc03004de + - registry.access.redhat.com/ubi8@sha256:68fecea0d255ee253acbf0c860eaebb7017ef5ef007c25bee9eeffd29ce85b29 + +# new vulnerabilities are added all of the time, instead of keeping up it's easier to ignore newer entries. +# This approach helps tremendously with keeping the analysis relatively stable. +default_max_year: 2020 + +result-sets: + pr_vs_latest_via_sbom: + description: "latest released grype vs grype from the current build (via SBOM ingestion)" + matrix: + images: *images + + tools: + + - name: syft + # note: we want to use a fixed version of syft for capturing all results (NOT "latest") + version: v0.54.0 + produces: SBOM + refresh: false + + - name: grype + version: git:current-commit + takes: SBOM + + - name: grype + version: latest + takes: SBOM diff --git a/test/quality/.yardstick/labels b/test/quality/.yardstick/labels new file mode 120000 index 00000000..4a9cf095 --- /dev/null +++ b/test/quality/.yardstick/labels @@ -0,0 +1 @@ +../vulnerability-match-labels/labels \ No newline at end of file diff --git a/test/quality/Makefile b/test/quality/Makefile new file mode 100644 index 00000000..65e1c96c --- /dev/null +++ b/test/quality/Makefile @@ -0,0 +1,68 @@ +SBOM_STORE_TAG = md5-$(shell md5sum .yardstick.yaml | cut -d' ' -f1) +SBOM_STORE_IMAGE = ghcr.io/anchore/grype/quality-test-sbom-store:$(SBOM_STORE_TAG) +ACTIVATE_VENV = . venv/bin/activate && +YARDSTICK = $(ACTIVATE_VENV) yardstick -v +YARDSTICK_RESULT_DIR = .yardstick/result +YARDSTICK_LABELS_DIR = .yardstick/labels +VULNERABILITY_LABELS = ./vulnerability-labels +RESULT_SET = pr_vs_latest_via_sbom + +# formatting variables +BOLD := $(shell tput -T linux bold) +PURPLE := $(shell tput -T linux setaf 5) +GREEN := $(shell tput -T linux setaf 2) +CYAN := $(shell tput -T linux setaf 6) +RED := $(shell tput -T linux setaf 1) +RESET := $(shell tput -T linux sgr0) +TITLE := $(BOLD)$(PURPLE) +SUCCESS := $(BOLD)$(GREEN) + +.PHONY: all +all: capture validate ## Fetch or capture all data and run all quality checks + +.PHONY: validate +validate: venv $(VULNERABILITY_LABELS) ## Run all quality checks against already collected data + $(ACTIVATE_VENV) ./gate.py + +.PHONY: capture +capture: sboms vulns ## Collect and store all syft and grype results + +.PHONY: capture +vulns: venv ## Collect and store all grype results + $(YARDSTICK) -v result capture -r $(RESULT_SET) + +.PHONY: sboms +sboms: $(YARDSTICK_RESULT_DIR) venv clear-results ## Collect and store all syft results (deletes all existing results) + bash -c "make download-sboms || ($(YARDSTICK) -v result capture -r $(RESULT_SET) --only-producers)" + +.PHONY: download-sboms +download-sboms: + cd vulnerability-match-labels && make venv + bash -c "export ORAS_CACHE=$(shell pwd)/.oras-cache && make venv && . vulnerability-match-labels/venv/bin/activate && ./vulnerability-match-labels/sboms.py download -r $(RESULT_SET)" + +venv: venv/touchfile + +venv/touchfile: requirements.txt + test -d venv || python3 -m venv venv + $(ACTIVATE_VENV) pip install -Ur requirements.txt + touch venv/touchfile + + +$(YARDSTICK_RESULT_DIR): + mkdir -p $(YARDSTICK_RESULT_DIR) + +$(VULNERABILITY_LABELS): + git submodule update vulnerability-match-labels + +.PHONY: clear-results +clear-results: venv ## Clear all existing yardstick results + $(YARDSTICK) result clear + +.PHONY: clean +clean: clear-results ## Clear all existing yardstick results and delete python environment + rm -rf venv + find -iname "*.pyc" -delete + +help: + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "$(BOLD)$(CYAN)%-25s$(RESET)%s\n", $$1, $$2}' + diff --git a/test/quality/README.md b/test/quality/README.md new file mode 100644 index 00000000..5acb0dac --- /dev/null +++ b/test/quality/README.md @@ -0,0 +1,140 @@ +# Match quality testing + +This form of testing compares the results from various releases of grype using a +static set of reference container images. The kinds of comparisons made are: + +1) "relative": find the vulnerability matching differences between both tools + for a given image. This helps identify when a change has occurred in matching + behavior and where the changes are. + +2) "against labels": pair each tool results for an image with ground truth. This + helps identify how well the matching behavior is performing (did it get + better or worse). + + +## Getting started + +To capture raw tool output and store into the local `.yardstick` directory for +further analysis: +``` +make capture +``` + +To analyze the tool output and evaluate a pass/fail result: +``` +make validate +``` + +A pass/fail result is shown in the output with reasons for the failure being +listed explicitly. + + +## What is the quality gate criteria + +The label comparison results are used to determine a pass/fail result, +specifically with the following criteria: + + - fail when current grype F1 score drops below last grype release F1 score (or + F1 score is indeterminate) + - fail when the indeterminate matches % > 10% in the current grype results + - fail when there is a rise in FNs relative to the results from the last grype + release + - otherwise, pass + +F1 score is the primary way that tool matching performance is characterized. F1 +score combines the TP, FP, and FN counts into a single metric between 0 and 1. +Ideally the F1 score for an image-tool pair should be 1. F1 score is a good way +to summarize the matching performance but does not explain why the matching +performance is what it is. + +Indeterminate matches are matches from results that could not be pared with a +label (TP or FP). This could also mean that multiple conflicting labels were +found for the a single match. The more indeterminate matches there are the less +confident you can be about the F1 score. Ideally there should be 0 indeterminate +matches, but this is difficult to achieve since vulnerability data is constantly +changing. + +False negatives represent matches that should have been made by the tool but +were missed. We should always make certain that this value does not increase +between releases of grype. + +## Assumptions + +1. **Comparing vulnerability results taken at different times is invalid**. + We leverage the yardstick result-set feature to capture all vulnerability + results at one time for a specific image and tool set. Why? If we use grype + at version `a` on monday and grype at version `b` on tuesday and attempt to + compare the results, if differences are found it will not be immediately + clear why the results are different. That is, it is entirely possible that + the vulnerability databases from the run of `b` simply had more up to date + information, and if `grype@a` were run at the same time (on tuesday) this + reason can be almost entirely eliminated. + +2. **Comparing vulnerability results across images with different digests is invalid**. + It may be very tempting to compare vulnerability results for + `alpine:3.2` from monday and `alpine:3.2` from tuesday to see if there are + any changes. However, this is potentially inaccurate as the image references + are for the same tag, but the publisher may have pushed a new image with + differing content. Any change could lead to different vulnerability matching + results but we are only interested in vulnerability match differences that + are due to actionable reasons (grype matcher logic problems or [SBOM] input + data into matchers). + +## Approach + +Vulnerability matching has essentially two inputs: + +- the packages that were found in the scanned artifact + +- the vulnerability data from upstream providers (e.g. NVD, GHSA, etc.) + + +These are both moving targets! + + +We may implement more catalogers in syft that raise up more packages discovered +over time (for the same artifact scanned). Also the world is continually finding +and reporting new vulnerabilities. The more moving parts there are in this form +of testing the harder it is to come to a conclusion about the actual quality of +the output over time. + + +To reduce the eroding value over time we've decided to change as many moving +targets into fixed targets as possible: + +- Vulnerability results beyond a particular year are ignored (the current config + allows for <= 2020). Though there are still retroactive CVEs created, this + helps a lot in terms of keeping vulnerability results relatively stable. + +- SBOMs are used as input into grype instead of the raw container images. This + allows the artifacts under test to remain truly fixed and saves a lot of time + when capturing grype results (as the container image is no longer needed + during analysis). + +- For the captured SBOMs, container images referenced must be with a digest, not + just a tag. In case we update a tool version (say syft) we want to make + certain that we are scanning the exact same artifact later when we re-run the + analysis. + +- Versions of tools used are fixed to a specific `major.minor.patch` release used. + This allows us to account for capability differences between tool runs. + + +To reduce maintenance effort of this comparison over time there are a few things +to keep in mind: + +- Once an image is labeled (at a specific digest) the image digest should be + considered immutable (never updated). Why? It takes a lot of effort to label + images and there are no "clearly safe" assumptions that can be made when it + comes to migrating labels from one image to another no matter how "similar" + the images may be. There is also no value in updating the image; these images + are not being executed and their only purpose is to survey the matching + performance of grype. In the philosophy of "maximizing fixed points" it + doesn't make sense to change these assets. Over time it may be that we remove + assets that are no longer useful for comparison, but this should rarely be + done. + +- Consider not changing the CVE year max-ceiling (currently set to 2020). + Pushing this ceiling will likely raise the number of unlabled matches + significantly for all images. Only bump this ceiling if all possible matches + are labeled. diff --git a/test/quality/gate.py b/test/quality/gate.py new file mode 100755 index 00000000..000c0fc2 --- /dev/null +++ b/test/quality/gate.py @@ -0,0 +1,326 @@ +#!/usr/bin/env python3 +import logging +import os +import re +import subprocess +import sys +from typing import Optional + +import click +from tabulate import tabulate +from dataclasses import dataclass, InitVar, field + +import yardstick +from yardstick import store, comparison, artifact, arrange +from yardstick.cli import display, config + + +# see the .yardstick.yaml configuration for details +default_result_set = "pr_vs_latest_via_sbom" +yardstick.utils.grype_db.raise_on_failure(False) + +@dataclass +class Gate: + label_comparisons: InitVar[Optional[list[comparison.AgainstLabels]]] + label_comparison_stats: InitVar[Optional[comparison.ImageToolLabelStats]] + + reasons: list[str] = field(default_factory=list) + + def __post_init__(self, label_comparisons: Optional[list[comparison.AgainstLabels]], label_comparison_stats: Optional[comparison.ImageToolLabelStats]): + if not label_comparisons and not label_comparison_stats: + return + + reasons = [] + + # - fail when current F1 score drops below last release F1 score (or F1 score is indeterminate) + # - fail when indeterminate % > 10% + # - fail when there is a rise in FNs + latest_release_tool, current_tool = guess_tool_orientation(label_comparison_stats.tools) + + latest_release_comparisons_by_image = {comp.config.image: comp for comp in label_comparisons if comp.config.tool == latest_release_tool } + current_comparisons_by_image = {comp.config.image: comp for comp in label_comparisons if comp.config.tool == current_tool } + + for image, comp in current_comparisons_by_image.items(): + latest_f1_score = latest_release_comparisons_by_image[image].summary.f1_score + current_f1_score = comp.summary.f1_score + if current_f1_score < latest_f1_score: + reasons.append(f"current F1 score is lower than the latest release F1 score: {bcolors.BOLD+bcolors.UNDERLINE}current={current_f1_score:0.2f} latest={latest_f1_score:0.2f}{bcolors.RESET} image={image}") + + if comp.summary.indeterminate_percent > 10: + reasons.append(f"current indeterminate matches % is greater than 10%: {bcolors.BOLD+bcolors.UNDERLINE}current={comp.summary.indeterminate_percent:0.2f}%{bcolors.RESET} image={image}") + + latest_fns = latest_release_comparisons_by_image[image].summary.false_negatives + current_fns = comp.summary.false_negatives + if current_fns > latest_fns: + reasons.append(f"current false negatives is greater than the latest release false negatives: {bcolors.BOLD+bcolors.UNDERLINE}current={current_fns} latest={latest_fns}{bcolors.RESET} image={image}") + + self.reasons = reasons + + def passed(self): + return len(self.reasons) == 0 + +def guess_tool_orientation(tools: list[str]): + if len(tools) != 2: + raise RuntimeError("expected 2 tools, got %s" % tools) + + current_tool = None + latest_release_tool = None + for tool in tools: + if tool.endswith("latest"): + latest_release_tool = tool + continue + current_tool = tool + + if latest_release_tool is None: + # "latest" value isn't accessible, so we do a best guess at which version is latest + current_tool, latest_release_tool = sorted(tools) + + if current_tool is None: + raise ValueError("current tool not found") + return latest_release_tool, current_tool + +class bcolors: + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKCYAN = '\033[96m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + RESET = '\033[0m' + +def show_results_used(results: list[artifact.ScanResult]): + print(f" Results used:") + for idx, result in enumerate(results): + branch = "├──" + if idx == len(results) - 1: + branch = "└──" + print(f" {branch} {result.ID} : {result.config.tool} against {result.config.image}") + print() + +def validate(cfg: config.Application, result_set: str, images: list[str], always_run_label_comparison: bool, verbosity: int, label_entries: Optional[list[artifact.LabelEntry]] = None): + print(f"{bcolors.HEADER}{bcolors.BOLD}Validating with {result_set!r}", bcolors.RESET) + result_set_obj = store.result_set.load(name=result_set) + + ret = [] + for image, result_states in result_set_obj.result_state_by_image.items(): + if images and image not in images: + print("Skipping image:", image) + continue + print() + print("Testing image:", image) + for state in result_states: + print(" ", f"with {state.request.tool}") + print() + + gate = validate_image(cfg, [s.config.path for s in result_states], always_run_label_comparison=always_run_label_comparison, verbosity=verbosity, label_entries=label_entries) + ret.append(gate) + + failure = not gate.passed() + if failure: + print(f"{bcolors.FAIL}{bcolors.BOLD}Failed quality gate{bcolors.RESET}") + for reason in gate.reasons: + print(f" - {reason}") + + print() + size = 120 + print("▁"*size) + print("░"*size) + print("▔"*size) + return ret + +def validate_image(cfg: config.Application, descriptions: list[str], always_run_label_comparison: bool, verbosity: int, label_entries: Optional[list[artifact.LabelEntry]] = None): + # do a relative comparison + # - show comparison summary (no gating action) + # - list out all individual match differences + + print(f"{bcolors.HEADER}Running relative comparison...", bcolors.RESET) + relative_comparison = yardstick.compare_results(descriptions=descriptions, year_max_limit=cfg.default_max_year) + show_results_used(relative_comparison.results) + + # show the relative comparison results + if verbosity > 0: + details = verbosity > 1 + display.preserved_matches(relative_comparison, details=details, summary=True, common=False) + print() + + # bail if there are no differences found + if not always_run_label_comparison and not sum([len(relative_comparison.unique[result.ID]) for result in relative_comparison.results]): + print("no differences found between tool results") + return Gate(None, None) + + # do a label comparison + print(f"{bcolors.HEADER}Running comparison against labels...", bcolors.RESET) + results, label_entries, comparisons_by_result_id, stats_by_image_tool_pair = yardstick.compare_results_against_labels(descriptions=descriptions, year_max_limit=cfg.default_max_year, label_entries=label_entries) + show_results_used(results) + + if verbosity > 0: + show_fns = verbosity > 1 + display.label_comparison( + results, + comparisons_by_result_id, + stats_by_image_tool_pair, + show_fns=show_fns, + show_summaries=True, + ) + + latest_release_tool, current_tool = guess_tool_orientation([r.config.tool for r in results]) + + # show the relative comparison unique differences paired up with label conclusions (TP/FP/FN/TN/Unknown) + all_rows: list[list[Any]] = [] + for result in relative_comparison.results: + label_comparison = comparisons_by_result_id[result.ID] + for unique_match in relative_comparison.unique[result.ID]: + labels = label_comparison.labels_by_match[unique_match.ID] + if not labels: + label = "(unknown)" + elif len(set(labels)) > 1: + label = ", ".join([l.name for l in labels]) + else: + label = labels[0].name + + + color = "" + commentary = "" + if result.config.tool == latest_release_tool: + # the tool which found the unique result is the latest release tool... + if label == artifact.Label.TruePositive.name: + # drats! we missed a case (this is a new FN) + color = bcolors.FAIL + commentary = "(this is a new FN 😱)" + elif artifact.Label.FalsePositive.name in label: + # we got rid of a FP! ["hip!", "hip!"] + color = bcolors.OKBLUE + commentary = "(got rid of a former FP 🙌)" + else: + # the tool which found the unique result is the current tool... + if label == artifact.Label.TruePositive.name: + # highest of fives! we found a new TP that the previous tool release missed! + color = bcolors.OKBLUE + commentary = "(this is a new TP 🙌)" + elif artifact.Label.FalsePositive.name in label: + # welp, our changes resulted in a new FP... not great, maybe not terrible? + color = bcolors.FAIL + commentary = "(this is a new FP 😱)" + + all_rows.append( + [ + f"{color}{result.config.tool} ONLY{bcolors.RESET}", + f"{color}{unique_match.package.name}@{unique_match.package.version}{bcolors.RESET}", + f"{color}{unique_match.vulnerability.id}{bcolors.RESET}", + f"{color}{label}{bcolors.RESET}", + f"{commentary}", + ] + ) + + def escape_ansi(line): + ansi_escape = re.compile(r'(?:\x1B[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]') + return ansi_escape.sub('', line) + + # sort but don't consider ansi escape codes + all_rows = sorted(all_rows, key=lambda x: escape_ansi(str(x[0]+x[1]+x[2]+x[3]))) + if len(all_rows) == 0: + print("No differences found between tooling (with labels)") + else: + print("Match differences between tooling (with labels):") + indent = " " + print(indent + tabulate([["TOOL PARTITION", "PACKAGE", "VULNERABILITY", "LABEL", "COMMENTARY"]]+all_rows, tablefmt="plain").replace("\n", "\n" + indent) + "\n") + + + # populate the quality gate with data that can evaluate pass/fail conditions + return Gate(label_comparisons=comparisons_by_result_id.values(), label_comparison_stats=stats_by_image_tool_pair) + +@click.command() +@click.option("--image", "-i", "images", multiple=True, help="filter down to one or more images to validate with (don't use the full result set)") +@click.option("--label-comparison", "-l", "always_run_label_comparison", is_flag=True, help="run label comparison irregardless of relative comparison results") +@click.option("--breakdown-by-ecosystem", "-e", is_flag=True, help="show label comparison results broken down by ecosystem") +@click.option("--verbose", "-v", "verbosity", count=True, help="show details of all comparisons") +@click.option("--result-set", "-r", default=default_result_set, help="the result set to use for the quality gate") +def main(images: list[str], always_run_label_comparison: bool, breakdown_by_ecosystem: bool, verbosity: int, result_set: str): + cfg = config.load() + setup_logging(verbosity) + + # let's not load any more labels than we need to, base this off of the images we're validating + if not images: + images = set() + result_set_obj = store.result_set.load(name=result_set) + for state in result_set_obj.state: + images.add(state.config.image) + images = sorted(list(images)) + + print("Loading label entries...", end=" ") + label_entries = store.labels.load_for_image(images, year_max_limit=cfg.default_max_year) + print(f"done! {len(label_entries)} entries loaded") + + result_sets = [result_set] # today only one result set is supported, but more can be added + gates = [] + for result_set in result_sets: + gates.extend(validate(cfg, result_set, images=images, always_run_label_comparison=always_run_label_comparison, verbosity=verbosity, label_entries=label_entries)) + print() + + if breakdown_by_ecosystem: + print(f"{bcolors.HEADER}Breaking down label comparison by ecosystem performance...", bcolors.RESET) + results_by_image, label_entries, stats = yardstick.compare_results_against_labels_by_ecosystem(result_set=result_set, year_max_limit=cfg.default_max_year, label_entries=label_entries) + display.labels_by_ecosystem_comparison( + results_by_image, + stats, + show_images_used=False, + ) + print() + + failure = not all([gate.passed() for gate in gates]) + if failure: + print("Reasons for quality gate failure:") + for gate in gates: + for reason in gate.reasons: + print(f" - {reason}") + + if failure: + print() + print(f"{bcolors.FAIL}{bcolors.BOLD}Quality gate FAILED{bcolors.RESET}") + sys.exit(1) + else: + print(f"{bcolors.OKGREEN}{bcolors.BOLD}Quality gate passed!{bcolors.RESET}") + + +def setup_logging(verbosity: int): + # pylint: disable=redefined-outer-name, import-outside-toplevel + import logging.config + + if verbosity in [0, 1, 2]: + log_level = "WARN" + elif verbosity == 3: + log_level = "INFO" + else: + log_level = "DEBUG" + + logging.config.dictConfig( + { + "version": 1, + "formatters": { + "standard": { + # [%(module)s.%(funcName)s] + "format": "%(asctime)s [%(levelname)s] %(message)s", + "datefmt": "", + }, + }, + "handlers": { + "default": { + "level": log_level, + "formatter": "standard", + "class": "logging.StreamHandler", + "stream": "ext://sys.stderr", + }, + }, + "loggers": { + "": { # root logger + "handlers": ["default"], + "level": log_level, + }, + }, + } + ) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/test/quality/requirements.txt b/test/quality/requirements.txt new file mode 100644 index 00000000..e33ab6dd --- /dev/null +++ b/test/quality/requirements.txt @@ -0,0 +1,3 @@ +git+https://github.com/anchore/yardstick@4526ad2ff6d33d34e900ed692c3a90adc80eab73 +# ../../../yardstick +tabulate==0.8.10 \ No newline at end of file diff --git a/test/quality/vulnerability-match-labels b/test/quality/vulnerability-match-labels new file mode 160000 index 00000000..3a2ecc33 --- /dev/null +++ b/test/quality/vulnerability-match-labels @@ -0,0 +1 @@ +Subproject commit 3a2ecc336411ddc3f37b7d5c123b80f6848a2cf3