mirror of
https://github.com/anchore/grype
synced 2024-11-10 06:34:13 +00:00
Add in-depth quality gate checks (#949)
* add in-depth quality gate checks Signed-off-by: Alex Goodman <alex.goodman@anchore.com> * add quality tests to PR checks Signed-off-by: Alex Goodman <alex.goodman@anchore.com> Signed-off-by: Alex Goodman <alex.goodman@anchore.com>
This commit is contained in:
parent
7ad60ce410
commit
d4587ddeec
11 changed files with 653 additions and 0 deletions
53
.github/workflows/validations.yaml
vendored
53
.github/workflows/validations.yaml
vendored
|
@ -9,6 +9,7 @@ on:
|
|||
env:
|
||||
GO_VERSION: "1.18.x"
|
||||
GO_STABLE_VERSION: true
|
||||
PYTHON_VERSION: "3.10"
|
||||
|
||||
jobs:
|
||||
Static-Analysis:
|
||||
|
@ -100,6 +101,58 @@ jobs:
|
|||
name: unit-test-results
|
||||
path: test/results/**/*
|
||||
|
||||
Quality-Test:
|
||||
# Note: changing this job name requires making the same update in the .github/workflows/release.yaml pipeline
|
||||
name: "Quality tests"
|
||||
runs-on: ubuntu-20.04
|
||||
steps:
|
||||
- uses: actions/setup-go@v2
|
||||
with:
|
||||
go-version: ${{ env.GO_VERSION }}
|
||||
stable: ${{ env.GO_STABLE_VERSION }}
|
||||
|
||||
- uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- name: Restore tool cache
|
||||
id: tool-cache
|
||||
uses: actions/cache@v2.1.3
|
||||
with:
|
||||
path: ${{ github.workspace }}/.tmp
|
||||
key: ${{ runner.os }}-tool-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Restore go cache
|
||||
id: go-cache
|
||||
uses: actions/cache@v2.1.3
|
||||
with:
|
||||
path: ~/go/pkg/mod
|
||||
key: ${{ runner.os }}-go-${{ env.GO_VERSION }}-${{ hashFiles('**/go.sum') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-go-${{ env.GO_VERSION }}-
|
||||
|
||||
- name: Restore python cache
|
||||
id: python-cache
|
||||
uses: actions/cache@v2.1.3
|
||||
with:
|
||||
path: |
|
||||
test/quality/venv
|
||||
test/quality/vulnerability-match-labels/venv
|
||||
key: ${{ runner.os }}-go-${{ env.PYTHON_VERSION }}-${{ hashFiles('**/test/quality/**/requirements.txt') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-go-${{ env.PYTHON_VERSION }}-
|
||||
|
||||
- name: (cache-miss) Bootstrap all project dependencies
|
||||
if: steps.tool-cache.outputs.cache-hit != 'true' || steps.go-cache.outputs.cache-hit != 'true'
|
||||
run: make bootstrap
|
||||
|
||||
- name: Run quality tests
|
||||
run: make quality
|
||||
|
||||
Integration-Test:
|
||||
# Note: changing this job name requires making the same update in the .github/workflows/release.yaml pipeline
|
||||
name: "Integration tests"
|
||||
|
|
4
.gitmodules
vendored
Normal file
4
.gitmodules
vendored
Normal file
|
@ -0,0 +1,4 @@
|
|||
[submodule "test/quality/vulnerability-match-labels"]
|
||||
path = test/quality/vulnerability-match-labels
|
||||
url = git@github.com:anchore/vulnerability-match-labels.git
|
||||
branch = main
|
5
Makefile
5
Makefile
|
@ -168,6 +168,11 @@ unit: ## Run unit tests (with coverage)
|
|||
@echo "Coverage: $$(cat $(COVER_TOTAL))"
|
||||
@if [ $$(echo "$$(cat $(COVER_TOTAL)) >= $(COVERAGE_THRESHOLD)" | bc -l) -ne 1 ]; then echo "$(RED)$(BOLD)Failed coverage quality gate (> $(COVERAGE_THRESHOLD)%)$(RESET)" && false; fi
|
||||
|
||||
.PHONY: quality
|
||||
quality: ## Run quality tests
|
||||
$(call title,Running quality tests)
|
||||
cd test/quality && make
|
||||
|
||||
# note: this is used by CI to determine if the install test fixture cache (docker image tars) should be busted
|
||||
install-fingerprint:
|
||||
cd test/install && \
|
||||
|
|
7
test/quality/.gitignore
vendored
Normal file
7
test/quality/.gitignore
vendored
Normal file
|
@ -0,0 +1,7 @@
|
|||
venv
|
||||
.yardstick/tools
|
||||
.yardstick/result
|
||||
stage
|
||||
pull
|
||||
migrate.py
|
||||
.oras-cache
|
45
test/quality/.yardstick.yaml
Normal file
45
test/quality/.yardstick.yaml
Normal file
|
@ -0,0 +1,45 @@
|
|||
x-ref:
|
||||
# note: always reference images with BOTH a tag and a digest
|
||||
images: &images
|
||||
- docker.io/cloudbees/cloudbees-core-mm:2.277.3.1@sha256:4c564f473d38f23da1caa48c4ef53b958ef03d279232007ad3319b1f38584bdb
|
||||
- docker.io/anchore/test_images:grype-quality-node-d89207b@sha256:f56164678054e5eb59ab838367373a49df723b324617b1ba6de775749d7f91d4
|
||||
- docker.io/anchore/test_images:grype-quality-python-d89207b@sha256:b2b58a55c0b03c1626d2aaae2add9832208b02124dda7b7b41811e14f0fb272c
|
||||
- docker.io/anchore/test_images:grype-quality-java-d89207b@sha256:b3534fc2e37943136d5b54e3a58b55d4ccd4363d926cf7aa5bf55a524cf8275b
|
||||
- docker.io/anchore/test_images:grype-quality-golang-d89207b@sha256:7536ee345532f674ec9e448e3768db4e546c48220ba2b6ec9bc9cfbfb3b7b74a
|
||||
- docker.io/anchore/test_images:grype-quality-ruby-d89207b@sha256:1a5a5f870924e88a6f0f2b8089cf276ef0a79b5244a052cdfe4a47bb9e5a2c10
|
||||
- docker.io/alpine:3.2@sha256:ddac200f3ebc9902fb8cfcd599f41feb2151f1118929da21bcef57dc276975f9
|
||||
- docker.io/centos:6@sha256:3688aa867eb84332460e172b9250c9c198fdfd8d987605fd53f246f498c60bcf
|
||||
- docker.io/ubuntu:16.10@sha256:8dc9652808dc091400d7d5983949043a9f9c7132b15c14814275d25f94bca18a
|
||||
- docker.io/almalinux:8@sha256:cd49d7250ed7bb194d502d8a3e50bd775055ca275d1d9c2785aea72b890afe6a
|
||||
- docker.io/rockylinux:8@sha256:72afc2e1a20c9ddf56a81c51148ebcbe927c0a879849efe813bee77d69df1dd8
|
||||
- docker.io/oraclelinux:6@sha256:a06327c0f1d18d753f2a60bb17864c84a850bb6dcbcf5946dd1a8123f6e75495
|
||||
- docker.io/debian:7@sha256:81e88820a7759038ffa61cff59dfcc12d3772c3a2e75b7cfe963c952da2ad264
|
||||
- docker.io/busybox:1.28.1@sha256:2107a35b58593c58ec5f4e8f2c4a70d195321078aebfadfbfb223a2ff4a4ed21
|
||||
- docker.io/amazonlinux:2@sha256:1301cc9f889f21dc45733df9e58034ac1c318202b4b0f0a08d88b3fdc03004de
|
||||
- registry.access.redhat.com/ubi8@sha256:68fecea0d255ee253acbf0c860eaebb7017ef5ef007c25bee9eeffd29ce85b29
|
||||
|
||||
# new vulnerabilities are added all of the time, instead of keeping up it's easier to ignore newer entries.
|
||||
# This approach helps tremendously with keeping the analysis relatively stable.
|
||||
default_max_year: 2020
|
||||
|
||||
result-sets:
|
||||
pr_vs_latest_via_sbom:
|
||||
description: "latest released grype vs grype from the current build (via SBOM ingestion)"
|
||||
matrix:
|
||||
images: *images
|
||||
|
||||
tools:
|
||||
|
||||
- name: syft
|
||||
# note: we want to use a fixed version of syft for capturing all results (NOT "latest")
|
||||
version: v0.54.0
|
||||
produces: SBOM
|
||||
refresh: false
|
||||
|
||||
- name: grype
|
||||
version: git:current-commit
|
||||
takes: SBOM
|
||||
|
||||
- name: grype
|
||||
version: latest
|
||||
takes: SBOM
|
1
test/quality/.yardstick/labels
Symbolic link
1
test/quality/.yardstick/labels
Symbolic link
|
@ -0,0 +1 @@
|
|||
../vulnerability-match-labels/labels
|
68
test/quality/Makefile
Normal file
68
test/quality/Makefile
Normal file
|
@ -0,0 +1,68 @@
|
|||
SBOM_STORE_TAG = md5-$(shell md5sum .yardstick.yaml | cut -d' ' -f1)
|
||||
SBOM_STORE_IMAGE = ghcr.io/anchore/grype/quality-test-sbom-store:$(SBOM_STORE_TAG)
|
||||
ACTIVATE_VENV = . venv/bin/activate &&
|
||||
YARDSTICK = $(ACTIVATE_VENV) yardstick -v
|
||||
YARDSTICK_RESULT_DIR = .yardstick/result
|
||||
YARDSTICK_LABELS_DIR = .yardstick/labels
|
||||
VULNERABILITY_LABELS = ./vulnerability-labels
|
||||
RESULT_SET = pr_vs_latest_via_sbom
|
||||
|
||||
# formatting variables
|
||||
BOLD := $(shell tput -T linux bold)
|
||||
PURPLE := $(shell tput -T linux setaf 5)
|
||||
GREEN := $(shell tput -T linux setaf 2)
|
||||
CYAN := $(shell tput -T linux setaf 6)
|
||||
RED := $(shell tput -T linux setaf 1)
|
||||
RESET := $(shell tput -T linux sgr0)
|
||||
TITLE := $(BOLD)$(PURPLE)
|
||||
SUCCESS := $(BOLD)$(GREEN)
|
||||
|
||||
.PHONY: all
|
||||
all: capture validate ## Fetch or capture all data and run all quality checks
|
||||
|
||||
.PHONY: validate
|
||||
validate: venv $(VULNERABILITY_LABELS) ## Run all quality checks against already collected data
|
||||
$(ACTIVATE_VENV) ./gate.py
|
||||
|
||||
.PHONY: capture
|
||||
capture: sboms vulns ## Collect and store all syft and grype results
|
||||
|
||||
.PHONY: capture
|
||||
vulns: venv ## Collect and store all grype results
|
||||
$(YARDSTICK) -v result capture -r $(RESULT_SET)
|
||||
|
||||
.PHONY: sboms
|
||||
sboms: $(YARDSTICK_RESULT_DIR) venv clear-results ## Collect and store all syft results (deletes all existing results)
|
||||
bash -c "make download-sboms || ($(YARDSTICK) -v result capture -r $(RESULT_SET) --only-producers)"
|
||||
|
||||
.PHONY: download-sboms
|
||||
download-sboms:
|
||||
cd vulnerability-match-labels && make venv
|
||||
bash -c "export ORAS_CACHE=$(shell pwd)/.oras-cache && make venv && . vulnerability-match-labels/venv/bin/activate && ./vulnerability-match-labels/sboms.py download -r $(RESULT_SET)"
|
||||
|
||||
venv: venv/touchfile
|
||||
|
||||
venv/touchfile: requirements.txt
|
||||
test -d venv || python3 -m venv venv
|
||||
$(ACTIVATE_VENV) pip install -Ur requirements.txt
|
||||
touch venv/touchfile
|
||||
|
||||
|
||||
$(YARDSTICK_RESULT_DIR):
|
||||
mkdir -p $(YARDSTICK_RESULT_DIR)
|
||||
|
||||
$(VULNERABILITY_LABELS):
|
||||
git submodule update vulnerability-match-labels
|
||||
|
||||
.PHONY: clear-results
|
||||
clear-results: venv ## Clear all existing yardstick results
|
||||
$(YARDSTICK) result clear
|
||||
|
||||
.PHONY: clean
|
||||
clean: clear-results ## Clear all existing yardstick results and delete python environment
|
||||
rm -rf venv
|
||||
find -iname "*.pyc" -delete
|
||||
|
||||
help:
|
||||
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "$(BOLD)$(CYAN)%-25s$(RESET)%s\n", $$1, $$2}'
|
||||
|
140
test/quality/README.md
Normal file
140
test/quality/README.md
Normal file
|
@ -0,0 +1,140 @@
|
|||
# Match quality testing
|
||||
|
||||
This form of testing compares the results from various releases of grype using a
|
||||
static set of reference container images. The kinds of comparisons made are:
|
||||
|
||||
1) "relative": find the vulnerability matching differences between both tools
|
||||
for a given image. This helps identify when a change has occurred in matching
|
||||
behavior and where the changes are.
|
||||
|
||||
2) "against labels": pair each tool results for an image with ground truth. This
|
||||
helps identify how well the matching behavior is performing (did it get
|
||||
better or worse).
|
||||
|
||||
|
||||
## Getting started
|
||||
|
||||
To capture raw tool output and store into the local `.yardstick` directory for
|
||||
further analysis:
|
||||
```
|
||||
make capture
|
||||
```
|
||||
|
||||
To analyze the tool output and evaluate a pass/fail result:
|
||||
```
|
||||
make validate
|
||||
```
|
||||
|
||||
A pass/fail result is shown in the output with reasons for the failure being
|
||||
listed explicitly.
|
||||
|
||||
|
||||
## What is the quality gate criteria
|
||||
|
||||
The label comparison results are used to determine a pass/fail result,
|
||||
specifically with the following criteria:
|
||||
|
||||
- fail when current grype F1 score drops below last grype release F1 score (or
|
||||
F1 score is indeterminate)
|
||||
- fail when the indeterminate matches % > 10% in the current grype results
|
||||
- fail when there is a rise in FNs relative to the results from the last grype
|
||||
release
|
||||
- otherwise, pass
|
||||
|
||||
F1 score is the primary way that tool matching performance is characterized. F1
|
||||
score combines the TP, FP, and FN counts into a single metric between 0 and 1.
|
||||
Ideally the F1 score for an image-tool pair should be 1. F1 score is a good way
|
||||
to summarize the matching performance but does not explain why the matching
|
||||
performance is what it is.
|
||||
|
||||
Indeterminate matches are matches from results that could not be pared with a
|
||||
label (TP or FP). This could also mean that multiple conflicting labels were
|
||||
found for the a single match. The more indeterminate matches there are the less
|
||||
confident you can be about the F1 score. Ideally there should be 0 indeterminate
|
||||
matches, but this is difficult to achieve since vulnerability data is constantly
|
||||
changing.
|
||||
|
||||
False negatives represent matches that should have been made by the tool but
|
||||
were missed. We should always make certain that this value does not increase
|
||||
between releases of grype.
|
||||
|
||||
## Assumptions
|
||||
|
||||
1. **Comparing vulnerability results taken at different times is invalid**.
|
||||
We leverage the yardstick result-set feature to capture all vulnerability
|
||||
results at one time for a specific image and tool set. Why? If we use grype
|
||||
at version `a` on monday and grype at version `b` on tuesday and attempt to
|
||||
compare the results, if differences are found it will not be immediately
|
||||
clear why the results are different. That is, it is entirely possible that
|
||||
the vulnerability databases from the run of `b` simply had more up to date
|
||||
information, and if `grype@a` were run at the same time (on tuesday) this
|
||||
reason can be almost entirely eliminated.
|
||||
|
||||
2. **Comparing vulnerability results across images with different digests is invalid**.
|
||||
It may be very tempting to compare vulnerability results for
|
||||
`alpine:3.2` from monday and `alpine:3.2` from tuesday to see if there are
|
||||
any changes. However, this is potentially inaccurate as the image references
|
||||
are for the same tag, but the publisher may have pushed a new image with
|
||||
differing content. Any change could lead to different vulnerability matching
|
||||
results but we are only interested in vulnerability match differences that
|
||||
are due to actionable reasons (grype matcher logic problems or [SBOM] input
|
||||
data into matchers).
|
||||
|
||||
## Approach
|
||||
|
||||
Vulnerability matching has essentially two inputs:
|
||||
|
||||
- the packages that were found in the scanned artifact
|
||||
|
||||
- the vulnerability data from upstream providers (e.g. NVD, GHSA, etc.)
|
||||
|
||||
|
||||
These are both moving targets!
|
||||
|
||||
|
||||
We may implement more catalogers in syft that raise up more packages discovered
|
||||
over time (for the same artifact scanned). Also the world is continually finding
|
||||
and reporting new vulnerabilities. The more moving parts there are in this form
|
||||
of testing the harder it is to come to a conclusion about the actual quality of
|
||||
the output over time.
|
||||
|
||||
|
||||
To reduce the eroding value over time we've decided to change as many moving
|
||||
targets into fixed targets as possible:
|
||||
|
||||
- Vulnerability results beyond a particular year are ignored (the current config
|
||||
allows for <= 2020). Though there are still retroactive CVEs created, this
|
||||
helps a lot in terms of keeping vulnerability results relatively stable.
|
||||
|
||||
- SBOMs are used as input into grype instead of the raw container images. This
|
||||
allows the artifacts under test to remain truly fixed and saves a lot of time
|
||||
when capturing grype results (as the container image is no longer needed
|
||||
during analysis).
|
||||
|
||||
- For the captured SBOMs, container images referenced must be with a digest, not
|
||||
just a tag. In case we update a tool version (say syft) we want to make
|
||||
certain that we are scanning the exact same artifact later when we re-run the
|
||||
analysis.
|
||||
|
||||
- Versions of tools used are fixed to a specific `major.minor.patch` release used.
|
||||
This allows us to account for capability differences between tool runs.
|
||||
|
||||
|
||||
To reduce maintenance effort of this comparison over time there are a few things
|
||||
to keep in mind:
|
||||
|
||||
- Once an image is labeled (at a specific digest) the image digest should be
|
||||
considered immutable (never updated). Why? It takes a lot of effort to label
|
||||
images and there are no "clearly safe" assumptions that can be made when it
|
||||
comes to migrating labels from one image to another no matter how "similar"
|
||||
the images may be. There is also no value in updating the image; these images
|
||||
are not being executed and their only purpose is to survey the matching
|
||||
performance of grype. In the philosophy of "maximizing fixed points" it
|
||||
doesn't make sense to change these assets. Over time it may be that we remove
|
||||
assets that are no longer useful for comparison, but this should rarely be
|
||||
done.
|
||||
|
||||
- Consider not changing the CVE year max-ceiling (currently set to 2020).
|
||||
Pushing this ceiling will likely raise the number of unlabled matches
|
||||
significantly for all images. Only bump this ceiling if all possible matches
|
||||
are labeled.
|
326
test/quality/gate.py
Executable file
326
test/quality/gate.py
Executable file
|
@ -0,0 +1,326 @@
|
|||
#!/usr/bin/env python3
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import click
|
||||
from tabulate import tabulate
|
||||
from dataclasses import dataclass, InitVar, field
|
||||
|
||||
import yardstick
|
||||
from yardstick import store, comparison, artifact, arrange
|
||||
from yardstick.cli import display, config
|
||||
|
||||
|
||||
# see the .yardstick.yaml configuration for details
|
||||
default_result_set = "pr_vs_latest_via_sbom"
|
||||
yardstick.utils.grype_db.raise_on_failure(False)
|
||||
|
||||
@dataclass
|
||||
class Gate:
|
||||
label_comparisons: InitVar[Optional[list[comparison.AgainstLabels]]]
|
||||
label_comparison_stats: InitVar[Optional[comparison.ImageToolLabelStats]]
|
||||
|
||||
reasons: list[str] = field(default_factory=list)
|
||||
|
||||
def __post_init__(self, label_comparisons: Optional[list[comparison.AgainstLabels]], label_comparison_stats: Optional[comparison.ImageToolLabelStats]):
|
||||
if not label_comparisons and not label_comparison_stats:
|
||||
return
|
||||
|
||||
reasons = []
|
||||
|
||||
# - fail when current F1 score drops below last release F1 score (or F1 score is indeterminate)
|
||||
# - fail when indeterminate % > 10%
|
||||
# - fail when there is a rise in FNs
|
||||
latest_release_tool, current_tool = guess_tool_orientation(label_comparison_stats.tools)
|
||||
|
||||
latest_release_comparisons_by_image = {comp.config.image: comp for comp in label_comparisons if comp.config.tool == latest_release_tool }
|
||||
current_comparisons_by_image = {comp.config.image: comp for comp in label_comparisons if comp.config.tool == current_tool }
|
||||
|
||||
for image, comp in current_comparisons_by_image.items():
|
||||
latest_f1_score = latest_release_comparisons_by_image[image].summary.f1_score
|
||||
current_f1_score = comp.summary.f1_score
|
||||
if current_f1_score < latest_f1_score:
|
||||
reasons.append(f"current F1 score is lower than the latest release F1 score: {bcolors.BOLD+bcolors.UNDERLINE}current={current_f1_score:0.2f} latest={latest_f1_score:0.2f}{bcolors.RESET} image={image}")
|
||||
|
||||
if comp.summary.indeterminate_percent > 10:
|
||||
reasons.append(f"current indeterminate matches % is greater than 10%: {bcolors.BOLD+bcolors.UNDERLINE}current={comp.summary.indeterminate_percent:0.2f}%{bcolors.RESET} image={image}")
|
||||
|
||||
latest_fns = latest_release_comparisons_by_image[image].summary.false_negatives
|
||||
current_fns = comp.summary.false_negatives
|
||||
if current_fns > latest_fns:
|
||||
reasons.append(f"current false negatives is greater than the latest release false negatives: {bcolors.BOLD+bcolors.UNDERLINE}current={current_fns} latest={latest_fns}{bcolors.RESET} image={image}")
|
||||
|
||||
self.reasons = reasons
|
||||
|
||||
def passed(self):
|
||||
return len(self.reasons) == 0
|
||||
|
||||
def guess_tool_orientation(tools: list[str]):
|
||||
if len(tools) != 2:
|
||||
raise RuntimeError("expected 2 tools, got %s" % tools)
|
||||
|
||||
current_tool = None
|
||||
latest_release_tool = None
|
||||
for tool in tools:
|
||||
if tool.endswith("latest"):
|
||||
latest_release_tool = tool
|
||||
continue
|
||||
current_tool = tool
|
||||
|
||||
if latest_release_tool is None:
|
||||
# "latest" value isn't accessible, so we do a best guess at which version is latest
|
||||
current_tool, latest_release_tool = sorted(tools)
|
||||
|
||||
if current_tool is None:
|
||||
raise ValueError("current tool not found")
|
||||
return latest_release_tool, current_tool
|
||||
|
||||
class bcolors:
|
||||
HEADER = '\033[95m'
|
||||
OKBLUE = '\033[94m'
|
||||
OKCYAN = '\033[96m'
|
||||
OKGREEN = '\033[92m'
|
||||
WARNING = '\033[93m'
|
||||
FAIL = '\033[91m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
RESET = '\033[0m'
|
||||
|
||||
def show_results_used(results: list[artifact.ScanResult]):
|
||||
print(f" Results used:")
|
||||
for idx, result in enumerate(results):
|
||||
branch = "├──"
|
||||
if idx == len(results) - 1:
|
||||
branch = "└──"
|
||||
print(f" {branch} {result.ID} : {result.config.tool} against {result.config.image}")
|
||||
print()
|
||||
|
||||
def validate(cfg: config.Application, result_set: str, images: list[str], always_run_label_comparison: bool, verbosity: int, label_entries: Optional[list[artifact.LabelEntry]] = None):
|
||||
print(f"{bcolors.HEADER}{bcolors.BOLD}Validating with {result_set!r}", bcolors.RESET)
|
||||
result_set_obj = store.result_set.load(name=result_set)
|
||||
|
||||
ret = []
|
||||
for image, result_states in result_set_obj.result_state_by_image.items():
|
||||
if images and image not in images:
|
||||
print("Skipping image:", image)
|
||||
continue
|
||||
print()
|
||||
print("Testing image:", image)
|
||||
for state in result_states:
|
||||
print(" ", f"with {state.request.tool}")
|
||||
print()
|
||||
|
||||
gate = validate_image(cfg, [s.config.path for s in result_states], always_run_label_comparison=always_run_label_comparison, verbosity=verbosity, label_entries=label_entries)
|
||||
ret.append(gate)
|
||||
|
||||
failure = not gate.passed()
|
||||
if failure:
|
||||
print(f"{bcolors.FAIL}{bcolors.BOLD}Failed quality gate{bcolors.RESET}")
|
||||
for reason in gate.reasons:
|
||||
print(f" - {reason}")
|
||||
|
||||
print()
|
||||
size = 120
|
||||
print("▁"*size)
|
||||
print("░"*size)
|
||||
print("▔"*size)
|
||||
return ret
|
||||
|
||||
def validate_image(cfg: config.Application, descriptions: list[str], always_run_label_comparison: bool, verbosity: int, label_entries: Optional[list[artifact.LabelEntry]] = None):
|
||||
# do a relative comparison
|
||||
# - show comparison summary (no gating action)
|
||||
# - list out all individual match differences
|
||||
|
||||
print(f"{bcolors.HEADER}Running relative comparison...", bcolors.RESET)
|
||||
relative_comparison = yardstick.compare_results(descriptions=descriptions, year_max_limit=cfg.default_max_year)
|
||||
show_results_used(relative_comparison.results)
|
||||
|
||||
# show the relative comparison results
|
||||
if verbosity > 0:
|
||||
details = verbosity > 1
|
||||
display.preserved_matches(relative_comparison, details=details, summary=True, common=False)
|
||||
print()
|
||||
|
||||
# bail if there are no differences found
|
||||
if not always_run_label_comparison and not sum([len(relative_comparison.unique[result.ID]) for result in relative_comparison.results]):
|
||||
print("no differences found between tool results")
|
||||
return Gate(None, None)
|
||||
|
||||
# do a label comparison
|
||||
print(f"{bcolors.HEADER}Running comparison against labels...", bcolors.RESET)
|
||||
results, label_entries, comparisons_by_result_id, stats_by_image_tool_pair = yardstick.compare_results_against_labels(descriptions=descriptions, year_max_limit=cfg.default_max_year, label_entries=label_entries)
|
||||
show_results_used(results)
|
||||
|
||||
if verbosity > 0:
|
||||
show_fns = verbosity > 1
|
||||
display.label_comparison(
|
||||
results,
|
||||
comparisons_by_result_id,
|
||||
stats_by_image_tool_pair,
|
||||
show_fns=show_fns,
|
||||
show_summaries=True,
|
||||
)
|
||||
|
||||
latest_release_tool, current_tool = guess_tool_orientation([r.config.tool for r in results])
|
||||
|
||||
# show the relative comparison unique differences paired up with label conclusions (TP/FP/FN/TN/Unknown)
|
||||
all_rows: list[list[Any]] = []
|
||||
for result in relative_comparison.results:
|
||||
label_comparison = comparisons_by_result_id[result.ID]
|
||||
for unique_match in relative_comparison.unique[result.ID]:
|
||||
labels = label_comparison.labels_by_match[unique_match.ID]
|
||||
if not labels:
|
||||
label = "(unknown)"
|
||||
elif len(set(labels)) > 1:
|
||||
label = ", ".join([l.name for l in labels])
|
||||
else:
|
||||
label = labels[0].name
|
||||
|
||||
|
||||
color = ""
|
||||
commentary = ""
|
||||
if result.config.tool == latest_release_tool:
|
||||
# the tool which found the unique result is the latest release tool...
|
||||
if label == artifact.Label.TruePositive.name:
|
||||
# drats! we missed a case (this is a new FN)
|
||||
color = bcolors.FAIL
|
||||
commentary = "(this is a new FN 😱)"
|
||||
elif artifact.Label.FalsePositive.name in label:
|
||||
# we got rid of a FP! ["hip!", "hip!"]
|
||||
color = bcolors.OKBLUE
|
||||
commentary = "(got rid of a former FP 🙌)"
|
||||
else:
|
||||
# the tool which found the unique result is the current tool...
|
||||
if label == artifact.Label.TruePositive.name:
|
||||
# highest of fives! we found a new TP that the previous tool release missed!
|
||||
color = bcolors.OKBLUE
|
||||
commentary = "(this is a new TP 🙌)"
|
||||
elif artifact.Label.FalsePositive.name in label:
|
||||
# welp, our changes resulted in a new FP... not great, maybe not terrible?
|
||||
color = bcolors.FAIL
|
||||
commentary = "(this is a new FP 😱)"
|
||||
|
||||
all_rows.append(
|
||||
[
|
||||
f"{color}{result.config.tool} ONLY{bcolors.RESET}",
|
||||
f"{color}{unique_match.package.name}@{unique_match.package.version}{bcolors.RESET}",
|
||||
f"{color}{unique_match.vulnerability.id}{bcolors.RESET}",
|
||||
f"{color}{label}{bcolors.RESET}",
|
||||
f"{commentary}",
|
||||
]
|
||||
)
|
||||
|
||||
def escape_ansi(line):
|
||||
ansi_escape = re.compile(r'(?:\x1B[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]')
|
||||
return ansi_escape.sub('', line)
|
||||
|
||||
# sort but don't consider ansi escape codes
|
||||
all_rows = sorted(all_rows, key=lambda x: escape_ansi(str(x[0]+x[1]+x[2]+x[3])))
|
||||
if len(all_rows) == 0:
|
||||
print("No differences found between tooling (with labels)")
|
||||
else:
|
||||
print("Match differences between tooling (with labels):")
|
||||
indent = " "
|
||||
print(indent + tabulate([["TOOL PARTITION", "PACKAGE", "VULNERABILITY", "LABEL", "COMMENTARY"]]+all_rows, tablefmt="plain").replace("\n", "\n" + indent) + "\n")
|
||||
|
||||
|
||||
# populate the quality gate with data that can evaluate pass/fail conditions
|
||||
return Gate(label_comparisons=comparisons_by_result_id.values(), label_comparison_stats=stats_by_image_tool_pair)
|
||||
|
||||
@click.command()
|
||||
@click.option("--image", "-i", "images", multiple=True, help="filter down to one or more images to validate with (don't use the full result set)")
|
||||
@click.option("--label-comparison", "-l", "always_run_label_comparison", is_flag=True, help="run label comparison irregardless of relative comparison results")
|
||||
@click.option("--breakdown-by-ecosystem", "-e", is_flag=True, help="show label comparison results broken down by ecosystem")
|
||||
@click.option("--verbose", "-v", "verbosity", count=True, help="show details of all comparisons")
|
||||
@click.option("--result-set", "-r", default=default_result_set, help="the result set to use for the quality gate")
|
||||
def main(images: list[str], always_run_label_comparison: bool, breakdown_by_ecosystem: bool, verbosity: int, result_set: str):
|
||||
cfg = config.load()
|
||||
setup_logging(verbosity)
|
||||
|
||||
# let's not load any more labels than we need to, base this off of the images we're validating
|
||||
if not images:
|
||||
images = set()
|
||||
result_set_obj = store.result_set.load(name=result_set)
|
||||
for state in result_set_obj.state:
|
||||
images.add(state.config.image)
|
||||
images = sorted(list(images))
|
||||
|
||||
print("Loading label entries...", end=" ")
|
||||
label_entries = store.labels.load_for_image(images, year_max_limit=cfg.default_max_year)
|
||||
print(f"done! {len(label_entries)} entries loaded")
|
||||
|
||||
result_sets = [result_set] # today only one result set is supported, but more can be added
|
||||
gates = []
|
||||
for result_set in result_sets:
|
||||
gates.extend(validate(cfg, result_set, images=images, always_run_label_comparison=always_run_label_comparison, verbosity=verbosity, label_entries=label_entries))
|
||||
print()
|
||||
|
||||
if breakdown_by_ecosystem:
|
||||
print(f"{bcolors.HEADER}Breaking down label comparison by ecosystem performance...", bcolors.RESET)
|
||||
results_by_image, label_entries, stats = yardstick.compare_results_against_labels_by_ecosystem(result_set=result_set, year_max_limit=cfg.default_max_year, label_entries=label_entries)
|
||||
display.labels_by_ecosystem_comparison(
|
||||
results_by_image,
|
||||
stats,
|
||||
show_images_used=False,
|
||||
)
|
||||
print()
|
||||
|
||||
failure = not all([gate.passed() for gate in gates])
|
||||
if failure:
|
||||
print("Reasons for quality gate failure:")
|
||||
for gate in gates:
|
||||
for reason in gate.reasons:
|
||||
print(f" - {reason}")
|
||||
|
||||
if failure:
|
||||
print()
|
||||
print(f"{bcolors.FAIL}{bcolors.BOLD}Quality gate FAILED{bcolors.RESET}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(f"{bcolors.OKGREEN}{bcolors.BOLD}Quality gate passed!{bcolors.RESET}")
|
||||
|
||||
|
||||
def setup_logging(verbosity: int):
|
||||
# pylint: disable=redefined-outer-name, import-outside-toplevel
|
||||
import logging.config
|
||||
|
||||
if verbosity in [0, 1, 2]:
|
||||
log_level = "WARN"
|
||||
elif verbosity == 3:
|
||||
log_level = "INFO"
|
||||
else:
|
||||
log_level = "DEBUG"
|
||||
|
||||
logging.config.dictConfig(
|
||||
{
|
||||
"version": 1,
|
||||
"formatters": {
|
||||
"standard": {
|
||||
# [%(module)s.%(funcName)s]
|
||||
"format": "%(asctime)s [%(levelname)s] %(message)s",
|
||||
"datefmt": "",
|
||||
},
|
||||
},
|
||||
"handlers": {
|
||||
"default": {
|
||||
"level": log_level,
|
||||
"formatter": "standard",
|
||||
"class": "logging.StreamHandler",
|
||||
"stream": "ext://sys.stderr",
|
||||
},
|
||||
},
|
||||
"loggers": {
|
||||
"": { # root logger
|
||||
"handlers": ["default"],
|
||||
"level": log_level,
|
||||
},
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
3
test/quality/requirements.txt
Normal file
3
test/quality/requirements.txt
Normal file
|
@ -0,0 +1,3 @@
|
|||
git+https://github.com/anchore/yardstick@4526ad2ff6d33d34e900ed692c3a90adc80eab73
|
||||
# ../../../yardstick
|
||||
tabulate==0.8.10
|
1
test/quality/vulnerability-match-labels
Submodule
1
test/quality/vulnerability-match-labels
Submodule
|
@ -0,0 +1 @@
|
|||
Subproject commit 3a2ecc336411ddc3f37b7d5c123b80f6848a2cf3
|
Loading…
Reference in a new issue