mirror of
https://github.com/anchore/grype
synced 2024-11-10 06:34:13 +00:00
Add in-depth quality gate checks (#949)
* add in-depth quality gate checks Signed-off-by: Alex Goodman <alex.goodman@anchore.com> * add quality tests to PR checks Signed-off-by: Alex Goodman <alex.goodman@anchore.com> Signed-off-by: Alex Goodman <alex.goodman@anchore.com>
This commit is contained in:
parent
7ad60ce410
commit
d4587ddeec
11 changed files with 653 additions and 0 deletions
53
.github/workflows/validations.yaml
vendored
53
.github/workflows/validations.yaml
vendored
|
@ -9,6 +9,7 @@ on:
|
||||||
env:
|
env:
|
||||||
GO_VERSION: "1.18.x"
|
GO_VERSION: "1.18.x"
|
||||||
GO_STABLE_VERSION: true
|
GO_STABLE_VERSION: true
|
||||||
|
PYTHON_VERSION: "3.10"
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
Static-Analysis:
|
Static-Analysis:
|
||||||
|
@ -100,6 +101,58 @@ jobs:
|
||||||
name: unit-test-results
|
name: unit-test-results
|
||||||
path: test/results/**/*
|
path: test/results/**/*
|
||||||
|
|
||||||
|
Quality-Test:
|
||||||
|
# Note: changing this job name requires making the same update in the .github/workflows/release.yaml pipeline
|
||||||
|
name: "Quality tests"
|
||||||
|
runs-on: ubuntu-20.04
|
||||||
|
steps:
|
||||||
|
- uses: actions/setup-go@v2
|
||||||
|
with:
|
||||||
|
go-version: ${{ env.GO_VERSION }}
|
||||||
|
stable: ${{ env.GO_STABLE_VERSION }}
|
||||||
|
|
||||||
|
- uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: ${{ env.PYTHON_VERSION }}
|
||||||
|
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
|
||||||
|
- name: Restore tool cache
|
||||||
|
id: tool-cache
|
||||||
|
uses: actions/cache@v2.1.3
|
||||||
|
with:
|
||||||
|
path: ${{ github.workspace }}/.tmp
|
||||||
|
key: ${{ runner.os }}-tool-${{ hashFiles('Makefile') }}
|
||||||
|
|
||||||
|
- name: Restore go cache
|
||||||
|
id: go-cache
|
||||||
|
uses: actions/cache@v2.1.3
|
||||||
|
with:
|
||||||
|
path: ~/go/pkg/mod
|
||||||
|
key: ${{ runner.os }}-go-${{ env.GO_VERSION }}-${{ hashFiles('**/go.sum') }}
|
||||||
|
restore-keys: |
|
||||||
|
${{ runner.os }}-go-${{ env.GO_VERSION }}-
|
||||||
|
|
||||||
|
- name: Restore python cache
|
||||||
|
id: python-cache
|
||||||
|
uses: actions/cache@v2.1.3
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
test/quality/venv
|
||||||
|
test/quality/vulnerability-match-labels/venv
|
||||||
|
key: ${{ runner.os }}-go-${{ env.PYTHON_VERSION }}-${{ hashFiles('**/test/quality/**/requirements.txt') }}
|
||||||
|
restore-keys: |
|
||||||
|
${{ runner.os }}-go-${{ env.PYTHON_VERSION }}-
|
||||||
|
|
||||||
|
- name: (cache-miss) Bootstrap all project dependencies
|
||||||
|
if: steps.tool-cache.outputs.cache-hit != 'true' || steps.go-cache.outputs.cache-hit != 'true'
|
||||||
|
run: make bootstrap
|
||||||
|
|
||||||
|
- name: Run quality tests
|
||||||
|
run: make quality
|
||||||
|
|
||||||
Integration-Test:
|
Integration-Test:
|
||||||
# Note: changing this job name requires making the same update in the .github/workflows/release.yaml pipeline
|
# Note: changing this job name requires making the same update in the .github/workflows/release.yaml pipeline
|
||||||
name: "Integration tests"
|
name: "Integration tests"
|
||||||
|
|
4
.gitmodules
vendored
Normal file
4
.gitmodules
vendored
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
[submodule "test/quality/vulnerability-match-labels"]
|
||||||
|
path = test/quality/vulnerability-match-labels
|
||||||
|
url = git@github.com:anchore/vulnerability-match-labels.git
|
||||||
|
branch = main
|
5
Makefile
5
Makefile
|
@ -168,6 +168,11 @@ unit: ## Run unit tests (with coverage)
|
||||||
@echo "Coverage: $$(cat $(COVER_TOTAL))"
|
@echo "Coverage: $$(cat $(COVER_TOTAL))"
|
||||||
@if [ $$(echo "$$(cat $(COVER_TOTAL)) >= $(COVERAGE_THRESHOLD)" | bc -l) -ne 1 ]; then echo "$(RED)$(BOLD)Failed coverage quality gate (> $(COVERAGE_THRESHOLD)%)$(RESET)" && false; fi
|
@if [ $$(echo "$$(cat $(COVER_TOTAL)) >= $(COVERAGE_THRESHOLD)" | bc -l) -ne 1 ]; then echo "$(RED)$(BOLD)Failed coverage quality gate (> $(COVERAGE_THRESHOLD)%)$(RESET)" && false; fi
|
||||||
|
|
||||||
|
.PHONY: quality
|
||||||
|
quality: ## Run quality tests
|
||||||
|
$(call title,Running quality tests)
|
||||||
|
cd test/quality && make
|
||||||
|
|
||||||
# note: this is used by CI to determine if the install test fixture cache (docker image tars) should be busted
|
# note: this is used by CI to determine if the install test fixture cache (docker image tars) should be busted
|
||||||
install-fingerprint:
|
install-fingerprint:
|
||||||
cd test/install && \
|
cd test/install && \
|
||||||
|
|
7
test/quality/.gitignore
vendored
Normal file
7
test/quality/.gitignore
vendored
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
venv
|
||||||
|
.yardstick/tools
|
||||||
|
.yardstick/result
|
||||||
|
stage
|
||||||
|
pull
|
||||||
|
migrate.py
|
||||||
|
.oras-cache
|
45
test/quality/.yardstick.yaml
Normal file
45
test/quality/.yardstick.yaml
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
x-ref:
|
||||||
|
# note: always reference images with BOTH a tag and a digest
|
||||||
|
images: &images
|
||||||
|
- docker.io/cloudbees/cloudbees-core-mm:2.277.3.1@sha256:4c564f473d38f23da1caa48c4ef53b958ef03d279232007ad3319b1f38584bdb
|
||||||
|
- docker.io/anchore/test_images:grype-quality-node-d89207b@sha256:f56164678054e5eb59ab838367373a49df723b324617b1ba6de775749d7f91d4
|
||||||
|
- docker.io/anchore/test_images:grype-quality-python-d89207b@sha256:b2b58a55c0b03c1626d2aaae2add9832208b02124dda7b7b41811e14f0fb272c
|
||||||
|
- docker.io/anchore/test_images:grype-quality-java-d89207b@sha256:b3534fc2e37943136d5b54e3a58b55d4ccd4363d926cf7aa5bf55a524cf8275b
|
||||||
|
- docker.io/anchore/test_images:grype-quality-golang-d89207b@sha256:7536ee345532f674ec9e448e3768db4e546c48220ba2b6ec9bc9cfbfb3b7b74a
|
||||||
|
- docker.io/anchore/test_images:grype-quality-ruby-d89207b@sha256:1a5a5f870924e88a6f0f2b8089cf276ef0a79b5244a052cdfe4a47bb9e5a2c10
|
||||||
|
- docker.io/alpine:3.2@sha256:ddac200f3ebc9902fb8cfcd599f41feb2151f1118929da21bcef57dc276975f9
|
||||||
|
- docker.io/centos:6@sha256:3688aa867eb84332460e172b9250c9c198fdfd8d987605fd53f246f498c60bcf
|
||||||
|
- docker.io/ubuntu:16.10@sha256:8dc9652808dc091400d7d5983949043a9f9c7132b15c14814275d25f94bca18a
|
||||||
|
- docker.io/almalinux:8@sha256:cd49d7250ed7bb194d502d8a3e50bd775055ca275d1d9c2785aea72b890afe6a
|
||||||
|
- docker.io/rockylinux:8@sha256:72afc2e1a20c9ddf56a81c51148ebcbe927c0a879849efe813bee77d69df1dd8
|
||||||
|
- docker.io/oraclelinux:6@sha256:a06327c0f1d18d753f2a60bb17864c84a850bb6dcbcf5946dd1a8123f6e75495
|
||||||
|
- docker.io/debian:7@sha256:81e88820a7759038ffa61cff59dfcc12d3772c3a2e75b7cfe963c952da2ad264
|
||||||
|
- docker.io/busybox:1.28.1@sha256:2107a35b58593c58ec5f4e8f2c4a70d195321078aebfadfbfb223a2ff4a4ed21
|
||||||
|
- docker.io/amazonlinux:2@sha256:1301cc9f889f21dc45733df9e58034ac1c318202b4b0f0a08d88b3fdc03004de
|
||||||
|
- registry.access.redhat.com/ubi8@sha256:68fecea0d255ee253acbf0c860eaebb7017ef5ef007c25bee9eeffd29ce85b29
|
||||||
|
|
||||||
|
# new vulnerabilities are added all of the time, instead of keeping up it's easier to ignore newer entries.
|
||||||
|
# This approach helps tremendously with keeping the analysis relatively stable.
|
||||||
|
default_max_year: 2020
|
||||||
|
|
||||||
|
result-sets:
|
||||||
|
pr_vs_latest_via_sbom:
|
||||||
|
description: "latest released grype vs grype from the current build (via SBOM ingestion)"
|
||||||
|
matrix:
|
||||||
|
images: *images
|
||||||
|
|
||||||
|
tools:
|
||||||
|
|
||||||
|
- name: syft
|
||||||
|
# note: we want to use a fixed version of syft for capturing all results (NOT "latest")
|
||||||
|
version: v0.54.0
|
||||||
|
produces: SBOM
|
||||||
|
refresh: false
|
||||||
|
|
||||||
|
- name: grype
|
||||||
|
version: git:current-commit
|
||||||
|
takes: SBOM
|
||||||
|
|
||||||
|
- name: grype
|
||||||
|
version: latest
|
||||||
|
takes: SBOM
|
1
test/quality/.yardstick/labels
Symbolic link
1
test/quality/.yardstick/labels
Symbolic link
|
@ -0,0 +1 @@
|
||||||
|
../vulnerability-match-labels/labels
|
68
test/quality/Makefile
Normal file
68
test/quality/Makefile
Normal file
|
@ -0,0 +1,68 @@
|
||||||
|
SBOM_STORE_TAG = md5-$(shell md5sum .yardstick.yaml | cut -d' ' -f1)
|
||||||
|
SBOM_STORE_IMAGE = ghcr.io/anchore/grype/quality-test-sbom-store:$(SBOM_STORE_TAG)
|
||||||
|
ACTIVATE_VENV = . venv/bin/activate &&
|
||||||
|
YARDSTICK = $(ACTIVATE_VENV) yardstick -v
|
||||||
|
YARDSTICK_RESULT_DIR = .yardstick/result
|
||||||
|
YARDSTICK_LABELS_DIR = .yardstick/labels
|
||||||
|
VULNERABILITY_LABELS = ./vulnerability-labels
|
||||||
|
RESULT_SET = pr_vs_latest_via_sbom
|
||||||
|
|
||||||
|
# formatting variables
|
||||||
|
BOLD := $(shell tput -T linux bold)
|
||||||
|
PURPLE := $(shell tput -T linux setaf 5)
|
||||||
|
GREEN := $(shell tput -T linux setaf 2)
|
||||||
|
CYAN := $(shell tput -T linux setaf 6)
|
||||||
|
RED := $(shell tput -T linux setaf 1)
|
||||||
|
RESET := $(shell tput -T linux sgr0)
|
||||||
|
TITLE := $(BOLD)$(PURPLE)
|
||||||
|
SUCCESS := $(BOLD)$(GREEN)
|
||||||
|
|
||||||
|
.PHONY: all
|
||||||
|
all: capture validate ## Fetch or capture all data and run all quality checks
|
||||||
|
|
||||||
|
.PHONY: validate
|
||||||
|
validate: venv $(VULNERABILITY_LABELS) ## Run all quality checks against already collected data
|
||||||
|
$(ACTIVATE_VENV) ./gate.py
|
||||||
|
|
||||||
|
.PHONY: capture
|
||||||
|
capture: sboms vulns ## Collect and store all syft and grype results
|
||||||
|
|
||||||
|
.PHONY: capture
|
||||||
|
vulns: venv ## Collect and store all grype results
|
||||||
|
$(YARDSTICK) -v result capture -r $(RESULT_SET)
|
||||||
|
|
||||||
|
.PHONY: sboms
|
||||||
|
sboms: $(YARDSTICK_RESULT_DIR) venv clear-results ## Collect and store all syft results (deletes all existing results)
|
||||||
|
bash -c "make download-sboms || ($(YARDSTICK) -v result capture -r $(RESULT_SET) --only-producers)"
|
||||||
|
|
||||||
|
.PHONY: download-sboms
|
||||||
|
download-sboms:
|
||||||
|
cd vulnerability-match-labels && make venv
|
||||||
|
bash -c "export ORAS_CACHE=$(shell pwd)/.oras-cache && make venv && . vulnerability-match-labels/venv/bin/activate && ./vulnerability-match-labels/sboms.py download -r $(RESULT_SET)"
|
||||||
|
|
||||||
|
venv: venv/touchfile
|
||||||
|
|
||||||
|
venv/touchfile: requirements.txt
|
||||||
|
test -d venv || python3 -m venv venv
|
||||||
|
$(ACTIVATE_VENV) pip install -Ur requirements.txt
|
||||||
|
touch venv/touchfile
|
||||||
|
|
||||||
|
|
||||||
|
$(YARDSTICK_RESULT_DIR):
|
||||||
|
mkdir -p $(YARDSTICK_RESULT_DIR)
|
||||||
|
|
||||||
|
$(VULNERABILITY_LABELS):
|
||||||
|
git submodule update vulnerability-match-labels
|
||||||
|
|
||||||
|
.PHONY: clear-results
|
||||||
|
clear-results: venv ## Clear all existing yardstick results
|
||||||
|
$(YARDSTICK) result clear
|
||||||
|
|
||||||
|
.PHONY: clean
|
||||||
|
clean: clear-results ## Clear all existing yardstick results and delete python environment
|
||||||
|
rm -rf venv
|
||||||
|
find -iname "*.pyc" -delete
|
||||||
|
|
||||||
|
help:
|
||||||
|
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "$(BOLD)$(CYAN)%-25s$(RESET)%s\n", $$1, $$2}'
|
||||||
|
|
140
test/quality/README.md
Normal file
140
test/quality/README.md
Normal file
|
@ -0,0 +1,140 @@
|
||||||
|
# Match quality testing
|
||||||
|
|
||||||
|
This form of testing compares the results from various releases of grype using a
|
||||||
|
static set of reference container images. The kinds of comparisons made are:
|
||||||
|
|
||||||
|
1) "relative": find the vulnerability matching differences between both tools
|
||||||
|
for a given image. This helps identify when a change has occurred in matching
|
||||||
|
behavior and where the changes are.
|
||||||
|
|
||||||
|
2) "against labels": pair each tool results for an image with ground truth. This
|
||||||
|
helps identify how well the matching behavior is performing (did it get
|
||||||
|
better or worse).
|
||||||
|
|
||||||
|
|
||||||
|
## Getting started
|
||||||
|
|
||||||
|
To capture raw tool output and store into the local `.yardstick` directory for
|
||||||
|
further analysis:
|
||||||
|
```
|
||||||
|
make capture
|
||||||
|
```
|
||||||
|
|
||||||
|
To analyze the tool output and evaluate a pass/fail result:
|
||||||
|
```
|
||||||
|
make validate
|
||||||
|
```
|
||||||
|
|
||||||
|
A pass/fail result is shown in the output with reasons for the failure being
|
||||||
|
listed explicitly.
|
||||||
|
|
||||||
|
|
||||||
|
## What is the quality gate criteria
|
||||||
|
|
||||||
|
The label comparison results are used to determine a pass/fail result,
|
||||||
|
specifically with the following criteria:
|
||||||
|
|
||||||
|
- fail when current grype F1 score drops below last grype release F1 score (or
|
||||||
|
F1 score is indeterminate)
|
||||||
|
- fail when the indeterminate matches % > 10% in the current grype results
|
||||||
|
- fail when there is a rise in FNs relative to the results from the last grype
|
||||||
|
release
|
||||||
|
- otherwise, pass
|
||||||
|
|
||||||
|
F1 score is the primary way that tool matching performance is characterized. F1
|
||||||
|
score combines the TP, FP, and FN counts into a single metric between 0 and 1.
|
||||||
|
Ideally the F1 score for an image-tool pair should be 1. F1 score is a good way
|
||||||
|
to summarize the matching performance but does not explain why the matching
|
||||||
|
performance is what it is.
|
||||||
|
|
||||||
|
Indeterminate matches are matches from results that could not be pared with a
|
||||||
|
label (TP or FP). This could also mean that multiple conflicting labels were
|
||||||
|
found for the a single match. The more indeterminate matches there are the less
|
||||||
|
confident you can be about the F1 score. Ideally there should be 0 indeterminate
|
||||||
|
matches, but this is difficult to achieve since vulnerability data is constantly
|
||||||
|
changing.
|
||||||
|
|
||||||
|
False negatives represent matches that should have been made by the tool but
|
||||||
|
were missed. We should always make certain that this value does not increase
|
||||||
|
between releases of grype.
|
||||||
|
|
||||||
|
## Assumptions
|
||||||
|
|
||||||
|
1. **Comparing vulnerability results taken at different times is invalid**.
|
||||||
|
We leverage the yardstick result-set feature to capture all vulnerability
|
||||||
|
results at one time for a specific image and tool set. Why? If we use grype
|
||||||
|
at version `a` on monday and grype at version `b` on tuesday and attempt to
|
||||||
|
compare the results, if differences are found it will not be immediately
|
||||||
|
clear why the results are different. That is, it is entirely possible that
|
||||||
|
the vulnerability databases from the run of `b` simply had more up to date
|
||||||
|
information, and if `grype@a` were run at the same time (on tuesday) this
|
||||||
|
reason can be almost entirely eliminated.
|
||||||
|
|
||||||
|
2. **Comparing vulnerability results across images with different digests is invalid**.
|
||||||
|
It may be very tempting to compare vulnerability results for
|
||||||
|
`alpine:3.2` from monday and `alpine:3.2` from tuesday to see if there are
|
||||||
|
any changes. However, this is potentially inaccurate as the image references
|
||||||
|
are for the same tag, but the publisher may have pushed a new image with
|
||||||
|
differing content. Any change could lead to different vulnerability matching
|
||||||
|
results but we are only interested in vulnerability match differences that
|
||||||
|
are due to actionable reasons (grype matcher logic problems or [SBOM] input
|
||||||
|
data into matchers).
|
||||||
|
|
||||||
|
## Approach
|
||||||
|
|
||||||
|
Vulnerability matching has essentially two inputs:
|
||||||
|
|
||||||
|
- the packages that were found in the scanned artifact
|
||||||
|
|
||||||
|
- the vulnerability data from upstream providers (e.g. NVD, GHSA, etc.)
|
||||||
|
|
||||||
|
|
||||||
|
These are both moving targets!
|
||||||
|
|
||||||
|
|
||||||
|
We may implement more catalogers in syft that raise up more packages discovered
|
||||||
|
over time (for the same artifact scanned). Also the world is continually finding
|
||||||
|
and reporting new vulnerabilities. The more moving parts there are in this form
|
||||||
|
of testing the harder it is to come to a conclusion about the actual quality of
|
||||||
|
the output over time.
|
||||||
|
|
||||||
|
|
||||||
|
To reduce the eroding value over time we've decided to change as many moving
|
||||||
|
targets into fixed targets as possible:
|
||||||
|
|
||||||
|
- Vulnerability results beyond a particular year are ignored (the current config
|
||||||
|
allows for <= 2020). Though there are still retroactive CVEs created, this
|
||||||
|
helps a lot in terms of keeping vulnerability results relatively stable.
|
||||||
|
|
||||||
|
- SBOMs are used as input into grype instead of the raw container images. This
|
||||||
|
allows the artifacts under test to remain truly fixed and saves a lot of time
|
||||||
|
when capturing grype results (as the container image is no longer needed
|
||||||
|
during analysis).
|
||||||
|
|
||||||
|
- For the captured SBOMs, container images referenced must be with a digest, not
|
||||||
|
just a tag. In case we update a tool version (say syft) we want to make
|
||||||
|
certain that we are scanning the exact same artifact later when we re-run the
|
||||||
|
analysis.
|
||||||
|
|
||||||
|
- Versions of tools used are fixed to a specific `major.minor.patch` release used.
|
||||||
|
This allows us to account for capability differences between tool runs.
|
||||||
|
|
||||||
|
|
||||||
|
To reduce maintenance effort of this comparison over time there are a few things
|
||||||
|
to keep in mind:
|
||||||
|
|
||||||
|
- Once an image is labeled (at a specific digest) the image digest should be
|
||||||
|
considered immutable (never updated). Why? It takes a lot of effort to label
|
||||||
|
images and there are no "clearly safe" assumptions that can be made when it
|
||||||
|
comes to migrating labels from one image to another no matter how "similar"
|
||||||
|
the images may be. There is also no value in updating the image; these images
|
||||||
|
are not being executed and their only purpose is to survey the matching
|
||||||
|
performance of grype. In the philosophy of "maximizing fixed points" it
|
||||||
|
doesn't make sense to change these assets. Over time it may be that we remove
|
||||||
|
assets that are no longer useful for comparison, but this should rarely be
|
||||||
|
done.
|
||||||
|
|
||||||
|
- Consider not changing the CVE year max-ceiling (currently set to 2020).
|
||||||
|
Pushing this ceiling will likely raise the number of unlabled matches
|
||||||
|
significantly for all images. Only bump this ceiling if all possible matches
|
||||||
|
are labeled.
|
326
test/quality/gate.py
Executable file
326
test/quality/gate.py
Executable file
|
@ -0,0 +1,326 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import click
|
||||||
|
from tabulate import tabulate
|
||||||
|
from dataclasses import dataclass, InitVar, field
|
||||||
|
|
||||||
|
import yardstick
|
||||||
|
from yardstick import store, comparison, artifact, arrange
|
||||||
|
from yardstick.cli import display, config
|
||||||
|
|
||||||
|
|
||||||
|
# see the .yardstick.yaml configuration for details
|
||||||
|
default_result_set = "pr_vs_latest_via_sbom"
|
||||||
|
yardstick.utils.grype_db.raise_on_failure(False)
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Gate:
|
||||||
|
label_comparisons: InitVar[Optional[list[comparison.AgainstLabels]]]
|
||||||
|
label_comparison_stats: InitVar[Optional[comparison.ImageToolLabelStats]]
|
||||||
|
|
||||||
|
reasons: list[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
def __post_init__(self, label_comparisons: Optional[list[comparison.AgainstLabels]], label_comparison_stats: Optional[comparison.ImageToolLabelStats]):
|
||||||
|
if not label_comparisons and not label_comparison_stats:
|
||||||
|
return
|
||||||
|
|
||||||
|
reasons = []
|
||||||
|
|
||||||
|
# - fail when current F1 score drops below last release F1 score (or F1 score is indeterminate)
|
||||||
|
# - fail when indeterminate % > 10%
|
||||||
|
# - fail when there is a rise in FNs
|
||||||
|
latest_release_tool, current_tool = guess_tool_orientation(label_comparison_stats.tools)
|
||||||
|
|
||||||
|
latest_release_comparisons_by_image = {comp.config.image: comp for comp in label_comparisons if comp.config.tool == latest_release_tool }
|
||||||
|
current_comparisons_by_image = {comp.config.image: comp for comp in label_comparisons if comp.config.tool == current_tool }
|
||||||
|
|
||||||
|
for image, comp in current_comparisons_by_image.items():
|
||||||
|
latest_f1_score = latest_release_comparisons_by_image[image].summary.f1_score
|
||||||
|
current_f1_score = comp.summary.f1_score
|
||||||
|
if current_f1_score < latest_f1_score:
|
||||||
|
reasons.append(f"current F1 score is lower than the latest release F1 score: {bcolors.BOLD+bcolors.UNDERLINE}current={current_f1_score:0.2f} latest={latest_f1_score:0.2f}{bcolors.RESET} image={image}")
|
||||||
|
|
||||||
|
if comp.summary.indeterminate_percent > 10:
|
||||||
|
reasons.append(f"current indeterminate matches % is greater than 10%: {bcolors.BOLD+bcolors.UNDERLINE}current={comp.summary.indeterminate_percent:0.2f}%{bcolors.RESET} image={image}")
|
||||||
|
|
||||||
|
latest_fns = latest_release_comparisons_by_image[image].summary.false_negatives
|
||||||
|
current_fns = comp.summary.false_negatives
|
||||||
|
if current_fns > latest_fns:
|
||||||
|
reasons.append(f"current false negatives is greater than the latest release false negatives: {bcolors.BOLD+bcolors.UNDERLINE}current={current_fns} latest={latest_fns}{bcolors.RESET} image={image}")
|
||||||
|
|
||||||
|
self.reasons = reasons
|
||||||
|
|
||||||
|
def passed(self):
|
||||||
|
return len(self.reasons) == 0
|
||||||
|
|
||||||
|
def guess_tool_orientation(tools: list[str]):
|
||||||
|
if len(tools) != 2:
|
||||||
|
raise RuntimeError("expected 2 tools, got %s" % tools)
|
||||||
|
|
||||||
|
current_tool = None
|
||||||
|
latest_release_tool = None
|
||||||
|
for tool in tools:
|
||||||
|
if tool.endswith("latest"):
|
||||||
|
latest_release_tool = tool
|
||||||
|
continue
|
||||||
|
current_tool = tool
|
||||||
|
|
||||||
|
if latest_release_tool is None:
|
||||||
|
# "latest" value isn't accessible, so we do a best guess at which version is latest
|
||||||
|
current_tool, latest_release_tool = sorted(tools)
|
||||||
|
|
||||||
|
if current_tool is None:
|
||||||
|
raise ValueError("current tool not found")
|
||||||
|
return latest_release_tool, current_tool
|
||||||
|
|
||||||
|
class bcolors:
|
||||||
|
HEADER = '\033[95m'
|
||||||
|
OKBLUE = '\033[94m'
|
||||||
|
OKCYAN = '\033[96m'
|
||||||
|
OKGREEN = '\033[92m'
|
||||||
|
WARNING = '\033[93m'
|
||||||
|
FAIL = '\033[91m'
|
||||||
|
BOLD = '\033[1m'
|
||||||
|
UNDERLINE = '\033[4m'
|
||||||
|
RESET = '\033[0m'
|
||||||
|
|
||||||
|
def show_results_used(results: list[artifact.ScanResult]):
|
||||||
|
print(f" Results used:")
|
||||||
|
for idx, result in enumerate(results):
|
||||||
|
branch = "├──"
|
||||||
|
if idx == len(results) - 1:
|
||||||
|
branch = "└──"
|
||||||
|
print(f" {branch} {result.ID} : {result.config.tool} against {result.config.image}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
def validate(cfg: config.Application, result_set: str, images: list[str], always_run_label_comparison: bool, verbosity: int, label_entries: Optional[list[artifact.LabelEntry]] = None):
|
||||||
|
print(f"{bcolors.HEADER}{bcolors.BOLD}Validating with {result_set!r}", bcolors.RESET)
|
||||||
|
result_set_obj = store.result_set.load(name=result_set)
|
||||||
|
|
||||||
|
ret = []
|
||||||
|
for image, result_states in result_set_obj.result_state_by_image.items():
|
||||||
|
if images and image not in images:
|
||||||
|
print("Skipping image:", image)
|
||||||
|
continue
|
||||||
|
print()
|
||||||
|
print("Testing image:", image)
|
||||||
|
for state in result_states:
|
||||||
|
print(" ", f"with {state.request.tool}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
gate = validate_image(cfg, [s.config.path for s in result_states], always_run_label_comparison=always_run_label_comparison, verbosity=verbosity, label_entries=label_entries)
|
||||||
|
ret.append(gate)
|
||||||
|
|
||||||
|
failure = not gate.passed()
|
||||||
|
if failure:
|
||||||
|
print(f"{bcolors.FAIL}{bcolors.BOLD}Failed quality gate{bcolors.RESET}")
|
||||||
|
for reason in gate.reasons:
|
||||||
|
print(f" - {reason}")
|
||||||
|
|
||||||
|
print()
|
||||||
|
size = 120
|
||||||
|
print("▁"*size)
|
||||||
|
print("░"*size)
|
||||||
|
print("▔"*size)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def validate_image(cfg: config.Application, descriptions: list[str], always_run_label_comparison: bool, verbosity: int, label_entries: Optional[list[artifact.LabelEntry]] = None):
|
||||||
|
# do a relative comparison
|
||||||
|
# - show comparison summary (no gating action)
|
||||||
|
# - list out all individual match differences
|
||||||
|
|
||||||
|
print(f"{bcolors.HEADER}Running relative comparison...", bcolors.RESET)
|
||||||
|
relative_comparison = yardstick.compare_results(descriptions=descriptions, year_max_limit=cfg.default_max_year)
|
||||||
|
show_results_used(relative_comparison.results)
|
||||||
|
|
||||||
|
# show the relative comparison results
|
||||||
|
if verbosity > 0:
|
||||||
|
details = verbosity > 1
|
||||||
|
display.preserved_matches(relative_comparison, details=details, summary=True, common=False)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# bail if there are no differences found
|
||||||
|
if not always_run_label_comparison and not sum([len(relative_comparison.unique[result.ID]) for result in relative_comparison.results]):
|
||||||
|
print("no differences found between tool results")
|
||||||
|
return Gate(None, None)
|
||||||
|
|
||||||
|
# do a label comparison
|
||||||
|
print(f"{bcolors.HEADER}Running comparison against labels...", bcolors.RESET)
|
||||||
|
results, label_entries, comparisons_by_result_id, stats_by_image_tool_pair = yardstick.compare_results_against_labels(descriptions=descriptions, year_max_limit=cfg.default_max_year, label_entries=label_entries)
|
||||||
|
show_results_used(results)
|
||||||
|
|
||||||
|
if verbosity > 0:
|
||||||
|
show_fns = verbosity > 1
|
||||||
|
display.label_comparison(
|
||||||
|
results,
|
||||||
|
comparisons_by_result_id,
|
||||||
|
stats_by_image_tool_pair,
|
||||||
|
show_fns=show_fns,
|
||||||
|
show_summaries=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
latest_release_tool, current_tool = guess_tool_orientation([r.config.tool for r in results])
|
||||||
|
|
||||||
|
# show the relative comparison unique differences paired up with label conclusions (TP/FP/FN/TN/Unknown)
|
||||||
|
all_rows: list[list[Any]] = []
|
||||||
|
for result in relative_comparison.results:
|
||||||
|
label_comparison = comparisons_by_result_id[result.ID]
|
||||||
|
for unique_match in relative_comparison.unique[result.ID]:
|
||||||
|
labels = label_comparison.labels_by_match[unique_match.ID]
|
||||||
|
if not labels:
|
||||||
|
label = "(unknown)"
|
||||||
|
elif len(set(labels)) > 1:
|
||||||
|
label = ", ".join([l.name for l in labels])
|
||||||
|
else:
|
||||||
|
label = labels[0].name
|
||||||
|
|
||||||
|
|
||||||
|
color = ""
|
||||||
|
commentary = ""
|
||||||
|
if result.config.tool == latest_release_tool:
|
||||||
|
# the tool which found the unique result is the latest release tool...
|
||||||
|
if label == artifact.Label.TruePositive.name:
|
||||||
|
# drats! we missed a case (this is a new FN)
|
||||||
|
color = bcolors.FAIL
|
||||||
|
commentary = "(this is a new FN 😱)"
|
||||||
|
elif artifact.Label.FalsePositive.name in label:
|
||||||
|
# we got rid of a FP! ["hip!", "hip!"]
|
||||||
|
color = bcolors.OKBLUE
|
||||||
|
commentary = "(got rid of a former FP 🙌)"
|
||||||
|
else:
|
||||||
|
# the tool which found the unique result is the current tool...
|
||||||
|
if label == artifact.Label.TruePositive.name:
|
||||||
|
# highest of fives! we found a new TP that the previous tool release missed!
|
||||||
|
color = bcolors.OKBLUE
|
||||||
|
commentary = "(this is a new TP 🙌)"
|
||||||
|
elif artifact.Label.FalsePositive.name in label:
|
||||||
|
# welp, our changes resulted in a new FP... not great, maybe not terrible?
|
||||||
|
color = bcolors.FAIL
|
||||||
|
commentary = "(this is a new FP 😱)"
|
||||||
|
|
||||||
|
all_rows.append(
|
||||||
|
[
|
||||||
|
f"{color}{result.config.tool} ONLY{bcolors.RESET}",
|
||||||
|
f"{color}{unique_match.package.name}@{unique_match.package.version}{bcolors.RESET}",
|
||||||
|
f"{color}{unique_match.vulnerability.id}{bcolors.RESET}",
|
||||||
|
f"{color}{label}{bcolors.RESET}",
|
||||||
|
f"{commentary}",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
def escape_ansi(line):
|
||||||
|
ansi_escape = re.compile(r'(?:\x1B[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]')
|
||||||
|
return ansi_escape.sub('', line)
|
||||||
|
|
||||||
|
# sort but don't consider ansi escape codes
|
||||||
|
all_rows = sorted(all_rows, key=lambda x: escape_ansi(str(x[0]+x[1]+x[2]+x[3])))
|
||||||
|
if len(all_rows) == 0:
|
||||||
|
print("No differences found between tooling (with labels)")
|
||||||
|
else:
|
||||||
|
print("Match differences between tooling (with labels):")
|
||||||
|
indent = " "
|
||||||
|
print(indent + tabulate([["TOOL PARTITION", "PACKAGE", "VULNERABILITY", "LABEL", "COMMENTARY"]]+all_rows, tablefmt="plain").replace("\n", "\n" + indent) + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
# populate the quality gate with data that can evaluate pass/fail conditions
|
||||||
|
return Gate(label_comparisons=comparisons_by_result_id.values(), label_comparison_stats=stats_by_image_tool_pair)
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.option("--image", "-i", "images", multiple=True, help="filter down to one or more images to validate with (don't use the full result set)")
|
||||||
|
@click.option("--label-comparison", "-l", "always_run_label_comparison", is_flag=True, help="run label comparison irregardless of relative comparison results")
|
||||||
|
@click.option("--breakdown-by-ecosystem", "-e", is_flag=True, help="show label comparison results broken down by ecosystem")
|
||||||
|
@click.option("--verbose", "-v", "verbosity", count=True, help="show details of all comparisons")
|
||||||
|
@click.option("--result-set", "-r", default=default_result_set, help="the result set to use for the quality gate")
|
||||||
|
def main(images: list[str], always_run_label_comparison: bool, breakdown_by_ecosystem: bool, verbosity: int, result_set: str):
|
||||||
|
cfg = config.load()
|
||||||
|
setup_logging(verbosity)
|
||||||
|
|
||||||
|
# let's not load any more labels than we need to, base this off of the images we're validating
|
||||||
|
if not images:
|
||||||
|
images = set()
|
||||||
|
result_set_obj = store.result_set.load(name=result_set)
|
||||||
|
for state in result_set_obj.state:
|
||||||
|
images.add(state.config.image)
|
||||||
|
images = sorted(list(images))
|
||||||
|
|
||||||
|
print("Loading label entries...", end=" ")
|
||||||
|
label_entries = store.labels.load_for_image(images, year_max_limit=cfg.default_max_year)
|
||||||
|
print(f"done! {len(label_entries)} entries loaded")
|
||||||
|
|
||||||
|
result_sets = [result_set] # today only one result set is supported, but more can be added
|
||||||
|
gates = []
|
||||||
|
for result_set in result_sets:
|
||||||
|
gates.extend(validate(cfg, result_set, images=images, always_run_label_comparison=always_run_label_comparison, verbosity=verbosity, label_entries=label_entries))
|
||||||
|
print()
|
||||||
|
|
||||||
|
if breakdown_by_ecosystem:
|
||||||
|
print(f"{bcolors.HEADER}Breaking down label comparison by ecosystem performance...", bcolors.RESET)
|
||||||
|
results_by_image, label_entries, stats = yardstick.compare_results_against_labels_by_ecosystem(result_set=result_set, year_max_limit=cfg.default_max_year, label_entries=label_entries)
|
||||||
|
display.labels_by_ecosystem_comparison(
|
||||||
|
results_by_image,
|
||||||
|
stats,
|
||||||
|
show_images_used=False,
|
||||||
|
)
|
||||||
|
print()
|
||||||
|
|
||||||
|
failure = not all([gate.passed() for gate in gates])
|
||||||
|
if failure:
|
||||||
|
print("Reasons for quality gate failure:")
|
||||||
|
for gate in gates:
|
||||||
|
for reason in gate.reasons:
|
||||||
|
print(f" - {reason}")
|
||||||
|
|
||||||
|
if failure:
|
||||||
|
print()
|
||||||
|
print(f"{bcolors.FAIL}{bcolors.BOLD}Quality gate FAILED{bcolors.RESET}")
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
print(f"{bcolors.OKGREEN}{bcolors.BOLD}Quality gate passed!{bcolors.RESET}")
|
||||||
|
|
||||||
|
|
||||||
|
def setup_logging(verbosity: int):
|
||||||
|
# pylint: disable=redefined-outer-name, import-outside-toplevel
|
||||||
|
import logging.config
|
||||||
|
|
||||||
|
if verbosity in [0, 1, 2]:
|
||||||
|
log_level = "WARN"
|
||||||
|
elif verbosity == 3:
|
||||||
|
log_level = "INFO"
|
||||||
|
else:
|
||||||
|
log_level = "DEBUG"
|
||||||
|
|
||||||
|
logging.config.dictConfig(
|
||||||
|
{
|
||||||
|
"version": 1,
|
||||||
|
"formatters": {
|
||||||
|
"standard": {
|
||||||
|
# [%(module)s.%(funcName)s]
|
||||||
|
"format": "%(asctime)s [%(levelname)s] %(message)s",
|
||||||
|
"datefmt": "",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"handlers": {
|
||||||
|
"default": {
|
||||||
|
"level": log_level,
|
||||||
|
"formatter": "standard",
|
||||||
|
"class": "logging.StreamHandler",
|
||||||
|
"stream": "ext://sys.stderr",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"loggers": {
|
||||||
|
"": { # root logger
|
||||||
|
"handlers": ["default"],
|
||||||
|
"level": log_level,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
3
test/quality/requirements.txt
Normal file
3
test/quality/requirements.txt
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
git+https://github.com/anchore/yardstick@4526ad2ff6d33d34e900ed692c3a90adc80eab73
|
||||||
|
# ../../../yardstick
|
||||||
|
tabulate==0.8.10
|
1
test/quality/vulnerability-match-labels
Submodule
1
test/quality/vulnerability-match-labels
Submodule
|
@ -0,0 +1 @@
|
||||||
|
Subproject commit 3a2ecc336411ddc3f37b7d5c123b80f6848a2cf3
|
Loading…
Reference in a new issue