mirror of
https://github.com/anchore/grype
synced 2024-11-10 06:34:13 +00:00
chore: switch to yardstick validate from custom gate.py
Signed-off-by: Will Murphy <willmurphyscode@users.noreply.github.com>
This commit is contained in:
parent
7901a57c1e
commit
832391cbe6
4 changed files with 9 additions and 347 deletions
|
@ -91,6 +91,11 @@ default_max_year: 2021
|
|||
result-sets:
|
||||
pr_vs_latest_via_sbom:
|
||||
description: "latest released grype vs grype from the current build (via SBOM ingestion)"
|
||||
validations:
|
||||
- max-f1-regression: 0.0
|
||||
max-new-false-negatives: 00
|
||||
max-unlabeled-percent: 10
|
||||
max_year: 2021
|
||||
matrix:
|
||||
images: *images
|
||||
|
||||
|
@ -112,6 +117,7 @@ result-sets:
|
|||
# for local build of grype, use for example:
|
||||
version: path:../../+import-db=db.tar.gz
|
||||
takes: SBOM
|
||||
label: candidate
|
||||
|
||||
- name: grype
|
||||
# note: we import a static (pinned) DB as to prevent changes in the DB from affecting the results. The
|
||||
|
@ -121,3 +127,4 @@ result-sets:
|
|||
# are testing with is not too stale.
|
||||
version: latest+import-db=db.tar.gz
|
||||
takes: SBOM
|
||||
label: reference
|
||||
|
|
|
@ -27,7 +27,7 @@ all: capture validate ## Fetch or capture all data and run all quality checks
|
|||
|
||||
.PHONY: validate
|
||||
validate: venv $(VULNERABILITY_LABELS)/Makefile ## Run all quality checks against already collected data
|
||||
$(ACTIVATE_VENV) ./gate.py
|
||||
$(ACTIVATE_VENV) yardstick validate -r $(RESULT_SET)
|
||||
|
||||
.PHONY: capture
|
||||
capture: sboms vulns ## Collect and store all syft and grype results
|
||||
|
|
|
@ -1,345 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import click
|
||||
from tabulate import tabulate
|
||||
from dataclasses import dataclass, InitVar, field
|
||||
|
||||
import yardstick
|
||||
from yardstick import store, comparison, artifact, arrange
|
||||
from yardstick.cli import display, config
|
||||
|
||||
|
||||
# see the .yardstick.yaml configuration for details
|
||||
default_result_set = "pr_vs_latest_via_sbom"
|
||||
yardstick.utils.grype_db.raise_on_failure(False)
|
||||
|
||||
@dataclass
|
||||
class Gate:
|
||||
label_comparisons: InitVar[Optional[list[comparison.AgainstLabels]]]
|
||||
label_comparison_stats: InitVar[Optional[comparison.ImageToolLabelStats]]
|
||||
|
||||
reasons: list[str] = field(default_factory=list)
|
||||
|
||||
def __post_init__(self, label_comparisons: Optional[list[comparison.AgainstLabels]], label_comparison_stats: Optional[comparison.ImageToolLabelStats]):
|
||||
if not label_comparisons and not label_comparison_stats:
|
||||
return
|
||||
|
||||
reasons = []
|
||||
|
||||
# - fail when current F1 score drops below last release F1 score (or F1 score is indeterminate)
|
||||
# - fail when indeterminate % > 10%
|
||||
# - fail when there is a rise in FNs
|
||||
latest_release_tool, current_tool = guess_tool_orientation(label_comparison_stats.tools)
|
||||
|
||||
latest_release_comparisons_by_image = {comp.config.image: comp for comp in label_comparisons if comp.config.tool == latest_release_tool }
|
||||
current_comparisons_by_image = {comp.config.image: comp for comp in label_comparisons if comp.config.tool == current_tool }
|
||||
|
||||
for image, comp in current_comparisons_by_image.items():
|
||||
latest_f1_score = latest_release_comparisons_by_image[image].summary.f1_score
|
||||
current_f1_score = comp.summary.f1_score
|
||||
if current_f1_score < latest_f1_score:
|
||||
reasons.append(f"current F1 score is lower than the latest release F1 score: {bcolors.BOLD+bcolors.UNDERLINE}current={current_f1_score:0.2f} latest={latest_f1_score:0.2f}{bcolors.RESET} image={image}")
|
||||
|
||||
if comp.summary.indeterminate_percent > 10:
|
||||
reasons.append(f"current indeterminate matches % is greater than 10%: {bcolors.BOLD+bcolors.UNDERLINE}current={comp.summary.indeterminate_percent:0.2f}%{bcolors.RESET} image={image}")
|
||||
|
||||
latest_fns = latest_release_comparisons_by_image[image].summary.false_negatives
|
||||
current_fns = comp.summary.false_negatives
|
||||
if current_fns > latest_fns:
|
||||
reasons.append(f"current false negatives is greater than the latest release false negatives: {bcolors.BOLD+bcolors.UNDERLINE}current={current_fns} latest={latest_fns}{bcolors.RESET} image={image}")
|
||||
|
||||
self.reasons = reasons
|
||||
|
||||
def passed(self):
|
||||
return len(self.reasons) == 0
|
||||
|
||||
def guess_tool_orientation(tools: list[str]):
|
||||
"""
|
||||
Given a pair of tools, guess which is latest version, and which is the one
|
||||
being compared to the latest version.
|
||||
Returns (latest_tool, current_tool)
|
||||
"""
|
||||
if len(tools) != 2:
|
||||
raise RuntimeError("expected 2 tools, got %s" % tools)
|
||||
tool_a, tool_b = sorted(tools)
|
||||
if tool_a == tool_b:
|
||||
raise ValueError("latest release tool and current tool are the same")
|
||||
if tool_a.endswith("latest"):
|
||||
return tool_a, tool_b
|
||||
elif tool_b.endswith("latest"):
|
||||
return tool_b, tool_a
|
||||
|
||||
if "@path:" in tool_a and "@path:" not in tool_b:
|
||||
# tool_a is a local build, so compare it against tool_b
|
||||
return tool_b, tool_a
|
||||
|
||||
if "@path:" in tool_b and "@path:" not in tool_a:
|
||||
# tool_b is a local build, so compare it against tool_a
|
||||
return tool_a, tool_b
|
||||
|
||||
return tool_a, tool_b
|
||||
|
||||
|
||||
|
||||
class bcolors:
|
||||
HEADER = '\033[95m'
|
||||
OKBLUE = '\033[94m'
|
||||
OKCYAN = '\033[96m'
|
||||
OKGREEN = '\033[92m'
|
||||
WARNING = '\033[93m'
|
||||
FAIL = '\033[91m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
RESET = '\033[0m'
|
||||
|
||||
if not sys.stdout.isatty():
|
||||
bcolors.HEADER = ""
|
||||
bcolors.OKBLUE = ""
|
||||
bcolors.OKCYAN = ""
|
||||
bcolors.OKGREEN = ""
|
||||
bcolors.WARNING = ""
|
||||
bcolors.FAIL = ""
|
||||
bcolors.BOLD = ""
|
||||
bcolors.UNDERLINE = ""
|
||||
bcolors.RESET = ""
|
||||
|
||||
def show_results_used(results: list[artifact.ScanResult]):
|
||||
print(f" Results used:")
|
||||
for idx, result in enumerate(results):
|
||||
branch = "├──"
|
||||
if idx == len(results) - 1:
|
||||
branch = "└──"
|
||||
print(f" {branch} {result.ID} : {result.config.tool} against {result.config.image}")
|
||||
print()
|
||||
|
||||
def validate(cfg: config.Application, result_set: str, images: list[str], always_run_label_comparison: bool, verbosity: int, label_entries: Optional[list[artifact.LabelEntry]] = None):
|
||||
print(f"{bcolors.HEADER}{bcolors.BOLD}Validating with {result_set!r}", bcolors.RESET)
|
||||
result_set_obj = store.result_set.load(name=result_set)
|
||||
|
||||
ret = []
|
||||
for image, result_states in result_set_obj.result_state_by_image.items():
|
||||
if images and image not in images:
|
||||
print("Skipping image:", image)
|
||||
continue
|
||||
print()
|
||||
print("Testing image:", image)
|
||||
for state in result_states:
|
||||
print(" ", f"with {state.request.tool}")
|
||||
print()
|
||||
|
||||
gate = validate_image(cfg, [s.config.path for s in result_states], always_run_label_comparison=always_run_label_comparison, verbosity=verbosity, label_entries=label_entries)
|
||||
ret.append(gate)
|
||||
|
||||
failure = not gate.passed()
|
||||
if failure:
|
||||
print(f"{bcolors.FAIL}{bcolors.BOLD}Failed quality gate{bcolors.RESET}")
|
||||
for reason in gate.reasons:
|
||||
print(f" - {reason}")
|
||||
|
||||
print()
|
||||
size = 120
|
||||
print("▁"*size)
|
||||
print("░"*size)
|
||||
print("▔"*size)
|
||||
return ret
|
||||
|
||||
def validate_image(cfg: config.Application, descriptions: list[str], always_run_label_comparison: bool, verbosity: int, label_entries: Optional[list[artifact.LabelEntry]] = None):
|
||||
# do a relative comparison
|
||||
# - show comparison summary (no gating action)
|
||||
# - list out all individual match differences
|
||||
|
||||
print(f"{bcolors.HEADER}Running relative comparison...", bcolors.RESET)
|
||||
relative_comparison = yardstick.compare_results(descriptions=descriptions, year_max_limit=cfg.default_max_year)
|
||||
show_results_used(relative_comparison.results)
|
||||
|
||||
# show the relative comparison results
|
||||
if verbosity > 0:
|
||||
details = verbosity > 1
|
||||
display.preserved_matches(relative_comparison, details=details, summary=True, common=False)
|
||||
print()
|
||||
|
||||
# bail if there are no differences found
|
||||
if not always_run_label_comparison and not sum([len(relative_comparison.unique[result.ID]) for result in relative_comparison.results]):
|
||||
print("no differences found between tool results")
|
||||
return Gate(None, None)
|
||||
|
||||
# do a label comparison
|
||||
print(f"{bcolors.HEADER}Running comparison against labels...", bcolors.RESET)
|
||||
results, label_entries, comparisons_by_result_id, stats_by_image_tool_pair = yardstick.compare_results_against_labels(descriptions=descriptions, year_max_limit=cfg.default_max_year, label_entries=label_entries)
|
||||
show_results_used(results)
|
||||
|
||||
if verbosity > 0:
|
||||
show_fns = verbosity > 1
|
||||
display.label_comparison(
|
||||
results,
|
||||
comparisons_by_result_id,
|
||||
stats_by_image_tool_pair,
|
||||
show_fns=show_fns,
|
||||
show_summaries=True,
|
||||
)
|
||||
|
||||
latest_release_tool, current_tool = guess_tool_orientation([r.config.tool for r in results])
|
||||
|
||||
# show the relative comparison unique differences paired up with label conclusions (TP/FP/FN/TN/Unknown)
|
||||
all_rows: list[list[Any]] = []
|
||||
for result in relative_comparison.results:
|
||||
label_comparison = comparisons_by_result_id[result.ID]
|
||||
for unique_match in relative_comparison.unique[result.ID]:
|
||||
labels = label_comparison.labels_by_match[unique_match.ID]
|
||||
if not labels:
|
||||
label = "(unknown)"
|
||||
elif len(set(labels)) > 1:
|
||||
label = ", ".join([l.name for l in labels])
|
||||
else:
|
||||
label = labels[0].name
|
||||
|
||||
|
||||
color = ""
|
||||
commentary = ""
|
||||
if result.config.tool == latest_release_tool:
|
||||
# the tool which found the unique result is the latest release tool...
|
||||
if label == artifact.Label.TruePositive.name:
|
||||
# drats! we missed a case (this is a new FN)
|
||||
color = bcolors.FAIL
|
||||
commentary = "(this is a new FN 😱)"
|
||||
elif artifact.Label.FalsePositive.name in label:
|
||||
# we got rid of a FP! ["hip!", "hip!"]
|
||||
color = bcolors.OKBLUE
|
||||
commentary = "(got rid of a former FP 🙌)"
|
||||
else:
|
||||
# the tool which found the unique result is the current tool...
|
||||
if label == artifact.Label.TruePositive.name:
|
||||
# highest of fives! we found a new TP that the previous tool release missed!
|
||||
color = bcolors.OKBLUE
|
||||
commentary = "(this is a new TP 🙌)"
|
||||
elif artifact.Label.FalsePositive.name in label:
|
||||
# welp, our changes resulted in a new FP... not great, maybe not terrible?
|
||||
color = bcolors.FAIL
|
||||
commentary = "(this is a new FP 😱)"
|
||||
|
||||
all_rows.append(
|
||||
[
|
||||
f"{color}{result.config.tool} ONLY{bcolors.RESET}",
|
||||
f"{color}{unique_match.package.name}@{unique_match.package.version}{bcolors.RESET}",
|
||||
f"{color}{unique_match.vulnerability.id}{bcolors.RESET}",
|
||||
f"{color}{label}{bcolors.RESET}",
|
||||
f"{commentary}",
|
||||
]
|
||||
)
|
||||
|
||||
def escape_ansi(line):
|
||||
ansi_escape = re.compile(r'(?:\x1B[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]')
|
||||
return ansi_escape.sub('', line)
|
||||
|
||||
# sort but don't consider ansi escape codes
|
||||
all_rows = sorted(all_rows, key=lambda x: escape_ansi(str(x[0]+x[1]+x[2]+x[3])))
|
||||
if len(all_rows) == 0:
|
||||
print("No differences found between tooling (with labels)")
|
||||
else:
|
||||
print("Match differences between tooling (with labels):")
|
||||
indent = " "
|
||||
print(indent + tabulate([["TOOL PARTITION", "PACKAGE", "VULNERABILITY", "LABEL", "COMMENTARY"]]+all_rows, tablefmt="plain").replace("\n", "\n" + indent) + "\n")
|
||||
|
||||
|
||||
# populate the quality gate with data that can evaluate pass/fail conditions
|
||||
return Gate(label_comparisons=comparisons_by_result_id.values(), label_comparison_stats=stats_by_image_tool_pair)
|
||||
|
||||
@click.command()
|
||||
@click.option("--image", "-i", "images", multiple=True, help="filter down to one or more images to validate with (don't use the full result set)")
|
||||
@click.option("--label-comparison", "-l", "always_run_label_comparison", is_flag=True, help="run label comparison irregardless of relative comparison results")
|
||||
@click.option("--breakdown-by-ecosystem", "-e", is_flag=True, help="show label comparison results broken down by ecosystem")
|
||||
@click.option("--verbose", "-v", "verbosity", count=True, help="show details of all comparisons")
|
||||
@click.option("--result-set", "-r", default=default_result_set, help="the result set to use for the quality gate")
|
||||
def main(images: list[str], always_run_label_comparison: bool, breakdown_by_ecosystem: bool, verbosity: int, result_set: str):
|
||||
cfg = config.load()
|
||||
setup_logging(verbosity)
|
||||
|
||||
# let's not load any more labels than we need to, base this off of the images we're validating
|
||||
if not images:
|
||||
images = set()
|
||||
result_set_obj = store.result_set.load(name=result_set)
|
||||
for state in result_set_obj.state:
|
||||
images.add(state.config.image)
|
||||
images = sorted(list(images))
|
||||
|
||||
print("Loading label entries...", end=" ")
|
||||
label_entries = store.labels.load_for_image(images, year_max_limit=cfg.default_max_year)
|
||||
print(f"done! {len(label_entries)} entries loaded")
|
||||
|
||||
result_sets = [result_set] # today only one result set is supported, but more can be added
|
||||
gates = []
|
||||
for result_set in result_sets:
|
||||
gates.extend(validate(cfg, result_set, images=images, always_run_label_comparison=always_run_label_comparison, verbosity=verbosity, label_entries=label_entries))
|
||||
print()
|
||||
|
||||
if breakdown_by_ecosystem:
|
||||
print(f"{bcolors.HEADER}Breaking down label comparison by ecosystem performance...", bcolors.RESET)
|
||||
results_by_image, label_entries, stats = yardstick.compare_results_against_labels_by_ecosystem(result_set=result_set, year_max_limit=cfg.default_max_year, label_entries=label_entries)
|
||||
display.labels_by_ecosystem_comparison(
|
||||
results_by_image,
|
||||
stats,
|
||||
show_images_used=False,
|
||||
)
|
||||
print()
|
||||
|
||||
failure = not all([gate.passed() for gate in gates])
|
||||
if failure:
|
||||
print("Reasons for quality gate failure:")
|
||||
for gate in gates:
|
||||
for reason in gate.reasons:
|
||||
print(f" - {reason}")
|
||||
|
||||
if failure:
|
||||
print()
|
||||
print(f"{bcolors.FAIL}{bcolors.BOLD}Quality gate FAILED{bcolors.RESET}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(f"{bcolors.OKGREEN}{bcolors.BOLD}Quality gate passed!{bcolors.RESET}")
|
||||
|
||||
|
||||
def setup_logging(verbosity: int):
|
||||
# pylint: disable=redefined-outer-name, import-outside-toplevel
|
||||
import logging.config
|
||||
|
||||
if verbosity in [0, 1, 2]:
|
||||
log_level = "WARN"
|
||||
elif verbosity == 3:
|
||||
log_level = "INFO"
|
||||
else:
|
||||
log_level = "DEBUG"
|
||||
|
||||
logging.config.dictConfig(
|
||||
{
|
||||
"version": 1,
|
||||
"formatters": {
|
||||
"standard": {
|
||||
# [%(module)s.%(funcName)s]
|
||||
"format": "%(asctime)s [%(levelname)s] %(message)s",
|
||||
"datefmt": "",
|
||||
},
|
||||
},
|
||||
"handlers": {
|
||||
"default": {
|
||||
"level": log_level,
|
||||
"formatter": "standard",
|
||||
"class": "logging.StreamHandler",
|
||||
"stream": "ext://sys.stderr",
|
||||
},
|
||||
},
|
||||
"loggers": {
|
||||
"": { # root logger
|
||||
"handlers": ["default"],
|
||||
"level": log_level,
|
||||
},
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -1,3 +1,3 @@
|
|||
git+https://github.com/anchore/yardstick@v0.9.1
|
||||
git+https://github.com/anchore/yardstick@feat-validate-subcommand
|
||||
# ../../../yardstick
|
||||
tabulate==0.9.0
|
||||
|
|
Loading…
Reference in a new issue