chore: switch to yardstick validate from custom gate.py

Signed-off-by: Will Murphy <willmurphyscode@users.noreply.github.com>
This commit is contained in:
Will Murphy 2024-08-29 12:33:29 -04:00
parent 7901a57c1e
commit 832391cbe6
4 changed files with 9 additions and 347 deletions

View file

@ -91,6 +91,11 @@ default_max_year: 2021
result-sets:
pr_vs_latest_via_sbom:
description: "latest released grype vs grype from the current build (via SBOM ingestion)"
validations:
- max-f1-regression: 0.0
max-new-false-negatives: 00
max-unlabeled-percent: 10
max_year: 2021
matrix:
images: *images
@ -112,6 +117,7 @@ result-sets:
# for local build of grype, use for example:
version: path:../../+import-db=db.tar.gz
takes: SBOM
label: candidate
- name: grype
# note: we import a static (pinned) DB as to prevent changes in the DB from affecting the results. The
@ -121,3 +127,4 @@ result-sets:
# are testing with is not too stale.
version: latest+import-db=db.tar.gz
takes: SBOM
label: reference

View file

@ -27,7 +27,7 @@ all: capture validate ## Fetch or capture all data and run all quality checks
.PHONY: validate
validate: venv $(VULNERABILITY_LABELS)/Makefile ## Run all quality checks against already collected data
$(ACTIVATE_VENV) ./gate.py
$(ACTIVATE_VENV) yardstick validate -r $(RESULT_SET)
.PHONY: capture
capture: sboms vulns ## Collect and store all syft and grype results

View file

@ -1,345 +0,0 @@
#!/usr/bin/env python3
import logging
import os
import re
import subprocess
import sys
from typing import Optional
import click
from tabulate import tabulate
from dataclasses import dataclass, InitVar, field
import yardstick
from yardstick import store, comparison, artifact, arrange
from yardstick.cli import display, config
# see the .yardstick.yaml configuration for details
default_result_set = "pr_vs_latest_via_sbom"
yardstick.utils.grype_db.raise_on_failure(False)
@dataclass
class Gate:
label_comparisons: InitVar[Optional[list[comparison.AgainstLabels]]]
label_comparison_stats: InitVar[Optional[comparison.ImageToolLabelStats]]
reasons: list[str] = field(default_factory=list)
def __post_init__(self, label_comparisons: Optional[list[comparison.AgainstLabels]], label_comparison_stats: Optional[comparison.ImageToolLabelStats]):
if not label_comparisons and not label_comparison_stats:
return
reasons = []
# - fail when current F1 score drops below last release F1 score (or F1 score is indeterminate)
# - fail when indeterminate % > 10%
# - fail when there is a rise in FNs
latest_release_tool, current_tool = guess_tool_orientation(label_comparison_stats.tools)
latest_release_comparisons_by_image = {comp.config.image: comp for comp in label_comparisons if comp.config.tool == latest_release_tool }
current_comparisons_by_image = {comp.config.image: comp for comp in label_comparisons if comp.config.tool == current_tool }
for image, comp in current_comparisons_by_image.items():
latest_f1_score = latest_release_comparisons_by_image[image].summary.f1_score
current_f1_score = comp.summary.f1_score
if current_f1_score < latest_f1_score:
reasons.append(f"current F1 score is lower than the latest release F1 score: {bcolors.BOLD+bcolors.UNDERLINE}current={current_f1_score:0.2f} latest={latest_f1_score:0.2f}{bcolors.RESET} image={image}")
if comp.summary.indeterminate_percent > 10:
reasons.append(f"current indeterminate matches % is greater than 10%: {bcolors.BOLD+bcolors.UNDERLINE}current={comp.summary.indeterminate_percent:0.2f}%{bcolors.RESET} image={image}")
latest_fns = latest_release_comparisons_by_image[image].summary.false_negatives
current_fns = comp.summary.false_negatives
if current_fns > latest_fns:
reasons.append(f"current false negatives is greater than the latest release false negatives: {bcolors.BOLD+bcolors.UNDERLINE}current={current_fns} latest={latest_fns}{bcolors.RESET} image={image}")
self.reasons = reasons
def passed(self):
return len(self.reasons) == 0
def guess_tool_orientation(tools: list[str]):
"""
Given a pair of tools, guess which is latest version, and which is the one
being compared to the latest version.
Returns (latest_tool, current_tool)
"""
if len(tools) != 2:
raise RuntimeError("expected 2 tools, got %s" % tools)
tool_a, tool_b = sorted(tools)
if tool_a == tool_b:
raise ValueError("latest release tool and current tool are the same")
if tool_a.endswith("latest"):
return tool_a, tool_b
elif tool_b.endswith("latest"):
return tool_b, tool_a
if "@path:" in tool_a and "@path:" not in tool_b:
# tool_a is a local build, so compare it against tool_b
return tool_b, tool_a
if "@path:" in tool_b and "@path:" not in tool_a:
# tool_b is a local build, so compare it against tool_a
return tool_a, tool_b
return tool_a, tool_b
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKCYAN = '\033[96m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
RESET = '\033[0m'
if not sys.stdout.isatty():
bcolors.HEADER = ""
bcolors.OKBLUE = ""
bcolors.OKCYAN = ""
bcolors.OKGREEN = ""
bcolors.WARNING = ""
bcolors.FAIL = ""
bcolors.BOLD = ""
bcolors.UNDERLINE = ""
bcolors.RESET = ""
def show_results_used(results: list[artifact.ScanResult]):
print(f" Results used:")
for idx, result in enumerate(results):
branch = "├──"
if idx == len(results) - 1:
branch = "└──"
print(f" {branch} {result.ID} : {result.config.tool} against {result.config.image}")
print()
def validate(cfg: config.Application, result_set: str, images: list[str], always_run_label_comparison: bool, verbosity: int, label_entries: Optional[list[artifact.LabelEntry]] = None):
print(f"{bcolors.HEADER}{bcolors.BOLD}Validating with {result_set!r}", bcolors.RESET)
result_set_obj = store.result_set.load(name=result_set)
ret = []
for image, result_states in result_set_obj.result_state_by_image.items():
if images and image not in images:
print("Skipping image:", image)
continue
print()
print("Testing image:", image)
for state in result_states:
print(" ", f"with {state.request.tool}")
print()
gate = validate_image(cfg, [s.config.path for s in result_states], always_run_label_comparison=always_run_label_comparison, verbosity=verbosity, label_entries=label_entries)
ret.append(gate)
failure = not gate.passed()
if failure:
print(f"{bcolors.FAIL}{bcolors.BOLD}Failed quality gate{bcolors.RESET}")
for reason in gate.reasons:
print(f" - {reason}")
print()
size = 120
print(""*size)
print(""*size)
print(""*size)
return ret
def validate_image(cfg: config.Application, descriptions: list[str], always_run_label_comparison: bool, verbosity: int, label_entries: Optional[list[artifact.LabelEntry]] = None):
# do a relative comparison
# - show comparison summary (no gating action)
# - list out all individual match differences
print(f"{bcolors.HEADER}Running relative comparison...", bcolors.RESET)
relative_comparison = yardstick.compare_results(descriptions=descriptions, year_max_limit=cfg.default_max_year)
show_results_used(relative_comparison.results)
# show the relative comparison results
if verbosity > 0:
details = verbosity > 1
display.preserved_matches(relative_comparison, details=details, summary=True, common=False)
print()
# bail if there are no differences found
if not always_run_label_comparison and not sum([len(relative_comparison.unique[result.ID]) for result in relative_comparison.results]):
print("no differences found between tool results")
return Gate(None, None)
# do a label comparison
print(f"{bcolors.HEADER}Running comparison against labels...", bcolors.RESET)
results, label_entries, comparisons_by_result_id, stats_by_image_tool_pair = yardstick.compare_results_against_labels(descriptions=descriptions, year_max_limit=cfg.default_max_year, label_entries=label_entries)
show_results_used(results)
if verbosity > 0:
show_fns = verbosity > 1
display.label_comparison(
results,
comparisons_by_result_id,
stats_by_image_tool_pair,
show_fns=show_fns,
show_summaries=True,
)
latest_release_tool, current_tool = guess_tool_orientation([r.config.tool for r in results])
# show the relative comparison unique differences paired up with label conclusions (TP/FP/FN/TN/Unknown)
all_rows: list[list[Any]] = []
for result in relative_comparison.results:
label_comparison = comparisons_by_result_id[result.ID]
for unique_match in relative_comparison.unique[result.ID]:
labels = label_comparison.labels_by_match[unique_match.ID]
if not labels:
label = "(unknown)"
elif len(set(labels)) > 1:
label = ", ".join([l.name for l in labels])
else:
label = labels[0].name
color = ""
commentary = ""
if result.config.tool == latest_release_tool:
# the tool which found the unique result is the latest release tool...
if label == artifact.Label.TruePositive.name:
# drats! we missed a case (this is a new FN)
color = bcolors.FAIL
commentary = "(this is a new FN 😱)"
elif artifact.Label.FalsePositive.name in label:
# we got rid of a FP! ["hip!", "hip!"]
color = bcolors.OKBLUE
commentary = "(got rid of a former FP 🙌)"
else:
# the tool which found the unique result is the current tool...
if label == artifact.Label.TruePositive.name:
# highest of fives! we found a new TP that the previous tool release missed!
color = bcolors.OKBLUE
commentary = "(this is a new TP 🙌)"
elif artifact.Label.FalsePositive.name in label:
# welp, our changes resulted in a new FP... not great, maybe not terrible?
color = bcolors.FAIL
commentary = "(this is a new FP 😱)"
all_rows.append(
[
f"{color}{result.config.tool} ONLY{bcolors.RESET}",
f"{color}{unique_match.package.name}@{unique_match.package.version}{bcolors.RESET}",
f"{color}{unique_match.vulnerability.id}{bcolors.RESET}",
f"{color}{label}{bcolors.RESET}",
f"{commentary}",
]
)
def escape_ansi(line):
ansi_escape = re.compile(r'(?:\x1B[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]')
return ansi_escape.sub('', line)
# sort but don't consider ansi escape codes
all_rows = sorted(all_rows, key=lambda x: escape_ansi(str(x[0]+x[1]+x[2]+x[3])))
if len(all_rows) == 0:
print("No differences found between tooling (with labels)")
else:
print("Match differences between tooling (with labels):")
indent = " "
print(indent + tabulate([["TOOL PARTITION", "PACKAGE", "VULNERABILITY", "LABEL", "COMMENTARY"]]+all_rows, tablefmt="plain").replace("\n", "\n" + indent) + "\n")
# populate the quality gate with data that can evaluate pass/fail conditions
return Gate(label_comparisons=comparisons_by_result_id.values(), label_comparison_stats=stats_by_image_tool_pair)
@click.command()
@click.option("--image", "-i", "images", multiple=True, help="filter down to one or more images to validate with (don't use the full result set)")
@click.option("--label-comparison", "-l", "always_run_label_comparison", is_flag=True, help="run label comparison irregardless of relative comparison results")
@click.option("--breakdown-by-ecosystem", "-e", is_flag=True, help="show label comparison results broken down by ecosystem")
@click.option("--verbose", "-v", "verbosity", count=True, help="show details of all comparisons")
@click.option("--result-set", "-r", default=default_result_set, help="the result set to use for the quality gate")
def main(images: list[str], always_run_label_comparison: bool, breakdown_by_ecosystem: bool, verbosity: int, result_set: str):
cfg = config.load()
setup_logging(verbosity)
# let's not load any more labels than we need to, base this off of the images we're validating
if not images:
images = set()
result_set_obj = store.result_set.load(name=result_set)
for state in result_set_obj.state:
images.add(state.config.image)
images = sorted(list(images))
print("Loading label entries...", end=" ")
label_entries = store.labels.load_for_image(images, year_max_limit=cfg.default_max_year)
print(f"done! {len(label_entries)} entries loaded")
result_sets = [result_set] # today only one result set is supported, but more can be added
gates = []
for result_set in result_sets:
gates.extend(validate(cfg, result_set, images=images, always_run_label_comparison=always_run_label_comparison, verbosity=verbosity, label_entries=label_entries))
print()
if breakdown_by_ecosystem:
print(f"{bcolors.HEADER}Breaking down label comparison by ecosystem performance...", bcolors.RESET)
results_by_image, label_entries, stats = yardstick.compare_results_against_labels_by_ecosystem(result_set=result_set, year_max_limit=cfg.default_max_year, label_entries=label_entries)
display.labels_by_ecosystem_comparison(
results_by_image,
stats,
show_images_used=False,
)
print()
failure = not all([gate.passed() for gate in gates])
if failure:
print("Reasons for quality gate failure:")
for gate in gates:
for reason in gate.reasons:
print(f" - {reason}")
if failure:
print()
print(f"{bcolors.FAIL}{bcolors.BOLD}Quality gate FAILED{bcolors.RESET}")
sys.exit(1)
else:
print(f"{bcolors.OKGREEN}{bcolors.BOLD}Quality gate passed!{bcolors.RESET}")
def setup_logging(verbosity: int):
# pylint: disable=redefined-outer-name, import-outside-toplevel
import logging.config
if verbosity in [0, 1, 2]:
log_level = "WARN"
elif verbosity == 3:
log_level = "INFO"
else:
log_level = "DEBUG"
logging.config.dictConfig(
{
"version": 1,
"formatters": {
"standard": {
# [%(module)s.%(funcName)s]
"format": "%(asctime)s [%(levelname)s] %(message)s",
"datefmt": "",
},
},
"handlers": {
"default": {
"level": log_level,
"formatter": "standard",
"class": "logging.StreamHandler",
"stream": "ext://sys.stderr",
},
},
"loggers": {
"": { # root logger
"handlers": ["default"],
"level": log_level,
},
},
}
)
if __name__ == '__main__':
main()

View file

@ -1,3 +1,3 @@
git+https://github.com/anchore/yardstick@v0.9.1
git+https://github.com/anchore/yardstick@feat-validate-subcommand
# ../../../yardstick
tabulate==0.9.0