remove inline-compare testing

Signed-off-by: Alex Goodman <alex.goodman@anchore.com>
This commit is contained in:
Alex Goodman 2021-03-18 09:00:05 -04:00
parent efcd8a8b9a
commit 68d698e9f2
No known key found for this signature in database
GPG key ID: 5CB45AE22BAB7EA7
10 changed files with 0 additions and 709 deletions

View file

@ -1,3 +0,0 @@
*.json
*.pyc
inline-reports

View file

@ -1,49 +0,0 @@
ifndef SYFT_CMD
SYFT_CMD = go run ../../main.go
endif
IMAGE_CLEAN = $(shell basename $(COMPARE_IMAGE) | tr ":" "_" )
SYFT_DIR = syft-reports
SYFT_REPORT = $(SYFT_DIR)/$(IMAGE_CLEAN).json
INLINE_DIR = inline-reports
INLINE_REPORT = $(INLINE_DIR)/$(IMAGE_CLEAN)-content-os.json
ifndef SYFT_DIR
$(error SYFT_DIR is not set)
endif
ifndef INLINE_DIR
$(error INLINE_DIR is not set)
endif
.PHONY: all
.DEFAULT_GOAL :=
all: clean-syft
./compare-all.sh
.PHONY: compare-image
compare-image: $(SYFT_REPORT) $(INLINE_REPORT)
./compare.py $(COMPARE_IMAGE)
.PHONY: gather-image
gather-image: $(SYFT_REPORT) $(INLINE_REPORT)
$(INLINE_REPORT):
echo "Creating $(INLINE_REPORT)..."
mkdir -p $(INLINE_DIR)
curl -s https://ci-tools.anchore.io/inline_scan-v0.7.0 | bash -s -- -p -r $(COMPARE_IMAGE)
mv anchore-reports/* $(INLINE_DIR)/
rmdir anchore-reports
$(SYFT_REPORT):
echo "Creating $(SYFT_REPORT)..."
mkdir -p $(SYFT_DIR)
$(SYFT_CMD) $(COMPARE_IMAGE) -o json > $(SYFT_REPORT)
.PHONY: clean
clean: clean-syft
rm -f $(INLINE_DIR)/*
.PHONY: clean-syft
clean-syft:
rm -f $(SYFT_DIR)/*

View file

@ -1,16 +0,0 @@
#!/usr/bin/env bash
set -eu
images=("debian:10.5" "centos:8.2.2004" "rails:5.0.1" "alpine:3.12.0" "anchore/test_images:java" "anchore/test_images:py38" "anchore/anchore-engine:v0.8.2" "jenkins/jenkins:2.249.2-lts-jdk11" )
# gather all image analyses
for img in "${images[@]}"; do
echo "Gathering facts for $img"
COMPARE_IMAGE=${img} make gather-image
done
# compare all results
for img in "${images[@]}"; do
echo "Comparing results for $img"
COMPARE_IMAGE=${img} make compare-image
done

View file

@ -1,234 +0,0 @@
#!/usr/bin/env python3
import os
import sys
import difflib
import collections
import utils.package
from utils.format import Colors, print_rows
from utils.inline import InlineScan
from utils.syft import Syft
DEFAULT_QUALITY_GATE_THRESHOLD = 0.95
INDENT = " "
PACKAGE_QUALITY_GATE = collections.defaultdict(lambda: DEFAULT_QUALITY_GATE_THRESHOLD, **{})
METADATA_QUALITY_GATE = collections.defaultdict(lambda: DEFAULT_QUALITY_GATE_THRESHOLD, **{
# syft is better at detecting package versions in specific cases, leading to a drop in matching metadata
"anchore/test_images:java": 0.61,
"jenkins/jenkins:2.249.2-lts-jdk11": 0.85,
})
# We additionally fail if an image is above a particular threshold. Why? We expect the lower threshold to be 90%,
# however additional functionality in grype is still being implemented, so this threshold may not be able to be met.
# In these cases the IMAGE_QUALITY_GATE is set to a lower value to allow the test to pass for known issues. Once these
# issues/enhancements are done we want to ensure that the lower threshold is bumped up to catch regression. The only way
# to do this is to select an upper threshold for images with known threshold values, so we have a failure that
# loudly indicates the lower threshold should be bumped.
PACKAGE_UPPER_THRESHOLD = collections.defaultdict(lambda: 1, **{})
METADATA_UPPER_THRESHOLD = collections.defaultdict(lambda: 1, **{
# syft is better at detecting package versions in specific cases, leading to a drop in matching metadata
"anchore/test_images:java": 0.65,
"jenkins/jenkins:2.249.2-lts-jdk11": 0.9,
})
def report(image, analysis):
if analysis.extra_packages:
rows = []
print(
Colors.bold + "Syft found extra packages:",
Colors.reset,
"Syft discovered packages that Inline did not",
)
for package in sorted(list(analysis.extra_packages)):
rows.append([INDENT, repr(package)])
print_rows(rows)
print()
if analysis.missing_packages:
rows = []
print(
Colors.bold + "Syft missed packages:",
Colors.reset,
"Inline discovered packages that Syft did not",
)
for package in sorted(list(analysis.missing_packages)):
rows.append([INDENT, repr(package)])
print_rows(rows)
print()
if analysis.missing_metadata:
print(
Colors.bold + "Syft mismatched metadata:",
Colors.reset,
"the packages between Syft and Inline are the same, the metadata is not",
)
for inline_metadata_pair in sorted(list(analysis.missing_metadata)):
pkg, metadata = inline_metadata_pair
if pkg not in analysis.syft_data.metadata[pkg.type]:
continue
syft_metadata_item = analysis.syft_data.metadata[pkg.type][pkg]
diffs = difflib.ndiff([repr(syft_metadata_item)], [repr(metadata)])
print(INDENT + "for: " + repr(pkg), "(top is syft, bottom is inline)")
print(INDENT+INDENT+("\n"+INDENT+INDENT).join(list(diffs)))
if not analysis.missing_metadata:
print(
INDENT,
"There are mismatches, but only due to packages Syft did not find (but inline did).\n",
)
if analysis.similar_missing_packages:
rows = []
print(
Colors.bold + "Probably pairings of missing/extra packages:",
Colors.reset,
"to aid in troubleshooting missed/extra packages",
)
for similar_packages in analysis.similar_missing_packages:
rows.append(
[
INDENT,
repr(similar_packages.pkg),
"--->",
repr(similar_packages.missed),
]
)
print_rows(rows)
print()
show_probable_mismatches = analysis.unmatched_missing_packages and analysis.extra_packages and len(analysis.unmatched_missing_packages) != len(analysis.missing_packages)
if show_probable_mismatches:
rows = []
print(
Colors.bold + "Probably missed packages:",
Colors.reset,
"a probable pair was not found",
)
for p in analysis.unmatched_missing_packages:
rows.append([INDENT, repr(p)])
print_rows(rows)
print()
print(Colors.bold + "Summary:", Colors.reset, image)
print(" Inline Packages : %d" % len(analysis.inline_data.packages))
print(" Syft Packages : %d" % len(analysis.syft_data.packages))
print(
" (extra) : %d (note: this is ignored by the quality gate!)"
% len(analysis.extra_packages)
)
print(" (missing) : %d" % len(analysis.missing_packages))
print()
if show_probable_mismatches:
print(
" Probable Package Matches : %d (matches not made, but were probably found by both Inline and Syft)"
% len(analysis.similar_missing_packages)
)
print(
" Probable Packages Matched : %2.3f %% (%d/%d packages)"
% (
analysis.percent_probable_overlapping_packages,
len(analysis.overlapping_packages)
+ len(analysis.similar_missing_packages),
len(analysis.inline_data.packages),
)
)
print(
" Probable Packages Missing : %d "
% len(analysis.unmatched_missing_packages)
)
print()
print(
" Baseline Packages Matched : %2.3f %% (%d/%d packages)"
% (
analysis.percent_overlapping_packages,
len(analysis.overlapping_packages),
len(analysis.inline_data.packages),
)
)
print(
" Baseline Metadata Matched : %2.3f %% (%d/%d metadata)"
% (
analysis.percent_overlapping_metadata,
len(analysis.overlapping_metadata),
len(analysis.inline_metadata),
)
)
def enforce_quality_gate(title, actual_value, lower_gate_value, upper_gate_value):
if actual_value < lower_gate_value:
print(
Colors.bold
+ " %s Quality Gate:\t" % title
+ Colors.FG.red
+ "FAIL (is not >= %d %%)" % lower_gate_value,
Colors.reset,
)
return False
elif actual_value > upper_gate_value:
print(
Colors.bold
+ " %s Quality Gate:\t" % title
+ Colors.FG.orange
+ "FAIL (lower threshold is artificially low and should be updated)",
Colors.reset,
)
return False
print(
Colors.bold
+ " %s Quality Gate:\t" % title
+ Colors.FG.green
+ "Pass (>= %d %%)" % lower_gate_value,
Colors.reset,
)
return True
def main(image):
cwd = os.path.dirname(os.path.abspath(__file__))
# parse the inline-scan and syft reports on disk
inline = InlineScan(image=image, report_dir=os.path.join(cwd, "inline-reports"))
syft = Syft(image=image, report_dir=os.path.join(cwd, "syft-reports"))
# analyze the raw data to generate all derivative data for the report and quality gate
analysis = utils.package.Analysis(
syft_data=syft.packages(), inline_data=inline.packages()
)
# show some useful report data for debugging / warm fuzzies
report(image, analysis)
# enforce a quality gate based on the comparison of package values and metadata values
success = True
success &= enforce_quality_gate(
title="Package",
actual_value=analysis.percent_overlapping_packages,
lower_gate_value=PACKAGE_QUALITY_GATE[image] * 100,
upper_gate_value=PACKAGE_UPPER_THRESHOLD[image] * 100
)
success &= enforce_quality_gate(
title="Metadata",
actual_value=analysis.percent_overlapping_metadata,
lower_gate_value=METADATA_QUALITY_GATE[image] * 100,
upper_gate_value=METADATA_UPPER_THRESHOLD[image] * 100
)
if not success:
return 1
return 0
if __name__ == "__main__":
if len(sys.argv) != 2:
sys.exit("provide an image")
rc = main(sys.argv[1])
sys.exit(rc)

View file

@ -1,46 +0,0 @@
class Colors:
reset = "\033[0m"
bold = "\033[01m"
disable = "\033[02m"
underline = "\033[04m"
reverse = "\033[07m"
strikethrough = "\033[09m"
invisible = "\033[08m"
class FG:
black = "\033[30m"
red = "\033[31m"
green = "\033[32m"
orange = "\033[33m"
blue = "\033[34m"
purple = "\033[35m"
cyan = "\033[36m"
lightgrey = "\033[37m"
darkgrey = "\033[90m"
lightred = "\033[91m"
lightgreen = "\033[92m"
yellow = "\033[93m"
lightblue = "\033[94m"
pink = "\033[95m"
lightcyan = "\033[96m"
class BG:
black = "\033[40m"
red = "\033[41m"
green = "\033[42m"
orange = "\033[43m"
blue = "\033[44m"
purple = "\033[45m"
cyan = "\033[46m"
lightgrey = "\033[47m"
def print_rows(rows):
if not rows:
return
widths = []
for col, _ in enumerate(rows[0]):
width = max(len(row[col]) for row in rows) + 2 # padding
widths.append(width)
for row in rows:
print("".join(word.ljust(widths[col_idx]) for col_idx, word in enumerate(row)))

View file

@ -1,5 +0,0 @@
import os
def clean(image: str) -> str:
return os.path.basename(image.replace(":", "_"))

View file

@ -1,142 +0,0 @@
import os
import re
import json
import collections
import utils.package
import utils.image
class InlineScan:
"""
Class for parsing inlnie-scan output files into a set of packages and package metadata.
"""
report_tmpl = "{image}-{report}.json"
def __init__(self, image, report_dir):
self.report_dir = report_dir
self.image = image
def packages(self):
python_packages, python_metadata = self._python_packages()
gem_packages, gem_metadata = self._gem_packages()
java_packages, java_metadata = self._java_packages()
npm_packages, npm_metadata = self._npm_packages()
os_packages, os_metadata = self._os_packages()
packages = (
python_packages | os_packages | gem_packages | java_packages | npm_packages
)
metadata = {
**python_metadata,
**os_metadata,
**gem_metadata,
**java_metadata,
**npm_metadata,
}
return utils.package.Info(packages=frozenset(packages), metadata=metadata)
def _report_path(self, report):
return os.path.join(
self.report_dir,
self.report_tmpl.format(image=utils.image.clean(self.image), report=report),
)
def _enumerate_section(self, report, section):
report_path = self._report_path(report=report)
os_report_path = self._report_path(report="content-os")
if os.path.exists(os_report_path) and not os.path.exists(report_path):
# if the OS report is there but the target report is not, that is engine's way of saying "no findings"
return
with open(report_path) as json_file:
data = json.load(json_file)
for entry in data[section]:
yield entry
def _java_packages(self):
packages = set()
metadata = collections.defaultdict(dict)
for entry in self._enumerate_section(report="content-java", section="content"):
# normalize to pseudo-inline
pkg_type = entry["type"].lower()
if pkg_type in ("java-jar", "java-war", "java-ear"):
pkg_type = "java-?ar"
elif pkg_type in ("java-jpi", "java-hpi"):
pkg_type = "java-?pi"
# this would usually be "package" but this would not be able to account for duplicate dependencies in
# nested jars of the same name. Fallback to the package name if there is no given location
name = entry["location"]
# replace fields with "N/A" with None
for k, v in dict(entry).items():
if v in ("", "N/A"):
entry[k] = None
pkg = utils.package.Package(
name=name,
type=pkg_type,
)
packages.add(pkg)
metadata[pkg.type][pkg] = utils.package.Metadata(
version=entry["maven-version"],
)
return packages, metadata
def _npm_packages(self):
packages = set()
metadata = collections.defaultdict(dict)
for entry in self._enumerate_section(report="content-npm", section="content"):
pkg = utils.package.Package(
name=entry["package"],
type=entry["type"].lower(),
)
packages.add(pkg)
metadata[pkg.type][pkg] = utils.package.Metadata(version=entry["version"])
return packages, metadata
def _python_packages(self):
packages = set()
metadata = collections.defaultdict(dict)
for entry in self._enumerate_section(
report="content-python", section="content"
):
pkg = utils.package.Package(
name=entry["package"],
type=entry["type"].lower(),
)
packages.add(pkg)
metadata[pkg.type][pkg] = utils.package.Metadata(version=entry["version"])
return packages, metadata
def _gem_packages(self):
packages = set()
metadata = collections.defaultdict(dict)
for entry in self._enumerate_section(report="content-gem", section="content"):
pkg = utils.package.Package(
name=entry["package"],
type=entry["type"].lower(),
)
packages.add(pkg)
metadata[pkg.type][pkg] = utils.package.Metadata(version=entry["version"])
return packages, metadata
def _os_packages(self):
packages = set()
metadata = collections.defaultdict(dict)
for entry in self._enumerate_section(report="content-os", section="content"):
pkg = utils.package.Package(
name=entry["package"], type=entry["type"].lower()
)
packages.add(pkg)
metadata[pkg.type][pkg] = utils.package.Metadata(version=entry["version"])
return packages, metadata

View file

@ -1,146 +0,0 @@
import difflib
import collections
import dataclasses
from typing import Set, FrozenSet, Tuple, Any, List
Metadata = collections.namedtuple("Metadata", "version")
Package = collections.namedtuple("Package", "name type")
Info = collections.namedtuple("Info", "packages metadata")
SimilarPackages = collections.namedtuple("SimilarPackages", "pkg missed")
ProbableMatch = collections.namedtuple("ProbableMatch", "pkg ratio")
@dataclasses.dataclass()
class Analysis:
"""
A package metadata analysis class. When given the raw syft and inline data, all necessary derivative information
needed to do a comparison of package and metadata is performed, allowing callers to interpret the results
"""
# all raw data from the inline scan and syft reports
syft_data: Info
inline_data: Info
# all derivative information (derived from the raw data above)
overlapping_packages: FrozenSet[Package] = dataclasses.field(init=False)
extra_packages: FrozenSet[Package] = dataclasses.field(init=False)
missing_packages: FrozenSet[Package] = dataclasses.field(init=False)
inline_metadata: Set[Tuple[Any, Any]] = dataclasses.field(init=False)
missing_metadata: Set[Tuple[Any, Any]] = dataclasses.field(init=False)
overlapping_metadata: Set[Tuple[Any, Any]] = dataclasses.field(init=False)
similar_missing_packages: List[Package] = dataclasses.field(init=False)
unmatched_missing_packages: List[Package] = dataclasses.field(init=False)
def __post_init__(self):
if not self.valid():
raise RuntimeError("invalid data given")
# basic sets derived from package information
self.overlapping_packages = self.syft_data.packages & self.inline_data.packages
self.extra_packages = self.syft_data.packages - self.inline_data.packages
self.missing_packages = self.inline_data.packages - self.syft_data.packages
# basic sets derived from metadata information
self.inline_metadata = self._inline_metadata()
self.overlapping_metadata = self._overlapping_metadata()
self.missing_metadata = self.inline_metadata - self.overlapping_metadata
# try to account for potential false negatives by pairing extra packages discovered only by syft with missing
# packages discovered only by inline scan.
(
similar_missing_packages,
unmatched_missing_packages,
) = self._pair_similar_packages(self.extra_packages, self.missing_packages)
self.similar_missing_packages = similar_missing_packages
self.unmatched_missing_packages = unmatched_missing_packages
def valid(self) -> bool:
# we are purposefully selecting test images that are guaranteed to have packages (this should never happen).
# ... if it does, then this analysis is not valid!
return bool(self.inline_data.packages)
def _inline_metadata(self):
"""
Returns the set of inline scan metadata paired with the corresponding package info.
"""
inline_metadata_set = set()
for package in self.inline_data.packages:
metadata = self.inline_data.metadata[package.type][package]
inline_metadata_set.add((package, metadata))
return inline_metadata_set
def _overlapping_metadata(self):
"""
Returns the metadata which has been found similar between both syft and inline scan.
"""
syft_overlap_metadata_set = set()
for package in self.syft_data.packages:
metadata = self.syft_data.metadata[package.type][package]
# we only want to really count mismatched metadata for packages that are at least found by inline
if package in self.inline_data.metadata.get(package.type, []):
syft_overlap_metadata_set.add((package, metadata))
return syft_overlap_metadata_set & self.inline_metadata
@staticmethod
def _pair_similar_packages(extra_packages, missing_packages, similar_threshold=0.7):
"""
Try to account for potential false negatives by pairing extra packages discovered only by syft with missing
packages discovered only by inline scan.
"""
matches = collections.defaultdict(set)
found = {}
for s in extra_packages:
for i in missing_packages:
ratio = difflib.SequenceMatcher(None, s.name, i.name).ratio()
if ratio >= similar_threshold:
if i in found:
# only allow for an inline package to be paired once
if ratio < found[i]:
continue
else:
matches[s].discard(i)
# persist the result
found[i] = ratio
matches[s].add(i)
results = []
for s, i_set in matches.items():
missed = tuple([ProbableMatch(pkg=i, ratio=found[i]) for i in i_set])
results.append(SimilarPackages(pkg=s, missed=missed))
not_found = [i for i in missing_packages if i not in found]
return sorted(results, key=lambda x: x.pkg), sorted(
not_found, key=lambda x: x.name
)
@property
def percent_overlapping_packages(self):
"""Returns a percentage representing how many packages that were found relative to the number of expected"""
return (
float(len(self.overlapping_packages))
/ float(len(self.inline_data.packages))
) * 100.0
@property
def percent_overlapping_metadata(self):
"""Returns a percentage representing how many matching metdata that were found relative to the number of expected"""
return (
float(len(self.overlapping_metadata)) / float(len(self.inline_metadata))
) * 100.0
@property
def percent_probable_overlapping_packages(self):
"""
Returns a percentage representing how many packages that were found relative to the number of expected after
considering pairing of missing packages with extra packages in a fuzzy match.
"""
return (
float(len(self.overlapping_packages) + len(self.similar_missing_packages))
/ float(len(self.inline_data.packages))
) * 100.0

View file

@ -1,68 +0,0 @@
import os
import json
import collections
import utils.package
import utils.image
class Syft:
"""
Class for parsing syft output into a set of packages and package metadata.
"""
report_tmpl = "{image}.json"
def __init__(self, image, report_dir):
self.report_path = os.path.join(
report_dir, self.report_tmpl.format(image=utils.image.clean(image))
)
def _enumerate_section(self, section):
with open(self.report_path) as json_file:
data = json.load(json_file)
for entry in data[section]:
yield entry
def packages(self):
packages = set()
metadata = collections.defaultdict(dict)
for entry in self._enumerate_section(section="artifacts"):
# normalize to inline
pkg_type = entry["type"].lower()
if pkg_type in ("wheel", "egg", "python"):
pkg_type = "python"
elif pkg_type in ("deb",):
pkg_type = "dpkg"
elif pkg_type in ("java-archive",):
# normalize to pseudo-inline
pkg_type = "java-?ar"
elif pkg_type in ("jenkins-plugin",):
# normalize to pseudo-inline
pkg_type = "java-?pi"
elif pkg_type in ("apk",):
pkg_type = "apkg"
name = entry["name"]
version = entry["version"]
if "java" in pkg_type:
# we need to use the virtual path instead of the name to account for nested dependencies with the same
# package name (but potentially different metadata)
name = entry.get("metadata", {}).get("virtualPath")
elif pkg_type == "apkg":
# inline scan strips off the release from the version, which should be normalized here
fields = entry["version"].split("-")
version = "-".join(fields[:-1])
pkg = utils.package.Package(
name=name,
type=pkg_type,
)
packages.add(pkg)
metadata[pkg.type][pkg] = utils.package.Metadata(version=version)
return utils.package.Info(packages=frozenset(packages), metadata=metadata)