Skip to content

Commit 2754704

Browse files
fmoessbauerUrist-McGit
authored andcommitted
feat(download): persistently cache PURL to upstream data
Resolving the upstream url and fileinfo data requires calls to the snapshot mirror which are costly. To speedup the repeated evaluation of SBOMs (e.g. with minor differences), we persistently cache this data. Signed-off-by: Felix Moessbauer <[email protected]>
1 parent 4a32c6b commit 2754704

File tree

3 files changed

+81
-9
lines changed

3 files changed

+81
-9
lines changed

src/debsbom/cli.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from pathlib import Path
1515

1616
from .generate import Debsbom, SBOMType
17-
from .download import PackageDownloader, PackageResolver
17+
from .download import PackageDownloader, PackageResolver, PersistentResolverCache
1818
from .snapshot import client as sdlclient
1919

2020

@@ -141,6 +141,9 @@ def human_readable_bytes(size):
141141

142142
@staticmethod
143143
def run(args):
144+
outdir = Path(args.outdir)
145+
outdir.mkdir(exist_ok=True)
146+
cache = PersistentResolverCache(outdir / ".cache")
144147
resolver = PackageResolver.create(Path(args.bomfile))
145148
sdl = sdlclient.SnapshotDataLake()
146149
downloader = PackageDownloader(args.outdir)
@@ -158,7 +161,7 @@ def run(args):
158161
if args.progress:
159162
progress_cb(idx, len(pkgs), pkg.name)
160163
try:
161-
files = list(resolver.resolve(sdl, pkg))
164+
files = list(resolver.resolve(sdl, pkg, cache))
162165
except sdlclient.NotFoundOnSnapshotError:
163166
local_pkgs.append(pkg)
164167
downloader.register(files)

src/debsbom/download/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
#
33
# SPDX-License-Identifier: MIT
44

5-
from .download import PackageResolver
5+
from .download import PackageResolver, PackageResolverCache, PersistentResolverCache
66
from .download import PackageDownloader

src/debsbom/download/download.py

Lines changed: 75 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,76 @@
33
# SPDX-License-Identifier: MIT
44

55
from abc import abstractmethod
6+
import dataclasses
67
from functools import reduce
78
import hashlib
8-
import os
9+
import json
910
import re
11+
import sys
1012
from typing import Generator, Tuple, Type
1113
from pathlib import Path
1214
from urllib.request import urlretrieve
1315
from ..dpkg import package
1416
from ..snapshot import client as sdlclient
1517

1618

19+
class PackageResolverCache:
20+
"""
21+
Maps packages to RemoteFile instances to avoid expensive calls to the upstream mirror.
22+
This dummy implementation can be used to not cache.
23+
"""
24+
25+
def lookup(
26+
self, p: package.SourcePackage | package.BinaryPackage
27+
) -> list["sdlclient.RemoteFile"] | None:
28+
return None
29+
30+
def insert(
31+
self, p: package.SourcePackage | package.BinaryPackage, files: list["sdlclient.RemoteFile"]
32+
) -> None:
33+
pass
34+
35+
36+
class PersistentResolverCache(PackageResolverCache):
37+
"""
38+
Trivial implementation of a file-backed cache. Each cache entry is stored as individual file
39+
in the cachedir.
40+
"""
41+
42+
def __init__(self, cachedir: Path):
43+
self.cachedir = cachedir
44+
cachedir.mkdir(exist_ok=True)
45+
46+
@staticmethod
47+
def _package_hash(p: package.SourcePackage | package.BinaryPackage) -> str:
48+
return hashlib.sha256(
49+
json.dumps({"name": p.name, "version": p.version}, sort_keys=True).encode("utf-8")
50+
).hexdigest()
51+
52+
def _entry_path(self, hash: str) -> Path:
53+
return self.cachedir / f"{hash}.json"
54+
55+
def lookup(
56+
self, p: package.SourcePackage | package.BinaryPackage
57+
) -> list["sdlclient.RemoteFile"] | None:
58+
hash = self._package_hash(p)
59+
entry = self._entry_path(hash)
60+
if not entry.is_file():
61+
return None
62+
with open(entry, "r") as f:
63+
data = json.load(f)
64+
return [sdlclient.RemoteFile(**d) for d in data]
65+
66+
def insert(
67+
self, p: package.SourcePackage | package.BinaryPackage, files: list["sdlclient.RemoteFile"]
68+
) -> None:
69+
hash = self._package_hash(p)
70+
entry = self._entry_path(hash)
71+
with open(entry.with_suffix(".tmp"), "w") as f:
72+
json.dump([dataclasses.asdict(rf) for rf in files], f)
73+
entry.with_suffix(".tmp").rename(entry)
74+
75+
1776
class PackageResolver:
1877
def __init__(self):
1978
self.purl_regex = re.compile(r"pkg:deb\/debian\/(.*)@(.*)[?]arch=(.*)$")
@@ -44,17 +103,27 @@ def package_from_purl(self, purl: str) -> Tuple[str, str, str]:
44103

45104
@staticmethod
46105
def resolve(
47-
sdl: sdlclient.SnapshotDataLake, p: package.SourcePackage | package.BinaryPackage
48-
) -> Generator["sdlclient.RemoteFile", None, None]:
106+
sdl: sdlclient.SnapshotDataLake,
107+
p: package.SourcePackage | package.BinaryPackage,
108+
cache: PackageResolverCache = PackageResolverCache(),
109+
) -> list["sdlclient.RemoteFile"]:
49110
"""
50111
Resolve a local package to references on the upstream snapshot mirror
51112
"""
113+
cached_files = cache.lookup(p)
114+
if cached_files:
115+
return cached_files
116+
117+
# Determine which type of package and fetch files
52118
if isinstance(p, package.SourcePackage):
53-
return sdlclient.SourcePackage(sdl, p.name, p.version).srcfiles()
54-
elif isinstance(p, package.BinaryPackage):
55-
return sdlclient.BinaryPackage(sdl, p.name, p.version, None, None).files(
119+
files = sdlclient.SourcePackage(sdl, p.name, p.version).srcfiles()
120+
else:
121+
files = sdlclient.BinaryPackage(sdl, p.name, p.version, None, None).files(
56122
arch=p.architecture
57123
)
124+
files_list = list(files)
125+
cache.insert(p, files_list)
126+
return files_list
58127

59128
@staticmethod
60129
def create(filename: Path) -> Type["PackageResolver"]:

0 commit comments

Comments
 (0)