Skip to content

Commit 270aaed

Browse files
fmoessbauerUrist-McGit
authored andcommitted
perf(download): use single http session
To resolve packages, we issue a lot of tiny requests to the snapshot mirror. To speed this up, we now use a session to pool the http connections and only resolve DNS once. We further re-use the same session when downloading as this will help us to track rate limits in the future. Signed-off-by: Felix Moessbauer <[email protected]>
1 parent bc59b0a commit 270aaed

File tree

3 files changed

+25
-11
lines changed

3 files changed

+25
-11
lines changed

src/debsbom/cli.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from uuid import UUID
1313
from urllib.parse import urlparse
1414
from pathlib import Path
15+
import requests
1516

1617
from .generate import Debsbom, SBOMType
1718
from .download import PackageDownloader, PackageResolver, PersistentResolverCache
@@ -145,8 +146,9 @@ def run(args):
145146
outdir.mkdir(exist_ok=True)
146147
cache = PersistentResolverCache(outdir / ".cache")
147148
resolver = PackageResolver.create(Path(args.bomfile))
148-
sdl = sdlclient.SnapshotDataLake()
149-
downloader = PackageDownloader(args.outdir)
149+
rs = requests.Session()
150+
sdl = sdlclient.SnapshotDataLake(session=rs)
151+
downloader = PackageDownloader(args.outdir, session=rs)
150152

151153
pkgs = []
152154
local_pkgs = []

src/debsbom/download/download.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,13 @@
77
from functools import reduce
88
import hashlib
99
import json
10+
import shutil
1011
import sys
1112
from typing import Generator, Tuple, Type
1213
from pathlib import Path
1314
from urllib.request import urlretrieve
1415
from packageurl import PackageURL
16+
import requests
1517

1618
from ..dpkg import package
1719
from ..snapshot import client as sdlclient
@@ -152,10 +154,13 @@ def create(filename: Path) -> Type["PackageResolver"]:
152154

153155

154156
class PackageDownloader:
155-
def __init__(self, outdir: Path | str = "downloads"):
157+
def __init__(
158+
self, outdir: Path | str = "downloads", session: requests.Session = requests.Session()
159+
):
156160
self.dldir = Path(outdir)
157161
self.dldir.mkdir(exist_ok=True)
158162
self.to_download: list["sdlclient.RemoteFile"] = []
163+
self.rs = session
159164

160165
def register(self, files: list["sdlclient.RemoteFile"]):
161166
self.to_download.extend(list(files))
@@ -180,5 +185,8 @@ def download(self, progress_cb):
180185
else:
181186
print(f"Checksum mismatch on {f.filename}. Download again.", file=sys.stderr)
182187
fdst = target.with_suffix(target.suffix + ".tmp")
183-
urlretrieve(f.downloadurl, fdst)
188+
with self.rs.get(f.downloadurl, stream=True) as r:
189+
r.raise_for_status()
190+
with open(fdst, "wb") as f:
191+
shutil.copyfileobj(r.raw, f)
184192
fdst.rename(target)

src/debsbom/snapshot/client.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def __init__(self, sdl, name: str):
3333

3434
def versions(self):
3535
try:
36-
r = requests.get(self.sdl.url + f"/mr/package/{self.name}/")
36+
r = self.sdl.rs.get(self.sdl.url + f"/mr/package/{self.name}/")
3737
except RequestException as e:
3838
raise SnapshotDataLakeError(e)
3939
for v in r.json().get("result", []):
@@ -55,7 +55,7 @@ def srcfiles(self) -> Generator["RemoteFile", None, None]:
5555
All files associated with the source package
5656
"""
5757
try:
58-
r = requests.get(
58+
r = self.sdl.rs.get(
5959
self.sdl.url + f"/mr/package/{self.name}/{self.version}" "/srcfiles?fileinfo=1"
6060
)
6161
if r.status_code == 404:
@@ -78,7 +78,7 @@ def binpackages(self) -> Generator["BinaryPackage", None, None]:
7878
All binary packages created from this source package
7979
"""
8080
try:
81-
r = requests.get(
81+
r = self.sdl.rs.get(
8282
self.sdl.url + f"/mr/package/{self.name}/{self.version}" "/binpackages"
8383
)
8484
data = r.json()
@@ -121,7 +121,7 @@ def files(self, arch: str = None) -> Generator["RemoteFile", None, None]:
121121
# resolve via binary only
122122
api = self.sdl.url + f"/mr/binary/{self.binname}/{self.binversion}/binfiles?fileinfo=1"
123123
try:
124-
r = requests.get(api)
124+
r = self.sdl.rs.get(api)
125125
if r.status_code == 404:
126126
raise NotFoundOnSnapshotError()
127127
data = r.json()
@@ -170,12 +170,16 @@ class SnapshotDataLake:
170170
Snapshot instance to query against
171171
"""
172172

173-
def __init__(self, url="https://snapshot.debian.org"):
173+
def __init__(
174+
self, url="https://snapshot.debian.org", session: requests.Session = requests.Session()
175+
):
174176
self.url = url
177+
# reuse the same connection for all requests
178+
self.rs = session
175179

176180
def packages(self) -> Generator[Package, None, None]:
177181
try:
178-
r = requests.get(self.url + "/mr/package/")
182+
r = self.rs.get(self.url + "/mr/package/")
179183
data = r.json()
180184
except RequestException as e:
181185
raise SnapshotDataLakeError(e)
@@ -184,7 +188,7 @@ def packages(self) -> Generator[Package, None, None]:
184188

185189
def fileinfo(self, hash):
186190
try:
187-
r = requests.get(self.url + f"/mr/file/{hash}/info")
191+
r = self.rs.get(self.url + f"/mr/file/{hash}/info")
188192
data = r.json()
189193
except RequestException as e:
190194
raise SnapshotDataLakeError(e)

0 commit comments

Comments
 (0)