Skip to content

Commit 31b1f44

Browse files
committed
perf(download): use single http session
To resolve packages, we issue a lot of tiny requests to the snapshot mirror. To speed this up, we now use a session to pool the http connections and only resolve DNS once. We further re-use the same session when downloading as this will help us to track rate limits in the future. Signed-off-by: Felix Moessbauer <[email protected]>
1 parent b866621 commit 31b1f44

File tree

3 files changed

+26
-11
lines changed

3 files changed

+26
-11
lines changed

src/debsbom/cli.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
from urllib.parse import urlparse
1414
from pathlib import Path
1515

16+
import requests
17+
1618
from debsbom.download import Compression, SourceArchiveMerger
1719

1820
from .dpkg import package
@@ -158,8 +160,9 @@ def run(args):
158160
outdir.mkdir(exist_ok=True)
159161
cache = PersistentResolverCache(outdir / ".cache")
160162
resolver = PackageResolver.create(Path(args.bomfile))
161-
sdl = sdlclient.SnapshotDataLake()
162-
downloader = PackageDownloader(args.outdir)
163+
rs = requests.Session()
164+
sdl = sdlclient.SnapshotDataLake(session=rs)
165+
downloader = PackageDownloader(args.outdir, session=rs)
163166

164167
pkgs = []
165168
local_pkgs = []

src/debsbom/download/download.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,13 @@
77
from functools import reduce
88
import hashlib
99
import json
10+
import shutil
1011
import sys
1112
from typing import Generator, Tuple, Type
1213
from pathlib import Path
1314
from urllib.request import urlretrieve
1415
from packageurl import PackageURL
16+
import requests
1517

1618
from ..dpkg import package
1719
from ..snapshot import client as sdlclient
@@ -152,10 +154,13 @@ def create(filename: Path) -> Type["PackageResolver"]:
152154

153155

154156
class PackageDownloader:
155-
def __init__(self, outdir: Path | str = "downloads"):
157+
def __init__(
158+
self, outdir: Path | str = "downloads", session: requests.Session = requests.Session()
159+
):
156160
self.dldir = Path(outdir)
157161
self.dldir.mkdir(exist_ok=True)
158162
self.to_download: list["sdlclient.RemoteFile"] = []
163+
self.rs = session
159164

160165
def register(self, files: list["sdlclient.RemoteFile"]):
161166
self.to_download.extend(list(files))
@@ -182,5 +187,8 @@ def download(self, progress_cb=None):
182187
else:
183188
print(f"Checksum mismatch on {f.filename}. Download again.", file=sys.stderr)
184189
fdst = target.with_suffix(target.suffix + ".tmp")
185-
urlretrieve(f.downloadurl, fdst)
190+
with self.rs.get(f.downloadurl, stream=True) as r:
191+
r.raise_for_status()
192+
with open(fdst, "wb") as f:
193+
shutil.copyfileobj(r.raw, f)
186194
fdst.rename(target)

src/debsbom/snapshot/client.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def __init__(self, sdl, name: str):
3333

3434
def versions(self):
3535
try:
36-
r = requests.get(self.sdl.url + f"/mr/package/{self.name}/")
36+
r = self.sdl.rs.get(self.sdl.url + f"/mr/package/{self.name}/")
3737
except RequestException as e:
3838
raise SnapshotDataLakeError(e)
3939
for v in r.json().get("result", []):
@@ -55,7 +55,7 @@ def srcfiles(self) -> Generator["RemoteFile", None, None]:
5555
All files associated with the source package
5656
"""
5757
try:
58-
r = requests.get(
58+
r = self.sdl.rs.get(
5959
self.sdl.url + f"/mr/package/{self.name}/{self.version}" "/srcfiles?fileinfo=1"
6060
)
6161
if r.status_code == 404:
@@ -78,7 +78,7 @@ def binpackages(self) -> Generator["BinaryPackage", None, None]:
7878
All binary packages created from this source package
7979
"""
8080
try:
81-
r = requests.get(
81+
r = self.sdl.rs.get(
8282
self.sdl.url + f"/mr/package/{self.name}/{self.version}" "/binpackages"
8383
)
8484
data = r.json()
@@ -121,7 +121,7 @@ def files(self, arch: str = None) -> Generator["RemoteFile", None, None]:
121121
# resolve via binary only
122122
api = self.sdl.url + f"/mr/binary/{self.binname}/{self.binversion}/binfiles?fileinfo=1"
123123
try:
124-
r = requests.get(api)
124+
r = self.sdl.rs.get(api)
125125
if r.status_code == 404:
126126
raise NotFoundOnSnapshotError()
127127
data = r.json()
@@ -170,12 +170,16 @@ class SnapshotDataLake:
170170
Snapshot instance to query against
171171
"""
172172

173-
def __init__(self, url="https://snapshot.debian.org"):
173+
def __init__(
174+
self, url="https://snapshot.debian.org", session: requests.Session = requests.Session()
175+
):
174176
self.url = url
177+
# reuse the same connection for all requests
178+
self.rs = session
175179

176180
def packages(self) -> Generator[Package, None, None]:
177181
try:
178-
r = requests.get(self.url + "/mr/package/")
182+
r = self.rs.get(self.url + "/mr/package/")
179183
data = r.json()
180184
except RequestException as e:
181185
raise SnapshotDataLakeError(e)
@@ -184,7 +188,7 @@ def packages(self) -> Generator[Package, None, None]:
184188

185189
def fileinfo(self, hash):
186190
try:
187-
r = requests.get(self.url + f"/mr/file/{hash}/info")
191+
r = self.rs.get(self.url + f"/mr/file/{hash}/info")
188192
data = r.json()
189193
except RequestException as e:
190194
raise SnapshotDataLakeError(e)

0 commit comments

Comments
 (0)