Skip to content

Commit 88886bf

Browse files
committed
perf(download): symlink files we already have
As the snapshot client now returns all file instances, we also download them multiple times. To optimize this, we check if we already have a file with that hash and just link it. Signed-off-by: Felix Moessbauer <[email protected]>
1 parent c3470d6 commit 88886bf

File tree

1 file changed

+17
-5
lines changed

1 file changed

+17
-5
lines changed

src/debsbom/download/download.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ def __init__(
161161
self.dldir.mkdir(exist_ok=True)
162162
self.to_download: list["sdlclient.RemoteFile"] = []
163163
self.rs = session
164+
self.known_hashes = {}
164165

165166
def register(self, files: list["sdlclient.RemoteFile"]):
166167
self.to_download.extend(list(files))
@@ -169,26 +170,37 @@ def stat(self):
169170
"""
170171
Returns a tuple (files to download, total size, cached files, cached bytes)
171172
"""
172-
nbytes = reduce(lambda acc, x: acc + x.size, self.to_download, 0)
173-
cfiles = list(filter(lambda f: Path(self.dldir / f.filename).is_file(), self.to_download))
173+
unique_dl = list({v.hash: v for v in self.to_download}.values())
174+
nbytes = reduce(lambda acc, x: acc + x.size, unique_dl, 0)
175+
cfiles = list(filter(lambda f: Path(self.dldir / f.filename).is_file(), unique_dl))
174176
cbytes = reduce(lambda acc, x: acc + x.size, cfiles, 0)
175-
return (len(self.to_download), nbytes, len(cfiles), cbytes)
177+
return (len(unique_dl), nbytes, len(cfiles), cbytes)
176178

177179
def download(self, progress_cb=None):
178180
for idx, f in enumerate(self.to_download):
179181
if progress_cb:
180182
progress_cb(idx, len(self.to_download), f.filename)
181183
target = Path(self.dldir / f.filename)
184+
# check if we have the file under the exact filename
182185
if target.is_file():
183186
with open(target, "rb") as fd:
184187
digest = hashlib.file_digest(fd, "sha1")
185188
if digest.hexdigest() == f.hash:
189+
self.known_hashes[f.hash] = f.filename
186190
continue
187191
else:
188192
print(f"Checksum mismatch on {f.filename}. Download again.", file=sys.stderr)
193+
# check if we have a file with the same hash and link to it
194+
o_filename = self.known_hashes.get(f.hash)
195+
if o_filename:
196+
o_path = Path(self.dldir / o_filename).resolve()
197+
target.symlink_to(o_path.relative_to(self.dldir.resolve()))
198+
continue
199+
189200
fdst = target.with_suffix(target.suffix + ".tmp")
190201
with self.rs.get(f.downloadurl, stream=True) as r:
191202
r.raise_for_status()
192-
with open(fdst, "wb") as f:
193-
shutil.copyfileobj(r.raw, f)
203+
with open(fdst, "wb") as fp:
204+
shutil.copyfileobj(r.raw, fp)
194205
fdst.rename(target)
206+
self.known_hashes[f.hash] = f.filename

0 commit comments

Comments
 (0)