From 5e1a5cda10a1cc1d3a648fbac164be21c65820d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= Date: Thu, 13 Aug 2020 21:13:51 +0200 Subject: [PATCH] Pipe out the archives directly from git-archive(1) Using a crude for i in $(seq 1 10); do curl -w '%{time_connect}:%{time_starttransfer}:%{time_total}\n' http://127.0.0.1:5001/~nabijaczleweli/linux/archive/HEAD.tar.gz --output /dev/null >> timing; done where the HEAD was at the v3.0 tag of https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git yielded 0.001489:45.133830:46.793669 0.001673:27.780585:29.399757 0.001416:27.351536:29.024689 0.001388:27.486558:29.180511 0.002239:27.299490:28.925065 0.001342:27.041805:28.740544 0.001558:27.465697:29.030950 0.001546:28.010680:29.604439 0.001819:27.551466:29.148465 0.001145:27.407098:29.040098 0.001493:27.597439:29.094110 0.001772:27.429221:29.095903 before this and 0.001991:0.285543:28.874766 0.001626:0.186180:28.290034 0.001195:0.196463:28.570427 0.001632:0.182806:29.050415 0.001598:0.184604:29.399892 0.001398:0.192858:29.184659 0.001458:0.186850:29.141446 0.002366:0.194297:28.997083 0.001390:0.184061:29.152253 0.001932:0.219032:29.557687 0.001435:0.182397:28.982165 after; stripping the obvious outlier at the top, this averages out to 0.001581 :27.4928704545455 :29.1167755454545 0.00163827272727273:0.199553727272727:29.018257 Note the *27.5 seconds* to first byte, as gzip was writing to tmpfs first (qemu-system-x86_64 -enable-kvm -smp 6 -m 4g -drive format=raw -device virtio-blk-pci,drive=rootfs -net nic,model=virtio-net-pci on a two-Xeon E5645 @ 2.4GHz + six-16GB HMT42GR7AFR4A-PB @ 1600MT/s host, gzip was at 100% CPU, git hovered around 22%), but low overall impact on total transfer time. However, the cURL output also reveals another set of data: % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 92.5M 0 92.5M 0 0 3223k 0 --:--:-- 0:00:29 --:--:-- 22.2M 100 92.5M 0 92.5M 0 0 3265k 0 --:--:-- 0:00:29 --:--:-- 24.4M 100 92.5M 0 92.5M 0 0 3247k 0 --:--:-- 0:00:29 --:--:-- 23.4M 100 92.5M 0 92.5M 0 0 3276k 0 --:--:-- 0:00:28 --:--:-- 25.1M 100 92.5M 0 92.5M 0 0 3297k 0 --:--:-- 0:00:28 --:--:-- 20.5M 100 92.5M 0 92.5M 0 0 3264k 0 --:--:-- 0:00:29 --:--:-- 19.3M 100 92.5M 0 92.5M 0 0 3201k 0 --:--:-- 0:00:29 --:--:-- 21.2M 100 92.5M 0 92.5M 0 0 3251k 0 --:--:-- 0:00:29 --:--:-- 23.6M 100 92.5M 0 92.5M 0 0 3263k 0 --:--:-- 0:00:29 --:--:-- 24.3M 100 92.5M 0 92.5M 0 0 3257k 0 --:--:-- 0:00:29 --:--:-- 19.0M before and % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 92.5M 0 92.5M 0 0 3349k 0 --:--:-- 0:00:28 --:--:-- 3134k 100 92.5M 0 92.5M 0 0 3317k 0 --:--:-- 0:00:28 --:--:-- 3478k 100 92.5M 0 92.5M 0 0 3262k 0 --:--:-- 0:00:29 --:--:-- 3322k 100 92.5M 0 92.5M 0 0 3223k 0 --:--:-- 0:00:29 --:--:-- 3174k 100 92.5M 0 92.5M 0 0 3247k 0 --:--:-- 0:00:29 --:--:-- 3369k 100 92.5M 0 92.5M 0 0 3252k 0 --:--:-- 0:00:29 --:--:-- 3132k 100 92.5M 0 92.5M 0 0 3268k 0 --:--:-- 0:00:28 --:--:-- 3305k 100 92.5M 0 92.5M 0 0 3250k 0 --:--:-- 0:00:29 --:--:-- 3216k 100 92.5M 0 92.5M 0 0 3206k 0 --:--:-- 0:00:29 --:--:-- 3084k 100 92.5M 0 92.5M 0 0 3269k 0 --:--:-- 0:00:28 --:--:-- 3164k after, and the speed in the after run was relatively constant, but, rather predictably, the before speed just filled the SSH tunnel, so I'm expecting huge differences for users across slow links, with changes to the time equation from 92.5MB/3MBs^-1 + 92.5MB/link_speed to 92.5MB/max(3MBs^-1, link_speed) The one potential downside of this approach is that we can no longer return a 500 if git returns non-0, but I doubt that's a common occurrence --- gitsrht/blueprints/repo.py | 40 +++++++++++--------------------------- 1 file changed, 11 insertions(+), 29 deletions(-) diff --git a/gitsrht/blueprints/repo.py b/gitsrht/blueprints/repo.py index 248e1a1..0b4b40a 100644 --- a/gitsrht/blueprints/repo.py +++ b/gitsrht/blueprints/repo.py @@ -288,36 +288,18 @@ def archive(owner, repo, ref): if not isinstance(commit, pygit2.Commit): abort(404) - path = f"/tmp/{commit.id.hex}{binascii.hexlify(os.urandom(8))}.tar.gz" - try: - args = [ - "git", - "--git-dir", repo.path, - "archive", - "--format=tar.gz", - "--prefix", f"{repo.name}-{ref}/", - "-o", path, ref - ] - subp = subprocess.run(args, timeout=30, - stdout=sys.stdout, stderr=sys.stderr) - except: - try: - os.unlink(path) - except: - pass - raise - - if subp.returncode != 0: - try: - os.unlink(path) - except: - pass - return "Error preparing archive", 500 + args = [ + "git", + "--git-dir", repo.path, + "archive", + "--format=tar.gz", + "--prefix", f"{repo.name}-{ref}/", + ref + ] + subp = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=sys.stderr) - f = open(path, "rb") - os.unlink(path) - return send_file(f, mimetype="application/tar+gzip", as_attachment=True, - attachment_filename=f"{repo.name}-{ref}.tar.gz") + return send_file(subp.stdout, mimetype="application/tar+gzip", + as_attachment=True, attachment_filename=f"{repo.name}-{ref}.tar.gz") class _AnnotatedRef: def __init__(self, repo, ref): -- 2.38.4