~edwargix/git.sr.ht

git.sr.ht/gitsrht-periodic -rwxr-xr-x 3.5 KiB
3dcb9420 — Drew DeVault archive: fix command injection 2 years ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python3
import os
import sys
import math
import random
import sqlalchemy as sa
import subprocess
import gitsrht.repos as gr
import prometheus_client
from prometheus_client import CollectorRegistry, Gauge
from prometheus_client.context_managers import Timer
from srht.config import cfg
from srht.database import DbSession
from gitsrht.types import Artifact, User, Repository
from minio import Minio
from datetime import datetime, timedelta

db = DbSession(cfg("git.sr.ht", "connection-string"))
db.init()

registry = CollectorRegistry()
tg = Gauge("gitsrht_periodic_time",
        "Time to run gitsrht-periodic jobs",
        ["section"],
        registry=registry)

gc_git_t = tg.labels("gc_git")
@gc_git_t.time()
def gc_git():
    repo_count = Repository.query.count()

    # *srht-periodic scripts are run every twenty minutes,
    # this gives us 504 runs over the course of a week;
    # hence, if we GC a 504th of the repository count each time,
    # on average, we will have GCd every repo around once a week.
    limit = int(math.ceil(repo_count / (7 * 24 * 60 / 20)))

    rc = Gauge("gitsrht_periodic_gc_git_count",
            "Amount of repos GCd by the gc_git job",
            registry=registry)
    ps = Gauge("gitsrht_periodic_gc_git_packsize",
            "Packfile size in the gc_git job (B)",
            ["stage"],
            registry=registry)

    repos = (Repository.query
            .offset(random.randrange(0, repo_count + 1 - limit))
            .limit(limit)).all()
    for r in repos:
        ps.labels("pre").inc(sum(map(lambda p: p.stat().st_size,
            os.scandir(os.path.join(r.path, "objects", "pack")))))

        def gc():
            subprocess.run(["git", "-C", r.path, "gc", "--quiet"],
                stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        gc()

        ps.labels("post").inc(sum(map(lambda p: p.stat().st_size,
            os.scandir(os.path.join(r.path, "objects", "pack")))))
        rc.inc()

gc_s3_t = tg.labels("gc_s3")
@gc_s3_t.time()
def gc_s3():
    if not gr.object_storage_enabled:
        return
    # Once a weekish
    if random.randrange(0, int(math.ceil(7 * 24 * 60 / 20))) != 0:
       return
    prefix = os.path.join(gr.s3_prefix, "artifacts")
    minio = Minio(gr.s3_upstream, access_key=gr.s3_access_key,
            secret_key=gr.s3_secret_key, secure=True)

    objs = set(obj.object_name for obj
        in minio.list_objects(gr.s3_bucket, prefix, recursive=True))
    artifacts = Artifact.query.all()

    users = {u.id: u for u in (User.query .filter(User.id.in_(
        set(a.user_id for a in artifacts)))).all()}

    repos = {r.id: r for r in (Repository.query.filter(Repository.id.in_(
        set(a.repo_id for a in artifacts)))).all()}

    for art in artifacts:
        artifact_path = os.path.join(prefix, users[art.user_id].canonical_name,
                repos[art.repo_id].name, art.filename)
        objs.discard(artifact_path)

    if not objs:
        return
    errs = list(minio.remove_objects(gr.s3_bucket, objs))
    if errs:
        raise Exception(f"While removing dangling artifacts {objs}, got errors: {errs}")

    Gauge("gitsrht_periodic_gc_s3_count",
            "Amount of objects pruned by the gc_s3 job",
            registry=registry).set(len(objs))

all_t = tg.labels("total")
@all_t.time()
def all():
    gc_git()
    gc_s3()
all()


pg_endpoint = cfg("sr.ht", "pushgateway", default=None)
if pg_endpoint:
    prometheus_client.push_to_gateway(pg_endpoint,
            job="git.sr.ht", registry=registry)