#!/usr/bin/env python3 import os import sys import math import random import sqlalchemy as sa import subprocess import gitsrht.repos as gr import prometheus_client from prometheus_client import CollectorRegistry, Gauge from prometheus_client.context_managers import Timer from srht.config import cfg from srht.database import DbSession from gitsrht.types import Artifact, User, Repository from minio import Minio from datetime import datetime, timedelta db = DbSession(cfg("git.sr.ht", "connection-string")) db.init() registry = CollectorRegistry() tg = Gauge("gitsrht_periodic_time", "Time to run gitsrht-periodic jobs", ["section"], registry=registry) gc_git_t = tg.labels("gc_git") @gc_git_t.time() def gc_git(): repo_count = Repository.query.count() # *srht-periodic scripts are run every twenty minutes, # this gives us 504 runs over the course of a week; # hence, if we GC a 504th of the repository count each time, # on average, we will have GCd every repo around once a week. limit = int(math.ceil(repo_count / (7 * 24 * 60 / 20))) rc = Gauge("gitsrht_periodic_gc_git_count", "Amount of repos GCd by the gc_git job", registry=registry) ps = Gauge("gitsrht_periodic_gc_git_packsize", "Packfile size in the gc_git job (B)", ["stage"], registry=registry) repos = (Repository.query .offset(random.randrange(0, repo_count + 1 - limit)) .limit(limit)).all() for r in repos: ps.labels("pre").inc(sum(map(lambda p: p.stat().st_size, os.scandir(os.path.join(r.path, "objects", "pack"))))) def gc(): subprocess.run(["git", "-C", r.path, "gc", "--quiet"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) gc() ps.labels("post").inc(sum(map(lambda p: p.stat().st_size, os.scandir(os.path.join(r.path, "objects", "pack"))))) rc.inc() gc_s3_t = tg.labels("gc_s3") @gc_s3_t.time() def gc_s3(): if not gr.object_storage_enabled: return # Once a weekish if random.randrange(0, int(math.ceil(7 * 24 * 60 / 20))) != 0: return prefix = os.path.join(gr.s3_prefix, "artifacts") minio = Minio(gr.s3_upstream, access_key=gr.s3_access_key, secret_key=gr.s3_secret_key, secure=True) objs = set(obj.object_name for obj in minio.list_objects(gr.s3_bucket, prefix, recursive=True)) artifacts = Artifact.query.all() users = {u.id: u for u in (User.query .filter(User.id.in_( set(a.user_id for a in artifacts)))).all()} repos = {r.id: r for r in (Repository.query.filter(Repository.id.in_( set(a.repo_id for a in artifacts)))).all()} for art in artifacts: artifact_path = os.path.join(prefix, users[art.user_id].canonical_name, repos[art.repo_id].name, art.filename) objs.discard(artifact_path) if not objs: return errs = list(minio.remove_objects(gr.s3_bucket, objs)) if errs: raise Exception(f"While removing dangling artifacts {objs}, got errors: {errs}") Gauge("gitsrht_periodic_gc_s3_count", "Amount of objects pruned by the gc_s3 job", registry=registry).set(len(objs)) all_t = tg.labels("total") @all_t.time() def all(): gc_git() gc_s3() all() pg_endpoint = cfg("sr.ht", "pushgateway", default=None) if pg_endpoint: prometheus_client.push_to_gateway(pg_endpoint, job="git.sr.ht", registry=registry)