#!/usr/bin/env bash
# Idempotent bootstrap for a fresh Sando host (the machine running sandod).
#
# Captures the three PG footguns + system user + systemd unit + scratch DB +
# .ssh setup + known_hosts seeding that fw13 accumulated by hand over the
# 2026-06-02 hardening session. Re-run any time the sando host is rebuilt.
#
# Run as root on the new host. The script is safe to run repeatedly — every
# step checks current state and skips if already satisfied.
#
# What it does:
#   1. base packages + postgresql
#   2. `sando` system user (login shell, /srv/sando home)
#   3. /srv/sando dirs (state/, work/, releases/, logs/, backups/)
#   4. postgres role `sando` with CREATEDB
#   5. `sando_scratch` database owned by `sando`
#   6. ALTER SCHEMA public OWNER TO sando inside sando_scratch
#      (must be set explicitly — PG15+ no longer grants public to db owner)
#   7. sando's ed25519 SSH key (generated if missing)
#   8. /srv/sando/.ssh/config — declares port 2200 for alpha-west-1
#   9. known_hosts seeded for tailnet targets (testnot, alpha-west-1, etc.)
#  10. /etc/sando/{sando-daemon.toml,sando.toml,sando.env}
#  11. /etc/systemd/system/sandod.service + sandod-backup-fetch.{service,timer}
#  12. /usr/local/bin/sandod (built from the local checkout if missing)
#  13. /srv/sando/mnw.git bare repo (initial; operator pushes the working tree)
#
# What this does NOT do (operator's job):
#   - tailscale up (auth)
#   - Authorize sando's pubkey on each deploy target's `deploy` user
#     (bootstrap-node.sh on the target consumes $SANDO_PUBKEY)
#   - Populate /etc/sando/sando.env with anything beyond SANDO_DAEMON if
#     additional secrets are needed
#   - Push the MNW working tree to /srv/sando/mnw.git
#   - Fix `mnw_test_template` ownership — that template gets re-created by
#     each `cargo test` run; ownership resets when a *different* user runs
#     tests on the host. Out of scope for one-shot bootstrap; manage by
#     keeping the host single-user or by re-`ALTER TABLE ... OWNER TO sando`
#     before sandod's cargo_test gate.

set -euo pipefail

if [[ $EUID -ne 0 ]]; then
    echo "must run as root" >&2
    exit 1
fi

# All paths the host should accept overrides for, with sane defaults that
# match the live fw13 install.
SANDO_USER="${SANDO_USER:-sando}"
SANDO_HOME="${SANDO_HOME:-/srv/sando}"
SANDO_DAEMON_URL="${SANDO_DAEMON_URL:-http://127.0.0.1:7766}"
INSTALL_POSTGRES="${INSTALL_POSTGRES:-1}"
BUILD_SANDOD="${BUILD_SANDOD:-1}"

# Tailnet targets to pre-seed in sando's known_hosts. Override SEED_HOSTS to
# add/remove. Each entry is "name[:port]"; port defaults to 22.
SEED_HOSTS="${SEED_HOSTS:-testnot alpha-west-1:2200}"

# Resolve the script's directory so it can copy sibling unit/config files
# without depending on cwd. Layout: `<SANDO_REPO>/deploy/this-script.sh`,
# so SANDO_REPO is one level up.
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SANDO_REPO="$(cd "$SCRIPT_DIR/.." && pwd)"

export DEBIAN_FRONTEND=noninteractive

log() { echo "[bootstrap-sandod] $*"; }

log "1/13 base packages"
apt-get update -qq
apt-get install -y -qq curl ca-certificates rsync openssh-client git build-essential pkg-config libssl-dev > /dev/null

if [[ "$INSTALL_POSTGRES" == "1" ]]; then
    log "2/13 postgresql"
    apt-get install -y -qq postgresql > /dev/null
else
    log "2/13 skipping postgresql"
fi

log "3/13 sando system user (home: $SANDO_HOME)"
if ! id "$SANDO_USER" &>/dev/null; then
    useradd -m -d "$SANDO_HOME" -s /bin/bash "$SANDO_USER"
fi
# Re-assert home dir + mode in case a prior partial run left it root-owned.
install -d -o "$SANDO_USER" -g "$SANDO_USER" -m 0750 "$SANDO_HOME"

log "4/13 /srv/sando subdirs"
for sub in state work releases logs backups; do
    install -d -o "$SANDO_USER" -g "$SANDO_USER" -m 0750 "$SANDO_HOME/$sub"
done

log "5/13 postgres role + scratch db"
# All postgres ops go through `sudo -u postgres` since the role/db live on the
# local cluster. Idempotency via CREATE … IF NOT EXISTS where supported, and
# DO blocks where it isn't (CREATE ROLE has no IF NOT EXISTS in older PG).
sudo -u postgres psql -v ON_ERROR_STOP=1 <<SQL
DO \$\$
BEGIN
    IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = '$SANDO_USER') THEN
        EXECUTE format('CREATE ROLE %I LOGIN CREATEDB', '$SANDO_USER');
    ELSE
        EXECUTE format('ALTER ROLE %I CREATEDB', '$SANDO_USER');
    END IF;
END
\$\$;
SQL

# CREATE DATABASE can't be inside a DO block, hence the separate guard.
if ! sudo -u postgres psql -tAc \
        "SELECT 1 FROM pg_database WHERE datname = 'sando_scratch'" \
        | grep -q '^1$'; then
    sudo -u postgres createdb -O "$SANDO_USER" sando_scratch
fi

log "6/13 sando_scratch public schema owner = $SANDO_USER"
# PG15+ no longer grants public to the DB owner automatically. Without this,
# reset_scratch (sando/daemon/src/gates.rs::reset_scratch) silently fails
# every rebuild because the DROP SCHEMA public CASCADE happens but the
# CREATE SCHEMA public lands as postgres, not sando, owning it.
sudo -u postgres psql -v ON_ERROR_STOP=1 sando_scratch -c \
    "ALTER SCHEMA public OWNER TO $SANDO_USER" >/dev/null

log "7/13 sando ssh key (ed25519)"
install -d -o "$SANDO_USER" -g "$SANDO_USER" -m 0700 "$SANDO_HOME/.ssh"
if [[ ! -f "$SANDO_HOME/.ssh/id_ed25519" ]]; then
    sudo -u "$SANDO_USER" ssh-keygen -t ed25519 -N "" \
        -f "$SANDO_HOME/.ssh/id_ed25519" \
        -C "sando@$(hostname -s)"
fi

log "8/13 /srv/sando/.ssh/config"
# Declare alpha-west-1 on port 2200 (prod sshd convention). bootstrap-node.sh
# on each deploy target accepts SANDO_PUBKEY so we don't need to manage the
# remote authorized_keys here.
ssh_config="$SANDO_HOME/.ssh/config"
if ! grep -q "^Host alpha-west-1" "$ssh_config" 2>/dev/null; then
    cat >> "$ssh_config" <<'EOF'
Host alpha-west-1
    Port 2200

EOF
fi
chown "$SANDO_USER:$SANDO_USER" "$ssh_config"
chmod 0600 "$ssh_config"

log "9/13 known_hosts seeding ($SEED_HOSTS)"
# Strict-host-key-check failures on first contact would block sandod's deploy
# step. Pre-seed each declared tier-node host. ssh-keyscan is idempotent
# (running it again just appends a duplicate; we de-dup via sort -u after).
known="$SANDO_HOME/.ssh/known_hosts"
touch "$known"
chown "$SANDO_USER:$SANDO_USER" "$known"
chmod 0600 "$known"
for entry in $SEED_HOSTS; do
    host="${entry%%:*}"
    port="${entry#*:}"
    [[ "$port" == "$host" ]] && port=22
    # `ssh-keyscan` returns the host keys without contacting the user; on
    # unreachable hosts it logs a warning and exits 0. We tolerate that —
    # operator can re-run after the target is up.
    sudo -u "$SANDO_USER" ssh-keyscan -p "$port" -T 5 "$host" 2>/dev/null \
        >> "$known" || log "  warn: ssh-keyscan $host:$port returned nothing"
done
# De-dup in place. sort+mv keeps ownership/mode via install.
sudo -u "$SANDO_USER" sort -u "$known" -o "$known"

log "10/13 /etc/sando configs"
install -d -m 0755 /etc/sando
# sando-daemon.toml.example is the canonical production config (per the
# header comment). Install as-is; operator edits the listen address if
# binding to a non-fw13 tailnet IP.
install -m 0644 -o root -g root \
    "$SCRIPT_DIR/sando-daemon.toml.example" \
    /etc/sando/sando-daemon.toml
install -m 0644 -o root -g root \
    "$SANDO_REPO/sando.toml" \
    /etc/sando/sando.toml
# sando.env carries non-secret operator settings consumed by sandod and the
# backup-fetch timer. Only write if missing — operator may have customized.
if [[ ! -f /etc/sando/sando.env ]]; then
    echo "SANDO_DAEMON=$SANDO_DAEMON_URL" > /etc/sando/sando.env
    chown root:"$SANDO_USER" /etc/sando/sando.env
    chmod 0640 /etc/sando/sando.env
fi

log "11/13 systemd units"
install -m 0644 -o root -g root \
    "$SCRIPT_DIR/sandod.service" \
    /etc/systemd/system/sandod.service
install -m 0644 -o root -g root \
    "$SCRIPT_DIR/sandod-backup-fetch.service" \
    /etc/systemd/system/sandod-backup-fetch.service
install -m 0644 -o root -g root \
    "$SCRIPT_DIR/sandod-backup-fetch.timer" \
    /etc/systemd/system/sandod-backup-fetch.timer
systemctl daemon-reload

if [[ "$BUILD_SANDOD" == "1" ]]; then
    log "12/13 sandod binary (cargo build --release → /usr/local/bin/sandod)"
    daemon_dir="$SANDO_REPO/daemon"
    if [[ ! -d "$daemon_dir" ]]; then
        log "  warn: cannot locate sando/daemon source at $daemon_dir; skipping build"
    else
        # cargo only needs network + tmp; resulting binary owned root, mode 755.
        (cd "$daemon_dir" && cargo build --release --quiet)
        install -m 0755 "$daemon_dir/target/release/sandod" /usr/local/bin/sandod
    fi
else
    log "12/13 skipping sandod build (BUILD_SANDOD=0)"
fi

log "13/13 bare mnw.git + post-receive hook"
if [[ ! -d "$SANDO_HOME/mnw.git" ]]; then
    sudo -u "$SANDO_USER" git init --bare --initial-branch=main "$SANDO_HOME/mnw.git" >/dev/null
fi
# Install (or refresh) the post-receive hook that POSTs to sandod on push.
# Sourced from the repo so updates here propagate to the next bootstrap run.
install -m 0755 -o "$SANDO_USER" -g "$SANDO_USER" \
    "$SCRIPT_DIR/post-receive" \
    "$SANDO_HOME/mnw.git/hooks/post-receive"

# Enable services last so a partial bootstrap doesn't leave a service trying
# to start against an incomplete environment.
systemctl enable sandod.service >/dev/null 2>&1 || true
systemctl enable sandod-backup-fetch.timer >/dev/null 2>&1 || true

echo
log "Done. Next steps for the operator:"
echo "   - tailscale up (auth this node onto the tailnet)"
echo "   - on each deploy target, run bootstrap-node.sh with:"
echo "       SANDO_PUBKEY=\"\$(cat $SANDO_HOME/.ssh/id_ed25519.pub)\""
echo "   - push the MNW working tree to $SANDO_HOME/mnw.git:"
echo "       git remote add sando $SANDO_USER@<host>:$SANDO_HOME/mnw.git"
echo "       git push sando main"
echo "   - sudo systemctl start sandod"
echo "   - sudo systemctl start sandod-backup-fetch.timer"
