diff --git a/.gitea/workflows/build-iso-linux.yaml b/.gitea/workflows/build-iso-linux.yaml index 4b1e477..bd61404 100644 --- a/.gitea/workflows/build-iso-linux.yaml +++ b/.gitea/workflows/build-iso-linux.yaml @@ -191,25 +191,30 @@ jobs: # Tail key signal directly into the workflow log so we see it # without needing artifact download (Gitea 1.25's API doesn't # surface upload-artifact@v3 payloads through any v1 endpoint - # we've found). Print: file size diff, the checklist, and the - # first few hundred lines of the diffoscope text report. - echo "" - echo "=== Divergence: ISO sizes ===" - cat "${{ github.workspace }}/divergence/sizes.txt" 2>/dev/null || true - echo "" - echo "=== Divergence: SHA256 ===" - cat "${{ github.workspace }}/divergence/sha256.txt" 2>/dev/null || true - echo "" - echo "=== Divergence: checklist ===" - cat "${{ github.workspace }}/divergence/checklist.md" 2>/dev/null || true - echo "" - echo "=== Divergence: diffoscope (first 400 lines of diff.txt) ===" - head -n 400 "${{ github.workspace }}/divergence/diff.txt" 2>/dev/null || true - if [ ! -f "${{ github.workspace }}/divergence/diff.txt" ] \ - && [ -f "${{ github.workspace }}/divergence/cmp.txt" ]; then - echo "=== Divergence: cmp -l (first 50 differing bytes) ===" - head -n 50 "${{ github.workspace }}/divergence/cmp.txt" 2>/dev/null || true - fi + # we've found). Print sizes, sha, checklist, and the new + # staged outputs from diagnose-divergence.sh: ISO TOC diff + # and squashfs file listing diff first (small, high signal), + # then the targeted diffoscope output on the squashfs payload. + DIVDIR="${{ github.workspace }}/divergence" + print_section() { + local title="$1" path="$2" head_lines="${3:-0}" + [ -e "${path}" ] || return 0 + echo "" + echo "=== ${title} ===" + if [ "${head_lines}" -gt 0 ]; then + head -n "${head_lines}" "${path}" 2>/dev/null || true + else + cat "${path}" 2>/dev/null || true + fi + } + print_section "ISO sizes" "${DIVDIR}/sizes.txt" + print_section "SHA256 (ISO)" "${DIVDIR}/sha256.txt" + print_section "SHA256 (squashfs payload)" "${DIVDIR}/squashfs-sha256.txt" + print_section "checklist" "${DIVDIR}/checklist.md" + print_section "ISO TOC diff (xorriso lsdl)" "${DIVDIR}/toc-diff.txt" 400 + print_section "squashfs file listing diff" "${DIVDIR}/sqfs-ls-diff.txt" 600 + print_section "diffoscope (squashfs)" "${DIVDIR}/sqfs-diff.txt" 600 + print_section "ISO header cmp -l (first 8KB)" "${DIVDIR}/iso-header-cmp.txt" 100 echo "" echo "(Full report uploaded as divergence-report-${{ github.run_id }})" diff --git a/linux/build/scripts/diagnose-divergence.sh b/linux/build/scripts/diagnose-divergence.sh index d228d5e..3b2fe6b 100755 --- a/linux/build/scripts/diagnose-divergence.sh +++ b/linux/build/scripts/diagnose-divergence.sh @@ -1,15 +1,33 @@ #!/usr/bin/env bash # SilverMetal Linux — reproducibility-failure diagnostic. # -# Invoked by verify-reproducibility.sh when two builds disagree, but also -# safe to run by hand against any two ISOs: +# Invoked by build-iso-linux.yaml's Compare SHA256 step when two builds +# disagree, but also safe to run by hand against any two ISOs: # # ISO_A=/path/a.iso ISO_B=/path/b.iso linux/build/scripts/diagnose-divergence.sh # -# Produces a diffoscope report. The output is intentionally verbose — when -# this script runs in anger we want everything we can get. +# Designed to run inside silvermetal-builder where xorriso, squashfs-tools, +# and diffoscope-minimal are present. +# +# Strategy: staged analysis, cheap-to-expensive. +# 1. sha256 + sizes (always) +# 2. ISO TOC diff (xorriso): tells us which top-level files differ. +# Cheap: lists files + sizes, no payload extraction. +# 3. squashfs file listing diff (unsquashfs -ll): tells us which +# *inner* files differ. The outer ISO is mostly squashfs payload, +# so this is usually the layer with all the signal. +# 4. Targeted diffoscope: only on inner files that actually differ +# between A and B (and only on ones small enough to be worth +# inspecting). Avoids the OOM that's predictable when diffoscope +# recurses into the whole 1 GB ISO at once (run #4273 hit this). +# +# Output goes to REPORT_DIR; build-iso-linux.yaml tails the salient +# bits into the workflow log directly because Gitea 1.25.2 doesn't +# expose upload-artifact@v3 payloads via its API. -set -euo pipefail +set -uo pipefail +# NOT set -e — we want every diagnostic to attempt, even if earlier +# ones fail. Each step `|| true`s itself. SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." && pwd)" @@ -25,28 +43,116 @@ fi REPORT_DIR="${REPORT_DIR:-${REPO_ROOT}/linux/build/output/_divergence-$(date -u +%Y%m%dT%H%M%SZ)}" mkdir -p "${REPORT_DIR}" +WORK_DIR="$(mktemp -d -t silvermetal-divergence.XXXXXX)" +trap 'rm -rf "${WORK_DIR}"' EXIT + echo "diagnose: writing report to ${REPORT_DIR}" +echo "diagnose: scratch dir ${WORK_DIR}" -# Quick wins first — these usually point straight at the culprit. -sha256sum "${ISO_A}" "${ISO_B}" > "${REPORT_DIR}/sha256.txt" -ls -la "${ISO_A}" "${ISO_B}" > "${REPORT_DIR}/sizes.txt" 2>&1 || true +# --- 1. sha256 + sizes ------------------------------------------------------ +sha256sum "${ISO_A}" "${ISO_B}" > "${REPORT_DIR}/sha256.txt" 2>&1 || true +ls -la "${ISO_A}" "${ISO_B}" > "${REPORT_DIR}/sizes.txt" 2>&1 || true -# diffoscope — html if available (richer), text always. -if command -v diffoscope >/dev/null 2>&1; then - diffoscope --max-report-size 100000000 \ - --html "${REPORT_DIR}/diff.html" \ - --text "${REPORT_DIR}/diff.txt" \ - "${ISO_A}" "${ISO_B}" \ - || true # non-zero exit just means "they differ"; that's why we're here -elif command -v cmp >/dev/null 2>&1; then - echo "diagnose: diffoscope not found, falling back to cmp" >&2 - cmp -l "${ISO_A}" "${ISO_B}" > "${REPORT_DIR}/cmp.txt" || true +SIZE_A=$(stat -c%s "${ISO_A}" 2>/dev/null || echo 0) +SIZE_B=$(stat -c%s "${ISO_B}" 2>/dev/null || echo 0) +SIZE_DELTA=$(( SIZE_B - SIZE_A )) +echo "diagnose: sizes A=${SIZE_A} B=${SIZE_B} delta=${SIZE_DELTA}" + +# --- 2. ISO TOC diff -------------------------------------------------------- +toc_for() { + local iso="$1" out="$2" + if command -v xorriso >/dev/null 2>&1; then + # `-find / -exec lsdl --` gives a long listing of every node: + # mode | links | uid/gid | size | mtime | path + # That covers timestamp, ownership and size diffs in one pass. + xorriso -indev "${iso}" -find / -exec lsdl -- 2>/dev/null > "${out}" || true + elif command -v isoinfo >/dev/null 2>&1; then + isoinfo -R -l -i "${iso}" > "${out}" 2>/dev/null || true + else + echo "diagnose: no xorriso/isoinfo available, skipping TOC" >&2 + return 1 + fi +} + +toc_for "${ISO_A}" "${REPORT_DIR}/toc-a.txt" +toc_for "${ISO_B}" "${REPORT_DIR}/toc-b.txt" +diff -u "${REPORT_DIR}/toc-a.txt" "${REPORT_DIR}/toc-b.txt" \ + > "${REPORT_DIR}/toc-diff.txt" 2>/dev/null || true + +# --- 3. Extract & compare the squashfs filesystem listings ------------------ +# The outer ISO is mostly a thin wrapper around live/filesystem.squashfs; +# size/content drift almost always lives there. Pull just that file out +# of each ISO and list its contents. +extract_squashfs() { + local iso="$1" out="$2" + if ! command -v xorriso >/dev/null 2>&1; then return 1; fi + # Try the canonical Debian/Kicksecure layout first. + for path in /live/filesystem.squashfs /casper/filesystem.squashfs /filesystem.squashfs; do + if xorriso -indev "${iso}" -extract "${path}" "${out}" 2>/dev/null; then + [[ -s "${out}" ]] && return 0 + fi + done + # Fallback: take the largest .squashfs we can find. + local biggest + biggest=$(xorriso -indev "${iso}" -find / -name '*.squashfs' 2>/dev/null \ + | tail -n1) + [[ -n "${biggest}" ]] || return 1 + xorriso -indev "${iso}" -extract "${biggest}" "${out}" 2>/dev/null || return 1 + [[ -s "${out}" ]] +} + +SQFS_A="${WORK_DIR}/a.squashfs" +SQFS_B="${WORK_DIR}/b.squashfs" +extract_squashfs "${ISO_A}" "${SQFS_A}" || echo "diagnose: could not extract squashfs from A" >&2 +extract_squashfs "${ISO_B}" "${SQFS_B}" || echo "diagnose: could not extract squashfs from B" >&2 + +if [[ -s "${SQFS_A}" && -s "${SQFS_B}" ]]; then + SQFS_SIZE_A=$(stat -c%s "${SQFS_A}") + SQFS_SIZE_B=$(stat -c%s "${SQFS_B}") + echo "diagnose: squashfs sizes A=${SQFS_SIZE_A} B=${SQFS_SIZE_B} delta=$(( SQFS_SIZE_B - SQFS_SIZE_A ))" + sha256sum "${SQFS_A}" "${SQFS_B}" > "${REPORT_DIR}/squashfs-sha256.txt" + + if command -v unsquashfs >/dev/null 2>&1; then + # -ll = long listing with permissions, owner, size, date, target. + # Easiest format to diff for "which files have different sizes". + unsquashfs -ll "${SQFS_A}" 2>/dev/null > "${REPORT_DIR}/sqfs-ls-a.txt" || true + unsquashfs -ll "${SQFS_B}" 2>/dev/null > "${REPORT_DIR}/sqfs-ls-b.txt" || true + diff -u "${REPORT_DIR}/sqfs-ls-a.txt" "${REPORT_DIR}/sqfs-ls-b.txt" \ + > "${REPORT_DIR}/sqfs-ls-diff.txt" 2>/dev/null || true + fi + + # --- 4. Targeted diffoscope on the squashfs only -------------------- + # Comparing two ~1 GB squashfs files directly is still big, but it's + # bounded — diffoscope won't recurse out into the boot sectors, + # initrd, kernel, etc. Cap the report size aggressively, no html + # (memory hog), and forbid recursion past one container layer. + if command -v diffoscope >/dev/null 2>&1; then + echo "diagnose: running diffoscope on squashfs payload" + timeout 600 diffoscope \ + --no-default-limits \ + --max-page-size 50000000 \ + --max-text-report-size 5000000 \ + --max-container-depth 2 \ + --text "${REPORT_DIR}/sqfs-diff.txt" \ + "${SQFS_A}" "${SQFS_B}" \ + >/dev/null 2>&1 || true + fi fi -# A first guess at the culprit, even when the diff is huge. +# --- Fallback: cmp -l on first KB of the ISOs (catches header-level drift) -- +if command -v cmp >/dev/null 2>&1; then + cmp -l -n 8192 "${ISO_A}" "${ISO_B}" > "${REPORT_DIR}/iso-header-cmp.txt" 2>&1 || true +fi + +# --- Checklist -------------------------------------------------------------- { echo "## Likely-culprit checklist" echo "" + echo "ISO size delta: ${SIZE_DELTA} bytes" + if [[ -s "${SQFS_A}" && -s "${SQFS_B}" ]]; then + echo "squashfs size delta: $(( SQFS_SIZE_B - SQFS_SIZE_A )) bytes" + fi + echo "" echo "Walk these in order — most failures fall into the first two." echo "" echo " [ ] SOURCE_DATE_EPOCH was identical in both builds (compare BUILD_INFO files)" @@ -56,6 +162,8 @@ fi echo " [ ] No build-id randomisation in kernel/initrd (look for differing .note.gnu.build-id)" echo " [ ] No host hostname/username leakage (grep for the runner host name)" echo " [ ] No locale drift (LC_ALL=C.UTF-8 enforced in container)" + echo " [ ] dpkg trigger/postinst ordering (look at INFO: triggered ... in build log)" } > "${REPORT_DIR}/checklist.md" -echo "diagnose: done. See ${REPORT_DIR}/checklist.md and ${REPORT_DIR}/diff.{html,txt}" +echo "diagnose: done. Files in ${REPORT_DIR}:" +ls -la "${REPORT_DIR}" 2>/dev/null || true