Run #4282's enriched diagnostic pinpointed the exact remaining drift: diagnose: first ISO byte difference at offset 205152 (LBA 100) 205153 7 10 205154 27 0 205155 57 3 205156 52 55 Decoded as decimal, those are the day/hour/minute/second fields of an ISO9660 7-byte directory record date: A: dd=7 hh=23 mm=47 ss=42 (May 7 23:47:42 UTC) B: dd=8 hh=0 mm=3 ss=45 (May 8 00:03:45 UTC) Match the wall-clock mtime of /live/filesystem.squashfs that the TOC diff also still showed: -/live/filesystem.squashfs ... May 7 23:47 +/live/filesystem.squashfs ... May 8 00:03 Why iter34's `-alter_date_r all "=N" /` didn't catch it: xorriso applies `-alter_date_r` to the in-memory ISO node table, but `-update <src> <iso_path>` writes the directory record's mtime at `-commit` time using the SOURCE FILE's mtime — overriding whatever was in the node table. So the relevant mtime is on `/tmp/silvermetal-rebuilt- XXXXXX.squashfs` (the freshly-`mksquashfs`d file), and that has wall-clock mtime. Fix: touch the source file to SOURCE_DATE_EPOCH right before xorriso reads it. sudo touch -d "@${SOURCE_DATE_EPOCH}" "${new_sqfs}" Bonus: diagnose-divergence.sh now falls back to `od -t x1z` when xxd isn't available — silvermetal-builder ships coreutils but not vim-common, so the iter34 xxd window was silently empty. The new od-based dump is what landed the actual byte values in run #4282. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
280 lines
12 KiB
Bash
Executable File
280 lines
12 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# SilverMetal Linux — reproducibility-failure diagnostic.
|
|
#
|
|
# Invoked by build-iso-linux.yaml's Compare SHA256 step when two builds
|
|
# disagree, but also safe to run by hand against any two ISOs:
|
|
#
|
|
# ISO_A=/path/a.iso ISO_B=/path/b.iso linux/build/scripts/diagnose-divergence.sh
|
|
#
|
|
# Designed to run inside silvermetal-builder where xorriso, squashfs-tools,
|
|
# and diffoscope-minimal are present.
|
|
#
|
|
# Strategy: staged analysis, cheap-to-expensive.
|
|
# 1. sha256 + sizes (always)
|
|
# 2. ISO TOC diff (xorriso): tells us which top-level files differ.
|
|
# Cheap: lists files + sizes, no payload extraction.
|
|
# 3. squashfs file listing diff (unsquashfs -ll): tells us which
|
|
# *inner* files differ. The outer ISO is mostly squashfs payload,
|
|
# so this is usually the layer with all the signal.
|
|
# 4. Targeted diffoscope: only on inner files that actually differ
|
|
# between A and B (and only on ones small enough to be worth
|
|
# inspecting). Avoids the OOM that's predictable when diffoscope
|
|
# recurses into the whole 1 GB ISO at once (run #4273 hit this).
|
|
#
|
|
# Output goes to REPORT_DIR; build-iso-linux.yaml tails the salient
|
|
# bits into the workflow log directly because Gitea 1.25.2 doesn't
|
|
# expose upload-artifact@v3 payloads via its API.
|
|
|
|
set -uo pipefail
|
|
# NOT set -e — we want every diagnostic to attempt, even if earlier
|
|
# ones fail. Each step `|| true`s itself.
|
|
|
|
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
|
|
REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." && pwd)"
|
|
|
|
: "${ISO_A:?ISO_A must point to the first ISO}"
|
|
: "${ISO_B:?ISO_B must point to the second ISO}"
|
|
|
|
if [[ ! -f "${ISO_A}" || ! -f "${ISO_B}" ]]; then
|
|
echo "diagnose: one of the ISOs is missing (A=${ISO_A} B=${ISO_B})" >&2
|
|
exit 1
|
|
fi
|
|
|
|
REPORT_DIR="${REPORT_DIR:-${REPO_ROOT}/linux/build/output/_divergence-$(date -u +%Y%m%dT%H%M%SZ)}"
|
|
mkdir -p "${REPORT_DIR}"
|
|
|
|
WORK_DIR="$(mktemp -d -t silvermetal-divergence.XXXXXX)"
|
|
trap 'rm -rf "${WORK_DIR}"' EXIT
|
|
|
|
echo "diagnose: writing report to ${REPORT_DIR}"
|
|
echo "diagnose: scratch dir ${WORK_DIR}"
|
|
|
|
# --- 1. sha256 + sizes ------------------------------------------------------
|
|
sha256sum "${ISO_A}" "${ISO_B}" > "${REPORT_DIR}/sha256.txt" 2>&1 || true
|
|
ls -la "${ISO_A}" "${ISO_B}" > "${REPORT_DIR}/sizes.txt" 2>&1 || true
|
|
|
|
SIZE_A=$(stat -c%s "${ISO_A}" 2>/dev/null || echo 0)
|
|
SIZE_B=$(stat -c%s "${ISO_B}" 2>/dev/null || echo 0)
|
|
SIZE_DELTA=$(( SIZE_B - SIZE_A ))
|
|
echo "diagnose: sizes A=${SIZE_A} B=${SIZE_B} delta=${SIZE_DELTA}"
|
|
|
|
# --- 2. ISO TOC diff --------------------------------------------------------
|
|
toc_for() {
|
|
local iso="$1" out="$2"
|
|
if command -v xorriso >/dev/null 2>&1; then
|
|
# `-find / -exec lsdl --` gives a long listing of every node:
|
|
# mode | links | uid/gid | size | mtime | path
|
|
# That covers timestamp, ownership and size diffs in one pass.
|
|
xorriso -indev "${iso}" -find / -exec lsdl -- 2>/dev/null > "${out}" || true
|
|
elif command -v isoinfo >/dev/null 2>&1; then
|
|
isoinfo -R -l -i "${iso}" > "${out}" 2>/dev/null || true
|
|
else
|
|
echo "diagnose: no xorriso/isoinfo available, skipping TOC" >&2
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
toc_for "${ISO_A}" "${REPORT_DIR}/toc-a.txt"
|
|
toc_for "${ISO_B}" "${REPORT_DIR}/toc-b.txt"
|
|
diff -u "${REPORT_DIR}/toc-a.txt" "${REPORT_DIR}/toc-b.txt" \
|
|
> "${REPORT_DIR}/toc-diff.txt" 2>/dev/null || true
|
|
|
|
# Always echo a sample of the TOC so we can see ISO layout in the
|
|
# workflow log even when the squashfs extraction works — useful for
|
|
# noticing "oh there's a third-party blob in here we don't expect".
|
|
echo "diagnose: first 30 lines of TOC (ISO A):"
|
|
head -n 30 "${REPORT_DIR}/toc-a.txt" 2>/dev/null || true
|
|
echo "diagnose: TOC (ISO A) size=$(wc -l < "${REPORT_DIR}/toc-a.txt" 2>/dev/null) lines"
|
|
|
|
# --- 3. Extract & compare the squashfs filesystem listings ------------------
|
|
# The outer ISO is mostly a thin wrapper around the rootfs payload (a
|
|
# squashfs in live-build's case), so size/content drift almost always
|
|
# lives there. Pull just that file out of each ISO and list its contents.
|
|
#
|
|
# Run #4274 hit the case where the named-path probes (/live/...) all
|
|
# missed and the `xorriso -find … | tail -n1` fallback path didn't
|
|
# work either (xorriso quotes its -find output, which -extract chokes
|
|
# on). This version is more aggressive: lists every file in the ISO
|
|
# with its size, picks the genuinely largest, and strips xorriso's
|
|
# quoting.
|
|
list_iso_files() {
|
|
# Output: "<size> <path>" per line, no quoting.
|
|
local iso="$1"
|
|
xorriso -indev "${iso}" -find / -exec lsdl -- 2>/dev/null \
|
|
| awk '
|
|
# lsdl format: "mode links uid gid size YYYY-MM-DD HH:MM path"
|
|
# path can have spaces; reconstruct from $9 onwards.
|
|
NF >= 9 && $1 ~ /^-/ {
|
|
size=$5
|
|
path=$9
|
|
for (i=10; i<=NF; i++) path=path " " $i
|
|
# strip surrounding single quotes if present
|
|
gsub(/^'\''|'\''$/, "", path)
|
|
print size " " path
|
|
}'
|
|
}
|
|
|
|
biggest_file() {
|
|
list_iso_files "$1" \
|
|
| sort -k1,1 -n -r \
|
|
| head -n1 \
|
|
| awk '{ $1=""; sub(/^ /,""); print }'
|
|
}
|
|
|
|
extract_squashfs() {
|
|
local iso="$1" out="$2"
|
|
if ! command -v xorriso >/dev/null 2>&1; then return 1; fi
|
|
|
|
# Try canonical Debian/Kicksecure layout first.
|
|
local err_log; err_log=$(mktemp)
|
|
for path in /live/filesystem.squashfs /casper/filesystem.squashfs \
|
|
/filesystem.squashfs /install/filesystem.squashfs \
|
|
/boot/filesystem.squashfs ; do
|
|
if xorriso -osirrox on -indev "${iso}" -extract "${path}" "${out}" \
|
|
2>"${err_log}" \
|
|
&& [[ -s "${out}" ]]; then
|
|
echo "diagnose: extracted ${path} from $(basename "${iso}")" >&2
|
|
rm -f "${err_log}"
|
|
return 0
|
|
fi
|
|
done
|
|
|
|
# Fallback: take the largest file in the ISO, regardless of name.
|
|
# In a live-build ISO that's reliably the rootfs payload, even when
|
|
# it isn't called *.squashfs.
|
|
local biggest
|
|
biggest=$(biggest_file "${iso}")
|
|
if [[ -n "${biggest}" ]]; then
|
|
echo "diagnose: largest file in $(basename "${iso}") is ${biggest}; extracting" >&2
|
|
if xorriso -osirrox on -indev "${iso}" -extract "${biggest}" "${out}" \
|
|
2>"${err_log}" \
|
|
&& [[ -s "${out}" ]]; then
|
|
rm -f "${err_log}"
|
|
return 0
|
|
fi
|
|
fi
|
|
# If we got here, extraction failed; surface the error.
|
|
echo "diagnose: xorriso -extract stderr (last 30 lines):" >&2
|
|
tail -n 30 "${err_log}" >&2 || true
|
|
rm -f "${err_log}"
|
|
return 1
|
|
}
|
|
|
|
SQFS_A="${WORK_DIR}/a.squashfs"
|
|
SQFS_B="${WORK_DIR}/b.squashfs"
|
|
|
|
if ! extract_squashfs "${ISO_A}" "${SQFS_A}"; then
|
|
echo "diagnose: could not extract rootfs from A — top 20 ISO files by size:" >&2
|
|
list_iso_files "${ISO_A}" | sort -k1,1 -n -r | head -n20 >&2 || true
|
|
fi
|
|
if ! extract_squashfs "${ISO_B}" "${SQFS_B}"; then
|
|
echo "diagnose: could not extract rootfs from B" >&2
|
|
fi
|
|
|
|
if [[ -s "${SQFS_A}" && -s "${SQFS_B}" ]]; then
|
|
SQFS_SIZE_A=$(stat -c%s "${SQFS_A}")
|
|
SQFS_SIZE_B=$(stat -c%s "${SQFS_B}")
|
|
echo "diagnose: squashfs sizes A=${SQFS_SIZE_A} B=${SQFS_SIZE_B} delta=$(( SQFS_SIZE_B - SQFS_SIZE_A ))"
|
|
sha256sum "${SQFS_A}" "${SQFS_B}" > "${REPORT_DIR}/squashfs-sha256.txt"
|
|
|
|
if command -v unsquashfs >/dev/null 2>&1; then
|
|
# -ll = long listing with permissions, owner, size, date, target.
|
|
# Easiest format to diff for "which files have different sizes".
|
|
unsquashfs -ll "${SQFS_A}" 2>/dev/null > "${REPORT_DIR}/sqfs-ls-a.txt" || true
|
|
unsquashfs -ll "${SQFS_B}" 2>/dev/null > "${REPORT_DIR}/sqfs-ls-b.txt" || true
|
|
diff -u "${REPORT_DIR}/sqfs-ls-a.txt" "${REPORT_DIR}/sqfs-ls-b.txt" \
|
|
> "${REPORT_DIR}/sqfs-ls-diff.txt" 2>/dev/null || true
|
|
fi
|
|
|
|
# --- 4. Targeted diffoscope on the squashfs only --------------------
|
|
# Comparing two ~1 GB squashfs files directly is still big, but it's
|
|
# bounded — diffoscope won't recurse out into the boot sectors,
|
|
# initrd, kernel, etc. Cap the report size aggressively, no html
|
|
# (memory hog), and forbid recursion past one container layer.
|
|
if command -v diffoscope >/dev/null 2>&1; then
|
|
echo "diagnose: running diffoscope on squashfs payload"
|
|
timeout 600 diffoscope \
|
|
--no-default-limits \
|
|
--max-page-size 50000000 \
|
|
--max-text-report-size 5000000 \
|
|
--max-container-depth 2 \
|
|
--text "${REPORT_DIR}/sqfs-diff.txt" \
|
|
"${SQFS_A}" "${SQFS_B}" \
|
|
>/dev/null 2>&1 || true
|
|
fi
|
|
fi
|
|
|
|
# --- Whole-file cmp + hex around first divergence ---------------------------
|
|
# When TOC + squashfs match but ISO SHA still diverges, the bytes that
|
|
# differ live in the ISO9660 structure between the system area (first
|
|
# 32 KiB) and the file payload. Limit -n to first 1 MiB scanned, capture
|
|
# first 200 differing offsets, and dump 128-byte hex windows from each
|
|
# ISO around the first divergence so the workflow log shows what region
|
|
# we're in.
|
|
if command -v cmp >/dev/null 2>&1; then
|
|
# First scan the full ISO; cap output at 200 lines so 1 GiB of all-
|
|
# different bytes can't drown the artifact.
|
|
cmp -l "${ISO_A}" "${ISO_B}" 2>/dev/null \
|
|
| head -n 200 \
|
|
> "${REPORT_DIR}/iso-cmp-first-200.txt" || true
|
|
|
|
first_diff_byte=$(awk 'NR==1 {print $1}' \
|
|
"${REPORT_DIR}/iso-cmp-first-200.txt" 2>/dev/null)
|
|
if [[ -n "${first_diff_byte}" ]]; then
|
|
# cmp -l prints 1-indexed positions; convert to 0-indexed.
|
|
first_diff_byte=$(( first_diff_byte - 1 ))
|
|
lba=$(( first_diff_byte / 2048 ))
|
|
echo "diagnose: first ISO byte difference at offset ${first_diff_byte} (LBA ${lba})"
|
|
# 128-byte window starting 32 bytes before the diff.
|
|
start=$(( first_diff_byte > 32 ? first_diff_byte - 32 : 0 ))
|
|
# Prefer xxd if available, fall back to od (always in coreutils).
|
|
# silvermetal-builder doesn't ship xxd; run #4282's diagnostic
|
|
# silently produced no hex window because of that.
|
|
dump_hex() {
|
|
local f="$1" out="$2"
|
|
if command -v xxd >/dev/null 2>&1; then
|
|
xxd "${f}" > "${out}"
|
|
else
|
|
od -A x -t x1z -v "${f}" > "${out}"
|
|
fi
|
|
}
|
|
dd if="${ISO_A}" bs=1 skip="${start}" count=128 2>/dev/null \
|
|
> "${REPORT_DIR}/_a-window.bin" || true
|
|
dd if="${ISO_B}" bs=1 skip="${start}" count=128 2>/dev/null \
|
|
> "${REPORT_DIR}/_b-window.bin" || true
|
|
dump_hex "${REPORT_DIR}/_a-window.bin" "${REPORT_DIR}/iso-a-around-first-diff.hex" || true
|
|
dump_hex "${REPORT_DIR}/_b-window.bin" "${REPORT_DIR}/iso-b-around-first-diff.hex" || true
|
|
rm -f "${REPORT_DIR}/_a-window.bin" "${REPORT_DIR}/_b-window.bin" || true
|
|
diff -u "${REPORT_DIR}/iso-a-around-first-diff.hex" \
|
|
"${REPORT_DIR}/iso-b-around-first-diff.hex" \
|
|
> "${REPORT_DIR}/iso-around-first-diff.diff" 2>/dev/null || true
|
|
fi
|
|
|
|
# Keep the legacy first-8KB scan around as a quick header view.
|
|
cmp -l -n 8192 "${ISO_A}" "${ISO_B}" > "${REPORT_DIR}/iso-header-cmp.txt" 2>&1 || true
|
|
fi
|
|
|
|
# --- Checklist --------------------------------------------------------------
|
|
{
|
|
echo "## Likely-culprit checklist"
|
|
echo ""
|
|
echo "ISO size delta: ${SIZE_DELTA} bytes"
|
|
if [[ -s "${SQFS_A}" && -s "${SQFS_B}" ]]; then
|
|
echo "squashfs size delta: $(( SQFS_SIZE_B - SQFS_SIZE_A )) bytes"
|
|
fi
|
|
echo ""
|
|
echo "Walk these in order — most failures fall into the first two."
|
|
echo ""
|
|
echo " [ ] SOURCE_DATE_EPOCH was identical in both builds (compare BUILD_INFO files)"
|
|
echo " [ ] snapshot.debian.org timestamp matched (compare snapshot-pin.env files)"
|
|
echo " [ ] Same builder image digest (compare BUILD_INFO files)"
|
|
echo " [ ] mksquashfs reproducibility flags survived (-no-exports -no-xattrs -reproducible)"
|
|
echo " [ ] No build-id randomisation in kernel/initrd (look for differing .note.gnu.build-id)"
|
|
echo " [ ] No host hostname/username leakage (grep for the runner host name)"
|
|
echo " [ ] No locale drift (LC_ALL=C.UTF-8 enforced in container)"
|
|
echo " [ ] dpkg trigger/postinst ordering (look at INFO: triggered ... in build log)"
|
|
} > "${REPORT_DIR}/checklist.md"
|
|
|
|
echo "diagnose: done. Files in ${REPORT_DIR}:"
|
|
ls -la "${REPORT_DIR}" 2>/dev/null || true
|