Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 117 additions & 0 deletions packages/core/src/parsers/hfIds.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import { describe, it, expect } from "vitest";
import { ensureHfIds, mintHfId } from "./hfIds.js";
import { parseHTML } from "linkedom";

function ids(html: string): string[] {
const { document } = parseHTML(html);
return Array.from(document.querySelectorAll("[data-hf-id]")).map(
(e) => e.getAttribute("data-hf-id") as string,
);
}

// data-hf-id of the first element matching `selector`.
function idOf(html: string, selector: string): string | null {
const { document } = parseHTML(html);
return document.querySelector(selector)?.getAttribute("data-hf-id") ?? null;
}

const doc = (body: string) => `<!doctype html><html><body>${body}</body></html>`;

describe("ensureHfIds", () => {
it("mints a hf- id on every editable element node in body", () => {
const html = `<!doctype html><html><body>
<div class="card"><h1>Hi</h1><img src="a.png"><span>x</span></div>
</body></html>`;
const out = ensureHfIds(html);
for (const id of ids(out)) expect(id).toMatch(/^hf-[a-z0-9]{4}$/);
// div, h1, img, span = 4 ids
expect(ids(out)).toHaveLength(4);
});

it("skips script/style/template/meta and head", () => {
const html = `<!doctype html><html><head><meta charset="utf-8"></head>
<body><script>1</script><style>.a{}</style><p>keep</p></body></html>`;
const out = ensureHfIds(html);
// only the <p> gets an id
expect(ids(out)).toHaveLength(1);
expect(out).not.toContain("<script data-hf-id");
expect(out).not.toContain("<style data-hf-id");
expect(out).not.toContain("<meta data-hf-id");
});

it("is idempotent: a second call mints nothing and is byte-stable", () => {
const html = `<!doctype html><html><body><div><p>a</p></div></body></html>`;
const once = ensureHfIds(html);
const twice = ensureHfIds(once);
expect(twice).toBe(once);
});

it("pins existing data-hf-id and mints around it", () => {
const html = `<!doctype html><html><body>
<div data-hf-id="hf-keep"><p>a</p></div></body></html>`;
const out = ensureHfIds(html);
expect(out).toContain('data-hf-id="hf-keep"');
expect(ids(out)).toContain("hf-keep");
expect(ids(out)).toHaveLength(2); // div pinned + p minted
});

it("two identical sibling nodes get distinct ids", () => {
const html = `<!doctype html><html><body>
<p class="x">same</p><p class="x">same</p></body></html>`;
const got = ids(ensureHfIds(html));
expect(new Set(got).size).toBe(got.length);
});

it("is deterministic: same input → same ids", () => {
const html = `<!doctype html><html><body><div><p>a</p><span>b</span></div></body></html>`;
expect(ids(ensureHfIds(html))).toEqual(ids(ensureHfIds(html)));
});

it("mintHfId rehashes on collision against the assigned set", () => {
const { document } = parseHTML(`<p class="x">same</p>`);
const el = document.querySelector("p") as Element;
const assigned = new Set<string>();
const a = mintHfId(el, assigned);
const b = mintHfId(el, assigned); // identical element, same assigned set
expect(a).not.toBe(b);
expect(a).toMatch(/^hf-[a-z0-9]{4}$/);
expect(b).toMatch(/^hf-[a-z0-9]{4}$/);
});
});

// Lock the edit-lifecycle behavior. These pin BOTH the guarantee that holds
// once ids are persisted to source (pinning) AND the two limitations that hold
// while they are not (design §3 write-back is not yet wired — see
// notes/r1-stable-hf-ids-design.md "Implementation status & verified lifecycle gap").
describe("ensureHfIds — edit lifecycle (R1 stability)", () => {
it("pinned id survives a content edit (the §3 write-back guarantee)", () => {
// Element already carries data-hf-id in source (as it would after write-back).
const edited = doc(`<p class="body" data-hf-id="hf-abcd">Hello world</p>`);
expect(idOf(ensureHfIds(edited), "p.body")).toBe("hf-abcd");
});

it("KNOWN LIMITATION: an unpinned id changes when the element's text is edited", () => {
// No data-hf-id in source → every parse re-mints from content. Editing the
// text changes the hash, so the id drifts. This is the "pure-hash" mode the
// design rejected; flip this assertion to .toBe once write-back lands.
const before = idOf(ensureHfIds(doc(`<p class="body">Hello</p>`)), "p.body");
const after = idOf(ensureHfIds(doc(`<p class="body">Hello world</p>`)), "p.body");
expect(before).not.toBe(after);
});

it("KNOWN LIMITATION: an unpinned id changes when an attribute is edited", () => {
const before = idOf(ensureHfIds(doc(`<p class="body">x</p>`)), "p");
const after = idOf(ensureHfIds(doc(`<p class="lead">x</p>`)), "p");
expect(before).not.toBe(after);
});

it("KNOWN LIMITATION: identical-content siblings have no content-stable id for the 2nd occurrence", () => {
// Insertion stability holds for DISTINCT content (covered elsewhere), but a
// second identical sibling collides and gets a position-derived dedup id —
// there is no content-stable handle for it. The first keeps the base id.
const single = idOf(ensureHfIds(doc(`<p class="x">same</p>`)), "p.x");
const pair = ids(ensureHfIds(doc(`<p class="x">same</p><p class="x">same</p>`)));
expect(pair[0]).toBe(single); // first identical element: stable, content-derived
expect(pair[1]).not.toBe(single); // second: dedup id, exists only by position
});
});
104 changes: 104 additions & 0 deletions packages/core/src/parsers/hfIds.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/**
* Stable hf- element id minting (R1). Node-safe (linkedom only, not browser DOM).
*
* Two surfaces share these helpers:
* - ensureHfIds(html): node-id surface — mints data-hf-id on every element.
* - mintHfId(el, assigned): shared by htmlParser for clip ids.
*
* Hash is CONTENT ONLY (tag + sorted attrs + own text) — no sibling position,
* so inserting a non-identical sibling never shifts another element's id.
*/
import { parseHTML } from "linkedom";

// Non-editable / non-visual elements that should never receive a stable id.
const EXCLUDED_TAGS = new Set(["script", "style", "template", "meta", "link", "noscript", "base"]);

// 32-bit FNV-1a. Pure, deterministic, no crypto, no Math.random.
function fnv1a(str: string): number {
let h = 0x811c9dc5;
for (let i = 0; i < str.length; i++) {
h ^= str.charCodeAt(i);
h = Math.imul(h, 0x01000193);
}
return h >>> 0;
}

// 4 base-36 chars · 36^4 ≈ 1.68M ids per document. Birthday-paradox collision
// ≈ N²/(2·36^4): well under 1% per document after dup rehash at realistic
// clip-model sizes (≤ a few hundred elements). The dup-rehash in mintHfId
// resolves the rare collision; width is deliberately small for readable ids.
function toHfId(hash: number): string {
const s = (hash >>> 0).toString(36);
// Use suffix (most-avalanched bits) for better distribution within the 4-char window.
const four = s.length >= 4 ? s.slice(-4) : s.padStart(4, "0");
return `hf-${four}`;
}

// Element's own direct text (TEXT_NODE children), not descendants'.
function ownText(el: Element): string {
let text = "";
el.childNodes.forEach((n) => {
if (n.nodeType === 3) text += (n as Text).nodeValue ?? "";
});
return text.trim();
}

function contentKey(el: Element): string {
// Exclude all data-hf-* attrs (ids, studio state) — they must not influence the hash.
// Use \x00 / \x01 separators (invalid in HTML attrs) to prevent ambiguous serialization.
const attrs = Array.from(el.attributes)
.filter((a) => !a.name.startsWith("data-hf-"))
.map((a) => `${a.name}\x00${a.value}`)
.sort()
.join("\x01");
return `${el.tagName.toLowerCase()}|${attrs}|${ownText(el)}`;
}

export function mintHfId(el: Element, assigned: Set<string>): string {
const key = contentKey(el);
let id = toHfId(fnv1a(key));
let dup = 0;
while (assigned.has(id)) {
dup += 1;
// Graceful fallback instead of a hard throw: rehashing only fails to find a
// free 4-char slot in a pathological document (~1.6M identical elements).
// Rather than crash the whole parse, widen the id with the dup counter —
// still deterministic and unique, just longer than the 4-char norm.
if (dup > 10000) {
id = `hf-${(fnv1a(key) >>> 0).toString(36)}-${dup}`;
break;
}
id = toHfId(fnv1a(`${key}#${dup}`));
}
assigned.add(id);
return id;
}

export function ensureHfIds(html: string): string {
// Mirror parseSourceDocument's fragment-wrapping so bare fragments don't land
// outside <body> in linkedom, which would cause body.querySelectorAll to return [].
const hasDocumentShell = /<!doctype|<html[\s>]/i.test(html);
const wrapped = !hasDocumentShell;
const { document } = wrapped
? parseHTML(`<!DOCTYPE html><html><head></head><body>${html}</body></html>`)
: parseHTML(html);
const body = document.body;
if (!body) return html;

const assigned = new Set<string>();
// Seed with already-present ids (pin) so fresh mints never collide with them.
// Scope to <body> to match the mint walk below — a stray data-hf-id in <head>
// must not pin an id into the set that a body element would then be bumped off.
for (const el of Array.from(body.querySelectorAll("[data-hf-id]"))) {
const existing = el.getAttribute("data-hf-id");
if (existing) assigned.add(existing);
}

for (const el of Array.from(body.querySelectorAll("*"))) {
if (EXCLUDED_TAGS.has(el.tagName.toLowerCase())) continue;
if (el.getAttribute("data-hf-id")) continue; // pinned
el.setAttribute("data-hf-id", mintHfId(el, assigned));
}

return wrapped ? document.body.innerHTML || "" : document.toString();
}
Loading