From 30c22168cc8790e3042a540b8aeef7168e43f13a Mon Sep 17 00:00:00 2001 From: anderdc Date: Thu, 25 Jun 2026 12:56:53 -0500 Subject: [PATCH 1/2] fix(backfill): compare PR updatedAt by instant so the metadata gate skips MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nightly incremental backfill's metadata gate compared the stored PR `updatedAt` against GitHub's value with raw `===`. The stored side is read back from a `timestamptz` column, which TypeORM hydrates into a Date (and the entity mis-annotated as `string`), while the incoming side is GitHub's ISO string. `Date === string` is always false, and even pg's text form (`...+00`) differs from GitHub's ISO form (`...Z`), so the gate matched nothing and re-enqueued a PR_METADATA job for every PR in the window every night — `meta_skipped` stayed 0. Benign for the rate-limit fix (metadata jobs are cheap; the SHA/content gate is what tamed the flood), but the metadata half of the optimisation was dead code. Normalise both sides to an epoch-ms instant before comparing, fail safe on null/unparseable input, and correct the entity annotation to `Date | string | null`. Adds spec coverage for the Date-vs-ISO-string reality that the old string-only tests never exercised. --- .../das/src/entities/PullRequest.entity.ts | 5 ++- .../src/webhook/incremental-backfill.spec.ts | 39 +++++++++++++++++++ .../das/src/webhook/incremental-backfill.ts | 31 +++++++++++---- 3 files changed, 66 insertions(+), 9 deletions(-) diff --git a/packages/das/src/entities/PullRequest.entity.ts b/packages/das/src/entities/PullRequest.entity.ts index 905ba2f..f07d923 100644 --- a/packages/das/src/entities/PullRequest.entity.ts +++ b/packages/das/src/entities/PullRequest.entity.ts @@ -35,8 +35,11 @@ export class PullRequest { @Column({ name: "merged_at", type: "timestamptz", nullable: true }) mergedAt: string; + // Written as a GitHub ISO string on upsert, but TypeORM hydrates a + // `timestamptz` back into a Date on read — hence the union. The incremental + // backfill's needsMetadataRefresh normalises both before comparing. @Column({ name: "updated_at", type: "timestamptz", nullable: true }) - updatedAt: string | null; + updatedAt: Date | string | null; @Column({ name: "last_edited_at", type: "timestamptz", nullable: true }) lastEditedAt: string | null; diff --git a/packages/das/src/webhook/incremental-backfill.spec.ts b/packages/das/src/webhook/incremental-backfill.spec.ts index 1751913..30ad46b 100644 --- a/packages/das/src/webhook/incremental-backfill.spec.ts +++ b/packages/das/src/webhook/incremental-backfill.spec.ts @@ -69,4 +69,43 @@ describe("needsMetadataRefresh", () => { it("re-fetches when GitHub returned a null updatedAt", () => { expect(needsMetadataRefresh(stored(), null)).toBe(true); }); + + // Production reality: TypeORM hydrates the `timestamptz` column into a Date, + // while GitHub sends an ISO string. A raw `===` never matched, so the gate + // re-fetched every PR forever. These lock in the instant-based comparison. + it("skips when a hydrated Date equals GitHub's ISO string (same instant)", () => { + expect( + needsMetadataRefresh( + stored({ updatedAt: new Date("2026-06-01T00:00:00Z") }), + "2026-06-01T00:00:00Z", + ), + ).toBe(false); + }); + + it("skips across timezone-equivalent representations of the same instant", () => { + expect( + needsMetadataRefresh( + stored({ updatedAt: new Date("2026-06-01T00:00:00Z") }), + "2026-05-31T19:00:00-05:00", + ), + ).toBe(false); + }); + + it("re-fetches when a hydrated Date is a different instant", () => { + expect( + needsMetadataRefresh( + stored({ updatedAt: new Date("2026-06-01T00:00:00Z") }), + "2026-06-02T00:00:00Z", + ), + ).toBe(true); + }); + + it("re-fetches (fails safe) on an unparseable stored value", () => { + expect( + needsMetadataRefresh( + stored({ updatedAt: "not-a-date" }), + "2026-06-01T00:00:00Z", + ), + ).toBe(true); + }); }); diff --git a/packages/das/src/webhook/incremental-backfill.ts b/packages/das/src/webhook/incremental-backfill.ts index defdf2b..b7a457b 100644 --- a/packages/das/src/webhook/incremental-backfill.ts +++ b/packages/das/src/webhook/incremental-backfill.ts @@ -5,7 +5,10 @@ export interface StoredPrState { headSha: string | null; baseSha: string | null; - updatedAt: string | null; + // Read back from a `timestamptz` column, so at runtime TypeORM hydrates this + // into a Date even though the entity annotates it as a string; tests pass + // ISO strings. needsMetadataRefresh normalises both shapes before comparing. + updatedAt: Date | string | null; scoringDataStored: boolean; } @@ -23,16 +26,28 @@ export function needsContentRefresh( ); } +// Normalise either a Date (what TypeORM hydrates a `timestamptz` column into) +// or an ISO string (what the GitHub GraphQL API returns, and what tests pass) +// to an epoch-ms instant. Returns null for null/unparseable input so callers +// fail safe. Comparing raw values directly would never match: a Date is never +// `===` a string, and even the DB's text form (`2026-06-01 00:00:00+00`) +// differs from GitHub's ISO form (`2026-06-01T00:00:00Z`). +function toEpochMs(value: Date | string | null): number | null { + if (value == null) return null; + const ms = + value instanceof Date ? value.getTime() : new Date(value).getTime(); + return Number.isNaN(ms) ? null : ms; +} + // updatedAt bumps on edits, state changes, merges, closes and link changes, so -// skip the PR_METADATA fetch only when it matches the stored value. +// skip the PR_METADATA fetch only when it matches the stored value. Compare by +// instant, not raw value: the stored side round-trips through a timestamptz +// column (hydrated as a Date), the incoming side is GitHub's ISO string. export function needsMetadataRefresh( stored: StoredPrState | null | undefined, updatedAt: string | null, ): boolean { - return !( - stored != null && - stored.updatedAt != null && - updatedAt != null && - stored.updatedAt === updatedAt - ); + const storedMs = stored ? toEpochMs(stored.updatedAt) : null; + const incomingMs = toEpochMs(updatedAt); + return !(storedMs != null && incomingMs != null && storedMs === incomingMs); } From 300e9fd886f64b7ec3ec2dddfbd56283ac7dbe87 Mon Sep 17 00:00:00 2001 From: anderdc Date: Thu, 25 Jun 2026 14:42:11 -0500 Subject: [PATCH 2/2] style(backfill): trim comments to one line each --- packages/das/src/entities/PullRequest.entity.ts | 4 +--- .../das/src/webhook/incremental-backfill.spec.ts | 4 +--- packages/das/src/webhook/incremental-backfill.ts | 16 +++------------- 3 files changed, 5 insertions(+), 19 deletions(-) diff --git a/packages/das/src/entities/PullRequest.entity.ts b/packages/das/src/entities/PullRequest.entity.ts index f07d923..9ee2cf9 100644 --- a/packages/das/src/entities/PullRequest.entity.ts +++ b/packages/das/src/entities/PullRequest.entity.ts @@ -35,9 +35,7 @@ export class PullRequest { @Column({ name: "merged_at", type: "timestamptz", nullable: true }) mergedAt: string; - // Written as a GitHub ISO string on upsert, but TypeORM hydrates a - // `timestamptz` back into a Date on read — hence the union. The incremental - // backfill's needsMetadataRefresh normalises both before comparing. + // ISO string on write, Date on read (TypeORM hydrates timestamptz). @Column({ name: "updated_at", type: "timestamptz", nullable: true }) updatedAt: Date | string | null; diff --git a/packages/das/src/webhook/incremental-backfill.spec.ts b/packages/das/src/webhook/incremental-backfill.spec.ts index 30ad46b..6d12376 100644 --- a/packages/das/src/webhook/incremental-backfill.spec.ts +++ b/packages/das/src/webhook/incremental-backfill.spec.ts @@ -70,9 +70,7 @@ describe("needsMetadataRefresh", () => { expect(needsMetadataRefresh(stored(), null)).toBe(true); }); - // Production reality: TypeORM hydrates the `timestamptz` column into a Date, - // while GitHub sends an ISO string. A raw `===` never matched, so the gate - // re-fetched every PR forever. These lock in the instant-based comparison. + // Stored side is a hydrated Date, incoming is GitHub's ISO string. it("skips when a hydrated Date equals GitHub's ISO string (same instant)", () => { expect( needsMetadataRefresh( diff --git a/packages/das/src/webhook/incremental-backfill.ts b/packages/das/src/webhook/incremental-backfill.ts index b7a457b..54f9ca3 100644 --- a/packages/das/src/webhook/incremental-backfill.ts +++ b/packages/das/src/webhook/incremental-backfill.ts @@ -5,9 +5,7 @@ export interface StoredPrState { headSha: string | null; baseSha: string | null; - // Read back from a `timestamptz` column, so at runtime TypeORM hydrates this - // into a Date even though the entity annotates it as a string; tests pass - // ISO strings. needsMetadataRefresh normalises both shapes before comparing. + // Date from TypeORM (timestamptz), string in tests. updatedAt: Date | string | null; scoringDataStored: boolean; } @@ -26,12 +24,7 @@ export function needsContentRefresh( ); } -// Normalise either a Date (what TypeORM hydrates a `timestamptz` column into) -// or an ISO string (what the GitHub GraphQL API returns, and what tests pass) -// to an epoch-ms instant. Returns null for null/unparseable input so callers -// fail safe. Comparing raw values directly would never match: a Date is never -// `===` a string, and even the DB's text form (`2026-06-01 00:00:00+00`) -// differs from GitHub's ISO form (`2026-06-01T00:00:00Z`). +// Normalise a Date or ISO string to an epoch-ms instant; null if unparseable. function toEpochMs(value: Date | string | null): number | null { if (value == null) return null; const ms = @@ -39,10 +32,7 @@ function toEpochMs(value: Date | string | null): number | null { return Number.isNaN(ms) ? null : ms; } -// updatedAt bumps on edits, state changes, merges, closes and link changes, so -// skip the PR_METADATA fetch only when it matches the stored value. Compare by -// instant, not raw value: the stored side round-trips through a timestamptz -// column (hydrated as a Date), the incoming side is GitHub's ISO string. +// Skip the PR_METADATA fetch only when GitHub's updatedAt instant is unchanged. export function needsMetadataRefresh( stored: StoredPrState | null | undefined, updatedAt: string | null,