diff --git a/apps/api/scripts/import-laddr/translators.ts b/apps/api/scripts/import-laddr/translators.ts index 7aafa0a..7886c0a 100644 --- a/apps/api/scripts/import-laddr/translators.ts +++ b/apps/api/scripts/import-laddr/translators.ts @@ -218,13 +218,19 @@ type Namespace = (typeof VALID_NAMESPACES)[number]; * (`topictransit`); when the source row has a `Title` like `topic.Transit` * we recover the namespace from there. Both Handle and the slug component * are lowercased; slug-shape normalization happens at the call site. + * + * When neither Handle nor Title supplies a `topic.`/`tech.`/`event.` prefix + * (~12% of laddr tags — these were created via autocomplete-create without + * typing a namespace), we default to `topic` rather than skip. An audit + * warning is emitted so operators can re-namespace them later via tooling. + * Per specs/data-model.md#tag. */ export function splitTagHandle( handle: string, title: string | null, warnings: Warnings, legacyId: number, -): { namespace: Namespace; slug: string } | null { +): { namespace: Namespace; slug: string } { const tryFrom = (s: string): { namespace: Namespace; slug: string } | null => { const dotIdx = s.indexOf('.'); if (dotIdx === -1) return null; @@ -245,9 +251,9 @@ export function splitTagHandle( return fromTitle; } warnings.push( - `[tags] legacyId=${legacyId} handle "${handle}" has no resolvable namespace; skipped`, + `[tags] legacyId=${legacyId} handle "${handle}" has no resolvable namespace; defaulted to topic`, ); - return null; + return { namespace: 'topic', slug: handle.toLowerCase() }; } // --------------------------------------------------------------------------- @@ -275,6 +281,16 @@ function validHttps(s: string | null): string | null { } } +function validUrl(s: string | null): string | null { + if (s === null) return null; + try { + const u = new URL(s); + return u.protocol === 'http:' || u.protocol === 'https:' ? u.toString() : null; + } catch { + return null; + } +} + /** * Coerce a freeform chat-channel string (laddr returns things like * `Benefit-Decision-Toolkit` or `#general` or `food.access`) into the v1 @@ -449,7 +465,6 @@ export function translateTag(row: RawTag, ctx: TranslateCtx): Tag | null { return null; } const split = splitTagHandle(handle, nonEmptyStr(row.Title), ctx.warnings, legacyId); - if (!split) return null; // The slug component derived from a handle like `topic.urban_design` can // contain underscores. Tag slugs only allow `[a-z0-9-]` — coerce, but @@ -594,7 +609,7 @@ export function translateBuzz( ); return null; } - const url = validHttps(nonEmptyStr(row.URL)); + const url = validUrl(nonEmptyStr(row.URL)); if (!url) { ctx.warnings.push( `[project-buzz] legacyId=${legacyId} missing/invalid URL; skipped`, diff --git a/apps/api/src/store/store.ts b/apps/api/src/store/store.ts index bc52702..0726d91 100644 --- a/apps/api/src/store/store.ts +++ b/apps/api/src/store/store.ts @@ -55,17 +55,21 @@ export class Store { } /** - * Replace the public-store snapshot. Used by the hot-reload webhook - * (`POST /api/_internal/reload-data`): gitsheets caches the dataTree - * each Sheet was opened against, so after a fast-forward merge the - * Sheet handles are stale. Re-open the store and swap it in here so - * subsequent direct reads (`store.public..query(...)`) see the - * new tree. + * Replace the public-store snapshot. * - * The `transact` path doesn't need this — `repo.transact` builds a - * fresh workspace from the parent commit on every call. But the - * revocation sweeper and any other consumer that reads via the cached - * Sheets must pick up the new dataTree. + * gitsheets `Sheet` objects cache the `dataTree` they were opened + * against and never refresh — direct `sheet.query*()` calls after a + * commit (or any working-tree-changing op) read from the pre-commit + * tree. The `transact` path itself is unaffected because `repo.transact` + * builds a fresh workspace from HEAD on every call, and route handlers + * read from the typed in-memory `Store` (kept in lockstep by + * `StateApply.upsert*`). + * + * Use this to swap in a freshly-opened `PublicStore` after an out-of-band + * reconcile or hot-reload has advanced HEAD. See + * specs/behaviors/storage.md → "Direct gitsheets reads after a transact" + * for the limitation on sheets not loaded into the typed Store + * (`slug-history`, `revocations`). */ swapPublic(newPublic: PublicStore): void { this.#public = newPublic; diff --git a/apps/api/tests/account-claim.test.ts b/apps/api/tests/account-claim.test.ts index 43d8ce8..eee5fc3 100644 --- a/apps/api/tests/account-claim.test.ts +++ b/apps/api/tests/account-claim.test.ts @@ -700,7 +700,11 @@ describe('Post-onboarding /account-claim/legacy search + merge approval', () => const fresh = app.inMemoryState.people.get(freshId); expect(fresh).toBeUndefined(); - // slug-history entry created for old → new + // slug-history entry created for old → new. We read it via `git show` + // rather than `app.store.public['slug-history'].queryAll()` — that sheet + // isn't loaded into the typed in-memory Store, so the standing Sheet + // handle caches the pre-transact dataTree and returns []. See #47 and + // specs/behaviors/storage.md → "Direct gitsheets reads after a transact". const showRes = await exec('git', ['show', 'HEAD:slug-history/person/fresh-new.toml'], { cwd: dataRepo.path, }); diff --git a/apps/api/tests/import-laddr.test.ts b/apps/api/tests/import-laddr.test.ts index 6af273f..0b725a8 100644 --- a/apps/api/tests/import-laddr.test.ts +++ b/apps/api/tests/import-laddr.test.ts @@ -221,12 +221,14 @@ describe('translateTag', () => { expect(tag!.slug).toBe('transit'); }); - it('skips bare handles with no namespace anywhere', () => { + it('defaults bare handles with no namespace to topic', () => { const c = ctx(); const row: RawTag = { ID: 11, Class: 'Tag', Handle: 'cocoa', Title: 'cocoa' }; const tag = translateTag(row, c); - expect(tag).toBeNull(); - expect(c.warnings.items.some((w) => w.includes('no resolvable namespace'))).toBe(true); + expect(tag).not.toBeNull(); + expect(tag!.namespace).toBe('topic'); + expect(tag!.slug).toBe('cocoa'); + expect(c.warnings.items.some((w) => w.includes('defaulted to topic'))).toBe(true); }); it('coerces underscores in the slug component', () => { @@ -244,9 +246,26 @@ describe('translateTag', () => { }); describe('splitTagHandle', () => { - it('rejects unknown namespaces', () => { + it('defaults unknown namespaces to topic with a warning', () => { const warnings = { items: [] as string[], push: (w: string) => warnings.items.push(w) }; - expect(splitTagHandle('weird.foo', null, warnings, 1)).toBeNull(); + expect(splitTagHandle('weird.foo', null, warnings, 1)).toEqual({ + namespace: 'topic', + slug: 'weird.foo', + }); + expect(warnings.items).toEqual([ + '[tags] legacyId=1 handle "weird.foo" has no resolvable namespace; defaulted to topic', + ]); + }); + + it('defaults bare-word handles to topic', () => { + const warnings = { items: [] as string[], push: (w: string) => warnings.items.push(w) }; + expect(splitTagHandle('naloxone', null, warnings, 1)).toEqual({ + namespace: 'topic', + slug: 'naloxone', + }); + expect(warnings.items).toEqual([ + '[tags] legacyId=1 handle "naloxone" has no resolvable namespace; defaulted to topic', + ]); }); it('handles event namespace', () => { diff --git a/packages/shared/src/schemas/project-buzz.ts b/packages/shared/src/schemas/project-buzz.ts index ee236b8..a5fc3b9 100644 --- a/packages/shared/src/schemas/project-buzz.ts +++ b/packages/shared/src/schemas/project-buzz.ts @@ -10,7 +10,7 @@ export const ProjectBuzzSchema = z.object({ postedById: z.string().uuid().nullable().optional(), slug: z.string().min(1), headline: z.string().min(1).max(200), - url: z.string().url().startsWith('https://'), + url: z.string().url(), publishedAt: z.string().datetime({ offset: true }), summary: z.string().nullable().optional(), imageKey: z.string().nullable().optional(), diff --git a/packages/shared/tests/schemas.test.ts b/packages/shared/tests/schemas.test.ts index b446c88..c422cc4 100644 --- a/packages/shared/tests/schemas.test.ts +++ b/packages/shared/tests/schemas.test.ts @@ -203,7 +203,7 @@ describe('ProjectBuzzSchema', () => { expect(result.success).toBe(true); }); - it('rejects non-https url', () => { + it('accepts http:// urls (legacy press links)', () => { const result = ProjectBuzzSchema.safeParse({ id: uuid, projectId: uuid, @@ -214,6 +214,20 @@ describe('ProjectBuzzSchema', () => { createdAt: now, updatedAt: now, }); + expect(result.success).toBe(true); + }); + + it('rejects malformed urls', () => { + const result = ProjectBuzzSchema.safeParse({ + id: uuid, + projectId: uuid, + slug: 'great-article', + headline: 'Great Article', + url: 'not a url', + publishedAt: now, + createdAt: now, + updatedAt: now, + }); expect(result.success).toBe(false); }); }); diff --git a/plans/importer-cleanup-trio.md b/plans/importer-cleanup-trio.md new file mode 100644 index 0000000..72ac2b3 --- /dev/null +++ b/plans/importer-cleanup-trio.md @@ -0,0 +1,96 @@ +--- +status: done +depends: [] +specs: + - specs/data-model.md + - specs/behaviors/storage.md +issues: [47, 56, 58] +pr: 74 +--- + +# Plan: Importer cleanup trio + +## Scope + +Three small importer-and-schema fixes surfaced by the legacy-import dry run and the account-claim test, bundled because they're each tiny on their own but share the spec-→-importer surface. Together they reduce data loss on the published branch and document one gitsheets-internal limitation that nipped a test pattern. + +- **#56** — relax `ProjectBuzz.url` so the importer stops dropping 81 of 113 records on `http://` URLs (legitimate historical press links). +- **#58** — default tags with no resolvable namespace to `topic` instead of skipping ~120 of them. +- **#47** — document the gitsheets `Sheet#dataTree` staleness limitation that the account-claim test stumbled into; verify no live exposure; file upstream for a future refresh API. + +## Implements + +- [data-model.md#projectbuzz](../specs/data-model.md#projectbuzz) — drop the `https://` requirement on `ProjectBuzz.url`; allow any valid URL. +- [data-model.md#tag](../specs/data-model.md#tag) — note that the importer defaults unnamespaced legacy handles to `topic`, with an audit-friendly warning log. +- [behaviors/storage.md](../specs/behaviors/storage.md) — add a section on the gitsheets dataTree-caching limitation and the in-memory-state pattern that bypasses it. + +## Approach + +### A. #56 — relax `ProjectBuzz.url` to any valid URL + +The current schema (`packages/shared/src/schemas/project-buzz.ts:13`) enforces `z.string().url().startsWith('https://')`. Eighty-one of 113 laddr buzz records have `http://` URLs to mid-2010s press articles that are still served as plain HTTP on `codeforphilly.org` today. The fidelity loss outweighs the marginal security value of refusing them. + +- **Schema change**: drop the `.startsWith('https://')` on `ProjectBuzz.url`. Keep `.url()` validation. +- **Importer change**: drop the `validHttps` fallback in `apps/api/scripts/import-laddr/translators.ts` for the buzz translator; pass through any well-formed URL. +- **Spec change**: in `specs/data-model.md#projectbuzz`, update the `url` row to drop the protocol requirement, with a note that historical `http://` URLs are preserved. +- **Web behavior**: confirm the buzz feed renders `http://` URLs (no `target="_blank" rel="noopener"` issues, no Content-Security-Policy bumps). The links render as ordinary `` already. + +### B. #58 — default unnamespaced tags to `topic` + +Current importer (`translators.ts:222–251`) skips any tag whose handle has no `topic.`/`tech.`/`event.` prefix and whose `Title` doesn't supply one. ~120 tags hit this — mostly low-traffic org/event keywords (`naloxone`, `cocoa`, `organizing_team`). + +- **Importer change**: in `splitTagHandle`, when neither handle nor title yields a valid namespace, default to `{ namespace: 'topic', slug: handleNormalized }` and emit a `[tags] legacyId= handle "" had no resolvable namespace; defaulted to topic` warning. Keep the existing `tryFrom` recovery logic intact — namespaced handles still resolve to their explicit namespace. +- **Spec change**: in `specs/data-model.md#tag`, add a short "legacy import" note that handles without a namespace land in `topic`, callable out as an audit-friendly default that operators can re-namespace later via a tag-rename tool (out of scope here). +- No schema change — `Tag.namespace` remains the `topic | tech | event` enum. + +### C. #47 — document the gitsheets `Sheet#dataTree` staleness limitation + +Investigation (out-of-band agent): every gitsheets `Sheet` instance caches its `dataTree` snapshot at `openStore` time. After `repo.transact` commits a new tree, the standing `Sheet` objects in `store.public` still point at the pre-commit tree, so `sheet.query()` / `queryAll()` / `findByIndex()` calls return stale data. The hot-reload path already addresses this via `Store.swapPublic` (`apps/api/src/store/store.ts:60–81`). + +Live production exposure: **none today.** No route handler reads from `slug-history` or `revocations` via gitsheets after a write in the same request. `InMemoryRevocationStore` operates on its own Maps; the future slug-history redirect handler will need to use an in-memory map too (the same pattern as `people`). + +- **Documentation**: add a "Direct gitsheets reads after a transact" section to `specs/behaviors/storage.md` explaining the limitation and the in-memory-state pattern. Tighten the JSDoc on `Store.swapPublic` to mention this. +- **Test ergonomics**: the failing account-claim test stays on the `git show HEAD:…` fallback for now — the comment in the test gets updated to point at the new spec section instead of describing the symptom. +- **Upstream**: file a gitsheets enhancement request for an explicit per-sheet or per-store `refresh()` API. Add the issue link to the spec section. +- **No runtime code change** in this plan beyond the JSDoc. + +## Validation + +- [x] `packages/shared/src/schemas/project-buzz.ts` — `url` no longer constrained to `startsWith('https://')`; type-check passes. +- [x] `apps/api/scripts/import-laddr/translators.ts` — buzz translator passes through any well-formed URL; tag splitter defaults to `topic` with a warning. +- [x] Dry-run importer against the live laddr snapshot. Counts: + - ProjectBuzz: 32 → **112** (1 record still legitimately skipped: `legacyId=118 project=388 — unresolved FK`, not a URL issue). + - Tags: 885 → **1017** (132 newly defaulted to `topic`; matches the "~120" estimate in #58). + - No new errors in either pass. +- [x] `npm run -w apps/api test` — importer + account-claim tests pass; full sweep clean. +- [x] `npm run type-check && npm run lint` clean. +- [x] `specs/data-model.md` reflects ProjectBuzz + Tag changes; `specs/behaviors/storage.md` has the new dataTree-staleness section. +- [ ] Upstream gitsheets enhancement filed; link recorded in the storage.md section. **Deferred** — see Follow-ups. +- [ ] Sandbox: after a deploy with the new importer's output merged into `published`, the project-detail pages with buzz feeds render the previously-skipped buzz items. **Deferred** — runs at the next deploy cadence; the live `legacy-import` re-run waits until the relaxed schema is in the running pod. + +## Risks / unknowns + +- **`http://` content-security**: a few of the 81 newly-imported buzz items may link to destinations that now serve malware or have been domain-squatted. Out of scope for this plan; future moderation tooling will need to handle this regardless of the URL protocol. Acceptable risk for the import. +- **Default-to-topic taxonomy drift**: adding ~120 noise tags to the `topic` namespace pollutes browsing surfaces that filter by namespace. Mitigation: the warning log makes the legacy ones enumerable, and the future tag-rename tool can re-classify them. Acceptable in exchange for not losing tag data. +- **Upstream gitsheets fix timing**: filing an enhancement is no-commitment work for that maintainer. The doc-and-pattern approach gives us a path that doesn't depend on it. + +## Notes + +Three small fixes paired with their spec updates. Each landed as its own commit on the trio branch: + +- `fix(schemas): allow http:// urls on ProjectBuzz` — schema + data-model spec. +- `fix(importer): default unnamespaced tags to topic; pass http urls through` — `splitTagHandle` no longer returns null; the call site in `translateTag` dropped its null check; `validUrl` sibling helper accepts both schemes while `validHttps` stays for Project's `usersUrl`/`developersUrl`. +- `docs(storage): document gitsheets dataTree caching limitation` — storage.md section + tightened JSDoc on `Store.swapPublic`. + +Surprises: + +- The buzz `url` test in `packages/shared/tests/schemas.test.ts` was previously "rejects non-https url" and asserted the rejection. Inverted to "accepts http:// urls" and added a malformed-URL case so the `.url()` floor is still test-covered. +- The data-model.md edit for the Tag legacy-import policy landed inside the #56 schema-relaxation commit rather than the #58 importer commit. Cohesive enough — both spec edits are adjacent in the file — but a small commit-message-vs-diff mismatch worth being aware of in the log. + +The live `legacy-import` re-run intentionally did **not** run from this branch. The relaxed schema needs to ship to the sandbox pod before any commit with `http://` URLs gets merged into `published`, otherwise validation would fail at boot/reload. The dry-run counts proved the importer-side fix; the actual write happens at the next deploy cadence. + +## Follow-ups + +- **File upstream gitsheets enhancement** — request a per-sheet or per-store refresh API so direct reads after a transact don't need a full re-open. *Tracked as*: needs a fresh issue against `JarvusInnovations/gitsheets`; link to be added to `specs/behaviors/storage.md` once filed. +- **Re-run the importer and merge to `published`** — after this PR + a fresh sandbox deploy. *Deferred to operator cadence*; no new plan needed. +- **Re-namespace defaulted tags** — operators may eventually want to triage the ~132 tags that landed in `topic` by default. Out of scope here; *Deferred* until a tag-management surface exists or someone surfaces a clear pain. diff --git a/specs/behaviors/storage.md b/specs/behaviors/storage.md index 95ac102..f6ba1ce 100644 --- a/specs/behaviors/storage.md +++ b/specs/behaviors/storage.md @@ -176,6 +176,18 @@ type Store = { Mutations are scoped to the HTTP request — see the [Request-bound commit lifecycle](#request-bound-commit-lifecycle) below. +### Direct gitsheets reads after a transact (caching limitation) + +gitsheets' `Sheet` objects cache the `dataTree` snapshot they were opened against, for the lifetime of the `Sheet` instance. `repo.transact` correctly builds a fresh workspace from the current HEAD each call, so writes always target the right tree — but after `transact` returns and HEAD has advanced, any standing `sheet.query()` / `queryAll()` / `findByIndex()` calls on the originally-opened `Sheet` objects in `store.public` return data from the **pre-commit** tree. + +This is why the hot-reload path uses `Store.swapPublic` (`apps/api/src/store/store.ts`) to re-open the public store after a working-tree-changing reconcile. + +**Consequence for the in-memory pattern above:** sheets whose records flow into the typed in-memory `Store` (people, projects, memberships, …) are immune — `StateApply.upsert*` mutates the in-memory Maps in lockstep with each transact, and route handlers read from those Maps. The staleness only affects code that reads directly from `store.public[''].query*()` after a write in the same logical operation. + +**Today** that means `slug-history` and `revocations` — neither has live exposure (no current route handler reads them post-write in the same request), but tests that verify a transact landed on those sheets must use `git show HEAD:…` or `swapPublic` themselves. When a future redirect handler does read slug-history post-write, the correct fix is to load it into the typed in-memory `Store` like the other sheets, not to call `swapPublic` per request. + +Upstream gitsheets enhancement tracked at the issue linked from [#47](https://github.com/CodeForPhilly/codeforphilly-ng/issues/47). + ## Commits are the audit log There is no separate audit-log table or sheet. Every mutation is a git commit with author, timestamp, full diff, structured message, and trailers. Queries that a SQL audit log would serve (`who soft-deleted project X?`, `recent staff actions this month?`) are answered by `git log --grep`, `git log --author`, and `git log -- path/to/sheet/`. diff --git a/specs/data-model.md b/specs/data-model.md index 43a879e..814367b 100644 --- a/specs/data-model.md +++ b/specs/data-model.md @@ -293,7 +293,7 @@ External media / press / "buzz" about a project. Was laddr's `project_buzz`. | postedById | uuid nullable | references people.id | | slug | string | URL-safe slug derived from headline | | headline | string | required, 1–200 chars | -| url | string | required, HTTPS valid | +| url | string | required, valid URL (any scheme). Historical laddr buzz includes mid-2010s `http://` press links still served as plain HTTP today — preserved for fidelity. | | publishedAt | iso8601 | date the original article was published | | summary | string nullable | excerpt / quote | | imageKey | string nullable | gitsheets attachment key for the article image | @@ -333,6 +333,8 @@ Polymorphic taxonomy. Replaces laddr's `tags` + `tag_items`, but with a typed `n URL: `/tags//` (was `/tags/topic.foo`). +**Legacy-import policy:** laddr tags whose `Handle` is a bare word (no `topic.`/`tech.`/`event.` prefix) and whose `Title` also lacks a prefix default to `namespace: 'topic'`. These are mostly low-traffic org/event keywords created via laddr's autocomplete-create flow without typing a namespace. The importer emits an audit warning per defaulted tag so operators can re-namespace them later via tooling. See [issue #58](https://github.com/CodeForPhilly/codeforphilly-ng/issues/58). + ## TagAssignment Polymorphic link between tags and (project | person | help_wanted_role).