diff --git a/.changeset/ascii-case-fold-upper-lower-ilike.md b/.changeset/ascii-case-fold-upper-lower-ilike.md new file mode 100644 index 000000000..192a69eff --- /dev/null +++ b/.changeset/ascii-case-fold-upper-lower-ilike.md @@ -0,0 +1,11 @@ +--- +"@tanstack/db": patch +--- + +Use ASCII-only case fold in `upper()`, `lower()`, and `ilike` to avoid JavaScript's locale-aware and length-changing `toUpperCase()` / `toLowerCase()`. + +`'straße'.toUpperCase()` is `'STRASSE'` (six characters become seven). `'İ'.toLowerCase()` becomes `'i̇'` (one character becomes two, adding a combining mark). `'I'.toLowerCase()` under a Turkish locale returns `'ı'` instead of `'i'`. These length-changing and locale-dependent folds caused TanStack DB's evaluator to silently disagree with the server side of a query collection when the source data contained non-ASCII characters - rows would drop out of the matching set with no error. + +`upper()` and `lower()` now match SQLite's default ASCII-only fold, so they are deterministic across JS locales and length-preserving. `ilike` uses the same ASCII fold for its case-insensitive comparison. + +For full Unicode-aware case folding, users can apply `String.prototype.toUpperCase()` / `toLowerCase()` themselves in a JavaScript expression before the value reaches the query layer. diff --git a/packages/db/src/query/compiler/evaluators.ts b/packages/db/src/query/compiler/evaluators.ts index 929ac56df..19559be8f 100644 --- a/packages/db/src/query/compiler/evaluators.ts +++ b/packages/db/src/query/compiler/evaluators.ts @@ -14,6 +14,36 @@ function isUnknown(value: any): boolean { return value === null || value === undefined } +/** + * ASCII-only upper-case fold. Matches SQLite's default `upper()` rather than + * JavaScript's locale-aware `String.prototype.toUpperCase()`, which is + * length-changing on `ß`, ligatures like `fi`, and Turkish dotted-i. Using + * `.toUpperCase()` directly causes the silent-row-loss class where a sync-back + * collection's server-side filter and the client-side filter disagree on the + * same string (`'straße' → 'STRASSE'` server-side via the SQL engine, + * `'STRAßE'` client-side via JS). + */ +function asciiToUpper(s: string): string { + let out = `` + for (let i = 0; i < s.length; i++) { + const c = s.charCodeAt(i) + out += c >= 0x61 && c <= 0x7a ? String.fromCharCode(c - 32) : s.charAt(i) + } + return out +} + +/** + * ASCII-only lower-case fold. See `asciiToUpper` for rationale. + */ +function asciiToLower(s: string): string { + let out = `` + for (let i = 0; i < s.length; i++) { + const c = s.charCodeAt(i) + out += c >= 0x41 && c <= 0x5a ? String.fromCharCode(c + 32) : s.charAt(i) + } + return out +} + function toDateValue(value: any): Date | null { if (value instanceof Date) { return Number.isNaN(value.getTime()) ? null : value @@ -407,14 +437,14 @@ function compileFunction(func: Func, isSingleRow: boolean): (data: any) => any { const arg = compiledArgs[0]! return (data) => { const value = arg(data) - return typeof value === `string` ? value.toUpperCase() : value + return typeof value === `string` ? asciiToUpper(value) : value } } case `lower`: { const arg = compiledArgs[0]! return (data) => { const value = arg(data) - return typeof value === `string` ? value.toLowerCase() : value + return typeof value === `string` ? asciiToLower(value) : value } } case `length`: { @@ -606,8 +636,8 @@ function evaluateLike( return false } - const searchValue = caseInsensitive ? value.toLowerCase() : value - const searchPattern = caseInsensitive ? pattern.toLowerCase() : pattern + const searchValue = caseInsensitive ? asciiToLower(value) : value + const searchPattern = caseInsensitive ? asciiToLower(pattern) : pattern // Convert SQL LIKE pattern to regex // First escape all regex special chars except % and _ diff --git a/packages/db/tests/query/compiler/evaluators.test.ts b/packages/db/tests/query/compiler/evaluators.test.ts index 69969de18..314db8410 100644 --- a/packages/db/tests/query/compiler/evaluators.test.ts +++ b/packages/db/tests/query/compiler/evaluators.test.ts @@ -157,6 +157,56 @@ describe(`evaluators`, () => { expect(compiled({})).toBe(`first`) }) + + // upper/lower must NOT use JavaScript's locale-aware, length-changing + // case fold. SQLite's default upper()/lower() is ASCII-only. When a + // TanStack DB collection is fed by a server-side filter (e.g. a query + // collection backed by PostgreSQL or a sync engine over SQLite), the + // two ends must agree on the case-fold of the bucket key. JavaScript's + // .toUpperCase() length-changes on `ß`, ligatures, Turkish dotted-i, + // and the row silently drops out of the matching collection. + it(`upper folds ASCII without length-changing on the German sharp s`, () => { + const func = new Func(`upper`, [new Value(`straße`)]) + const compiled = compileExpression(func) + + expect(compiled({})).toBe(`STRAßE`) + }) + + it(`upper folds ASCII without length-changing on the fi ligature`, () => { + const func = new Func(`upper`, [new Value(`file`)]) + const compiled = compileExpression(func) + + expect(compiled({})).toBe(`fiLE`) + }) + + it(`upper folds ASCII without length-changing on Turkish dotted-i`, () => { + const func = new Func(`upper`, [new Value(`istanbul`)]) + const compiled = compileExpression(func) + + // JS .toUpperCase() returns `ISTANBUL` in en-US locale but in Turkish + // locale-aware folds give `İSTANBUL`. ASCII fold deterministically + // yields `ISTANBUL` regardless of locale. + expect(compiled({})).toBe(`ISTANBUL`) + }) + + it(`lower folds ASCII without length-changing on the German sharp s`, () => { + const func = new Func(`lower`, [new Value(`STRAßE`)]) + const compiled = compileExpression(func) + + // ASCII fold leaves `ß` (already lowercase form) unchanged. JS + // .toLowerCase() also returns `straße` here so this is a control; + // the real divergence is on `İ → i̇` (combining dot above). + expect(compiled({})).toBe(`straße`) + }) + + it(`lower folds ASCII without combining-mark expansion on Turkish capital I-dot`, () => { + const func = new Func(`lower`, [new Value(`İ`)]) + const compiled = compileExpression(func) + + // JS .toLowerCase('İ') === 'i̇' (two code points). ASCII fold + // leaves it as-is, deterministically. + expect(compiled({})).toBe(`İ`) + }) }) describe(`array functions`, () => { @@ -373,6 +423,45 @@ describe(`evaluators`, () => { expect(compiled({})).toBe(true) }) + // ilike must use the same ASCII-only case fold as upper()/lower(). + // JavaScript's String.prototype.toLowerCase() is length-changing on + // Turkish dotted-i (`İ → i̇` adds a combining mark) and locale-aware + // on `I → ı` under tr-TR. Using it for ilike means a row written as + // `'İstanbul'` server-side and `'ISTANBUL'` in the query pattern + // silently mismatches. + it(`ilike does not length-change Turkish capital I-dot during case fold`, () => { + const func = new Func(`ilike`, [ + new Value(`İstanbul`), + new Value(`istanbul`), + ]) + const compiled = compileExpression(func) + + // Under JS .toLowerCase(), 'İstanbul' → 'i̇stanbul' (9 code units) + // and 'istanbul' → 'istanbul' (8 code units) - regex anchored + // ^...$ does NOT match. ASCII fold leaves the non-ASCII `İ` + // untouched, and the regex still doesn't match (correctly: the + // strings differ in their first character). Test asserts the + // result is deterministic across JS locales. + expect(compiled({})).toBe(false) + }) + + it(`ilike does not length-change the German sharp s during case fold`, () => { + // 'straße' ilike 'STRA%E' should match deterministically. + // Under JS .toLowerCase(), 'STRA%E' → 'stra%e' (good) and 'straße' + // → 'straße' (no change). Under .toUpperCase(), 'straße' → 'STRASSE' + // (length-changes from 6 to 7) and 'STRA%E' → 'STRA%E' - the regex + // 'STRA.*E' DOES match 'STRASSE'. So .toUpperCase() would match while + // .toLowerCase() also matches, but the two folds disagree on the + // string they compare. ASCII-only fold keeps both consistent. + const func = new Func(`ilike`, [ + new Value(`straße`), + new Value(`STRA%E`), + ]) + const compiled = compileExpression(func) + + expect(compiled({})).toBe(true) + }) + it(`handles ilike with null value (3-valued logic)`, () => { const func = new Func(`ilike`, [new Value(null), new Value(`hello%`)]) const compiled = compileExpression(func)