feat(block): migrate existing tracked rows when block algo is enabled

marcobambini · marcobambini · commit 21f297191b80 · 2026-04-14T16:17:09.000+02:00
When cloudsync_set_column(..., 'algo', 'block') is called on a table that
already has tracked rows, those rows are now immediately migrated into the
blocks table. Previously pre-existing column values were silently ignored
until the next UPDATE, leaving sync state incomplete.

Migration uses a two-phase collect-then-write approach to avoid SQLite
cursor invalidation, and INSERT OR IGNORE / ON CONFLICT DO NOTHING for
idempotency. Bumps version to 1.0.13.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,17 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
+## [1.0.13] - 2026-04-14
+
+### Fixed
+
+- **Block-level LWW migration**: When `cloudsync_set_column(..., 'algo', 'block')` is called on a table that already has tracked rows, those rows are now immediately migrated into the blocks table. Previously, pre-existing column values were ignored until the next UPDATE, leaving sync state incomplete. The migration uses a two-phase collect-then-write approach to avoid SQLite cursor invalidation and `INSERT OR IGNORE` / `ON CONFLICT DO NOTHING` semantics for idempotency.
+
+### Added
+
+- Unit test `do_test_block_lww_existing_data` (Block LWW Existing Data) verifying block migration on `set_column`, idempotency of repeated `set_column` calls, and correct materialization after update.
+- PostgreSQL test `50_block_lww_existing_data.sql` with equivalent coverage for the PostgreSQL backend.
+
 ## [1.0.12] - 2026-04-11
 
 ### Fixed
diff --git a/src/cloudsync.c b/src/cloudsync.c
@@ -2069,6 +2069,172 @@ int merge_insert (cloudsync_context *data, cloudsync_table_context *table, const
 
 // MARK: - Block column setup -
 
+// Migrate existing tracked rows to block format when block-level LWW is first enabled on a column.
+// Scans the metadata table for alive rows with the plain col_name entry (not yet block entries),
+// reads each row's current value from the base table, splits it into blocks, and inserts
+// the block entries into both the blocks table and the metadata table.
+// Uses INSERT OR IGNORE semantics so the operation is safe to call multiple times.
+static int block_migrate_existing_rows (cloudsync_context *data, cloudsync_table_context *table, int col_idx) {
+    const char *col_name = table->col_name[col_idx];
+    if (!col_name || !table->meta_ref || !table->blocks_ref) return DBRES_OK;
+
+    const char *delim = table->col_delimiter[col_idx] ? table->col_delimiter[col_idx] : BLOCK_DEFAULT_DELIMITER;
+    int64_t db_version = cloudsync_dbversion_next(data, CLOUDSYNC_VALUE_NOTSET);
+
+    // Phase 1: collect all existing PKs that have an alive regular col_name entry
+    // AND do not yet have any entries in the blocks table for this column.
+    // The NOT IN filter makes this idempotent: rows that were already migrated
+    // (or had their blocks created via INSERT) are skipped on subsequent calls.
+    // We collect PKs before writing so that writes to the metadata table (Phase 2)
+    // do not perturb the read cursor on the same table.
+    char *like_pattern = block_build_colname(col_name, "%");
+    if (!like_pattern) return DBRES_NOMEM;
+
+    char *scan_sql = cloudsync_memory_mprintf(SQL_META_SCAN_COL_FOR_MIGRATION, table->meta_ref, table->blocks_ref);
+    if (!scan_sql) { cloudsync_memory_free(like_pattern); return DBRES_NOMEM; }
+    dbvm_t *scan_vm = NULL;
+    int rc = databasevm_prepare(data, scan_sql, &scan_vm, 0);
+    cloudsync_memory_free(scan_sql);
+    if (rc != DBRES_OK) { cloudsync_memory_free(like_pattern); return rc; }
+
+    rc = databasevm_bind_text(scan_vm, 1, col_name, -1);
+    if (rc != DBRES_OK) { cloudsync_memory_free(like_pattern); databasevm_finalize(scan_vm); return rc; }
+    // Bind like_pattern as ?2 and keep it alive until after all scan steps complete,
+    // because databasevm_bind_text uses SQLITE_STATIC (no copy).
+    rc = databasevm_bind_text(scan_vm, 2, like_pattern, -1);
+    if (rc != DBRES_OK) { cloudsync_memory_free(like_pattern); databasevm_finalize(scan_vm); return rc; }
+
+    // Collect pk blobs into a dynamically-grown array of owned copies
+    void  **pks     = NULL;
+    size_t *pklens  = NULL;
+    int     pk_count = 0;
+    int     pk_cap   = 0;
+
+    while ((rc = databasevm_step(scan_vm)) == DBRES_ROW) {
+        size_t pklen = 0;
+        const void *pk = database_column_blob(scan_vm, 0, &pklen);
+        if (!pk || pklen == 0) continue;
+
+        if (pk_count >= pk_cap) {
+            int new_cap = pk_cap ? pk_cap * 2 : 8;
+            void  **new_pks    = (void  **)cloudsync_memory_realloc(pks,    (uint64_t)(new_cap * sizeof(void *)));
+            size_t *new_pklens = (size_t *)cloudsync_memory_realloc(pklens, (uint64_t)(new_cap * sizeof(size_t)));
+            if (!new_pks || !new_pklens) {
+                cloudsync_memory_free(new_pks ? new_pks : pks);
+                cloudsync_memory_free(new_pklens ? new_pklens : pklens);
+                databasevm_finalize(scan_vm);
+                return DBRES_NOMEM;
+            }
+            pks    = new_pks;
+            pklens = new_pklens;
+            pk_cap = new_cap;
+        }
+
+        pks[pk_count] = cloudsync_memory_alloc((uint64_t)pklen);
+        if (!pks[pk_count]) { rc = DBRES_NOMEM; break; }
+        memcpy(pks[pk_count], pk, pklen);
+        pklens[pk_count] = pklen;
+        pk_count++;
+    }
+
+    databasevm_finalize(scan_vm);
+    cloudsync_memory_free(like_pattern); // safe to free after scan_vm is finalized
+    if (rc != DBRES_DONE && rc != DBRES_OK) {
+        for (int i = 0; i < pk_count; i++) cloudsync_memory_free(pks[i]);
+        cloudsync_memory_free(pks);
+        cloudsync_memory_free(pklens);
+        return rc;
+    }
+
+    if (pk_count == 0) {
+        cloudsync_memory_free(pks);
+        cloudsync_memory_free(pklens);
+        return DBRES_OK;
+    }
+
+    // Phase 2: for each collected PK, read the column value, split into blocks,
+    // and insert into the blocks table + metadata using INSERT OR IGNORE.
+
+    char *meta_sql = cloudsync_memory_mprintf(SQL_META_INSERT_BLOCK_IGNORE, table->meta_ref);
+    if (!meta_sql) { rc = DBRES_NOMEM; goto cleanup_pks; }
+    dbvm_t *meta_vm = NULL;
+    rc = databasevm_prepare(data, meta_sql, &meta_vm, 0);
+    cloudsync_memory_free(meta_sql);
+    if (rc != DBRES_OK) goto cleanup_pks;
+
+    char *blocks_sql = cloudsync_memory_mprintf(SQL_BLOCKS_INSERT_IGNORE, table->blocks_ref);
+    if (!blocks_sql) { databasevm_finalize(meta_vm); rc = DBRES_NOMEM; goto cleanup_pks; }
+    dbvm_t *blocks_vm = NULL;
+    rc = databasevm_prepare(data, blocks_sql, &blocks_vm, 0);
+    cloudsync_memory_free(blocks_sql);
+    if (rc != DBRES_OK) { databasevm_finalize(meta_vm); goto cleanup_pks; }
+
+    dbvm_t *val_vm = (dbvm_t *)table_column_lookup(table, col_name, false, NULL);
+
+    for (int p = 0; p < pk_count; p++) {
+        const void *pk = pks[p];
+        size_t pklen   = pklens[p];
+
+        if (!val_vm) continue;
+
+        // Read current column value from the base table
+        int bind_rc = pk_decode_prikey((char *)pk, pklen, pk_decode_bind_callback, (void *)val_vm);
+        if (bind_rc < 0) { databasevm_reset(val_vm); continue; }
+
+        int step_rc = databasevm_step(val_vm);
+        const char *text = (step_rc == DBRES_ROW) ? database_column_text(val_vm, 0) : NULL;
+        // Make a copy of text before resetting val_vm, as the pointer is only valid until reset
+        char *text_copy = text ? cloudsync_string_dup(text) : NULL;
+        databasevm_reset(val_vm);
+
+        if (!text_copy) continue; // NULL column value: nothing to migrate
+
+        // Split text into blocks and store each one
+        block_list_t *blocks = block_split(text_copy, delim);
+        cloudsync_memory_free(text_copy);
+        if (!blocks) continue;
+
+        char **positions = block_initial_positions(blocks->count);
+        if (positions) {
+            for (int b = 0; b < blocks->count; b++) {
+                char *block_cn = block_build_colname(col_name, positions[b]);
+                if (block_cn) {
+                    // Metadata entry (skip if this block position already exists)
+                    databasevm_bind_blob(meta_vm, 1, pk, (int)pklen);
+                    databasevm_bind_text(meta_vm, 2, block_cn, -1);
+                    databasevm_bind_int(meta_vm, 3, 1);            // col_version = 1 (alive)
+                    databasevm_bind_int(meta_vm, 4, db_version);
+                    databasevm_bind_int(meta_vm, 5, cloudsync_bumpseq(data));
+                    databasevm_step(meta_vm);
+                    databasevm_reset(meta_vm);
+
+                    // Block value (skip if this block position already exists)
+                    databasevm_bind_blob(blocks_vm, 1, pk, (int)pklen);
+                    databasevm_bind_text(blocks_vm, 2, block_cn, -1);
+                    databasevm_bind_text(blocks_vm, 3, blocks->entries[b].content, -1);
+                    databasevm_step(blocks_vm);
+                    databasevm_reset(blocks_vm);
+
+                    cloudsync_memory_free(block_cn);
+                }
+                cloudsync_memory_free(positions[b]);
+            }
+            cloudsync_memory_free(positions);
+        }
+        block_list_free(blocks);
+    }
+
+    databasevm_finalize(meta_vm);
+    databasevm_finalize(blocks_vm);
+    rc = DBRES_OK;
+
+cleanup_pks:
+    for (int i = 0; i < pk_count; i++) cloudsync_memory_free(pks[i]);
+    cloudsync_memory_free(pks);
+    cloudsync_memory_free(pklens);
+    return rc;
+}
+
 int cloudsync_setup_block_column (cloudsync_context *data, const char *table_name, const char *col_name, const char *delimiter, bool persist) {
     cloudsync_table_context *table = table_lookup(data, table_name);
     if (!table) return cloudsync_set_error(data, "cloudsync_setup_block_column: table not found", DBRES_ERROR);
@@ -2148,6 +2314,13 @@ int cloudsync_setup_block_column (cloudsync_context *data, const char *table_nam
             rc = dbutils_table_settings_set_key_value(data, table_name, col_name, "delimiter", delimiter);
             if (rc != DBRES_OK) return rc;
         }
+
+        // Migrate any existing tracked rows: populate the blocks table and metadata with
+        // block entries derived from the current column value, so that subsequent UPDATE
+        // operations can diff against the real existing state instead of treating everything
+        // as new, and so this node participates correctly in LWW conflict resolution.
+        rc = block_migrate_existing_rows(data, table, col_idx);
+        if (rc != DBRES_OK) return rc;
     }
 
     return DBRES_OK;
diff --git a/src/cloudsync.h b/src/cloudsync.h
@@ -18,7 +18,7 @@
 extern "C" {
 #endif
 
-#define CLOUDSYNC_VERSION                       "1.0.12"
+#define CLOUDSYNC_VERSION                       "1.0.13"
 #define CLOUDSYNC_MAX_TABLENAME_LEN             512
 
 #define CLOUDSYNC_VALUE_NOTSET                  -1
diff --git a/src/postgresql/sql_postgresql.c b/src/postgresql/sql_postgresql.c
@@ -437,3 +437,15 @@ const char * const SQL_BLOCKS_LIST_ALIVE =
     "WHERE b.pk = $1 AND b.col_name LIKE $2 "
     "AND m.pk = $3 AND m.col_name LIKE $4 AND m.col_version %% 2 = 1 "
     "ORDER BY b.col_name COLLATE \"C\"";
+
+const char * const SQL_BLOCKS_INSERT_IGNORE =
+    "INSERT INTO %s (pk, col_name, col_value) VALUES ($1, $2, $3) ON CONFLICT DO NOTHING";
+
+const char * const SQL_META_SCAN_COL_FOR_MIGRATION =
+    "SELECT DISTINCT m.pk FROM %s m "
+    "WHERE m.col_name = $1 AND m.col_version %% 2 = 1 "
+    "AND NOT EXISTS (SELECT 1 FROM %s b WHERE b.pk = m.pk AND b.col_name LIKE $2)";
+
+const char * const SQL_META_INSERT_BLOCK_IGNORE =
+    "INSERT INTO %s (pk, col_name, col_version, db_version, seq, site_id) "
+    "VALUES ($1, $2, $3, $4, $5, 0) ON CONFLICT DO NOTHING";
diff --git a/src/sql.h b/src/sql.h
@@ -74,5 +74,8 @@ extern const char * const SQL_BLOCKS_UPSERT;
 extern const char * const SQL_BLOCKS_SELECT;
 extern const char * const SQL_BLOCKS_DELETE;
 extern const char * const SQL_BLOCKS_LIST_ALIVE;
+extern const char * const SQL_BLOCKS_INSERT_IGNORE;
+extern const char * const SQL_META_SCAN_COL_FOR_MIGRATION;
+extern const char * const SQL_META_INSERT_BLOCK_IGNORE;
 
 #endif
diff --git a/src/sqlite/sql_sqlite.c b/src/sqlite/sql_sqlite.c
@@ -304,3 +304,15 @@ const char * const SQL_BLOCKS_LIST_ALIVE =
     "WHERE b.pk = ?1 AND b.col_name LIKE ?2 "
     "AND m.pk = ?3 AND m.col_name LIKE ?4 AND m.col_version %% 2 = 1 "
     "ORDER BY b.col_name";
+
+const char * const SQL_BLOCKS_INSERT_IGNORE =
+    "INSERT OR IGNORE INTO %s (pk, col_name, col_value) VALUES (?1, ?2, ?3)";
+
+const char * const SQL_META_SCAN_COL_FOR_MIGRATION =
+    "SELECT DISTINCT m.pk FROM %s m "
+    "WHERE m.col_name = ?1 AND m.col_version %% 2 = 1 "
+    "AND NOT EXISTS (SELECT 1 FROM %s b WHERE b.pk = m.pk AND b.col_name LIKE ?2)";
+
+const char * const SQL_META_INSERT_BLOCK_IGNORE =
+    "INSERT OR IGNORE INTO %s (pk, col_name, col_version, db_version, seq, site_id) "
+    "VALUES (?1, ?2, ?3, ?4, ?5, 0)";
diff --git a/test/postgresql/50_block_lww_existing_data.sql b/test/postgresql/50_block_lww_existing_data.sql
@@ -0,0 +1,92 @@
+-- 'Block-level LWW: migration of existing tracked rows when algo=block is enabled'
+-- Mirrors the SQLite unit test: Block LWW Existing Data
+
+\set testid '50'
+\ir helper_test_init.sql
+
+\connect postgres
+\ir helper_psql_conn_setup.sql
+
+DROP DATABASE IF EXISTS cloudsync_block_existing_a;
+CREATE DATABASE cloudsync_block_existing_a;
+
+\connect cloudsync_block_existing_a
+\ir helper_psql_conn_setup.sql
+
+CREATE EXTENSION IF NOT EXISTS cloudsync;
+
+-- Create a table and init cloudsync WITHOUT block algo first
+DROP TABLE IF EXISTS docs;
+CREATE TABLE docs (id TEXT PRIMARY KEY NOT NULL, body TEXT);
+SELECT cloudsync_init('docs', 'CLS', 1) AS _init \gset
+
+-- Insert rows BEFORE enabling block algorithm (they will be tracked as regular CLS rows)
+INSERT INTO docs (id, body) VALUES ('d1', E'Line1\nLine2\nLine3');
+INSERT INTO docs (id, body) VALUES ('d2', E'Alpha\nBeta');
+
+-- Now enable block algo on the column that already has data
+SELECT cloudsync_set_column('docs', 'body', 'algo', 'block') AS _sc \gset
+
+-- Test 1: Blocks table should have 5 entries (3 for d1, 2 for d2) immediately after set_column
+SELECT count(*) AS block_count FROM docs_cloudsync_blocks \gset
+SELECT (:block_count::int = 5) AS block_count_ok \gset
+\if :block_count_ok
+\echo [PASS] (:testid) Migration: 5 block entries after set_column on existing data
+\else
+\echo [FAIL] (:testid) Migration: expected 5 block entries, got :block_count
+SELECT (:fail::int + 1) AS fail \gset
+\endif
+
+-- Test 2: Metadata should have 5 alive block entries
+SELECT count(*) AS meta_count FROM docs_cloudsync
+WHERE col_name LIKE 'body' || chr(31) || '%' AND col_version % 2 = 1 \gset
+SELECT (:meta_count::int = 5) AS meta_count_ok \gset
+\if :meta_count_ok
+\echo [PASS] (:testid) Migration: 5 alive block metadata entries
+\else
+\echo [FAIL] (:testid) Migration: expected 5 alive metadata entries, got :meta_count
+SELECT (:fail::int + 1) AS fail \gset
+\endif
+
+-- Test 3: Calling set_column again should be idempotent (count stays at 5)
+SELECT cloudsync_set_column('docs', 'body', 'algo', 'block') AS _sc2 \gset
+
+SELECT count(*) AS block_count2 FROM docs_cloudsync_blocks \gset
+SELECT (:block_count2::int = 5) AS idempotent_ok \gset
+\if :idempotent_ok
+\echo [PASS] (:testid) Migration: idempotent (still 5 blocks after second set_column)
+\else
+\echo [FAIL] (:testid) Migration: idempotency broken, got :block_count2 blocks (expected 5)
+SELECT (:fail::int + 1) AS fail \gset
+\endif
+
+-- Test 4: UPDATE on d1 should still work correctly after migration
+UPDATE docs SET body = E'Line1\nLine2-edited\nLine3' WHERE id = 'd1';
+
+SELECT count(*) AS block_count3 FROM docs_cloudsync_blocks \gset
+SELECT (:block_count3::int = 5) AS update_count_ok \gset
+\if :update_count_ok
+\echo [PASS] (:testid) Migration: 5 blocks after UPDATE (d1 edited in-place)
+\else
+\echo [FAIL] (:testid) Migration: expected 5 blocks after update, got :block_count3
+SELECT (:fail::int + 1) AS fail \gset
+\endif
+
+-- Test 5: Materialized value should reflect the update
+SELECT cloudsync_text_materialize('docs', 'body', 'd1') AS _mat \gset
+
+SELECT (body = E'Line1\nLine2-edited\nLine3') AS mat_ok FROM docs WHERE id = 'd1' \gset
+\if :mat_ok
+\echo [PASS] (:testid) Migration: materialized value correct after update
+\else
+\echo [FAIL] (:testid) Migration: materialized value mismatch
+SELECT (:fail::int + 1) AS fail \gset
+\endif
+
+-- Cleanup
+\ir helper_test_cleanup.sql
+\if :should_cleanup
+DROP DATABASE IF EXISTS cloudsync_block_existing_a;
+\else
+\echo [INFO] !!!!!
+\endif
diff --git a/test/postgresql/full_test.sql b/test/postgresql/full_test.sql
@@ -57,6 +57,7 @@
 \ir 47_row_filter_advanced.sql
 \ir 48_row_filter_multi_table.sql
 \ir 49_row_filter_prefill.sql
+\ir 50_block_lww_existing_data.sql
 
 -- 'Test summary'
 \echo '\nTest summary:'
diff --git a/test/unit.c b/test/unit.c